Repository: intel/cri-resource-manager
Branch: master
Commit: 886388e7a4a7
Files: 555
Total size: 2.2 MB

Directory structure:
gitextract_8uskpwqi/

├── .githooks/
│   ├── pre-commit.d/
│   │   ├── 00-gofmt
│   │   ├── 10-shellcheck
│   │   └── 20-go-version
│   └── run-hooks
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   ├── feature_request.md
│   │   └── new-release.md
│   └── workflows/
│       ├── common-build-docs.yaml
│       ├── common-build-images.yaml
│       ├── common-codeql.yaml
│       ├── common-trivy.yaml
│       ├── common-verify-code.yaml
│       ├── publish-devel-images.yaml
│       ├── publish-docs.yml
│       ├── release.yaml
│       ├── trivy-csv.tpl
│       ├── verify-periodic.yaml
│       ├── verify-pr-code.yaml
│       └── verify-pr-docs.yaml
├── .gitignore
├── CODEOWNERS
├── Jenkinsfile
├── LICENSE
├── Makefile
├── README.md
├── SECURITY.md
├── cmd/
│   ├── cri-resmgr/
│   │   ├── cri-resource-manager.service.in
│   │   ├── cri-resource-manager.sysconf
│   │   ├── fallback.cfg.sample
│   │   └── main.go
│   ├── cri-resmgr-agent/
│   │   ├── Dockerfile
│   │   ├── agent-deployment.yaml
│   │   └── main.go
│   ├── cri-resmgr-agent-probe/
│   │   └── main.go
│   └── cri-resmgr-webhook/
│       ├── Dockerfile
│       ├── handlers.go
│       ├── main.go
│       ├── mutating-webhook-config.yaml
│       ├── webhook-deployment.yaml
│       └── webhook.go
├── demo/
│   ├── blockio/
│   │   ├── bb-scanner.yaml
│   │   ├── cri-resmgr-config.default.yaml
│   │   └── run.sh
│   └── lib/
│       ├── command.bash
│       ├── distro.bash
│       ├── host.bash
│       ├── numactlH2numajson.py
│       ├── topology.py
│       ├── topology2qemuopts.py
│       └── vm.bash
├── dockerfiles/
│   └── cross-build/
│       ├── Dockerfile.debian-11
│       ├── Dockerfile.debian-12
│       ├── Dockerfile.debian-sid
│       ├── Dockerfile.fedora
│       ├── Dockerfile.opensuse-leap-15.6
│       ├── Dockerfile.ubuntu-18.04
│       ├── Dockerfile.ubuntu-20.04
│       ├── Dockerfile.ubuntu-22.04
│       └── Dockerfile.ubuntu-24.04
├── docs/
│   ├── Dockerfile
│   ├── _templates/
│   │   └── layout.html
│   ├── conf.py
│   ├── contributing.md
│   ├── demos/
│   │   ├── blockio.md
│   │   └── index.rst
│   ├── developers-guide/
│   │   ├── architecture.md
│   │   ├── cri-test.md
│   │   ├── e2e-test.md
│   │   ├── index.rst
│   │   ├── policy-writers-guide.md
│   │   ├── testing.rst
│   │   └── unit-test.md
│   ├── index.html
│   ├── index.rst
│   ├── installation.md
│   ├── introduction.md
│   ├── migration-to-NRI.md
│   ├── node-agent.md
│   ├── policy/
│   │   ├── balloons.md
│   │   ├── blockio.md
│   │   ├── container-affinity.md
│   │   ├── cpu-allocator.md
│   │   ├── dynamic-pools.md
│   │   ├── index.rst
│   │   ├── podpools.md
│   │   ├── rdt.md
│   │   ├── static-pools.md
│   │   └── topology-aware.md
│   ├── quick-start.md
│   ├── reference/
│   │   ├── agent-command-line-reference.md
│   │   ├── configuration-reference.md
│   │   ├── index.rst
│   │   └── resmgr-command-line-reference.md
│   ├── releases/
│   │   ├── conf.py
│   │   └── index.md
│   ├── requirements.txt
│   ├── security.md
│   ├── setup.md
│   └── webhook.md
├── elf/
│   └── avx512.c
├── go.mod
├── go.sum
├── packaging/
│   ├── deb.in/
│   │   ├── changelog
│   │   ├── compat
│   │   ├── control
│   │   └── rules
│   └── rpm/
│       └── cri-resource-manager.spec.in
├── pkg/
│   ├── agent/
│   │   ├── agent.go
│   │   ├── api/
│   │   │   └── v1/
│   │   │       ├── api.go
│   │   │       ├── api.pb.go
│   │   │       ├── api.proto
│   │   │       ├── api_grpc.pb.go
│   │   │       └── constants.go
│   │   ├── config-updater.go
│   │   ├── flags.go
│   │   ├── kubernetes.go
│   │   ├── server.go
│   │   └── watcher.go
│   ├── apis/
│   │   └── resmgr/
│   │       ├── expression.go
│   │       ├── expression_test.go
│   │       ├── generated/
│   │       │   ├── clientset/
│   │       │   │   └── versioned/
│   │       │   │       ├── clientset.go
│   │       │   │       ├── doc.go
│   │       │   │       ├── fake/
│   │       │   │       │   ├── clientset_generated.go
│   │       │   │       │   ├── doc.go
│   │       │   │       │   └── register.go
│   │       │   │       ├── scheme/
│   │       │   │       │   ├── doc.go
│   │       │   │       │   └── register.go
│   │       │   │       └── typed/
│   │       │   │           └── resmgr/
│   │       │   │               └── v1alpha1/
│   │       │   │                   ├── adjustment.go
│   │       │   │                   ├── doc.go
│   │       │   │                   ├── fake/
│   │       │   │                   │   ├── doc.go
│   │       │   │                   │   ├── fake_adjustment.go
│   │       │   │                   │   └── fake_resmgr_client.go
│   │       │   │                   ├── generated_expansion.go
│   │       │   │                   └── resmgr_client.go
│   │       │   ├── informers/
│   │       │   │   └── externalversions/
│   │       │   │       ├── factory.go
│   │       │   │       ├── generic.go
│   │       │   │       ├── internalinterfaces/
│   │       │   │       │   └── factory_interfaces.go
│   │       │   │       └── resmgr/
│   │       │   │           ├── interface.go
│   │       │   │           └── v1alpha1/
│   │       │   │               ├── adjustment.go
│   │       │   │               └── interface.go
│   │       │   └── listers/
│   │       │       └── resmgr/
│   │       │           └── v1alpha1/
│   │       │               ├── adjustment.go
│   │       │               └── expansion_generated.go
│   │       └── v1alpha1/
│   │           ├── adjustment-schema.yaml
│   │           ├── adjustment.go
│   │           ├── doc.go
│   │           ├── register.go
│   │           ├── types.go
│   │           └── zz_generated.deepcopy.go
│   ├── avx/
│   │   ├── collector.go
│   │   ├── elfdump.go
│   │   └── register.go
│   ├── blockio/
│   │   ├── blockio.go
│   │   ├── blockio_test.go
│   │   └── config.go
│   ├── cgroups/
│   │   ├── cgroupblkio.go
│   │   ├── cgroupblkio_test.go
│   │   ├── cgroupcontrol.go
│   │   ├── cgroupid.go
│   │   ├── cgrouppath.go
│   │   └── cgroupstats.go
│   ├── cgroupstats/
│   │   └── collector.go
│   ├── config/
│   │   ├── config.go
│   │   ├── data.go
│   │   ├── duration.go
│   │   ├── error.go
│   │   ├── help.go
│   │   ├── log.go
│   │   └── options.go
│   ├── cpuallocator/
│   │   ├── allocator.go
│   │   └── cpuallocator_test.go
│   ├── cri/
│   │   ├── client/
│   │   │   ├── client.go
│   │   │   └── v1/
│   │   │       └── client.go
│   │   ├── relay/
│   │   │   ├── image-service.go
│   │   │   ├── relay.go
│   │   │   └── runtime-service.go
│   │   ├── resource-manager/
│   │   │   ├── agent/
│   │   │   │   └── agent.go
│   │   │   ├── builtin-policies.go
│   │   │   ├── cache/
│   │   │   │   ├── affinity.go
│   │   │   │   ├── affinity_test.go
│   │   │   │   ├── cache.go
│   │   │   │   ├── cache_test.go
│   │   │   │   ├── container.go
│   │   │   │   ├── container_test.go
│   │   │   │   ├── error.go
│   │   │   │   ├── pod.go
│   │   │   │   └── utils.go
│   │   │   ├── config/
│   │   │   │   ├── api/
│   │   │   │   │   └── v1/
│   │   │   │   │       ├── api.pb.go
│   │   │   │   │       ├── api.proto
│   │   │   │   │       └── api_grpc.pb.go
│   │   │   │   ├── config.go
│   │   │   │   └── server.go
│   │   │   ├── control/
│   │   │   │   ├── blockio/
│   │   │   │   │   └── blockio.go
│   │   │   │   ├── control.go
│   │   │   │   ├── cpu/
│   │   │   │   │   ├── api.go
│   │   │   │   │   ├── cache.go
│   │   │   │   │   └── cpu.go
│   │   │   │   ├── cri/
│   │   │   │   │   └── cri.go
│   │   │   │   ├── flags.go
│   │   │   │   ├── memory/
│   │   │   │   │   └── memory.go
│   │   │   │   ├── page-migrate/
│   │   │   │   │   ├── demoter.go
│   │   │   │   │   ├── demoter_test.go
│   │   │   │   │   ├── flags.go
│   │   │   │   │   ├── page-migrate.go
│   │   │   │   │   └── page-mover.go
│   │   │   │   └── rdt/
│   │   │   │       └── rdt.go
│   │   │   ├── controllers.go
│   │   │   ├── error.go
│   │   │   ├── events/
│   │   │   │   └── events.go
│   │   │   ├── events.go
│   │   │   ├── flags.go
│   │   │   ├── introspect/
│   │   │   │   └── introspect.go
│   │   │   ├── kubernetes/
│   │   │   │   ├── kubernetes.go
│   │   │   │   └── resources.go
│   │   │   ├── metrics/
│   │   │   │   ├── avx.go
│   │   │   │   ├── metrics.go
│   │   │   │   └── prometheus.go
│   │   │   ├── no-test-api.go
│   │   │   ├── policy/
│   │   │   │   ├── builtin/
│   │   │   │   │   ├── balloons/
│   │   │   │   │   │   ├── balloons-policy.go
│   │   │   │   │   │   ├── balloons-policy_test.go
│   │   │   │   │   │   ├── cputree.go
│   │   │   │   │   │   ├── cputree_test.go
│   │   │   │   │   │   ├── fillmethod.go
│   │   │   │   │   │   ├── flags.go
│   │   │   │   │   │   └── metrics.go
│   │   │   │   │   ├── dynamic-pools/
│   │   │   │   │   │   ├── cpu.go
│   │   │   │   │   │   ├── dyp.go
│   │   │   │   │   │   ├── dyp_test.go
│   │   │   │   │   │   ├── flags.go
│   │   │   │   │   │   └── metrics.go
│   │   │   │   │   ├── none/
│   │   │   │   │   │   └── none-policy.go
│   │   │   │   │   ├── podpools/
│   │   │   │   │   │   ├── flags.go
│   │   │   │   │   │   ├── metrics.go
│   │   │   │   │   │   ├── podpools-policy.go
│   │   │   │   │   │   └── podpools-policy_test.go
│   │   │   │   │   ├── static/
│   │   │   │   │   │   ├── flags.go
│   │   │   │   │   │   └── static-policy.go
│   │   │   │   │   ├── static-plus/
│   │   │   │   │   │   └── static-plus-policy.go
│   │   │   │   │   ├── static-pools/
│   │   │   │   │   │   ├── config.go
│   │   │   │   │   │   ├── node.go
│   │   │   │   │   │   ├── stp-policy.go
│   │   │   │   │   │   └── stp-policy_test.go
│   │   │   │   │   └── topology-aware/
│   │   │   │   │       ├── affinity.go
│   │   │   │   │       ├── cache.go
│   │   │   │   │       ├── cache_test.go
│   │   │   │   │       ├── coldstart.go
│   │   │   │   │       ├── coldstart_test.go
│   │   │   │   │       ├── error.go
│   │   │   │   │       ├── flags.go
│   │   │   │   │       ├── hint.go
│   │   │   │   │       ├── hint_test.go
│   │   │   │   │       ├── logging.go
│   │   │   │   │       ├── mocks_test.go
│   │   │   │   │       ├── node.go
│   │   │   │   │       ├── pod-preferences.go
│   │   │   │   │       ├── pod-preferences_test.go
│   │   │   │   │       ├── pools.go
│   │   │   │   │       ├── pools_test.go
│   │   │   │   │       ├── resources.go
│   │   │   │   │       └── topology-aware-policy.go
│   │   │   │   ├── error.go
│   │   │   │   ├── flags.go
│   │   │   │   └── policy.go
│   │   │   ├── requests.go
│   │   │   ├── resource-manager.go
│   │   │   ├── sockets/
│   │   │   │   └── sockets.go
│   │   │   ├── test-api.go
│   │   │   └── visualizer/
│   │   │       ├── bubbles/
│   │   │       │   ├── assets/
│   │   │       │   │   ├── css/
│   │   │       │   │   │   └── style.css
│   │   │       │   │   ├── index.html
│   │   │       │   │   └── js/
│   │   │       │   │       ├── ui-json-adapter.js
│   │   │       │   │       └── ui.js
│   │   │       │   ├── assets.go
│   │   │       │   ├── assets_generate.go
│   │   │       │   └── doc.go
│   │   │       ├── builtins.go
│   │   │       ├── flags.go
│   │   │       └── visualizer.go
│   │   └── server/
│   │       ├── server.go
│   │       └── services.go
│   ├── dump/
│   │   ├── doc.go
│   │   ├── dump.go
│   │   ├── dump_test.go
│   │   └── flags.go
│   ├── instrumentation/
│   │   ├── flags.go
│   │   ├── grpc.go
│   │   ├── http/
│   │   │   ├── http.go
│   │   │   └── http_test.go
│   │   ├── instrumentation.go
│   │   ├── instrumentation_test.go
│   │   ├── jaeger.go
│   │   ├── prometheus.go
│   │   └── service.go
│   ├── log/
│   │   ├── default.go
│   │   ├── flags.go
│   │   ├── grpc-logger.go
│   │   ├── klogcontrol/
│   │   │   └── klogcontrol.go
│   │   ├── log.go
│   │   ├── ratelimit.go
│   │   ├── ratelimit_test.go
│   │   ├── signal.go
│   │   └── stdlog-logger.go
│   ├── metrics/
│   │   ├── metrics.go
│   │   └── register/
│   │       ├── register_metrics.go
│   │       └── register_metrics_avx.go
│   ├── pidfile/
│   │   ├── pidfile.go
│   │   └── pidfile_test.go
│   ├── policycollector/
│   │   └── collector.go
│   ├── procstats/
│   │   └── procstats.go
│   ├── sysfs/
│   │   ├── error.go
│   │   ├── parsers.go
│   │   ├── system.go
│   │   └── utils.go
│   ├── testutils/
│   │   └── verify.go
│   ├── topology/
│   │   ├── go.mod
│   │   ├── test-cleanup.sh
│   │   ├── test-setup.sh
│   │   ├── topology.go
│   │   └── topology_test.go
│   ├── utils/
│   │   ├── cpuset/
│   │   │   ├── cpuset.go
│   │   │   └── cpuset_test.go
│   │   ├── json.go
│   │   ├── net.go
│   │   ├── parse.go
│   │   ├── sort.go
│   │   └── tar.go
│   └── version/
│       └── version.go
├── runtime-deps.csv
├── sample-configs/
│   ├── balloons-policy.cfg
│   ├── blockio.cfg
│   ├── cri-full-message-dump.cfg
│   ├── cri-resmgr-configmap.example.yaml
│   ├── external-adjustment.yaml
│   ├── podpools-policy.cfg
│   ├── static-policy.cfg
│   ├── static-pools-policy.conf.example
│   └── topology-aware-policy.cfg
├── scripts/
│   ├── build/
│   │   ├── docker-build-image
│   │   ├── get-buildid
│   │   └── update-gh-pages.sh
│   ├── code-generator/
│   │   ├── boilerplate.go.txt
│   │   └── generate-groups.sh
│   ├── hack/
│   │   ├── create-webhook-secrets.sh
│   │   ├── go-mod-replace-helper.sh
│   │   ├── go-mod-tree
│   │   └── install-protobuf
│   └── testing/
│       ├── crictl
│       ├── jaeger
│       ├── kube-cgroups
│       ├── pairwise
│       ├── prometheus
│       ├── prometheus.yaml
│       └── set-path
└── test/
    ├── critest/
    │   ├── run.sh
    │   ├── topology-aware-policy.cfg
    │   └── tsl
    ├── e2e/
    │   ├── benchmarks.test-suite/
    │   │   └── memtier_benchmark/
    │   │       ├── cri-resmgr.cfg
    │   │       ├── memtier-benchmark-02.yaml.in
    │   │       ├── memtier-benchmark.yaml.in
    │   │       ├── n4c16/
    │   │       │   ├── test01-memtier-stress-ng/
    │   │       │   │   ├── code.var.sh
    │   │       │   │   └── post-process.sh
    │   │       │   ├── test02-multi-memtier/
    │   │       │   │   └── code.var.sh
    │   │       │   └── topology.var.json
    │   │       ├── redis-secret.yaml.in
    │   │       ├── redis-service.yaml.in
    │   │       ├── redis.yaml.in
    │   │       ├── stress-ng-benchmark.yaml.in
    │   │       └── stress-ng.yaml.in
    │   ├── besteffort.yaml.in
    │   ├── blockio.test-suite/
    │   │   ├── blockio/
    │   │   │   └── n4c16/
    │   │   │       ├── test00-slowreader/
    │   │   │       │   └── code.var.sh
    │   │   │       ├── topology.var.json
    │   │   │       └── vm-files/
    │   │   │           └── etc/
    │   │   │               ├── containers/
    │   │   │               │   └── blockio.yaml
    │   │   │               └── crio/
    │   │   │                   └── crio.conf.d/
    │   │   │                       └── 55-blockio
    │   │   ├── containerd_src.var.in.sh
    │   │   ├── crio_src.var.in.sh
    │   │   ├── k8scri.var.in.sh
    │   │   └── omit_cri_resmgr.var.sh
    │   ├── burstable.yaml.in
    │   ├── cri-resmgr-topology-aware.cfg
    │   ├── guaranteed.yaml.in
    │   ├── packages.test-suite/
    │   │   ├── debian-11/
    │   │   │   ├── binsrc.var
    │   │   │   ├── cri-resmgr.cfg
    │   │   │   ├── distro.var
    │   │   │   ├── pkgtest/
    │   │   │   │   ├── test01-systemd/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   └── topology.var.json
    │   │   │   └── reinstall_cri_resmgr.var
    │   │   ├── debian-12/
    │   │   │   ├── binsrc.var
    │   │   │   ├── cri-resmgr.cfg
    │   │   │   ├── distro.var
    │   │   │   ├── pkgtest/
    │   │   │   │   ├── test01-systemd/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   └── topology.var.json
    │   │   │   └── reinstall_cri_resmgr.var
    │   │   ├── debian-sid/
    │   │   │   ├── binsrc.var
    │   │   │   ├── cri-resmgr.cfg
    │   │   │   ├── distro.var
    │   │   │   ├── pkgtest/
    │   │   │   │   ├── test01-systemd/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   └── topology.var.json
    │   │   │   └── reinstall_cri_resmgr.var
    │   │   ├── fedora/
    │   │   │   ├── binsrc.var
    │   │   │   ├── cri-resmgr.cfg
    │   │   │   ├── distro.var
    │   │   │   ├── pkgtest/
    │   │   │   │   ├── test01-systemd/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   └── topology.var.json
    │   │   │   └── reinstall_cri_resmgr.var
    │   │   ├── opensuse-15.6/
    │   │   │   ├── binsrc.var
    │   │   │   ├── cri-resmgr.cfg
    │   │   │   ├── distro.var
    │   │   │   ├── pkgtest/
    │   │   │   │   ├── test01-systemd/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   └── topology.var.json
    │   │   │   └── reinstall_cri_resmgr.var
    │   │   ├── ubuntu-18.04/
    │   │   │   ├── binsrc.var
    │   │   │   ├── cri-resmgr.cfg
    │   │   │   ├── distro.var
    │   │   │   ├── pkgtest/
    │   │   │   │   ├── test01-systemd/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   └── topology.var.json
    │   │   │   └── reinstall_cri_resmgr.var
    │   │   ├── ubuntu-20.04/
    │   │   │   ├── binsrc.var
    │   │   │   ├── cri-resmgr.cfg
    │   │   │   ├── distro.var
    │   │   │   ├── pkgtest/
    │   │   │   │   ├── test01-systemd/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   └── topology.var.json
    │   │   │   └── reinstall_cri_resmgr.var
    │   │   ├── ubuntu-22.04/
    │   │   │   ├── binsrc.var
    │   │   │   ├── cri-resmgr.cfg
    │   │   │   ├── distro.var
    │   │   │   ├── pkgtest/
    │   │   │   │   ├── test01-systemd/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   └── topology.var.json
    │   │   │   └── reinstall_cri_resmgr.var
    │   │   └── ubuntu-24.04/
    │   │       ├── binsrc.var
    │   │       ├── cri-resmgr.cfg
    │   │       ├── distro.var
    │   │       ├── pkgtest/
    │   │       │   ├── test01-systemd/
    │   │       │   │   └── code.var.sh
    │   │       │   └── topology.var.json
    │   │       └── reinstall_cri_resmgr.var
    │   ├── policies.test-suite/
    │   │   ├── balloons/
    │   │   │   ├── balloons-busybox.yaml.in
    │   │   │   ├── balloons-configmap.yaml.in
    │   │   │   ├── cri-resmgr.cfg
    │   │   │   ├── n4c16/
    │   │   │   │   ├── test01-basic-placement/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test02-prometheus-metrics/
    │   │   │   │   │   ├── balloons-metrics.cfg
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test03-reserved/
    │   │   │   │   │   ├── balloons-reserved.cfg
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test05-namespace/
    │   │   │   │   │   ├── balloons-namespace.cfg
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test06-update-configmap/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test07-maxballoons/
    │   │   │   │   │   ├── balloons-maxballoons-impossible.cfg
    │   │   │   │   │   ├── balloons-maxballoons.cfg
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test08-numa/
    │   │   │   │   │   ├── balloons-numa.cfg
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test09-isolated/
    │   │   │   │   │   ├── balloons-isolated.cfg
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test10-allocator-opts/
    │   │   │   │   │   ├── balloons-allocator-opts.cfg
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   └── topology.var.json
    │   │   │   ├── n4c32/
    │   │   │   │   ├── test01-dynamic-baloons/
    │   │   │   │   │   ├── balloons-dynamic.cfg
    │   │   │   │   │   ├── code.var.sh
    │   │   │   │   │   └── multicontainerpod.yaml.in
    │   │   │   │   └── topology.var.json
    │   │   │   └── verify.source.sh
    │   │   ├── check-correct-policy.source.sh
    │   │   ├── dynamic-pools/
    │   │   │   ├── cri-resmgr.cfg
    │   │   │   ├── dyp-busybox.yaml.in
    │   │   │   ├── dyp-configmap.yaml.in
    │   │   │   ├── n4c16/
    │   │   │   │   ├── test01-basic-placement/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test02-prometheus-metrics/
    │   │   │   │   │   ├── code.var.sh
    │   │   │   │   │   └── dyp-metrics.cfg
    │   │   │   │   ├── test03-rebalancing/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test04-reserved/
    │   │   │   │   │   ├── code.var.sh
    │   │   │   │   │   └── dyp-reserved.cfg
    │   │   │   │   ├── test05-namespace/
    │   │   │   │   │   ├── code.var.sh
    │   │   │   │   │   └── dyp-namespace.cfg
    │   │   │   │   ├── test06-update-configmap/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test07-numa/
    │   │   │   │   │   ├── code.var.sh
    │   │   │   │   │   └── dyp-numa.cfg
    │   │   │   │   └── topology.var.json
    │   │   │   └── verify.source.sh
    │   │   ├── podpools/
    │   │   │   ├── cri-resmgr.cfg
    │   │   │   ├── n4c16/
    │   │   │   │   ├── podpools-configmap.yaml.in
    │   │   │   │   ├── py_consts.var.py
    │   │   │   │   ├── test01-basic-placement/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test02-fill-order/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test03-qos/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test04-overbook-cpus/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test05-agent-updates-config/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test06-prometheus-metrics/
    │   │   │   │   │   ├── code.var.sh
    │   │   │   │   │   └── podpools-metrics.cfg
    │   │   │   │   ├── test07-custom-default-pool/
    │   │   │   │   │   ├── code.var.sh
    │   │   │   │   │   └── podpools-custom-default.cfg
    │   │   │   │   └── topology.var.json
    │   │   │   └── podpools-busybox.yaml.in
    │   │   ├── static-pools/
    │   │   │   ├── README.txt
    │   │   │   ├── cmk-exclusive.yaml.in
    │   │   │   ├── cmk-isolate.yaml.in
    │   │   │   ├── cmk-tolerating-guaranteed.yaml.in
    │   │   │   ├── cri-resmgr.cfg
    │   │   │   ├── n4c16/
    │   │   │   │   ├── cri-resmgr-static-pools.cfg
    │   │   │   │   ├── py_consts.var.py
    │   │   │   │   ├── test00-node-status/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test01-exclusive-pods/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test02-pods-without-cmk/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test03-cmk-isolate/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test04-cmk-isolate-noaffinity/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test05-negative-tests/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── test99-cleanup/
    │   │   │   │   │   └── code.var.sh
    │   │   │   │   ├── topology.var.json
    │   │   │   │   └── vm-files/
    │   │   │   │       └── etc/
    │   │   │   │           └── cmk/
    │   │   │   │               └── pools.conf
    │   │   │   └── static-pools-lib.source.sh
    │   │   └── topology-aware/
    │   │       ├── c4pmem4/
    │   │       │   ├── test01-pmem-node-assigning/
    │   │       │   │   └── code.var.sh
    │   │       │   ├── test02-annotation-memory-type/
    │   │       │   │   ├── code.var.sh
    │   │       │   │   └── memtype-guaranteed.yaml.in
    │   │       │   ├── test02-annotation-memory-type-deprecated-syntax/
    │   │       │   │   ├── code.var.sh
    │   │       │   │   └── memtype-guaranteed.yaml.in
    │   │       │   ├── test03-coldstart/
    │   │       │   │   ├── bb-coldstart.yaml.in
    │   │       │   │   └── code.var.sh
    │   │       │   ├── test03-coldstart-deprecated-syntax/
    │   │       │   │   ├── bb-coldstart.yaml.in
    │   │       │   │   └── code.var.sh
    │   │       │   ├── test04-dynamic-page-demotion/
    │   │       │   │   ├── bb-memload.yaml.in
    │   │       │   │   ├── code.var.sh
    │   │       │   │   └── cri-resmgr-dynamic-page-demotion.cfg
    │   │       │   ├── test04-dynamic-page-demotion-deprecated-syntax/
    │   │       │   │   ├── bb-memload.yaml.in
    │   │       │   │   ├── code.var.sh
    │   │       │   │   └── cri-resmgr-dynamic-page-demotion.cfg
    │   │       │   ├── test05-guarantee-memory/
    │   │       │   │   └── code.var.sh
    │   │       │   └── topology.var.json
    │   │       ├── cri-resmgr.cfg
    │   │       └── n4c16/
    │   │           ├── test00-basic-placement/
    │   │           │   ├── code.var.sh
    │   │           │   └── cri-resmgr.cfg.in
    │   │           ├── test01-always-fits/
    │   │           │   └── code.var.sh
    │   │           ├── test02-shrink-and-grow-shared/
    │   │           │   └── code.var.sh
    │   │           ├── test03-simple-affinity/
    │   │           │   ├── code.var.sh
    │   │           │   └── guaranteed+affinity.yaml.in
    │   │           ├── test04-available-resources/
    │   │           │   ├── code.var.sh
    │   │           │   └── cri-resmgr-available-resources.cfg.in
    │   │           ├── test05-reserved-resources/
    │   │           │   ├── code.var.sh
    │   │           │   └── cri-resmgr-reserved.cfg.in
    │   │           ├── test06-fuzz/
    │   │           │   ├── code.var.sh
    │   │           │   ├── codelib.sh
    │   │           │   ├── fuzz.aal
    │   │           │   ├── fuzz.fmbt.conf
    │   │           │   └── generate.sh
    │   │           ├── test07-mixed-allocations/
    │   │           │   ├── code.var.sh
    │   │           │   └── guaranteed-annotated.yaml.in
    │   │           ├── test08-isolcpus/
    │   │           │   ├── code.var.sh
    │   │           │   └── guaranteed-annotated.yaml.in
    │   │           ├── test09-container-exit/
    │   │           │   └── code.var.sh
    │   │           ├── test10-additional-reserved-namespaces/
    │   │           │   ├── code.var.sh
    │   │           │   └── cri-resmgr-reserved-namespaces.cfg.in
    │   │           ├── test11-reserved-cpu-annotations/
    │   │           │   ├── code.var.sh
    │   │           │   ├── cri-resmgr-reserved-annotations.cfg.in
    │   │           │   └── reserved-annotated.yaml.in
    │   │           └── topology.var.json
    │   ├── run.sh
    │   ├── run_all_configurations.sh
    │   └── run_tests.sh
    └── functional/
        ├── e2e_test.go
        └── fake_cri_server_test.go

================================================
FILE CONTENTS
================================================

================================================
FILE: .githooks/pre-commit.d/00-gofmt
================================================
#!/bin/bash
# Copyright 2012 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.

# git gofmt pre-commit hook
#
# To use, store as .git/hooks/pre-commit inside your repository and make sure
# it has execute permissions.
#
# This script does not handle file names that contain spaces.

if [ -z "$(command -v gofmt)" ]; then
    echo >&2 "WARNING: Cannot check/enforce Go code formatting: can't find gofmt."
    echo >&2 "WARNING: Please consider installing gofmt."
    exit 0
fi

gofiles=$(git diff --cached --name-only --diff-filter=ACM | grep '\.go$')
[ -z "$gofiles" ] && exit 0

# shellcheck disable=SC2086
unformatted=$(gofmt -l $gofiles)
[ -z "$unformatted" ] && exit 0

# Some files are not gofmt'd. Print message and fail.

echo >&2 "Go files must be formatted with gofmt. Please run:"
for fn in $unformatted; do
	echo >&2 "  gofmt -w $PWD/$fn"
done

exit 1


================================================
FILE: .githooks/pre-commit.d/10-shellcheck
================================================
#!/bin/bash

# git shellcheck pre-commit hook
#
# To use, store as .git/hooks/pre-commit/shellcheck inside your repository
# and make sure it has execute permissions.
#
# This script does not handle file names that contain spaces.
#

if [ -z "$(command -v shellcheck)" ]; then
    echo >&2 "WARNING: Cannot shellcheck scripts: can't find shellcheck."
    echo >&2 "WARNING: Please consider installing shellcheck."
    exit 0
fi

shfiles=$(git diff --cached --name-only --diff-filter=ACM -- '*.sh' '*.bash')
#echo >&2 "[$0: shfiles: $shfiles]"

for f in $(git diff --cached --name-only --diff-filter=ACM); do
    if grep -EHn '^#!/bin/.*sh *' "$f" | grep -q ':1:#!'; then
        shfiles="$shfiles $f"
    fi
done
shfiles="$(echo "$shfiles" | tr -s '\t ' '\n' | sort | uniq)"
#echo >&2 "[$0: shfiles: $shfiles]"

# shellcheck disable=SC2086
if  [ -z "$shfiles" ] || shellcheck $shfiles; then
    exit 0
fi

# Some files do not pass ShellCheck. Print message and fail.
echo >&2 "shell scripts must pass ShellCheck. Please fix them."
exit 1


================================================
FILE: .githooks/pre-commit.d/20-go-version
================================================
#!/bin/bash

WORKFLOWS=".github/workflows/verify.yml"

if git diff --cached go.mod | grep -q '^+go '; then
    gomod=$(go list -m -f '{{.GoVersion}}')
else
    exit 0
fi

status=0
for wf in $WORKFLOWS; do
    workflow=$(grep 'go-version:' $wf | sed 's/^.*: //')
    if [ "$gomod" != "$workflow" ]; then
        echo >&2 "ERROR: inconsistent golang versions, $gomod in go.mod but $workflow in $wf..."
        status=1
    fi
done

if [ "$status" != 0 ]; then
    echo >&2 "Please consider fixing these inconsistencies before committing..."
fi

exit $status


================================================
FILE: .githooks/run-hooks
================================================
#!/bin/bash

type=${0##*/}
hdir=$0.d
orig=${0%/*}/../.git/hooks/$type

exec 1>&2

for hlet in "$hdir"/???*; do
    case $hlet in
        *~|*.swp)
            continue
            ;;
        [0-9][0-9]-*)
            ;;
    esac
    if [ ! -x "$hlet" ]; then
        continue
    fi

    echo "<checking $type/${hlet##*/}>"
    $hlet
    r=$?
    if [ $r != 0 ]; then
        exit $r
    fi
done

if [ -x "$orig" ]; then
    echo "<checking .git/hooks/$type>"
    $orig
    exit $?
fi


================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: ''
assignees: ''

---

**Describe the bug**
<!-- A clear and concise description of what the bug is
-->

**Expected behavior**
<!-- A clear and concise description of what you expected to happen
-->

**To Reproduce**
<!-- Steps to reproduce the behavior
-->

**Environment**
<!-- OS, kernel, container runtime, Kubernetes version
-->

**Additional context**
<!-- Add any other context about the problem here
-->


================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: ''
assignees: ''

---

**Describe the solution you'd like**
<!--
A clear and concise description of what you want to happen
-->

**Why this is needed**
<!--
A clear and concise description of the use case and/or justification for the feature
-->


================================================
FILE: .github/ISSUE_TEMPLATE/new-release.md
================================================
---
name: New release
about: Propose a new release
title: Release v0.0.0
labels: ''
assignees: ''

---

## Release Process
<!--
If making adjustments to the checklist please also file a PR against this issue
template (.github/ISSUE_TEMPLATE/new-release.md) to incorporate the changes for
future releases.
-->
- [ ] In the issue description, add a changelog section, describing changes since the last release.
- Local release preparations
  - [ ] Perform mandatory internal release checks and preparations.
  - [ ] Run `make release-tests` to run an extended set of tests prior to a release.
  - [ ] Sync/tidy up dependencies.
    - [ ] Run `go mod tidy`.
    - [ ] Run `git commit -m 'go.mod,go.sum: update dependencies.' go.{mod,sum}`, if necessary.
  - [ ] Run `git tag -a -m "CRI Resource Manager release $VERSION" $VERSION`.
- Publishing
  - [ ] Push the tag with `git push $VERSION`. This will automatically build container images and release assets and upload the release assets to a new draft release,
  - [ ] Check that release assets were created for the tag
    - Container images are published
      - https://hub.docker.com/r/intel/cri-resmgr-agent/tags
      - https://hub.docker.com/r/intel/cri-resmgr-webhook/tags
    - Release assets are uploaded to the draft release
      - RPM packages
      - DEB package
      - Binary tarball
      - Source+dependencies tarball (vendored dist)
  - [ ] Update the automatically created draft release corresponding to the tag.
    - [ ] Write the change log to the release.
    - [ ] Mark the release as a non-production pre-release if necessary.
    - [ ] Save as draft.
  - [ ] Get the change log OK'd by other maintainers.
  - [ ] Publish the draft as a release.
  - [ ] Add a link to the tagged release in this issue.
- [ ] Close this issue.


## Changelog
<!--
Capture changes since the last release here.
For major releases have separate sections for major changes and a more detailed changelog.
For minor releases list the most important bug fixes and other improvements.
-->
### Major changes

### Detailed changelog


================================================
FILE: .github/workflows/common-build-docs.yaml
================================================
name: Build documentation
on:
  workflow_call:
    inputs:
      publish:
        default: false
        required: false
        type: boolean

permissions:
  contents: read

jobs:
  update-gh-pages:
    runs-on: ubuntu-22.04
    permissions:
      contents: write
    steps:
    - uses: actions/checkout@v4

    - name: Fetch gh-pages
      run: git fetch --no-tags --prune --depth=1 origin refs/heads/gh-pages:refs/heads/gh-pages

    - name: Install build dependencies
      run: |
        pip3 install --user -r docs/requirements.txt
        echo "`python3 -m site --user-base`/bin" >> $GITHUB_PATH

    - name: Add docs from this revision to gh-pages
      run: |
        git config user.name "Github"
        git config user.email "no-reply@github.com"
        ./scripts/build/update-gh-pages.sh

    - name: Publish gh-pages
      if: ${{ inputs.publish }}
      shell: bash
      env:
        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      run: |
        git push https://${GITHUB_ACTOR}:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git gh-pages


================================================
FILE: .github/workflows/common-build-images.yaml
================================================
name: Build container images

on:
  workflow_call:
    inputs:
      image-tag:
        default: ${{ github.ref_name }}
        required: false
        type: string
      publish:
        default: false
        required: false
        type: boolean
      github-environment:
        default: null
        required: false
        type: string

permissions:
  contents: read

jobs:
  build-images:
    name: Build and publish container images
    runs-on: ubuntu-22.04
    environment: ${{ inputs.github-environment }}
    env:
      IMAGE_REPO: intel
      IMAGE_VERSION: ${{ inputs.image-tag }}
    steps:
    - name: Checkout
      uses: actions/checkout@v4

    - name: Build images
      run: "make images IMAGE_VERSION=${IMAGE_VERSION}  Q="

    - name: Login to Docker Hub
      if: ${{ inputs.publish }}
      uses: docker/login-action@v3
      with:
        username: ${{ secrets.DOCKERHUB_USERNAME }}
        password: ${{ secrets.DOCKERHUB_TOKEN }}

    - name: Push images
      if: ${{ inputs.publish }}
      run: "make images-push IMAGE_VERSION=${IMAGE_VERSION} Q="


================================================
FILE: .github/workflows/common-codeql.yaml
================================================
name: CodeQL scanning
on:
  workflow_call:
    inputs:
      export-report:
        default: false
        required: false
        type: boolean

permissions:
  contents: read

jobs:
  codeql-scan:
    runs-on: ubuntu-22.04
    permissions:
      security-events: write
    steps:
    - name: Checkout
      uses: actions/checkout@v4

    - name: Set up Go
      uses: actions/setup-go@v5
      with:
        go-version-file: go.mod

    - name: Initialize CodeQL
      uses: github/codeql-action/init@v3
      with:
        languages: go

    - name: Perform CodeQL Analysis
      uses: github/codeql-action/analyze@v3

    - name: Generate CodeQL Security Report
      if: ${{ inputs.export-report }}
      uses: rsdmike/github-security-report-action@v3.0.4
      with:
        template: report
        token: ${{ secrets.GITHUB_TOKEN }}

    - name: Upload PDF report as an artifact
      if: ${{ inputs.export-report }}
      uses: actions/upload-artifact@v4
      with:
        name: codeql-report
        path: report.pdf


================================================
FILE: .github/workflows/common-trivy.yaml
================================================
name: Trivy scanning
on:
  workflow_call:
    inputs:
      upload-to-github-security-tab:
        default: false
        required: false
        type: boolean
      export-csv:
        default: false
        required: false
        type: boolean

permissions:
  contents: read

jobs:
  trivy-scan-licenses:
    runs-on: ubuntu-22.04
    steps:
    - name: Checkout
      uses: actions/checkout@v4

    - name: Run Trivy in fs mode
      uses: aquasecurity/trivy-action@master
      with:
        scan-type: fs
        scan-ref: .
        exit-code: 1
        scanners: license
        severity: "UNKNOWN,MEDIUM,HIGH,CRITICAL"

  trivy-scan-vulns:
    runs-on: ubuntu-22.04
    permissions:
      security-events: write
    steps:
    - name: Checkout
      uses: actions/checkout@v4

    - name: Run Trivy in fs mode
      continue-on-error: true
      uses: aquasecurity/trivy-action@master
      with:
        scan-type: fs
        scan-ref: .
        exit-code: 1
        list-all-pkgs: true
        format: json
        output: trivy-report.json

    - name: Show report in human-readable format
      uses: aquasecurity/trivy-action@master
      with:
        scan-type: convert
        vuln-type: ''
        severity: ''
        image-ref: trivy-report.json
        format: table

    - name: Convert report to sarif
      if: ${{ inputs.upload-to-github-security-tab }}
      uses: aquasecurity/trivy-action@master
      with:
        scan-type: convert
        vuln-type: ''
        severity: ''
        image-ref: trivy-report.json
        format: sarif
        output: trivy-report.sarif

    - name: Upload sarif report to GitHub Security tab
      if: ${{ inputs.upload-to-github-security-tab }}
      uses: github/codeql-action/upload-sarif@v3
      with:
       sarif_file: trivy-report.sarif

    - name: Convert report to csv
      if: ${{ inputs.export-csv }}
      uses: aquasecurity/trivy-action@master
      with:
        scan-type: convert
        vuln-type: ''
        severity: ''
        image-ref: trivy-report.json
        format: template
        template: "@.github/workflows/trivy-csv.tpl"
        output: trivy-report.csv

    - name: Upload CSV report as an artifact
      if: ${{ inputs.export-csv }}
      uses: actions/upload-artifact@v4
      with:
        name: trivy-report
        path: trivy-report.csv


================================================
FILE: .github/workflows/common-verify-code.yaml
================================================
name: Verify code

on:
  - workflow_call

permissions:
  contents: read

jobs:
  build-and-test:
    runs-on: ubuntu-22.04
    steps:
    - name: Check out code
      uses: actions/checkout@v4

    - name: Set up Go
      uses: actions/setup-go@v5
      with:
        go-version-file: go.mod
      id: go

    - name: Install golangci-lint
      run: curl -sfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.64.7

    - name: Gofmt
      run: make format

    - name: Build
      run: make

    - name: Test
      run: make test

    - name: Golangci-lint
      run: |
        export PATH=$PATH:$(go env GOPATH)/bin
        make golangci-lint

    - name: Codecov report
      run: bash <(curl -s https://codecov.io/bash)

  trivy-scan:
    uses: "./.github/workflows/common-trivy.yaml"
    permissions:
      contents: read
      security-events: write
    with:
      upload-to-github-security-tab: true

  codeql-scan:
    uses: "./.github/workflows/common-codeql.yaml"
    permissions:
      contents: read
      security-events: write


================================================
FILE: .github/workflows/publish-devel-images.yaml
================================================
name: Build and publish devel container images

on:
  push:
    branches: ["master"]

permissions:
  contents: read

concurrency:
  group: ${{ github.workflow }}-${{ github.ref_name }}
  cancel-in-progress: true

jobs:
  trivy-scan:
    uses: "./.github/workflows/common-trivy.yaml"
    permissions:
      contents: read
      security-events: write

  publish-images:
    uses: "./.github/workflows/common-build-images.yaml"
    needs: [trivy-scan]
    secrets: inherit
    with:
      publish: true
      image-tag: "devel"
      github-environment: "staging"


================================================
FILE: .github/workflows/publish-docs.yml
================================================
name: Publish documentation

on:
  push:
    branches:
        - master
        - release-*
    # Path filters are ignored for tags
    paths:
      - "docs/**"
      - "Makefile"
    tags:
        - v*

permissions:
  contents: read

concurrency:
  group: ${{ github.workflow }}
  cancel-in-progress: false

jobs:
  update-gh-pages:
    uses: "./.github/workflows/common-build-docs.yaml"
    permissions:
      contents: write
    with:
      publish: true


================================================
FILE: .github/workflows/release.yaml
================================================
name: Build and publish release artifacts

on:
  push:
    tags: [ 'v*' ]

permissions:
  contents: read

concurrency:
  group: ${{ github.workflow }}-${{ github.ref_name }}
  cancel-in-progress: true

jobs:
  trivy-scan:
    uses: "./.github/workflows/common-trivy.yaml"
    permissions:
      contents: read
      security-events: write
    with:
      export-csv: true

  codeql:
    uses: "./.github/workflows/common-codeql.yaml"
    permissions:
      contents: read
      security-events: write
    with:
      export-report: true

  publish-images:
    uses: "./.github/workflows/common-build-images.yaml"
    needs: [trivy-scan]
    secrets: inherit
    with:
      publish: true
      image-tag: ${{ github.ref_name }}
      github-environment: "release"

  build-packages:
    needs: [trivy-scan]
    permissions:
      contents: write
    runs-on: ubuntu-22.04
    steps:
    - name: Checkout
      uses: actions/checkout@v4

    - name: Build packages
      run: "make cross-packages  Q="

    - name: Build vendored dist tarball
      run: "make vendored-dist  Q="

    - name: Upload release assets
      uses: softprops/action-gh-release@v1
      with:
        name: ${{ github.ref_name }}
        draft: true
        append_body: true
        files: |
          packages/release-assets/*
          vendored-cri-resource-manager-*.tar.gz


================================================
FILE: .github/workflows/trivy-csv.tpl
================================================
{{ range . }}
Trivy Vulnerability Scan Results ({{- .Target -}})
VulnerabilityID,Severity,CVSS Score,Title,Library,Vulnerable Version,Fixed Version,Information URL,Triage Information
{{ range .Vulnerabilities }}
    {{- .VulnerabilityID }},
    {{- .Severity }},
    {{- range $key, $value := .CVSS }}
        {{- if (eq $key "nvd") }}
            {{- .V3Score -}}
        {{- end }}
    {{- end }},
    {{- quote .Title }},
    {{- quote .PkgName }},
    {{- quote .InstalledVersion }},
    {{- quote .FixedVersion }},
    {{- .PrimaryURL }}
{{ else -}}
    No vulnerabilities found at this time.
{{ end }}
Trivy Dependency Scan Results ({{ .Target }})
ID,Name,Version,Notes
{{ range .Packages -}}
    {{- quote .ID }},
    {{- quote .Name }},
    {{- quote .Version }}
{{ else -}}
    No dependencies found at this time.
{{ end }}
{{ end }}


================================================
FILE: .github/workflows/verify-periodic.yaml
================================================
name: Verify branches periodic

on:
  schedule:
    - cron: '30 2 * * 0'

permissions:
  contents: read

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  verify-code:
    uses: "./.github/workflows/common-verify-code.yaml"
    permissions:
      contents: read
      security-events: write


================================================
FILE: .github/workflows/verify-pr-code.yaml
================================================
name: Verify code

on:
  pull_request

permissions:
  contents: read

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
  cancel-in-progress: true

jobs:
  verify:
    uses: "./.github/workflows/common-verify-code.yaml"
    permissions:
      contents: read
      security-events: write


================================================
FILE: .github/workflows/verify-pr-docs.yaml
================================================
name: Verify documentation

on:
  pull_request:
    paths:
      - "docs/**"
      - "Makefile"

permissions:
  contents: read

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
  cancel-in-progress: true

jobs:
  verify-docs:
    uses: "./.github/workflows/common-build-docs.yaml"
    permissions:
      contents: write
      security-events: write


================================================
FILE: .gitignore
================================================
*~
*.swp

*_gendata.go
/bin
coverage.html
coverage.txt
.git-hooks.redirected

*.tar
*.tar.*
*.spec
.static.*
/debian
/packages

/_build
/_work
*.stamp
test/e2e/**/output


================================================
FILE: CODEOWNERS
================================================
* @kad @klihub @marquiz @mythi @askervin @jukkar @fmuyassarov


================================================
FILE: Jenkinsfile
================================================
pipeline {
    agent {
        label "cri-rm"
    }

    environment {
        IMAGE_REPO = "cloud-native-image-registry.westus.cloudapp.azure.com"
    }

    stages {
        stage('Build and push images') {
            steps {
                script {
                    withDockerRegistry([credentialsId: "${env.DOCKER_REGISTRY}", url: "https://${env.IMAGE_REPO}"]) {
                        if (env.BRANCH_NAME == 'master') {
                            sh "make images-push IMAGE_REPO=${env.IMAGE_REPO} IMAGE_VERSION=devel Q="
                        } else {
                            sh "make images-push IMAGE_REPO=${env.IMAGE_REPO} Q="
                        }
                    }
                }
            }
        }
    }
}


================================================
FILE: LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: Makefile
================================================
# We use bashisms in this Makefile.
SHELL := /bin/bash

# Go compiler/toolchain and extra related binaries we ues/need.
GO_PARALLEL :=
GO_CMD      := go
GO_BUILD    := $(GO_CMD) build $(GO_PARALLEL)
GO_GEN      := $(GO_CMD) generate -x
GO_INSTALL  := $(GO_CMD) install
GO_FMT      := gofmt
GO_CYCLO    := gocyclo
GO_LINT     := golint
GO_CILINT   := golangci-lint
GO_VERSION  ?= 1.24.1
GOLICENSES_VERSION  ?= v1.5.0

# TEST_TAGS is the set of extra build tags passed for tests.
# We disable AVX collector for tests by default.
TEST_TAGS := noavx,test
GO_TEST   := $(GO_CMD) test $(GO_PARALLEL) -tags $(TEST_TAGS)
GO_VET    := $(GO_CMD) vet -tags $(TEST_TAGS)

TEST_SETUP   := test-setup.sh
TEST_CLEANUP := test-cleanup.sh

# Disable some golangci_lint checkers for now until we have an more acceptable baseline...
GO_CILINT_CHECKERS := -D unused,staticcheck,errcheck,deadcode,structcheck,gosimple,revive -E gofmt
GO_CILINT_RUNFLAGS := --build-tags $(TEST_TAGS)

# Protoc compiler and protobuf definitions we might need to recompile.
PROTOC    := $(shell command -v protoc;)
PROTOBUFS  = $(shell find cmd pkg -name \*.proto)
PROTOCODE := $(patsubst %.proto,%.pb.go,$(PROTOBUFS))

PROTO_INCLUDE = -I$(PWD):/usr/local/include:/usr/include
PROTO_OPTIONS = --proto_path=. $(PROTO_INCLUDE) \
    --go_opt=paths=source_relative --go_out=. \
    --go-grpc_opt=paths=source_relative --go-grpc_out=.
PROTO_COMPILE = $(PROTOC) $(PROTO_OPTIONS)


# ShellCheck for checking shell scripts.
SHELLCHECK := shellcheck

CLANG := clang
KERNEL_VERSION ?= $(shell uname -r)
KERNEL_HEADERS_DIR ?= /lib/modules/$(KERNEL_VERSION)/source
KERNEL_BUILD_DIR ?= /lib/modules/$(KERNEL_VERSION)/build
# Directory for full kernel sources
KERNEL_SRC_DIR ?= /usr/src/linux

# Binaries and directories for installation.
INSTALL    := install
PREFIX     ?= /usr
BINDIR     ?= $(PREFIX)/bin
UNITDIR    ?= $(PREFIX)/lib/systemd/system
DOCDIR     ?= $(PREFIX)/share/doc/cri-resource-manager
SYSCONFDIR ?= /etc
CONFIGDIR  ?= /etc/cri-resmgr
DEFAULTDIR ?= $(shell \
    [ -d /etc/rpm ] && { echo /etc/sysconfig; exit 0; };  \
    [ -f /etc/debian_version ] && { echo /etc/default; exit 0; }; \
    echo unknown; exit 1)

# Directories (in cmd) with go code we'll want to build and install.
BUILD_DIRS = $(shell find cmd -name \*.go | sed 's:cmd/::g;s:/.*::g' | uniq)
BUILD_BINS = $(foreach dir,$(BUILD_DIRS),bin/$(dir))

# Directories (in cmd) with go code we'll want to create Docker images from.
IMAGE_DIRS  = $(shell find cmd -name Dockerfile | sed 's:cmd/::g;s:/.*::g' | uniq)
IMAGE_VERSION  := $(shell git describe --dirty 2> /dev/null || echo unknown)
ifdef IMAGE_REPO
    override IMAGE_REPO := $(IMAGE_REPO)/
endif

# List of our active go modules.
GO_LIST_MODULES := $(GO_CMD) list ./... | grep -v vendor/
GO_PKG_SRC = $(shell find pkg -name \*.go)

# List of visualizer collateral files to go generate.
UI_ASSETS := $(shell for i in pkg/cri/resource-manager/visualizer/*; do \
        if [ -d "$$i" -a -e "$$i/assets_generate.go" ]; then \
            echo $$i/assets_gendata.go; \
        fi; \
    done)

# Right now we don't depend on libexec/%.o on purpose so make sure the file
# is always up-to-date when elf/avx512.c is changed.
GEN_TARGETS := pkg/avx/programbytes_gendata.go $(PROTOCODE)

# Determine binary version and buildid, and versions for rpm, deb, and tar packages.
BUILD_VERSION := $(shell scripts/build/get-buildid --version --shell=no)
BUILD_BUILDID := $(shell scripts/build/get-buildid --buildid --shell=no)
RPM_VERSION   := $(shell scripts/build/get-buildid --rpm --shell=no)
DEB_VERSION   := $(shell scripts/build/get-buildid --deb --shell=no)
TAR_VERSION   := $(shell scripts/build/get-buildid --tar --shell=no)

# Kubernetes version we pull in as modules and our external API versions.
KUBERNETES_VERSION := $(shell grep 'k8s.io/kubernetes ' go.mod | sed 's/^.* //')
RESMGR_API_VERSION := $(shell ls pkg/apis/resmgr | grep '^v[0-9]*')

# Git (tagged) version and revisions we'll use to linker-tag our binaries with.
RANDOM_ID := "$(shell head -c20 /dev/urandom | od -An -tx1 | tr -d ' \n')"

ifdef STATIC
    STATIC_LDFLAGS:=-extldflags=-static
    BUILD_TAGS:=-tags osusergo,netgo
endif

LDFLAGS    = \
    -ldflags "$(STATIC_LDFLAGS) -X=github.com/intel/cri-resource-manager/pkg/version.Version=$(BUILD_VERSION) \
             -X=github.com/intel/cri-resource-manager/pkg/version.Build=$(BUILD_BUILDID) \
             -B 0x$(RANDOM_ID)"

# Build non-optimized version for debugging on make DEBUG=1.
DEBUG ?= 0
ifeq ($(DEBUG),1)
    GCFLAGS=-gcflags "all=-N -l"
else
    GCFLAGS=
endif

# Release/end-to-end testing. Specify E2E_TESTS to override the default test set.
E2E_RUN := reinstall_cri_resmgr=1 test/e2e/run_tests.sh

# tar-related commands and options.
TAR        := tar
TAR_UPDATE := $(TAR) -uf
GZIP       := gzip
GZIP_DC    := gzip -dc
GZEXT      := .gz

# Metadata for packages, changelog, etc.
USER_NAME  ?= $(shell git config user.name)
USER_EMAIL ?= $(shell git config user.email)
BUILD_DATE ?= $(shell date -R)

# RPM spec files we might want to generate.
SPEC_FILES = $(shell find packaging -name \*.spec.in | sed 's/.spec.in/.spec/g' | uniq)

# Systemd collateral.
SYSTEMD_DIRS = $(shell find cmd -name \*.service -o -name \*.socket | sed 's:cmd/::g;s:/.*::g'|uniq)
SYSCONF_DIRS = $(shell find cmd -name \*.sysconf | sed 's:cmd/::g;s:/.*::g' | uniq)

DOCKER := docker

# Extra options to pass to docker (for instance --network host).
DOCKER_OPTIONS =

# Set this to empty to prevent 'docker build' from trying to pull all image refs.
DOCKER_PULL := --pull

# Docker boilerplate/commands to build debian/ubuntu packages.
DOCKER_DEB_BUILD := \
    cd /build && \
    tar -xvf /build/input/cri-resource-manager-$(TAR_VERSION).tar.gz && \
    cd cri-resource-manager-$(TAR_VERSION) && \
    cp -r /build/input/debian . && \
    dpkg-buildpackage -uc && \
    cp ../*.{buildinfo,changes,deb,dsc} /output

# Docker boilerplate/commands to build rpm packages.
DOCKER_RPM_BUILD := \
    mkdir -p ~/rpmbuild/{SOURCES,SPECS} && \
    cp -v /build/input/*.spec ~/rpmbuild/SPECS && \
    cp -v /build/input/*.tar.* ~/rpmbuild/SOURCES && \
    for spec in ~/rpmbuild/SPECS/*.spec; do \
        rpmbuild -bb $$spec; \
    done && \
    cp -v $$(rpm --eval %{_rpmdir}/%{_arch})/*.rpm /output

# Docker boilerplate/commands to build binary tarballs.
DOCKER_TAR_BUILD := \
    cd ~ && \
    $(GZIP_DC) /build/input/cri-resource-manager-$(TAR_VERSION).tar$(GZ_EXT) | \
        $(TAR) -xf - && \
    cd cri-resource-manager-$(TAR_VERSION) && \
    $(MAKE) OUTPUT=/output/ binary-dist

# Docker boilerplate/commands to build binaries.
DOCKER_BIN_BUILD := \
    mkdir ~/build && cd ~/build && \
    tar -xvzf /build/input/cri-resource-manager-$(TAR_VERSION).tar$(GZEXT) && \
    cd cri-resource-manager-$(TAR_VERSION) && \
    make && \
    cp -v bin/* /output

# Documentation-related variables
SPHINXOPTS    ?= -W
SPHINXBUILD   = sphinx-build
SITE_BUILDDIR ?= _build

# Docker base command for working with html documentation.
DOCKER_SITE_BUILDER_IMAGE := cri-resmgr-site-builder
DOCKER_SITE_CMD := $(DOCKER) run --rm -v "`pwd`:/docs" --user=`id -u`:`id -g` \
	-p 8081:8081 \
	-e SITE_BUILDDIR=$(SITE_BUILDDIR) -e SPHINXOPTS=$(SPHINXOPTS)


# Supported distros with debian native packaging format.
SUPPORTED_DEB_DISTROS := $(shell \
    grep -l 'apt-get ' dockerfiles/cross-build/Dockerfile.* | \
    egrep -v '((~)|(swp))$$' | \
    sed 's:^.*Dockerfile.::g')

# Supported distros with rpm native packaging format.
SUPPORTED_RPM_DISTROS := $(shell \
    egrep -l '(dnf )|(yum )|(zypper )' dockerfiles/cross-build/Dockerfile.* | \
    egrep -v '((~)|(swp))$$' | \
    sed 's:^.*Dockerfile.::g')

# Directory to leave built distro packages and collateral in.
PACKAGES_DIR := packages

# Directory to leave build distro binaries in.
BINARIES_DIR := binaries

# Directory to use to build distro packages.
BUILD_DIR := build

# dist tarball target name
ifneq ($(wildcard .git/.),)
    DIST_TARGET = dist-git
else
    DIST_TARGET = dist-cwd
endif

# Paths to exclude from tarballs generated by dist-cwd.
DIST_EXCLUDE := \
    --exclude="./$$tarball*" \
    --exclude='./cri-resource-manager-*' \
    --exclude='./$(PACKAGES_DIR)*' \
    --exclude='./$(BUILD_DIR)*'

# Path name transformations for tarballs generated by dist-cwd.
DIST_TRANSFORM := \
    --transform='s:^.:cri-resource-manager-$(TAR_VERSION):'

# Determine distro ID, version and package type.
DISTRO_ID      := $(shell . /etc/os-release; echo "$${ID:-unknown}")
DISTRO_VERSION := $(shell . /etc/os-release; echo "$${VERSION_ID:-unknown}")
DISTRO_PACKAGE := $(shell echo $(DISTRO_ID) | tr -d ' \t' | \
    sed -E 's/.*((fedora)|(suse)).*/rpm/;s/.*((ubuntu)|(debian)).*/deb/')

# Be quiet by default but let folks override it with Q= or V=1 on the command line.
ifneq ($(V),1)
  Q := @
endif

# Default target: just build everything.
all: build

#
# Generic targets: build, install, clean, build images.
#

build: $(BUILD_BINS)

build-static:
	$(MAKE) STATIC=1 build

install: $(BUILD_BINS) $(foreach dir,$(BUILD_DIRS),install-bin-$(dir)) \
    $(foreach dir,$(BUILD_DIRS),install-systemd-$(dir)) \
    $(foreach dir,$(BUILD_DIRS),install-sysconf-$(dir)) \
    $(foreach dir,$(BUILD_DIRS),install-config-$(dir))


clean: clean-bin clean-spec clean-deb clean-ui-assets clean-html

images: $(foreach dir,$(IMAGE_DIRS),image-$(dir))

images-push: $(foreach dir,$(IMAGE_DIRS),image-push-$(dir))

#
# Rules for building and installing binaries, or building docker images, and cleaning up.
#

KERNEL_INCLUDE_DIRS = /include \
                      /include/uapi \
                      /include/generated/uapi \
                      /arch/x86/include \
                      /arch/x86/include/uapi \
                      /arch/x86/include/generated/uapi

KERNEL_INCLUDES := $(strip $(foreach kernel_dir,$(KERNEL_HEADERS_DIR) $(KERNEL_BUILD_DIR),$(addprefix -I,$(wildcard $(addprefix $(kernel_dir),$(KERNEL_INCLUDE_DIRS))))))

libexec/%.o: elf/%.c
	$(Q)if [ -z "$(KERNEL_INCLUDES)" ]; then echo "Cannot build $@: invalid KERNEL_HEADERS_DIR=$(KERNEL_HEADERS_DIR)"; exit 1; fi
	$(Q)echo "Building $@"
	$(Q)mkdir -p libexec
	$(Q)$(CLANG) -nostdinc -D __KERNEL__ $(KERNEL_INCLUDES) -O2 -Wall -target bpf -c $< -o $@

bin/%: .static.%.$(STATIC)
	$(Q)bin=$(notdir $@); src=./cmd/$$bin; \
	echo "Building $$([ -n "$(STATIC)" ] && echo 'static ')$@ (version $(BUILD_VERSION), build $(BUILD_BUILDID))..."; \
	mkdir -p bin && \
	$(GO_BUILD) $(BUILD_TAGS) $(LDFLAGS) $(GCFLAGS) -o bin/ $$src

.static.%.$(STATIC):
	$(Q)if [ ! -f "$@" ]; then \
	    touch "$@"; \
	fi; \
	old="$@"; old="$${old%.*}"; \
        if [ -n "$(STATIC)" ]; then \
	    rm -f "$$old."; \
	else \
	    rm -f "$$old.1"; \
	fi

.PRECIOUS: $(foreach dir,$(BUILD_DIRS),.static.$(dir).1 .static.$(dir).)

install-bin-%: bin/%
	$(Q)bin=$(patsubst install-bin-%,%,$@); dir=cmd/$$bin; \
	echo "Installing $$bin in $(DESTDIR)$(BINDIR)..."; \
	$(INSTALL) -d $(DESTDIR)$(BINDIR) && \
	$(INSTALL) -m 0755 -t $(DESTDIR)$(BINDIR) bin/$$bin; \

install-systemd-%:
	$(Q)bin=$(patsubst install-systemd-%,%,$@); dir=cmd/$$bin; \
	echo "Installing systemd collateral for $$bin..."; \
	$(INSTALL) -d $(DESTDIR)$(UNITDIR) && \
	for f in $$(find $$dir -name \*.service -o -name \*.socket); do \
	    echo "  $$f in $(DESTDIR)$(UNITDIR)..."; \
	    $(INSTALL) -m 0644 -t $(DESTDIR)$(UNITDIR) $$f.in; \
	done; \
	for f in $$(find $$dir -name \*.service.in -o -name \*.socket.in); do \
	    echo "  $$f in $(DESTDIR)$(UNITDIR)..."; \
	    df=$${f##*/}; df=$${df%.in}; \
	    $(INSTALL) -m 0644 -T $$f $(DESTDIR)$(UNITDIR)/$$df; \
	    sed -E -i -e "s:__DEFAULTDIR__:$(DEFAULTDIR):g" \
	              -e "s:__BINDIR__:$(BINDIR):g" $(DESTDIR)$(UNITDIR)/$$df; \
	done

install-sysconf-%:
	$(Q)bin=$(patsubst install-sysconf-%,%,$@); dir=cmd/$$bin; \
	echo "Installing sysconf/default collateral for $$bin..."; \
	$(INSTALL) -d $(DESTDIR)$(DEFAULTDIR) && \
	for f in $$(find $$dir -name \*.sysconf); do \
	    echo "  $$f in $(DESTDIR)$(DEFAULTDIR)..."; \
	    df=$${f##*/}; df=$${df%.sysconf}; \
	    $(INSTALL) -m 0644 -T $$f $(DESTDIR)$(DEFAULTDIR)/$$df; \
	done

install-config-%:
	$(Q)bin=$(patsubst install-config-%,%,$@); dir=cmd/$$bin; \
	echo "Installing sample configuration collateral for $$bin..."; \
	$(INSTALL) -d $(DESTDIR)$(CONFIGDIR) && \
	for f in $$(find $$dir -name \*.cfg.sample); do \
	    echo "  $$f in $(DESTDIR)$(CONFIGDIR)..."; \
	    df=$${f##*/}; \
	    $(INSTALL) -m 0644 -T $$f $(DESTDIR)$(CONFIGDIR)/$${df}; \
	done

install-minimal-docs:
	$(Q)echo "Installing minimal documentation to $(DOCDIR)..."; \
	$(INSTALL) -d $(DESTDIR)$(DOCDIR) && \
	for f in LICENSE docs/security.md; do \
	    echo "  $$f in $(DESTDIR)$(DOCDIR)..."; \
	    df=$${f##*/}; \
	    $(INSTALL) -m 0644 -T $$f $(DESTDIR)$(DOCDIR)/$${df}; \
	done

install-licenses:
	$(Q)for cmd in $(BUILD_DIRS); do \
	    install -D LICENSE $(DESTDIR)/licenses/$$cmd/LICENSE && \
	    go-licenses save ./cmd/$$cmd \
	        --ignore github.com/intel/cri-resource-manager \
	        --save_path $(DESTDIR)/licenses/$$cmd/go-licenses; \
	done

clean-bin: $(foreach dir,$(BUILD_DIRS),clean-$(dir))
	$(Q)rm -f .static.*

clean-%:
	$(Q)bin=$(patsubst clean-%,%,$@); src=cmd/$$bin; \
	echo "Cleaning up $$bin..."; \
	rm -f bin/$$bin

clean-gen:
	$(Q)rm -f $(GEN_TARGETS)

image-%:
	$(Q)bin=$(patsubst image-%,%,$@); \
	    $(DOCKER) build . -f "cmd/$$bin/Dockerfile" \
	    --build-arg GO_VERSION=$(GO_VERSION) \
	    --build-arg GOLICENSES_VERSION=$(GOLICENSES_VERSION) \
	    -t $(IMAGE_REPO)$$bin:$(IMAGE_VERSION)

image-push-%:
	$(Q)bin=$(patsubst image-push-%,%,$@); \
		if [ -z "$(IMAGE_REPO)" ]; then echo "ERROR: no IMAGE_REPO specified"; exit 1; fi; \
		$(DOCKER) push $(IMAGE_REPO)$$bin:$(IMAGE_VERSION)

#
# Rules for format checking, various code quality and complexity checks and measures.
#

format:
	$(Q)report=`$(GO_FMT) -s -d -w $$(find cmd pkg test/functional -name \*.go)`; \
	if [ -n "$$report" ]; then \
	    echo "$$report"; \
	    exit 1; \
	fi

vet:
	$(Q)$(GO_VET) $(shell $(GO_LIST_MODULES))

cyclomatic-check:
	$(Q)report=`$(GO_CYCLO) -over 15 cmd pkg`; \
	if [ -n "$$report" ]; then \
	    echo "Complexity is over 15 in"; \
	    echo "$$report"; \
	    exit 1; \
	fi

lint:
	$(Q)rc=0; \
	for f in $$(find -name \*.go | grep -v \.\/vendor); do \
	    $(GO_LINT) -set_exit_status $$f || rc=1; \
	done; \
	exit $$rc

golangci-lint:
	$(Q)$(GO_CILINT) run $(GO_CILINT_RUNFLAGS) $(GO_CILINT_CHECKERS)

shellcheck:
	$(Q)for f in $$(git grep -n '^#!/bin/.*sh *' | grep ':1:#!' | sed 's/:1:.*//'); do \
	    echo "shellchecking $$f..."; \
	    $(SHELLCHECK) $$f; \
	done


#
# Rules for running unit/module tests.
#

test: test-setup test-run test-cleanup
race-test racetest: test-setup racetest-run test-cleanup

test-setup:
	$(Q)for i in $$(find . -name $(TEST_SETUP)); do \
	    echo "+ Running test setup $$i..."; \
	    (cd $${i%/*}; \
	        if [ -x "$(TEST_SETUP)" ]; then \
	            ./$(TEST_SETUP); \
	        fi); \
	done

test-cleanup:
	$(Q)for i in $$(find . -name $(TEST_CLEANUP)); do \
	    echo "- Running test cleanup $$i..."; \
	    (cd $${i%/*}; \
	        if [ -x "$(TEST_CLEANUP)" ]; then \
	            ./$(TEST_CLEANUP); \
	        fi); \
	done

test-run:
ifndef WHAT
	$(Q)$(GO_TEST) -race -coverprofile=coverage.txt -covermode=atomic \
	    $(shell $(GO_LIST_MODULES))
else
	$(Q)if [ -n '$(TESTS)' ]; then \
	        run="-run $(TESTS)"; \
	    fi; \
	cd $(WHAT) && \
            $(GO_TEST) $$run -v -cover -coverprofile cover.out || rc=1; \
            $(GO_CMD) tool cover -html=cover.out -o coverage.html; \
            rm cover.out; \
            echo "Coverage report: file://$$(realpath coverage.html)"; \
            exit $$rc
endif

racetest-run:
ifndef WHAT
	$(Q)$(GO_TEST) -race -coverprofile=coverage.txt -covermode=atomic \
	    $(shell $(GO_LIST_MODULES))
else
	$(Q)cd $(WHAT) && \
	    $(GO_TEST) -race -coverprofile=cover.out -covermode=atomic || rc=1; \
            $(GO_CMD) tool cover -html=cover.out -o coverage.html; \
            rm cover.out; \
            echo "Coverage report: file://$$(realpath coverage.html)"; \
            exit $$rc
endif

release-tests: e2e-tests

e2e-tests: build-static
	$(Q)tests="$(if $(E2E_TESTS),$(E2E_TESTS),test/e2e/policies.test-suite)"; \
	$(E2E_RUN) $$tests; \
	if [ "$$?" != "0" ]; then \
	    echo "You drop into interactive mode upon failures if you run e2e tests as"; \
	    echo "    on_verify_fail=interactive $(E2E_RUN) $$tests"; \
	    exit 1; \
	fi

packaging-tests: cross-packages
	$(Q)cleanup=1 omit_agent=1 $(E2E_RUN) test/e2e/packages.test-suite

#
# Rules for building distro packages.
#

ifneq ($(DISTRO_ID),fedora)
    packages: cross-$(DISTRO_PACKAGE).$(DISTRO_ID)-$(DISTRO_VERSION)
else
    packages: cross-$(DISTRO_PACKAGE).$(DISTRO_ID)
endif

cross-packages: cross-rpm cross-deb cross-tar

cross-rpm: $(foreach d,$(SUPPORTED_RPM_DISTROS),cross-rpm.$(d))

cross-deb: $(foreach d,$(SUPPORTED_DEB_DISTROS),cross-deb.$(d))

cross-bin: $(foreach d,$(SUPPORTED_RPM_DISTROS),cross-bin.$(d)) \
           $(foreach d,$(SUPPORTED_DEB_DISTROS),cross-bin.$(d))

#
# Rules for building dist-tarballs, rpm, and deb packages.
#

dist: $(DIST_TARGET)

dist-git:
	$(Q)echo "Using git to create dist tarball $(TAR_VERSION) from $(BUILD_BUILDID)..."; \
	tardir=cri-resource-manager-$(TAR_VERSION) && \
	tarball=cri-resource-manager-$(TAR_VERSION).tar && \
	git archive --format=tar --prefix=$$tardir/ HEAD > $$tarball && \
	mkdir -p $$tardir && \
	    echo $(BUILD_VERSION) > $$tardir/version && \
	    echo $(BUILD_BUILDID) > $$tardir/buildid && \
	$(TAR) -uf $$tarball $$tardir && \
	rm -f $$tarball.* && \
	$(GZIP) $$tarball && \
	rm -fr $$tardir

dist-cwd:
	$(Q)echo "Using tar to create dist tarball $(TAR_VERSION) from $$(pwd)..."; \
	tardir=cri-resource-manager-$(TAR_VERSION) && \
	tarball=cri-resource-manager-$(TAR_VERSION).tar && \
	$(TAR) $(DIST_EXCLUDE) $(DIST_TRANSFORM) -cvf - . > $$tarball && \
	mkdir -p $$tardir && \
	    echo $(BUILD_VERSION) > $$tardir/version && \
	    echo $(BUILD_BUILDID) > $$tardir/buildid && \
	$(TAR_UPDATE) $$tarball $$tardir && \
	rm -f $$tarball.* && \
	$(GZIP) $$tarball && \
	rm -fr $$tardir

vendored-dist: dist
	$(Q)echo "Creating vendored dist tarball $(TAR_VERSION)..."; \
	tardir=cri-resource-manager-$(TAR_VERSION) && \
	tarball=cri-resource-manager-$(TAR_VERSION).tar && \
	cp $$tarball$(GZEXT) vendored-$$tarball$(GZEXT) && \
	$(GZIP_DC) vendored-$$tarball$(GZEXT) | tar -xf - && \
	go mod vendor -v && \
	mkdir -p $$tardir && \
	  mv vendor $$tardir && \
	rm -f vendored-$$tarball* && \
	$(TAR) -cf vendored-$$tarball $$tardir && \
	$(GZIP) vendored-$$tarball && \
	rm -fr $$tardir

binary-dist:
	$(Q)tarball=$(OUTPUT)cri-resource-manager-$(TAR_VERSION).$$(uname -m).tar; \
	echo "Creating binary dist tarball $$tarball..."; \
	tardir=binary-dist; \
	rm -fr $$tarball* $$tardir && \
	$(MAKE) DESTDIR=$$tardir \
	        BUILD_DIRS=cri-resmgr \
	        PREFIX=/opt/intel \
	        DEFAULTDIR=/etc/default \
	        UNITDIR=$(SYSCONFDIR)/systemd/system install install-minimal-docs && \
	$(MAKE) DESTDIR=$$tardir/opt/intel/ install-licenses && \
	$(TAR) -C $$tardir -cf $$tarball . && \
	$(GZIP) $$tarball && \
	rm -fr $$tardir

spec: clean-spec $(SPEC_FILES)

%.spec:
	$(Q)echo "Generating RPM spec file $@..."; \
	cp $@.in $@ && \
	sed -E -i -e "s/__VERSION__/$(RPM_VERSION)/g"    \
	          -e "s/__TARVERSION__/$(TAR_VERSION)/g" \
	          -e "s/__BUILDID__/$(BUILD_BUILDID)/g" $@

clean-spec:
	$(Q)rm -f $(SPEC_FILES)

cross-rpm.%: docker/cross-build/% clean-spec spec dist
	$(Q)distro=$(patsubst cross-rpm.%,%,$@); \
	builddir=$(BUILD_DIR)/docker/$$distro; \
	outdir=$(PACKAGES_DIR)/$$distro; \
	echo "Docker cross-building $$distro packages..."; \
	mkdir -p $(PACKAGES_DIR)/$$distro && \
	rm -fr $$builddir && mkdir -p $$builddir/{input,build} && \
	cp cri-resource-manager-$(TAR_VERSION).tar$(GZEXT) $$builddir/input && \
	cp packaging/rpm/cri-resource-manager.spec $$builddir/input && \
	$(DOCKER) run --rm $(DOCKER_OPTIONS) --user $$USER \
	    --env USER_NAME="$(USER_NAME)" --env USER_EMAIL=$(USER_EMAIL) \
	    -v $$(pwd)/$$builddir:/build \
	    -v $$(pwd)/$$outdir:/output \
	    -v "`go env GOMODCACHE`:/home/$$USER/go/pkg/mod" \
	    $$distro-build /bin/bash -c '$(DOCKER_RPM_BUILD)' && \
	rm -fr $$builddir && \
	install -D -m644  $$outdir/cri-resource-manager-$(RPM_VERSION)-0.x86_64.rpm $(PACKAGES_DIR)/release-assets/cri-resource-manager-$(RPM_VERSION)-0.$$distro.x86_64.rpm

src.rpm source-rpm: spec dist
	mkdir -p ~/rpmbuild/{SOURCES,SPECS} && \
	cp packaging/rpm/cri-resource-manager.spec ~/rpmbuild/SPECS && \
	cp cri-resource-manager-$(TAR_VERSION).tar$(GZEXT) ~/rpmbuild/SOURCES && \
	rpmbuild -bs ~/rpmbuild/SPECS/cri-resource-manager.spec

rpm: source-rpm
	rpmbuild -bb ~/rpmbuild/SPECS/cri-resource-manager.spec

debian/%: packaging/deb.in/%
	$(Q)echo "Generating debian packaging file $@..."; \
	tardir=cri-resource-manager-$(TAR_VERSION) && \
	tarball=cri-resource-manager-$(TAR_VERSION).tar && \
	mkdir -p debian; \
	cp $< $@ && \
	sed -E -i -e "s/__PACKAGE__/cri-resource-manager/g" \
	          -e "s/__TARBALL__/$$tarball/g"            \
	          -e "s/__VERSION__/$(DEB_VERSION)/g"       \
	          -e "s/__AUTHOR__/$(USER_NAME)/g"          \
	          -e "s/__EMAIL__/$(USER_EMAIL)/g"          \
	          -e "s/__DATE__/$(BUILD_DATE)/g" $@

clean-deb:
	$(Q)rm -fr debian

cross-deb.%: docker/cross-build/% \
    clean-deb debian/changelog debian/control debian/rules debian/compat dist
	$(Q)distro=$(patsubst cross-deb.%,%,$@); \
	echo "Docker cross-building $$distro packages..."; \
	builddir=$(BUILD_DIR)/docker/$$distro; \
	outdir=$(PACKAGES_DIR)/$$distro; \
	mkdir -p $(PACKAGES_DIR)/$$distro && \
	rm -fr $$builddir && mkdir -p $$builddir/{input,build} && \
	cp cri-resource-manager-$(TAR_VERSION).tar$(GZEXT) $$builddir/input && \
	cp -r debian $$builddir/input && \
	$(DOCKER) run --rm $(DOCKER_OPTIONS) --user $$USER \
	    --env USER_NAME="$(USER_NAME)" --env USER_EMAIL=$(USER_EMAIL) \
	    -v $$(pwd)/$$builddir:/build \
	    -v $$(pwd)/$$outdir:/output \
	    -v "`go env GOMODCACHE`:/home/$$USER/go/pkg/mod" \
	    $$distro-build /bin/bash -c '$(DOCKER_DEB_BUILD)' && \
	rm -fr $$builddir && \
	install -D -m644 $$outdir/cri-resource-manager_$(DEB_VERSION)_amd64.deb $(PACKAGES_DIR)/release-assets/cri-resource-manager_$(DEB_VERSION)_$${distro}_amd64.deb

deb: debian/changelog debian/control debian/rules debian/compat dist
	dpkg-buildpackage -uc

cross-bin.%: docker/cross-build/% dist
	$(Q)distro=$(patsubst cross-bin.%,%,$@); \
	echo "Docker cross-building $$distro binaries..."; \
	builddir=$(BUILD_DIR)/docker/$$distro; \
	outdir=$(BINARIES_DIR)/$$distro; \
	mkdir -p $(BINARIES_DIR)/$$distro && \
	rm -fr $$builddir && mkdir -p $$builddir/{input,build} && \
	cp cri-resource-manager-$(TAR_VERSION).tar$(GZEXT) $$builddir/input && \
	$(DOCKER) run --rm $(DOCKER_OPTIONS) --user $$USER \
	    --env USER_NAME="$(USER_NAME)" --env USER_EMAIL=$(USER_EMAIL) \
	    -v $$(pwd)/$$builddir:/build \
	    -v $$(pwd)/$$outdir:/output \
	    -v "`go env GOMODCACHE`:/home/$$USER/go/pkg/mod" \
	    $$distro-build /bin/bash -c '$(DOCKER_BIN_BUILD)' && \
	rm -fr $$builddir

cross-tar cross-tarball: dist docker/cross-build/fedora
	$(Q)distro=tarball; \
	builddir=$(BUILD_DIR)/docker/$$distro; \
	outdir=$(PACKAGES_DIR)/$$distro; \
	echo "Docker cross-building $$distro packages..."; \
	mkdir -p $$outdir && \
	rm -fr $$builddir && mkdir -p $$builddir/{input,build} && \
	cp cri-resource-manager-$(TAR_VERSION).tar$(GZEXT) $$builddir/input && \
	$(DOCKER) run --rm $(DOCKER_OPTIONS) --user $$USER \
	    --env USER_NAME="$(USER_NAME)" --env USER_EMAIL=$(USER_EMAIL) \
	    -v $$(pwd)/$$builddir:/build \
	    -v $$(pwd)/$$outdir:/output \
	    -v "`go env GOMODCACHE`:/home/$$USER/go/pkg/mod" \
	    fedora-build /bin/bash -c '$(DOCKER_TAR_BUILD)' && \
	rm -fr $$builddir && \
	install -D -m644 -t $(PACKAGES_DIR)/release-assets $$outdir/cri-resource-manager-$(TAR_VERSION).x86_64.tar.gz

# Build a docker image (for distro cross-building).
docker/cross-build/%: dockerfiles/cross-build/Dockerfile.%
	$(Q)distro=$(patsubst docker/cross-build/%,%,$@) && \
	echo "Building cross-build docker image for $$distro..." && \
	img=$${distro}-build && $(DOCKER) rm $$distro-build || : && \
	scripts/build/docker-build-image $$distro-build \
	    $(DOCKER_PULL) \
	    --build-arg GO_VERSION=$(GO_VERSION) \
	    --build-arg GOLICENSES_VERSION=$(GOLICENSES_VERSION) \
	    $(DOCKER_OPTIONS)

# Rule for recompiling a changed protobuf.
%.pb.go: %.proto
	$(Q)if [ -n "$(PROTOC)" -o ! -e "$@" ]; then \
	        echo "Generating go code ($@) for updated protobuf $<..."; \
		$(PROTO_COMPILE) $<; \
	else \
	        echo "WARNING: no protoc found, compiling with OUTDATED $@..."; \
	fi


# Rule for installing in-repo git hooks.
install-git-hooks:
	$(Q)if [ -d .git -a ! -e .git-hooks.redirected ]; then \
	    echo -n "Redirecting git hooks to .githooks..."; \
	    git config core.hookspath .githooks && \
	    touch .git-hooks.redirected && \
	    echo "done."; \
	fi

# Rules for installing protoc and related utilities.
install-protoc:
	$(Q)./scripts/hack/install-protobuf

install-protoc-gen-go:
	$(Q)$(GO_INSTALL) google.golang.org/protobuf/cmd/protoc-gen-go@v1.28.0

install-protoc-gen-go-grpc:
	$(Q)$(GO_INSTALL) google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.2.0

install-protoc-tools: install-protoc install-protoc-gen-go install-protoc-gen-go-grpc

#
# go dependencies for our binaries (careful with that axe, Eugene...)
#

bin/cri-resmgr: $(wildcard cmd/cri-resmgr/*.go) $(UI_ASSETS) $(GEN_TARGETS) \
    $(shell for dir in \
                  $(shell go list -f '{{ join .Deps  "\n"}}' ./cmd/cri-resmgr/... | \
                          grep cri-resource-manager/pkg/ | \
                          sed 's#github.com/intel/cri-resource-manager/##g'); do \
                find $$dir -name \*.go; \
            done | sort | uniq)

bin/cri-resmgr-agent: $(wildcard cmd/cri-resmgr-agent/*.go) \
    $(shell for dir in \
                  $(shell go list -f '{{ join .Deps  "\n"}}' ./cmd/cri-resmgr-agent/... | \
                          grep cri-resource-manager/pkg/ | \
                          sed 's#github.com/intel/cri-resource-manager/##g'); do \
                find $$dir -name \*.go; \
            done | sort | uniq)

bin/webhook: $(wildcard cmd/cri-resmgr-webhook/*.go) \
    $(shell for dir in \
                  $(shell go list -f '{{ join .Deps  "\n"}}' ./cmd/cri-resmgr-webhook/... | \
                          grep cri-resource-manager/pkg/ | \
                          sed 's#github.com/intel/cri-resource-manager/##g'); do \
                find $$dir -name \*.go; \
            done | sort | uniq)

#
# rules to run go generators
#
clean-ui-assets:
	$(Q)echo "Cleaning up generated UI assets..."; \
	for i in $(UI_ASSETS); do \
	    echo "  - $$i"; \
	    rm -f $$i; \
	done

%_gendata.go::
	$(Q)echo "Generating $@..."; \
	cd $(dir $@) && \
	    $(GO_GEN) || exit 1 && \
	cd - > /dev/null

pkg/sysfs/sst_types%.go: pkg/sysfs/_sst_types%.go pkg/sysfs/gen_sst_types.sh
	$(Q)cd $(@D) && \
	    KERNEL_SRC_DIR=$(KERNEL_SRC_DIR) $(GO_GEN)


#
# API generation
#

# unconditionally generate all apis
generate-apis: generate-resmgr-api

# unconditionally generate (external) resmgr api
generate-resmgr-api:
	$(Q)$(call generate-api,resmgr,$(RESMGR_API_VERSION))

# automatic update of generated code for resource-manager external api
pkg/apis/resmgr/$(RESMGR_API_VERSION)/zz_generated.deepcopy.go: \
    pkg/apis/resmgr/$(RESMGR_API_VERSION)/types.go
	$(Q)$(call generate-api,resmgr,$(RESMGR_API_VERSION))

# macro to generate code for api $(1), version $(2)
generate-api = \
	echo "Generating '$(1)' api, version $(2)..." && \
	    KUBERNETES_VERSION=$(KUBERNETES_VERSION) \
	    ./scripts/code-generator/generate-groups.sh all \
	        github.com/intel/cri-resource-manager/pkg/apis/$(1)/generated \
	        github.com/intel/cri-resource-manager/pkg/apis $(1):$(2) \
	        --output-base $(shell pwd)/generate && \
	    cp -r generate/github.com/intel/cri-resource-manager/pkg/apis/$(1) pkg/apis && \
	        rm -fr generate/github.com/intel/cri-resource-manager/pkg/apis/$(1)


#
# dependencies for UI assets baked in using vfsgendev (can't come up with a working pattern rule)
#

pkg/cri/resource-manager/visualizer/bubbles/assets_gendata.go:: \
	$(wildcard pkg/cri/resource-manager/visualizer/bubbles/assets/*.html) \
	$(wildcard pkg/cri/resource-manager/visualizer/bubbles/assets/js/*.js) \
	$(wildcard pkg/cri/resource-manager/visualizer/bubbles/assets/css/*.css)


# phony targets
.PHONY: all build install clean test images images-push release-tests e2e-tests \
	format vet cyclomatic-check lint golangci-lint \
	cross-packages cross-rpm cross-deb \

#
# Rules for documentation
#

vhtml: _work/venv/.stamp
	. _work/venv/bin/activate && \
		make -C docs html && \
		cp -r docs/_build .

html: clean-html
	$(Q)BUILD_VERSION=$(BUILD_VERSION) \
		$(SPHINXBUILD) -c docs . "$(SITE_BUILDDIR)" $(SPHINXOPTS)
	cp docs/index.html "$(SITE_BUILDDIR)"
	for d in $$(find docs -name figures -type d); do \
	    mkdir -p $(SITE_BUILDDIR)/$$d && cp $$d/* $(SITE_BUILDDIR)/$$d; \
	done

serve-html: html
	$(Q)cd $(SITE_BUILDDIR) && python3 -m http.server 8081

clean-html:
	rm -rf $(SITE_BUILDDIR)

site-build: .$(DOCKER_SITE_BUILDER_IMAGE).image.stamp
	$(Q)$(DOCKER_SITE_CMD) $(DOCKER_SITE_BUILDER_IMAGE) make html

site-serve: .$(DOCKER_SITE_BUILDER_IMAGE).image.stamp
	$(Q)$(DOCKER_SITE_CMD) -it $(DOCKER_SITE_BUILDER_IMAGE) make serve-html

.$(DOCKER_SITE_BUILDER_IMAGE).image.stamp: docs/Dockerfile docs/requirements.txt
	docker build -t $(DOCKER_SITE_BUILDER_IMAGE) docs
	touch $@

# Set up a Python3 environment with the necessary tools for document creation.
_work/venv/.stamp: docs/requirements.txt
	rm -rf ${@D}
	python3 -m venv ${@D}
	. ${@D}/bin/activate && pip install -r $<
	touch $@


================================================
FILE: README.md
================================================
# CRI Resource Manager for Kubernetes\*

## ⚠️ The project is no longer maintained ⚠️

The CRI Resource manager project is no longer maintained. No further updates,
bug fixes or releases are planned.

We recommend users migrate to
[NRI Plugins](https://github.com/containers/nri-plugins), which provides
similar functionality and is actively maintained.

Thank you for being part of this journey!

### See our [Documentation][documentation] site for detailed documentation.

[documentation]: https://intel.github.io/cri-resource-manager


================================================
FILE: SECURITY.md
================================================
# Security Policy
Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation.

## Reporting a Vulnerability
Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).


================================================
FILE: cmd/cri-resmgr/cri-resource-manager.service.in
================================================
[Unit]
Description=A CRI proxy with (hardware) resource aware container placement policies.
Documentation=https://github.com/intel/cri-resource-manager
Before=kubelet.service
LogRateLimitIntervalSec=5
LogRateLimitBurst=100000

[Service]
Type=simple
EnvironmentFile=__DEFAULTDIR__/cri-resource-manager
ExecStart=__BINDIR__/cri-resmgr $CONFIG_OPTIONS $POLICY_OPTIONS
Restart=always

[Install]
WantedBy=multi-user.target


================================================
FILE: cmd/cri-resmgr/cri-resource-manager.sysconf
================================================
# Configuration options to pass to cri-resmgr when started via systemd.

# Use a fallback file for configuration if/when we can't acquire one from the agent.
CONFIG_OPTIONS="--fallback-config /etc/cri-resmgr/fallback.cfg"

# Enable this for preventing the active policy to be changed during startup.
#POLICY_OPTIONS="--disable-policy-switch"


================================================
FILE: cmd/cri-resmgr/fallback.cfg.sample
================================================
#
# If you pass this file to cri-resmgr using the --fallback-config
# command line option, it will be used if configuration cannot be
# acquired from any other source (agent, or last configuration
# stored in the cache).
#
# Switching Policies:
#     Recent versions of cri-resmgr will allow changing the active
#     policy during startup. If you want to prevent this from hap-
#     pening you can pass the --disable-policy-switch option to
#     cri-resmgr on the command line.
#
#     With the stock packaging you can control whether startup-
#     phase policy switching is allowed using the POLICY_OPTIONS
#     variable in the sysconf file.
#
#     If switching policies is disabled, you can still reset the
#     active policy manually when cri-resmgr is not running. This
#     allows cri-resmgr to start up next with a new policy. You
#     do this by passing the --reset-policy command line option
#     to cri-resmgr. The full sequence of switching policies this
#     way is
#         - stop cri-resmgr (systemctl stop cri-resource-manager),
#         - reset the active policy (cri-resmgr --reset-policy),
#         - start cri-resmgr (systemctl start cri-resource-manager)
#

policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
logger:
  Debug: resource-manager,cache,resource-control
dump:
  Config: off:.*,full:((Create)|(Remove)|(Run)|(Update)|(Start)|(Stop)).*


================================================
FILE: cmd/cri-resmgr/main.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
	"flag"
	"fmt"
	"os"
	"strings"
	"syscall"
	"time"

	"github.com/intel/goresctrl/pkg/rdt"

	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	"github.com/intel/cri-resource-manager/pkg/instrumentation"

	"github.com/intel/cri-resource-manager/pkg/config"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	version "github.com/intel/cri-resource-manager/pkg/version"
)

var log = logger.Default()

func main() {
	rate := logger.Rate{Limit: logger.Every(1 * time.Minute)}
	logger.SetGrpcLogger("grpc", &rate)
	logger.SetStdLogger("stdlog")
	rdt.SetLogger(logger.Get("rdt"))

	printConfig := flag.Bool("print-config", false, "Print configuration and exit.")
	listPolicies := flag.Bool("list-policies", false, "List available policies.")
	flag.Parse()

	switch {
	case *printConfig:
		config.Print(nil)
		os.Exit(0)

	case *listPolicies:
		fmt.Printf("Available policies:\n")
		for _, available := range policy.AvailablePolicies() {
			fmt.Printf("  * %s: %s\n", available.Name, available.Description)
		}
		os.Exit(0)

	default:
		if args := flag.Args(); len(args) > 0 {
			switch args[0] {
			case "config-help", "help":
				config.Describe(args[1:]...)
				os.Exit(0)
			default:
				log.Error("unknown command line arguments: %s", strings.Join(flag.Args(), ","))
				flag.Usage()
				os.Exit(1)
			}
		}
	}

	logger.Flush()
	logger.SetupDebugToggleSignal(syscall.SIGUSR1)
	log.Info("cri-resmgr (version %s, build %s) starting...", version.Version, version.Build)

	if err := instrumentation.Start(); err != nil {
		log.Fatal("failed to set up instrumentation: %v", err)
	}
	defer instrumentation.Stop()

	m, err := resmgr.NewResourceManager()
	if err != nil {
		log.Fatal("failed to create resource manager instance: %v", err)
	}

	if err := m.Start(); err != nil {
		log.Fatal("failed to start resource manager: %v", err)
	}

	for {
		time.Sleep(15 * time.Second)
	}
}


================================================
FILE: cmd/cri-resmgr-agent/Dockerfile
================================================
ARG GO_VERSION=1.24

FROM golang:${GO_VERSION}-bullseye as builder

ARG GOLICENSES_VERSION

WORKDIR /go/build

# Fetch go dependencies in a separate layer for caching
RUN go install github.com/google/go-licenses@${GOLICENSES_VERSION}
COPY go.mod go.sum ./
COPY pkg/topology/ pkg/topology/
RUN go mod download -x

# Build agent and agent-probe, fully statically linked binary
COPY . .

RUN CGO_ENABLED=0 make build-static BUILD_DIRS="cri-resmgr-agent cri-resmgr-agent-probe" && \
    install -D /go/build/bin/* -t /install_root/bin

# Save licenses
RUN make install-licenses BUILD_DIRS="cri-resmgr-agent cri-resmgr-agent-probe" DESTDIR=/install_root

FROM scratch as final

COPY --from=builder /install_root /

ENTRYPOINT ["/bin/cri-resmgr-agent"]


================================================
FILE: cmd/cri-resmgr-agent/agent-deployment.yaml
================================================
apiVersion: v1
kind: ServiceAccount
metadata:
  name: cri-resmgr-agent
  namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: cri-resmgr-agent
rules:
- apiGroups:
  - ""
  - criresmgr.intel.com
  resources:
  - nodes
  - configmaps
  - adjustments
  - labels
  - annotations
  verbs:
  - get
  - patch
  - update
  - watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: cri-resmgr-agent
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: cri-resmgr-agent
subjects:
- kind: ServiceAccount
  name: cri-resmgr-agent
  namespace: kube-system
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
  labels:
    app: cri-resmgr-agent
  name: cri-resmgr-agent
  namespace: kube-system
spec:
  selector:
    matchLabels:
      app: cri-resmgr-agent
  template:
    metadata:
      labels:
        app: cri-resmgr-agent
    spec:
      serviceAccount: cri-resmgr-agent
      containers:
        - name: cri-resmgr-agent
          env:
          - name: NODE_NAME
            valueFrom:
              fieldRef:
                fieldPath: spec.nodeName
          image: IMAGE_PLACEHOLDER
          imagePullPolicy: Always # for testing
          securityContext:
            allowPrivilegeEscalation: false
            capabilities:
              drop: ["ALL"]
            readOnlyRootFilesystem: true
          volumeMounts:
          - name: resmgrsockets
            mountPath: /var/run/cri-resmgr
          resources:
            limits:
              cpu: 1
              memory: 512Mi
          livenessProbe:
            exec:
              command: ["/bin/cri-resmgr-agent-probe"]
            initialDelaySeconds: 5
            periodSeconds: 30
          #
          # Notes: This is NOT a readiness probe for the agent itself.
          #
          # We (mis)use this readiness probe to propagate information
          # back to the control plane about any failure on the node to
          # activate the last updated configuration. Since success or
          # failure is reflected by whether the agent's pod on the node
          # is marked Ready, any error in configuration should now be a
          # watchable condition, at least indirectly. One can get more
          # details about the specifics of any configuration errors by
          # watching the readiness of the agent's and fetching its log
          # messages if it ever becomes not ready.
          #
          readinessProbe:
            exec:
              command: ["/bin/cri-resmgr-agent-probe", "-query", "config-status"]
            initialDelaySeconds: 5
            periodSeconds: 30
      volumes:
      - name: resmgrsockets
        hostPath:
          path: /var/run/cri-resmgr


================================================
FILE: cmd/cri-resmgr-agent/main.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
	"flag"

	"github.com/intel/cri-resource-manager/pkg/agent"
	"github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/version"
)

func main() {
	// Disable buffering and make sure that all messages have been emitted at
	// program exit
	log.Flush()
	defer log.Flush()

	flag.Parse()

	a, err := agent.NewResourceManagerAgent()
	if err != nil {
		log.Fatal("failed to create resource manager agent instance: %v", err)
	}

	log.Info("cri-resmgr agent (version %s, build %s) starting...", version.Version, version.Build)

	if err := a.Run(); err != nil {
		log.Fatal("%v", err)
	}
}


================================================
FILE: cmd/cri-resmgr-agent-probe/main.go
================================================
/*
Copyright 2020 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
	"context"
	"flag"
	"fmt"
	"net"
	"time"

	"google.golang.org/grpc"

	agent_v1 "github.com/intel/cri-resource-manager/pkg/agent/api/v1"
	v1 "github.com/intel/cri-resource-manager/pkg/agent/api/v1"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/sockets"
	"github.com/intel/cri-resource-manager/pkg/log"
)

func main() {
	socket := flag.String("agent-socket", sockets.ResourceManagerAgent, "Unix domain socket where agent is serving")
	query := flag.String("query", "", fmt.Sprintf("query to send, use %q to query status of last config push to resmgr", v1.ConfigStatus))

	// Disable logger buffering and make sure that everything has been flushed
	// when program exits
	log.Flush()
	defer log.Flush()

	flag.Parse()

	// Try to connect to agent
	dialOpts := []grpc.DialOption{
		grpc.WithInsecure(),
		grpc.WithDialer(func(sock string, timeout time.Duration) (net.Conn, error) {
			return net.Dial("unix", sock)
		}),
	}
	conn, err := grpc.Dial(*socket, dialOpts...)
	if err != nil {
		log.Fatal("failed to connect to agent: %v", err)
	}
	cli := agent_v1.NewAgentClient(conn)

	// Do health check
	ctx, cancel := context.WithTimeout(context.Background(), time.Second)
	defer cancel()

	rpl, err := cli.HealthCheck(ctx, &agent_v1.HealthCheckRequest{
		Query: *query,
	})
	if err != nil {
		log.Fatal("%v", err)
	}
	if rpl.Error != "" {
		log.Fatal("health check negative: %s", rpl.Error)
	}
	log.Info("Health check OK")
}


================================================
FILE: cmd/cri-resmgr-webhook/Dockerfile
================================================
ARG GO_VERSION=1.24

FROM golang:${GO_VERSION}-bullseye as builder

ARG GOLICENSES_VERSION

WORKDIR /go/build

# Fetch go dependencies in a separate layer for caching
RUN go install github.com/google/go-licenses@${GOLICENSES_VERSION}
COPY go.mod go.sum ./
COPY pkg/topology/ pkg/topology/
RUN go mod download -x

# Build webhook, fully statically linked binary
COPY . .

RUN CGO_ENABLED=0 make build-static BUILD_DIRS=cri-resmgr-webhook && \
    install -D /go/build/bin/* -t /install_root/bin

# Save licenses
RUN make install-licenses BUILD_DIRS=cri-resmgr-webhook DESTDIR=/install_root

FROM scratch as final

USER 65534:65534

COPY --from=builder /install_root /

ENTRYPOINT ["/bin/cri-resmgr-webhook"]


================================================
FILE: cmd/cri-resmgr-webhook/handlers.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"log"
	"net/http"

	admissionv1 "k8s.io/api/admission/v1"
	corev1 "k8s.io/api/core/v1"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/apimachinery/pkg/runtime"
	"k8s.io/apimachinery/pkg/runtime/serializer"
	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
	"sigs.k8s.io/yaml"
)

type jsonPatch struct {
	Op    string      `json:"op"`
	Path  string      `json:"path"`
	Value interface{} `json:"value"`
}

type podResourceRequirements struct {
	InitContainers map[string]corev1.ResourceRequirements `json:"initContainers"`
	Containers     map[string]corev1.ResourceRequirements `json:"containers"`
}

var scheme = runtime.NewScheme()
var codecs = serializer.NewCodecFactory(scheme)

// Module inatialization
func init() {
	utilruntime.Must(corev1.AddToScheme(scheme))
	utilruntime.Must(admissionv1.AddToScheme(scheme))
}

// Helper for creating an AdmissionResponse with an error
func errResponse(err error) *admissionv1.AdmissionResponse {
	return &admissionv1.AdmissionResponse{
		Result: &metav1.Status{
			Message: err.Error(),
		},
	}
}

// Dump req/rsp in human-readable form
func stringify(r interface{}) string {
	out, err := yaml.Marshal(r)
	if err != nil {
		return fmt.Sprintf("!!!!!\nUnable to stringify %T: %v\n!!!!!", r, err)
	}
	return string(out)
}

// Handle HTTP requests
func handle(w http.ResponseWriter, r *http.Request) {
	var body []byte
	if r.Body != nil {
		if data, err := io.ReadAll(r.Body); err == nil {
			body = data
		}
	}

	// Check Content-Type
	contentType := r.Header.Get("Content-Type")
	if contentType != "application/json" {
		log.Printf("ERROR: incorrect Content-Type (received %s, expect application/json", contentType)
		return
	}

	// Deserialize AdmissionReview request and create an AdmissionReview response
	arReq := admissionv1.AdmissionReview{}
	arRsp := admissionv1.AdmissionReview{}
	deserializer := codecs.UniversalDeserializer()
	if _, _, err := deserializer.Decode(body, nil, &arReq); err != nil {
		log.Printf("ERROR: deserializing admission request failed: %v", err)
		arRsp.Response = errResponse(err)
	} else if arReq.Request == nil {
		log.Printf("REQUEST empty")
		arRsp.Response = errResponse(errors.New("Empty request"))
	} else {
		log.Printf("REQUEST:\n%s", stringify(&arReq))
		if arReq.Request.Resource.Group != "" || arReq.Request.Resource.Version != "v1" {
			arRsp.Response = errResponse(fmt.Errorf("Unexpected resource group/version '%s/%s'", arReq.Request.Resource.Group, arReq.Request.Resource.Version))
		} else {
			res := arReq.Request.Resource.Resource
			switch res {
			case "pods":
				arRsp.Kind = "AdmissionReview"
				arRsp.APIVersion = "admission.k8s.io/v1"
				arRsp.Response = mutatePodObject(&arReq.Request.Object)
			default:
				arRsp.Response = errResponse(fmt.Errorf("Unexpected resource %s", arReq.Request.Resource))
			}
		}
		// Use the same UID in response that was used in the request
		arRsp.Response.UID = arReq.Request.UID
	}

	log.Printf("RESPONSE:\n%s", stringify(arRsp.Response))

	respBytes, err := json.Marshal(arRsp)
	if err != nil {
		log.Printf("ERROR: json marshal failed: %v", err)
	}
	if _, err := w.Write(respBytes); err != nil {
		log.Printf("ERROR: failed to write HTTP response: %v", err)
	}
}

// Handle AdmissionReview requests for Pod objects
func mutatePodObject(rawObj *runtime.RawExtension) *admissionv1.AdmissionResponse {
	pod := corev1.Pod{}
	deserializer := codecs.UniversalDeserializer()
	if _, _, err := deserializer.Decode(rawObj.Raw, nil, &pod); err != nil {
		log.Printf("ERROR: failed to deserialize Pod object: %v", err)
		return errResponse(err)
	}

	reviewResponse := admissionv1.AdmissionResponse{}
	reviewResponse.Allowed = true

	patches := []jsonPatch{}
	// Add a patch to add an empty annotations object if no annotations are found
	if pod.ObjectMeta.Annotations == nil {
		patches = append(patches, jsonPatch{Op: "add", Path: "/metadata/annotations", Value: map[string]string{}})
	}

	patch, err := patchResourceAnnotation(&pod)
	if err != nil {
		return errResponse(err)
	}
	patches = append(patches, patch)

	reviewResponse.Patch, err = json.Marshal(patches)
	if err != nil {
		log.Printf("ERROR: failed to marshal Pod patch: %v", err)
		return errResponse(err)
	}
	patchType := admissionv1.PatchTypeJSONPatch
	reviewResponse.PatchType = &patchType

	return &reviewResponse
}

// Create a Pod (JSON) patch adding resource annotation
func patchResourceAnnotation(pod *corev1.Pod) (jsonPatch, error) {
	patch := jsonPatch{Op: "add", Path: "/metadata/annotations/intel.com~1resources"}

	// Create annotation that includes all resources of all (init)containers
	resourceAnnotation := podResourceRequirements{InitContainers: map[string]corev1.ResourceRequirements{},
		Containers: map[string]corev1.ResourceRequirements{}}
	for _, container := range pod.Spec.Containers {
		resourceAnnotation.Containers[container.Name] = container.Resources
	}
	for _, container := range pod.Spec.InitContainers {
		resourceAnnotation.InitContainers[container.Name] = container.Resources
	}
	resourceAnnotationBytes, err := json.Marshal(resourceAnnotation)
	if err != nil {
		log.Printf("ERROR: failed to marshal 'intel.com/resources' annotations: %v", err)
		return patch, err
	}

	// Patch Pod annotations to include the "resources" annotation
	patch.Value = string(resourceAnnotationBytes)

	return patch, nil
}


================================================
FILE: cmd/cri-resmgr-webhook/main.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
	"flag"
	"log"
)

// Parse command line
func parseArgs() args {
	args := args{}

	flag.IntVar(&args.port, "port", 443, "Port on which to listen for connections")
	flag.StringVar(&args.certFile, "cert-file", "", "x509 certificate used for authenticating connections")
	flag.StringVar(&args.keyFile, "key-file", "", "Private x509 key matching --cert-file")

	flag.Parse()

	return args
}

func main() {
	args := parseArgs()

	if err := Run(args); err != nil {
		log.Fatal(err)
	}

}


================================================
FILE: cmd/cri-resmgr-webhook/mutating-webhook-config.yaml
================================================
apiVersion: admissionregistration.k8s.io/v1
kind: MutatingWebhookConfiguration
metadata:
  name: cri-resmgr
webhooks:
- name: cri-resmgr.intel.com
  sideEffects: None
  admissionReviewVersions: ["v1"]
  rules:
  - apiGroups:
    - ""
    apiVersions:
    - v1
    operations:
    - CREATE
    - UPDATE
    resources:
    - pods
  clientConfig:
    service:
      namespace: cri-resmgr
      name: cri-resmgr-webhook
    caBundle: CA_BUNDLE_PLACEHOLDER


================================================
FILE: cmd/cri-resmgr-webhook/webhook-deployment.yaml
================================================
apiVersion: v1
kind: Namespace
metadata:
  name: cri-resmgr
  labels:
    name: cri-resmgr
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: cri-resmgr-webhook
  namespace: cri-resmgr
  labels:
    app: cri-resmgr-webhook
spec:
  replicas: 1
  selector:
    matchLabels:
      app: cri-resmgr-webhook
  template:
    metadata:
      labels:
        app: cri-resmgr-webhook
    spec:
      containers:
      - name: cri-resmgr-webhook
        image: IMAGE_PLACEHOLDER
        # Convenience pull policy for development
        imagePullPolicy: Always
        # Mount the tls cert/key in the default location
        volumeMounts:
        - name: certs
          mountPath: /etc/cri-resmgr-webhook/certs.d/
          readOnly: true
        args:
         - "-cert-file=/etc/cri-resmgr-webhook/certs.d/svc.crt"
         - "-key-file=/etc/cri-resmgr-webhook/certs.d/svc.key"
         - "-port=8443"
        securityContext:
          allowPrivilegeEscalation: false
          capabilities:
            drop: ["ALL"]
          readOnlyRootFilesystem: true
          runAsNonRoot: true
        resources:
          limits:
            cpu: 1
            memory: 256Mi
        livenessProbe:
          httpGet:
            scheme: HTTPS
            port: 8443
            httpHeaders:
            - name: "Content-Type"
              value: "application/json"
          initialDelaySeconds: 5
          periodSeconds: 30

      nodeSelector:
        node-role.kubernetes.io/control-plane: ""
      tolerations:
        - key: "node-role.kubernetes.io/control-plane"
          operator: "Equal"
          value: ""
          effect: "NoSchedule"
      volumes:
      # This example deployment uses k8s secrests to store TLS secrets
      # You need to manually generate the cert/key pair, and, the accompanying secret
      # Expected filenames are "svc.crt" and "svc.key"
      - name: certs
        secret:
          secretName: cri-resmgr-webhook-secret
---
apiVersion: v1
kind: Service
metadata:
  name: cri-resmgr-webhook
  namespace: cri-resmgr
spec:
  selector:
    app: cri-resmgr-webhook
  ports:
  - port: 443
    targetPort: 8443
    protocol: TCP


================================================
FILE: cmd/cri-resmgr-webhook/webhook.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
	"crypto/tls"
	"fmt"
	"log"
	"net/http"
)

type args struct {
	port     int
	certFile string
	keyFile  string
}

// Load server certificate and private key
func loadTLS(certFile, keyFile string) *tls.Config {
	cert, err := tls.LoadX509KeyPair(certFile, keyFile)
	if err != nil {
		log.Fatalf("Failed to initialize TLS config: %v", err)
	}
	return &tls.Config{
		Certificates: []tls.Certificate{cert},
	}
}

// Run is the main entry point for the webhook server
func Run(args args) error {
	// Attach handlers
	http.HandleFunc("/", handle)

	// Create and run HTTP server
	server := &http.Server{
		Addr:      fmt.Sprintf(":%d", args.port),
		TLSConfig: loadTLS(args.certFile, args.keyFile),
	}
	log.Printf("Listening on port %d", args.port)
	return server.ListenAndServeTLS("", "")
}


================================================
FILE: demo/blockio/bb-scanner.yaml
================================================
# bb-scanner continuously calculates checksums of files found
# under /scan. Output reveals added, deleted, renamed and modified
# files together with timestamps.
#
# bb-scanner is configured as a low-priority activity:
# 1. CPU usage is limited to 10 %.
# 2. Disk/SSD bandwidth is limited by SlowReader configuration.
#
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: bb-scanner
  labels:
    app: bb-scanner
spec:
  selector:
    matchLabels:
      app: bb-scanner
  template:
    metadata:
      name: bb-scanner
      labels:
        app: bb-scanner
      annotations:
        blockioclass.cri-resource-manager.intel.com/pod: SlowReader
    spec:
      terminationGracePeriodSeconds: 1
      containers:
      - image: busybox
        command:
          - sh
          - -c
          - while true; do
              find /scan -type f -print0 | xargs -0 md5sum | sort > curr.md5;
              date +%s >> /output/diffs.md5;
              diff -U1 prev.md5 curr.md5 >> /output/diffs.md5;
              cp curr.md5 /output/files.md5;
              mv curr.md5 prev.md5;
            done
        imagePullPolicy: IfNotPresent
        name: busybox
        resources:
          limits:
            cpu: 100m
        volumeMounts:
          - mountPath: /scan/usr-bin
            name: usr-bin
            readOnly: true
          - mountPath: /scan/usr-lib
            name: usr-lib
            readOnly: true
          - mountPath: /output
            name: output
            readOnly: false
      volumes:
        - name: usr-bin
          hostPath:
            path: /usr/bin
            type: DirectoryOrCreate
        - name: usr-lib
          hostPath:
            path: /usr/lib
            type: DirectoryOrCreate
        - name: output
          hostPath:
            path: /var/cache/bb-scanner
            type: DirectoryOrCreate
      restartPolicy: Always


================================================
FILE: demo/blockio/cri-resmgr-config.default.yaml
================================================
apiVersion: v1
kind: ConfigMap
metadata:
  name: cri-resmgr-config.default
  namespace: kube-system
data:
  policy: |+
    Active: none
  logger: |+
    Debug: blockio,cgroupblkio
  blockio: |+
    Classes:
      SlowReader:
        - Devices:
            - /dev/vda
          ThrottleReadBps: 512k


================================================
FILE: demo/blockio/run.sh
================================================
#!/bin/bash

DEMO_TITLE="CRI Resource Manager: Block I/O Demo"

PV='pv -qL'

SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")"
LIB_DIR=$SCRIPT_DIR/../lib
BIN_DIR=${bindir-$(realpath "$SCRIPT_DIR/../../bin")}
OUTPUT_DIR=${outdir-$SCRIPT_DIR/output}
COMMAND_OUTPUT_DIR=$OUTPUT_DIR/commands

# shellcheck disable=SC1091
# shellcheck source=../lib/command.bash
source "$LIB_DIR/command.bash"
# shellcheck disable=SC1091
# shellcheck source=../lib/host.bash
source "$LIB_DIR/host.bash"
# shellcheck disable=SC1091
# shellcheck source=../lib/vm.bash
source "$LIB_DIR/vm.bash"

usage() {
    echo "$DEMO_TITLE"
    echo "Usage: [VAR=VALUE] ./run.sh MODE"
    echo "  MODE:     \"play\" plays the demo."
    echo "            \"record\" plays and records the demo."
    echo "            \"test\" runs fast, reports pass or fail."
    echo "  VARs:"
    echo "    vm:      govm virtual machine name."
    echo "             The default is \"crirm-demo-blockio\"."
    echo "    speed:   Demo play speed."
    echo "             The default is 10 (keypresses per second)."
    echo "    cleanup: 0: leave VM running. (\"play\" mode default)"
    echo "             1: delete VM (\"test\" mode default)"
    echo "             2: stop VM, but do not delete it."
    echo "    outdir:  Save output under given directory."
    echo "             The default is \"${SCRIPT_DIR}/output\"."
    echo "    binsrc:  Where to get cri-resmgr to the VM."
    echo "             \"github\": go get and build in VM (\"play\" mode default)."
    echo "             \"local\": copy from source tree bin/ (\"test\" mode default)"
    echo "                      (set bindir=/path/to/cri-resmgr* to override bin/)"
}

error() {
    (echo ""; echo "error: $1" ) >&2
    exit 1
}

out() {
    if [ -n "$PV" ]; then
        speed=${speed-10}
        echo "$1" | $PV "$speed"
    else
        echo "$1"
    fi
    echo ""
}

record() {
    clear
    out "Recording this screencast..."
    host-command "asciinema rec -t \"$DEMO_TITLE\" crirm-demo-blockio.cast -c \"./run.sh play\""
}

screen-create-vm() {
    speed=60 out "### Running the demo in VM \"$vm\"."
    host-create-vm "$vm"
    vm-networking
    if [ -z "$VM_IP" ]; then
        error "creating VM failed"
    fi
}

screen-install-k8s() {
    speed=60 out "### Installing Kubernetes to the VM."
    vm-install-cri
    vm-install-k8s
}

screen-install-cri-resmgr() {
    speed=60 out "### Installing CRI Resource Manager to VM."
    vm-install-cri-resmgr
}

screen-launch-cri-resmgr() {
    policy=${policy-none}
    speed=60 out "### Launching cri-resmgr."
    vm-command "(echo \"policy:\"; echo \"  Active: $policy\") > cri-resmgr.fallback.cfg"
    vm-command "cri-resmgr -relay-socket /var/run/cri-resmgr/cri-resmgr.sock -runtime-socket /var/run/containerd/containerd.sock -fallback-config cri-resmgr.fallback.cfg >cri-resmgr.output.txt 2>&1 &"
}

screen-create-singlenode-cluster() {
    speed=60 out "### Setting up single-node Kubernetes cluster."
    speed=60 out "### CRI Resource Manager + containerd will act as the container runtime."
    vm-create-singlenode-cluster
}

screen-launch-cri-resmgr-agent() {
    speed=60 out "### Launching cri-resmgr-agent."
    speed=60 out "### The agent will make cri-resmgr configurable with ConfigMaps."
    vm-command "NODE_NAME=\$(hostname) cri-resmgr-agent -kubeconfig \$HOME/.kube/config >cri-resmgr-agent.output.txt 2>&1 &"
}

screen-measure-io-speed() {
    process=$1
    measuretime=2
    vm-command "echo 3 > /proc/sys/vm/drop_caches"
    out "### Measuring $process read speed -- twice."
    cmd="pid=\$(ps -A | awk \"/$process/{print \\\$1}\"); [ -n \"\$pid\" ] && { echo \$(grep read_bytes /proc/\$pid/io; sleep $measuretime; grep read_bytes /proc/\$pid/io) | awk \"{print \\\"$process read speed: \\\"(\\\$4-\\\$2)/$measuretime/1024\\\" kBps\\\"}\"; }"
    speed=360 outcolor=10 vm-command "$cmd"
    sleep 1
    speed=360 outcolor=10 vm-command "$cmd"
}

demo-blockio() {
    out "### Let the show begin!"
    out "### Configuring cri-resmgr: introduce a SlowReader block I/O class."
    host-command "scp cri-resmgr-config.default.yaml $VM_SSH_USER@$VM_IP:"
    vm-command "cat cri-resmgr-config.default.yaml"
    out "### Note: SlowReaders can read from each of the listed devices up to $(vm-command-q "awk '/ThrottleRead/{print \$2}' < cri-resmgr-config.default.yaml")Bps."
    vm-command "kubectl apply -f cri-resmgr-config.default.yaml"
    out "### Our test workload, bb-scanner, is annotated as a SlowReader."
    host-command "scp bb-scanner.yaml $VM_SSH_USER@$VM_IP:"
    vm-command "grep -A1 annotations: bb-scanner.yaml"
    out "### Flushing caches and deploying bb-scanner."
    vm-command "echo 3 > /proc/sys/vm/drop_caches"
    vm-command "kubectl create -f bb-scanner.yaml"

    out "### Now bb-scanner is running md5sum to all mounted directories, non-stop."
    vm-wait-process --timeout 60 md5sum

    screen-measure-io-speed md5sum

    out "### Reconfiguring cri-resmgr: set SlowReader read speed to 2 MBps."
    out "### This applies to all pods and containers in this block I/O class,"
    out "### both new and already running, like our bb-scanner."
    vm-command "sed -i 's/ThrottleReadBps:.*/ThrottleReadBps: 2Mi/' cri-resmgr-config.default.yaml"
    vm-command "cat cri-resmgr-config.default.yaml"
    vm-command "kubectl apply -f cri-resmgr-config.default.yaml"

    # Give some time for new config to become effective and process
    # I/O to accelerate.
    sleep 2;

    screen-measure-io-speed md5sum

    out "### Thanks for watching!"
    out "### Cleaning up: deleting bb-scanner."
    vm-command "kubectl delete daemonset bb-scanner"
}

# Validate parameters
mode=$1
distro=${distro:="ubuntu-20.04"}
cri=${cri:="containerd"}
vm=${vm:="blockio-$distro-$cri"}
echo "vm is here: \"$vm\""
host-set-vm-config "$vm" "$distro" "$cri"

if [ "$mode" == "play" ]; then
    speed=${speed-10}
    cleanup=${cleanup-0}
    binsrc=${binsrc-github}
elif [ "$mode" == "test" ]; then
    PV=
    cleanup=${cleanup-1}
    binsrc=${binsrc-local}
elif [ "$mode" == "record" ]; then
    record
else
    usage
    error "missing valid MODE"
    exit 1
fi

# Prepare for test/demo
mkdir -p "$OUTPUT_DIR"
mkdir -p "$COMMAND_OUTPUT_DIR"
rm -f "$COMMAND_OUTPUT_DIR"/0*
( echo x > "$OUTPUT_DIR"/x && rm -f "$OUTPUT_DIR"/x ) || {
    error "output directory outdir=\"$OUTPUT_DIR\" is not writable"
}

if [ "$binsrc" == "local" ]; then
    [ -f "${BIN_DIR}/cri-resmgr" ] || error "missing \"${BIN_DIR}/cri-resmgr\""
    [ -f "${BIN_DIR}/cri-resmgr-agent" ] || error "missing \"${BIN_DIR}/cri-resmgr-agent\""
fi

if [ -z "$VM_IP" ] || [ -z "$VM_SSH_USER" ] || [ -z "$VM_NAME" ]; then
    screen-create-vm
fi

if ! vm-command-q "dpkg -l | grep -q kubelet"; then
    screen-install-k8s
fi

if ! vm-command-q "[ -f /usr/bin/cri-resmgr ] || [ -f /usr/local/bin/cri-resmgr ]"; then
    screen-install-cri-resmgr
fi

# start cri-resmgr if not already running
if ! vm-command-q "pidof cri-resmgr" >/dev/null; then
    screen-launch-cri-resmgr
fi

# create kubernetes cluster or wait that it is online
if vm-command-q "[ ! -f /var/lib/kubelet/config.yaml ]"; then
    screen-create-singlenode-cluster
else
    # wait for kube-apiserver to launch (may be down if the VM was just booted)
    vm-wait-process kube-apiserver
fi

# start cri-resmgr-agent if not already running
if ! vm-command-q "pidof cri-resmgr-agent >/dev/null"; then
    screen-launch-cri-resmgr-agent
fi

# Run test/demo
demo-blockio

# Cleanup
if [ "$cleanup" == "0" ]; then
    echo "The VM, Kubernetes and cri-resmgr are left running. Next steps:"
    vm-print-usage
elif [ "$cleanup" == "1" ]; then
    host-stop-vm $vm
    host-delete-vm $vm
elif [ "$cleanup" == "2" ]; then
    host-stop-vm $vm
fi

# Summarize results
SUMMARY_FILE="$OUTPUT_DIR/summary.txt"
echo -n "" > "$SUMMARY_FILE" || error "cannot write summary to \"$SUMMARY_FILE\""
first_speed="$(grep "^md5sum read speed:" "$COMMAND_OUTPUT_DIR"/0* | head -n 1 | awk '{print $4}')"
last_speed="$(grep "^md5sum read speed:" "$COMMAND_OUTPUT_DIR"/0* | tail -n 1 | awk '{print $4}')"
echo "First md5sum read speed (512 kBps throttling): $first_speed kBps" >> "$SUMMARY_FILE"
echo "Last  md5sum read speed (2 MBps throttling): $last_speed kBps" >> "$SUMMARY_FILE"
# Declare verdict in test mode
exit_status=0
if [ "$mode" == "test" ]; then
    min_first=100 max_first=600 min_last=1500 max_last=2500
    [[ "$first_speed" -gt "$min_first" ]] || exit_status=1
    [[ "$first_speed" -lt "$max_first" ]] || exit_status=1
    [[ "$last_speed" -gt "$min_last" ]] || exit_status=1
    [[ "$last_speed" -lt "$max_last" ]] || exit_status=1
    if [ "$exit_status" == "1" ]; then
        echo "Error: speeds outside acceptable ranges ($min_first..$max_first kBps and $min_last..$max_last kBps)." >> "$SUMMARY_FILE"
        echo "Test verdict: FAIL" >> "$SUMMARY_FILE"
    else
        echo "Speeds within acceptable ranges ($min_first..$max_first kBps and $min_last..$max_last kBps)." >> "$SUMMARY_FILE"
        echo "Test verdict: PASS" >> "$SUMMARY_FILE"
    fi
    echo ""
    cat "$SUMMARY_FILE"
fi
exit $exit_status


================================================
FILE: demo/lib/command.bash
================================================
# Hooks for displaying and logging how shell commands (local and
# remote) are executed, and handling their output and exit status.
#
# Example in a Bash script, run-on-mytargethost function:
#   command-start mytargethost "ls -la"
#   ssh mytargethost $COMMAND 2>&1 | command-handle-output
#   command-end ${PIPESTATUS[0]}
#   [ "$COMMAND_STATUS" == "0" ] || command-error "non-zero exit status"
#
# command-start and command-end set environment variables:
# COMMAND, COMMAND_STATUS, COMMAND_OUTPUT

export LC_NUMERIC=C

# These exports force ssh-* to fail instead of prompting for a passphrase.
export DISPLAY=bogus-none
export SSH_ASKPASS=/bin/false
SSH_KEY="${HOME}/.ssh/id_rsa"
SSH_OPTS="-o StrictHostKeyChecking=No"
SSH="ssh $SSH_OPTS"
SCP="scp $SSH_OPTS"

epochrealtime() {
    [ -n "$EPOCHREALTIME" ] && echo "$EPOCHREALTIME" || echo "$SECONDS"
}

COMMAND_COUNTER=0
command_init_time=$(epochrealtime)

command-start() {
    # example: command-start vm prompt "mkdir $MYDIR"
    COMMAND_TARGET="$1"
    COMMAND_PROMPT="$2"
    COMMAND="$3"
    COMMAND_STATUS=""
    COMMAND_OUTPUT=""
    COMMAND_COUNTER=$(( COMMAND_COUNTER + 1 ))
    local command_start_time=$(epochrealtime)
    local time_since_start=$(echo "$command_start_time - $command_init_time" | bc)
    COMMAND_OUT_FILE="$COMMAND_OUTPUT_DIR/$(printf %04g $COMMAND_COUNTER)-$COMMAND_TARGET"
    echo "# start time: $time_since_start" > "$COMMAND_OUT_FILE" || {
        echo "cannot write command output to file \"$COMMAND_OUT_FILE\""
        exit 1
    }
    echo "# command: $COMMAND" >> "$COMMAND_OUT_FILE"
    echo -e -n "${COMMAND_PROMPT}"
    if [ -n "$PV" ]; then
        echo "$COMMAND" | $PV $speed
    else
        echo "$COMMAND"
    fi
    if [ -n "$outcolor" ]; then
        COMMAND_OUTSTART="\e[38;5;${outcolor}m"
        COMMAND_OUTEND="\e[0m"
    else
        COMMAND_OUTSTART=""
        COMMAND_OUTEND=""
    fi
}

command-handle-output() {
    # example: sh -c $command | command-handle-output
    tee "$COMMAND_OUT_FILE.tmp" | ( echo -e -n "$COMMAND_OUTSTART"; cat; echo -e -n "$COMMAND_OUTEND" )
    cat "$COMMAND_OUT_FILE.tmp" >> "$COMMAND_OUT_FILE"
    if [ -n "$PV" ]; then
        echo | $PV $speed
    fi
}

command-runs-in-bg() {
    echo "(runs in background)"
    echo ""
}

command-end() {
    # example: command-end EXIT_STATUS
    COMMAND_STATUS=$1
    local command_end_time=$(epochrealtime)
    local time_since_start=$(echo "$command_end_time - $command_init_time" | bc)
    ( echo "# exit status: $COMMAND_STATUS"; echo "# end time: $time_since_start" ) >> "$COMMAND_OUT_FILE"
    COMMAND_OUTPUT=$(<"$COMMAND_OUT_FILE.tmp")
    rm -f "$COMMAND_OUT_FILE.tmp"
}

command-error() { # script API
    # Usage: command-error MESSAGE
    #
    # Print executed command, observed output, exit status and MESSAGE.
    # Stop script execution.
    ( echo "command:     $COMMAND";
      echo "output:      $COMMAND_OUTPUT";
      echo "exit status: $COMMAND_STATUS";
      echo "error:       $1" ) >&2
    command-exit-if-not-interactive
}

command-exit-if-not-interactive() {
    if [ -z "$INTERACTIVE_MODE" ] || [ "$INTERACTIVE_MODE" == "0" ]; then
        exit ${1:-1}
    fi
}

command-debug-log() {
    if [ "$(type -t -- debug-log)" = "function" ]; then
        debug-log "$@"
        return 0
    else
        if [ -n "$OUTPUT_DIR" ] && [ -d "$OUTPUT_DIR" ]; then
            touch "$OUTPUT_DIR"/debug-log
            echo "$@" >> "$OUTPUT_DIR"/debug-log
            return 0
        fi
    fi
    echo "$@" 1>&2
}


================================================
FILE: demo/lib/distro.bash
================================================
# shellcheck disable=SC2120
GO_URLDIR=https://golang.org/dl
GO_VERSION=1.24.1
GOLANG_URL=$GO_URLDIR/go$GO_VERSION.linux-amd64.tar.gz
CRICTL_VERSION=${CRICTL_VERSION:-"v1.25.0"}
MINIKUBE_VERSION=${MINIKUBE_VERSION:-v1.27.0}

###########################################################################

#
# distro-agnostic interface
#
# To add a new distro implement distro-specific versions of these
# functions. You can omit implementing those which already resolve
# to an existing function which works for the new distro.
#
# To add a new API function, add an new briding resolution entry below.
#

distro-image-url()          { distro-resolve "$@"; }
distro-ssh-user()           { distro-resolve "$@"; }
distro-pkg-type()           { distro-resolve "$@"; }
distro-install-repo-key()   { distro-resolve "$@"; }
distro-install-repo()       { distro-resolve "$@"; }
distro-refresh-pkg-db()     { distro-resolve "$@"; }
distro-install-pkg()        { distro-resolve "$@"; }
distro-install-pkg-local()  { distro-resolve "$@"; }
distro-remove-pkg()         { distro-resolve "$@"; }
distro-setup-proxies()      { distro-resolve "$@"; }
distro-setup-oneshot()      { distro-resolve "$@"; }
distro-install-utils()      { distro-resolve "$@"; }
distro-install-golang()     { distro-resolve "$@"; }
distro-install-runc()       { distro-resolve "$@"; }
distro-install-containerd() { distro-resolve "$@"; }
distro-config-containerd()  { distro-resolve "$@"; }
distro-restart-containerd() { distro-resolve "$@"; }
distro-install-crio()       { distro-resolve "$@"; }
distro-config-crio()        { distro-resolve "$@"; }
distro-restart-crio()       { distro-resolve "$@"; }
distro-install-crictl()     { distro-resolve "$@"; }
distro-install-cri-dockerd(){ distro-resolve "$@"; }
distro-install-minikube()   { distro-resolve "$@"; }
distro-install-k8s()        { distro-resolve "$@"; }
distro-install-kernel-dev() { distro-resolve "$@"; }
distro-k8s-cni()            { distro-resolve "$@"; }
distro-k8s-cni-subnet()     { distro-resolve "$@"; }
distro-set-kernel-cmdline() { distro-resolve "$@"; }
distro-govm-env()           { distro-resolve "$@"; }
distro-bootstrap-commands() { distro-resolve "$@"; }
distro-env-file-dir()       { distro-resolve "$@"; }

###########################################################################

# distro-specific function resolution
distro-resolve() {
    local apifn="${FUNCNAME[1]}" fn prefn postfn
    # shellcheck disable=SC2086
    {
        fn="$(distro-resolve-fn $apifn)"
        prefn="$(distro-resolve-fn $apifn-pre)"
        postfn="$(distro-resolve-fn $apifn-post)"
        command-debug-log "$VM_DISTRO/${FUNCNAME[1]}: pre: ${prefn:--}, fn: ${fn:--}, post: ${postfn:--}"
    }
    [ -n "$prefn" ] && { $prefn "$@" || return $?; }
    $fn "$@" || return $?
    [ -n "$postfn" ] && { $postfn "$@" || return $?; }
    return 0
}

distro-resolve-fn() {
    # We try resolving distro-agnostic implementations by looping through
    # a list of candidate function names in decreasing order of precedence
    # and returning the first one found. The candidate list has version-
    # exact and unversioned distro-specific functions and a set fallbacks
    # based on known distro, derivative, and package type relations.
    #
    # For normal functions the last fallback is 'distro-unresolved' which
    # prints and returns an error. For pre- and post-functions there is no
    # similar setup. IOW, unresolved normal distro functions fail while
    # unresolved pre- and post-functions get ignored (in distro-resolve).
    local apifn="$1" candidates fn

    case $apifn in
        distro-*) apifn="${apifn#distro-}";;
        *) error "internal error: can't resolve non-API function $apifn";;
    esac
    candidates="${VM_DISTRO/./_}-$apifn ${VM_DISTRO%%-*}-$apifn"
    case $VM_DISTRO in
        ubuntu*) candidates="$candidates debian-$apifn";;
        fedora*) candidates="$candidates rpm-$apifn";;
        *suse*)  candidates="$candidates rpm-$apifn";;
        sles*)   candidates="$candidates opensuse-$apifn rpm-$apifn";;
    esac
    case $apifn in
        *-pre|*-post) ;;
        *) candidates="$candidates default-$apifn distro-unresolved";;
    esac
    for fn in $candidates; do
        if [ "$(type -t -- "$fn")" = "function" ]; then
            echo "$fn"
            return 0
        fi
    done
}

# distro-unresolved terminates failed API function resolution with an error.
distro-unresolved() {
    local apifn="${FUNCNAME[2]}"
    command-error "internal error: can't resolve \"$apifn\" for \"$VM_DISTRO\""
    return 1
}

###########################################################################

#
# Ubuntu, Debian
#

ubuntu-18_04-image-url() {
    echo "https://cloud-images.ubuntu.com/bionic/current/bionic-server-cloudimg-amd64.img"
}

ubuntu-20_04-image-url() {
    echo "https://cloud-images.ubuntu.com/focal/current/focal-server-cloudimg-amd64.img"
}

ubuntu-22_04-image-url() {
    echo "https://cloud-images.ubuntu.com/releases/jammy/release/ubuntu-22.04-server-cloudimg-amd64.img"
}

ubuntu-24_04-image-url() {
    echo "https://cloud-images.ubuntu.com/releases/noble/release/ubuntu-24.04-server-cloudimg-amd64.img"
}

debian-11-image-url() {
    echo "https://cloud.debian.org/images/cloud/bullseye/latest/debian-11-generic-amd64.qcow2"
}

debian-12-image-url() {
    echo "https://cloud.debian.org/images/cloud/bookworm/latest/debian-12-generic-amd64.qcow2"
}

debian-sid-image-url() {
    echo "https://cloud.debian.org/images/cloud/sid/daily/latest/debian-sid-generic-amd64-daily.qcow2"
}

ubuntu-download-kernel() {
    # Usage:
    #   ubuntu-download-kernel list
    #   ubuntu-download-kernel VERSION
    #
    # List or download Ubuntu kernel team kernels.
    #
    # Example:
    #   ubuntu-download-kernel list | grep 5.9
    #   ubuntu-download-kernel 5.9-rc8
    #   vm-command "dpkg -i kernels/linux*rc8*deb"
    #   vm-reboot
    #   vm-command "uname -a"
    local version=$1
    [ -n "$version" ] ||
        error "missing kernel version to install"
    if [ "$version" == "list" ]; then
        wget -q -O- https://kernel.ubuntu.com/~kernel-ppa/mainline/  | grep -E '^<tr>.*href="v[5-9]' | sed 's|^.*href="v\([0-9][^"]*\)/".*$|\1|g'
        return 0
    fi
    vm-command "mkdir -p kernels; rm -f kernels/linux*$version*deb; for deb in \$(wget -q -O- https://kernel.ubuntu.com/~kernel-ppa/mainline/v$version/ | awk -F'\"' '/amd64.*deb/{print \$2}' | grep -v -E 'headers|lowlatency'); do ( cd kernels; wget -q https://kernel.ubuntu.com/~kernel-ppa/mainline/v$version/\$deb ); done; echo; echo 'Downloaded kernel packages:'; du -h kernels/*.deb" ||
        command-error "downloading kernel $version failed"
}

ubuntu-ssh-user() {
    echo ubuntu
}

debian-ssh-user() {
    echo debian
}

ubuntu-apparmor-disable-runc() {
    vm-command "[ -f /etc/apparmor.d/runc ] && ln -s /etc/apparmor.d/runc /etc/apparmor.d/disable/ && apparmor_parser -R /etc/apparmor.d/runc"
}

ubuntu-config-containerd() {
    ubuntu-apparmor-disable-runc
    default-config-containerd
}

ubuntu-config-crio() {
    ubuntu-apparmor-disable-runc
    default-config-crio
}


debian-pkg-type() {
    echo deb
}

debian-install-repo-key() {
    local key
    # apt-key needs gnupg2, that might not be available by default
    vm-command "command -v gpg >/dev/null 2>&1" || {
        vm-command "apt-get update && apt-get install -y gnupg2"
    }
    for key in "$@"; do
        vm-command "curl -L -s $key | apt-key add -" ||
            command-error "failed to install repo key $key"
    done
}

debian-install-repo() {
    if [ $# = 1 ]; then
        # shellcheck disable=SC2086,SC2048
        set -- $*
    fi
    vm-command "echo $* > /etc/apt/sources.list.d/$3-$4.list && apt-get update" ||
        command-error "failed to install apt repository $*"
}

debian-refresh-pkg-db() {
    vm-command "apt-get update" ||
        command-error "failed to refresh apt package DB"
}

debian-install-pkg() {
    # dpkg configure may ask "The default action is to keep your
    # current version", for instance when a test has added
    # /etc/containerd/config.toml and then apt-get installs
    # containerd. 'yes ""' will continue with the default answer (N:
    # keep existing) in this case. Without 'yes' installation fails.

    # Add apt-get option "--reinstall" if any environment variable
    # reinstall_<pkg>=1
    local pkg
    local opts=""
    for pkg in "$@"; do
        if [ "$(eval echo \$reinstall_$pkg)" == "1" ]; then
            opts="$opts --reinstall"
            break
        fi
    done
    vm-command "yes \"\" | DEBIAN_FRONTEND=noninteractive apt-get install $opts -y --allow-downgrades $*" ||
        command-error "failed to install $*"
}

debian-remove-pkg() {
    vm-command "for pkg in $*; do dpkg -l \$pkg >& /dev/null && apt remove -y --purge \$pkg || :; done" ||
        command-error "failed to remove package(s) $*"
}

debian-install-pkg-local() {
    local force=""
    if [ "$1" == "--force" ]; then
        force="--force-all"
        shift
    fi
    vm-command "dpkg -i $force $*" ||
        command-error "failed to install local package(s)"
}

debian-install-golang() {
    debian-refresh-pkg-db
    debian-install-pkg golang git-core
}

debian-install-kernel-dev() {
    distro-refresh-pkg-db
    distro-install-pkg git-core build-essential linux-source bc kmod cpio flex libncurses5-dev libelf-dev libssl-dev dwarves bison
    vm-command "[ -d linux ] || git clone https://github.com/torvalds/linux"
    vm-command '[ -f linux/.config ] || cp -v /boot/config-$(uname -r) linux/.config'
    echo "Kernel ready for patching and configuring."
    echo "build:   cd linux && make bindeb-pkg"
    echo "install: dpkg -i linux-*.deb"
}

debian-11-install-containerd-pre() {
    debian-install-repo-key https://download.docker.com/linux/debian/gpg
    debian-install-repo "deb https://download.docker.com/linux/debian bullseye stable"
}

debian-11-install-containerd() {
    vm-command-q "[ -f /usr/bin/containerd ]" || {
        distro-install-pkg containerd.io
    }
}

debian-sid-config-containerd-post() {
    vm-command "sed -e 's|bin_dir = \"/usr/lib/cni\"|bin_dir = \"/opt/cni/bin\"|g' -i /etc/containerd/config.toml"
}

debian-install-cri-dockerd-pre() {
    debian-refresh-pkg-db
    debian-install-pkg docker.io conntrack
    vm-command "addgroup $(vm-ssh-user) docker"
    distro-install-golang
}

debian-install-crio-pre() {
    debian-refresh-pkg-db
    debian-install-pkg libgpgme11 conmon runc containernetworking-plugins conntrack || true
}

debian-install-k8s() {
    local _k8s=$k8s
    debian-refresh-pkg-db
    debian-install-pkg gpg apt-transport-https curl

    if [[ -z "$k8s" ]] || [[ "$k8s" == "latest" ]]; then
        vm-command "curl -s https://api.github.com/repos/kubernetes/kubernetes/releases/latest | grep tag_name | sed -e 's/.*v\([0-9]\+\.[0-9]\+\).*/\1/g'"
        _k8s=$COMMAND_OUTPUT
    fi
    echo "installing Kubernetes v${_k8s}"
    vm-command "curl -fsSL https://pkgs.k8s.io/core:/stable:/v${_k8s}/deb/Release.key -o /tmp/Release.key" || \
        command-error "failed to download Kubernetes v${_k8s} key"

    if vm-command "command -v apt-key >/dev/null"; then
        vm-command "sudo apt-key add /tmp/Release.key"
        vm-command "echo 'deb https://pkgs.k8s.io/core:/stable:/v${_k8s}/deb/ /' > /etc/apt/sources.list.d/kubernetes.list && apt update" || \
            command-error "failed to add Kubernetes v${_k8s} repo"
    else
        vm-command "sudo gpg --dearmor --batch --yes -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg /tmp/Release.key"
        vm-command "echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v${_k8s}/deb/ /' > /etc/apt/sources.list.d/kubernetes.list && apt update" || \
            command-error "failed to add Kubernetes v${_k8s} repo"
    fi
    debian-install-pkg "kubeadm" "kubelet" "kubectl"
}

debian-set-kernel-cmdline() {
    local e2e_defaults="$*"
    vm-command "echo 'GRUB_CMDLINE_LINUX_DEFAULT=\"\${GRUB_CMDLINE_LINUX_DEFAULT} ${e2e_defaults}\"' > /etc/default/grub.d/60-e2e-defaults.cfg" || {
        command-error "writing new command line parameters failed"
    }
    vm-command "update-grub" || {
        command-error "updating grub failed"
    }
}

debian-env-file-dir() {
    echo "/etc/default"
}

debian-sid-govm-env() {
    echo "DISABLE_VGA=N"
}

###########################################################################

#
# Generic Fedora
#

YUM_INSTALL="yum install --disableplugin=fastestmirror -y"
YUM_REMOVE="yum remove --disableplugin=fastestmirror -y"

fedora-image-url() {
    fedora-40-image-url
}

fedora-40-image-url() {
    echo "https://mirrors.xtom.de/fedora/releases/40/Cloud/x86_64/images/Fedora-Cloud-Base-Generic.x86_64-40-1.14.qcow2"
}

fedora-ssh-user() {
    echo fedora
}

fedora-install-utils() {
    distro-install-pkg /usr/bin/pidof
}

fedora-install-repo() {
    distro-install-pkg dnf-plugins-core
    vm-command "dnf config-manager --add-repo $*" ||
        command-error "failed to install DNF repository $*"
}

fedora-install-pkg() {
    local pkg
    local do_reinstall=0
    for pkg in "$@"; do
        if [ "$(eval echo \$reinstall_$pkg)" == "1" ]; then
            do_reinstall=1
            break
        fi
    done
    vm-command "dnf install -y $*" ||
        command-error "failed to install $*"
    # When requesting reinstallation, detect which packages were
    # already installed and reinstall those.
    # (Unlike apt and zypper, dnf offers no option for reinstalling
    # existing and installing new packages on the same run.)
    if [ "$do_reinstall" == "1" ]; then
        local reinstall_pkgs
        reinstall_pkgs=$(awk -F '[ -]' -v ORS=" " '/Package .* already installed/{print $2}' <<< "$COMMAND_OUTPUT")
        if [ -n "$reinstall_pkgs" ]; then
            vm-command "dnf reinstall -y $reinstall_pkgs"
        fi
    fi
}

fedora-remove-pkg() {
    vm-command "dnf remove -y $*" ||
        command-error "failed to remove package(s) $*"
}

fedora-install-pkg-local() {
    local force=""
    if [ "$1" == "--force" ]; then
        force="--nodeps --force"
        shift
    fi
    vm-command "rpm -Uvh $force $*" ||
        command-error "failed to install local package(s)"
}

fedora-install-kernel-dev() {
    fedora-install-pkg fedpkg fedora-packager rpmdevtools ncurses-devel pesign grubby git-core
    vm-command "(set -x -e
      echo root >> /etc/pesign/users
      echo $(vm-ssh-user) >> /etc/pesign/users
      /usr/libexec/pesign/pesign-authorize
      fedpkg clone -a kernel
      cd kernel
      git fetch
      git switch ${VM_DISTRO/edora-/} # example: git switch f40 in fedora-40
      sed -i 's/# define buildid .local/%define buildid .e2e/g' kernel.spec
    )" || {
        echo "installing kernel development environment failed"
        return 1
    }
    echo "Kernel ready for patching and configuring."
    echo "build:   cd kernel && dnf builddep -y kernel.spec && fedpkg local"
    echo "install: cd kernel/x86_64 && dnf install -y --nogpgcheck kernel-{core-,modules-,}[5-9]*.e2e.fc*.x86_64.rpm"
}

fedora-install-golang() {
    fedora-install-pkg wget tar gzip git-core
    from-tarball-install-golang
}

fedora-install-crio-version() {
    distro-install-pkg runc conmon
    vm-command "ln -sf /usr/lib64/libdevmapper.so.1.02 /usr/lib64/libdevmapper.so.1.02.1" || true

    if [ -z "$crio_src" ]; then
        vm-command "dnf -y module enable cri-o:${crio_version:-$1}"
    fi
}

fedora-install-containernetworking-plugins() {
    distro-install-pkg containernetworking-plugins
    vm-command "[ -x /opt/cni/bin/loopback ] || { mkdir -p /opt/cni/bin; mount --bind /usr/libexec/cni /opt/cni/bin; }"
    vm-command "grep /usr/libexec/cni /etc/fstab || echo /usr/libexec/cni /opt/cni/bin none defaults,bind,nofail 0 0 >> /etc/fstab"
}

fedora-install-cri-dockerd-pre() {
    distro-install-pkg docker git-core conntrack
    vm-command "systemctl enable docker --now; usermod --append --groups docker $(vm-ssh-user)"
    distro-install-golang
}

fedora-install-crio-pre() {
    fedora-install-crio-version 1.21
    fedora-install-containernetworking-plugins
}

fedora-install-crio() {
    if [ -n "$crio_src" ]; then
        default-install-crio
    else
        distro-install-pkg cri-o
        vm-command "systemctl enable --now crio" ||
            command-error "failed to enable cri-o"
    fi
}

fedora-install-containerd-pre() {
    distro-install-repo https://download.docker.com/linux/fedora/docker-ce.repo
    fedora-install-containernetworking-plugins
}

fedora-install-containerd-post() {
    vm-command "systemctl enable containerd"
}

fedora-install-k8s() {
    _k8s=$k8s
    if [[ -z "$_k8s" ]] || [[ "$_k8s" == "latest" ]]; then
        vm-command "curl -s https://api.github.com/repos/kubernetes/kubernetes/releases/latest | grep tag_name | sed -e 's/.*v\([0-9]\+\.[0-9]\+\).*/\1/g'"
        _k8s=$COMMAND_OUTPUT
    fi

    local repo="/etc/yum.repos.d/kubernetes.repo"

    cat <<EOF |
[kubernetes]
name=Kubernetes
baseurl=https://pkgs.k8s.io/core:/stable:/v$_k8s/rpm/
enabled=1
gpgcheck=1
gpgkey=https://pkgs.k8s.io/core:/stable:/v$_k8s/rpm/repodata/repomd.xml.key
EOF
      vm-pipe-to-file $repo

    if [ -n "$k8s" ]; then
        k8sverparam="-${k8s}-0"
    else
        k8sverparam=""
    fi

    distro-install-pkg iproute-tc kubelet$k8sverparam kubeadm$k8sverparam kubectl$k8sverparam
    vm-command "systemctl enable --now kubelet" ||
        command-error "failed to enable kubelet"
}

fedora-bootstrap-commands-post() {
    cat <<EOF
reboot_needed=0
mkdir -p /etc/sudoers.d
echo 'Defaults !requiretty' > /etc/sudoers.d/10-norequiretty

setenforce 0
sed -E -i 's/^SELINUX=.*$/SELINUX=permissive/' /etc/selinux/config

echo PATH='\$PATH:/usr/local/bin:/usr/local/sbin' > /etc/profile.d/usr-local-path.sh
EOF
    if [[ "${cgroups:-}" != "v2" ]]; then
        cat <<EOF
if grep -q NAME=Fedora /etc/os-release; then
    if ! grep -q systemd.unified_cgroup_hierarchy=0 /proc/cmdline; then
        sudo grubby --update-kernel=ALL --args="systemd.unified_cgroup_hierarchy=0"
        reboot_needed=1
    fi
fi
EOF
    fi

    # Using swapoff is not enough as we also need to disable the swap from systemd
    # and then reboot the VM
    cat <<EOF
if swapon --show | grep -q partition; then
    sed -E -i '/^\\/.*[[:space:]]swap[[:space:]].*\$/d' /etc/fstab
    systemctl --type swap
    for swp in \`systemctl --type swap | awk '/\\.swap/ { print \$1 }'\`; do systemctl stop "\$swp"; systemctl mask "\$swp"; done
    swapoff --all
    reboot_needed=1
fi
EOF

    cat <<EOF
if [ "\$reboot_needed" == "1" ]; then
   shutdown -r now
fi
EOF
}

fedora-set-kernel-cmdline() {
    local e2e_defaults="$*"
    vm-command "mkdir -p /etc/default; touch /etc/default/grub; sed -i '/e2e:fedora-set-kernel-cmdline/d' /etc/default/grub"
    vm-command "echo 'GRUB_CMDLINE_LINUX_DEFAULT=\"\${GRUB_CMDLINE_LINUX_DEFAULT} ${e2e_defaults}\" # by e2e:fedora-set-kernel-cmdline' >> /etc/default/grub" || {
        command-error "writing new command line parameters failed"
    }
    vm-command "grub2-mkconfig -o /boot/grub2/grub.cfg" || {
        command-error "updating grub failed"
    }
}

###########################################################################

#
# OpenSUSE and SLES
#

ZYPPER="zypper --non-interactive --no-gpg-checks"

sles-image-url() {
    echo "/DOWNLOAD-MANUALLY-TO-HOME/vms/images/SLES15-SP3-JeOS.x86_64-15.3-OpenStack-Cloud-GM.qcow2"
}

sles-ssh-user() {
    echo "sles"
}

sles-install-utils() {
    local sles_registered=0
    local sles_version=""
    vm-command "SUSEConnect -s" || {
        command-error "cannot run SUSEConnect"
    }
    # Parse registration status and SLES version.
    if [ "$(jq '.[] | select(.identifier == "SLES") | .status' <<< $COMMAND_OUTPUT)" == '"Registered"' ]; then
        sles_registered=1
    fi
    sles_version="$(jq -r '.[] | select(.identifier == "SLES") | .version' <<< $COMMAND_OUTPUT)"
    if [ -z "$sles_version" ]; then
        command-error "cannot read SLES version information from SUSEConnect -s output"
    fi
    # Try automatic registration if registration code is provided.
    if [ "$sles_registered" == 0 ] && [ -n "$VM_SLES_REGCODE" ]; then
            vm-command "SUSEConnect -r $VM_SLES_REGCODE" || {
                echo "ERROR:"
                echo "ERROR: Registering to SUSE Customer Center failed."
                echo "ERROR: - Verify VM_SLES_REGCODE and try again."
                echo "ERROR: - Unset VM_SLES_REGCODE to skip registration (use unsupported repos)."
                echo "ERROR:"
                exit 1
            }
            sles_registered=1
    fi
    # Add correct repo, depending on registration status.
    if [ "$sles_registered" == 0 ]; then
        echo "WARNING:"
        echo "WARNING: Unregistered SUSE Linux Enterprise Server."
        echo "WARNING: VM_SLES_REGCODE is not set, automatic registration skipped."
        echo "WARNING: Fallback to use OpenSUSE OSS repository."
        echo "WARNING:"
        sleep "${warning_delay:-0}"
        vm-command-q "$ZYPPER lr openSUSE-Oss >/dev/null" || {
            distro-install-repo "http://download.opensuse.org/distribution/leap/${sles_version}/repo/oss/" openSUSE-Oss
        }
    else
        vm-command-q "$ZYPPER lr | grep -q SUSE-PackageHub" || {
            vm-command "SUSEConnect -p PackageHub/${sles_version}/x86_64"
        }
    fi
    distro-install-pkg sysvinit-tools psmisc
}

opensuse-image-url() {
    opensuse-15_6-image-url
}

opensuse-15_6-image-url() {
    echo "https://download.opensuse.org/pub/opensuse/distribution/leap/15.6/appliances/openSUSE-Leap-15.6-Minimal-VM.x86_64-Cloud.qcow2"
}

opensuse-tumbleweed-image-url() {
    echo "https://ftp.uni-erlangen.de/opensuse/tumbleweed/appliances/openSUSE-MicroOS.x86_64-ContainerHost-OpenStack-Cloud.qcow2"
}

opensuse-install-utils() {
    distro-install-pkg psmisc sysvinit-tools
}

opensuse-ssh-user() {
    echo "opensuse"
}

opensuse-pkg-type() {
    echo "rpm"
}

opensuse-set-kernel-cmdline() {
    local e2e_defaults="$*"
    vm-command "mkdir -p /etc/default; touch /etc/default/grub; sed -i '/e2e:opensuse-set-kernel-cmdline/d' /etc/default/grub"
    vm-command "echo 'GRUB_CMDLINE_LINUX_DEFAULT=\"\${GRUB_CMDLINE_LINUX_DEFAULT} ${e2e_defaults}\" # by e2e:opensuse-set-kernel-cmdline' >> /etc/default/grub" || {
        command-error "writing new command line parameters failed"
    }
    vm-command "grub2-mkconfig -o /boot/grub2/grub.cfg" || {
        command-error "updating grub failed"
    }
}

opensuse-setup-oneshot() {
    # Remove bad version of containerd if it is already installed,
    # otherwise valid version of the package will not be installed.
    vm-command "rpm -q containerd && ( zypper info containerd | awk '/Repository/{print $3}' | grep -v Virtualization ) && echo Removing wrong containerd version && zypper --non-interactive rm containerd"
}

opensuse-install-repo() {
    opensuse-wait-for-zypper
    vm-command "$ZYPPER addrepo $* && $ZYPPER refresh" ||
        command-error "failed to add zypper repository $*"
}

opensuse-refresh-pkg-db() {
    opensuse-wait-for-zypper
    vm-command "$ZYPPER refresh" ||
        command-error "failed to refresh zypper package DB"
}

opensuse-install-pkg() {
    opensuse-wait-for-zypper
    # Add zypper option "--force" if environment variable reinstall_<pkg>=1
    local pkg
    local opts=""
    for pkg in "$@"; do
        if [ "$(eval echo \$reinstall_$pkg)" == "1" ]; then
            opts="$opts --force"
            break
        fi
    done
    # In OpenSUSE 15.2 zypper exits with status 106 if already installed,
    # in 15.3 the exit status is 0. Do not consider "already installed"
    # as an error.
    vm-command "$ZYPPER install $opts $*" || [ "$COMMAND_STATUS" == "106" ] ||
        command-error "failed to install $*"
}

opensuse-install-pkg-local() {
    opensuse-wait-for-zypper
    local force=""
    if [ "$1" == "--force" ]; then
        force="--nodeps --force"
        shift
    fi
    vm-command "rpm -Uvh $force $*" ||
        command-error "failed to install local package(s)"
}

opensuse-remove-pkg() {
    vm-command 'for i in $*; do rpm -q --quiet $i || continue; $ZYPPER remove $i || exit 1; done' ||
        command-error "failed to remove package(s) $*"
}

opensuse-install-golang() {
    distro-install-pkg wget tar gzip git-core
    from-tarball-install-golang
}

opensuse-wait-for-zypper() {
    vm-run-until --timeout 5 '( ! pgrep zypper >/dev/null ) || ( pkill -9 zypper; sleep 1; exit 1 )' ||
        error "Failed to stop zypper running in the background"
}

opensuse-install-k8s() {
    vm-command "( lsmod | grep -q br_netfilter ) || { echo br_netfilter > /etc/modules-load.d/50-br_netfilter.conf; modprobe br_netfilter; }"
    vm-command "echo 1 > /proc/sys/net/ipv4/ip_forward"
    vm-command "zypper ls"
    if ! grep -q snappy <<< "$COMMAND_OUTPUT"; then
        distro-install-repo "http://download.opensuse.org/repositories/system:/snappy/openSUSE_Leap_15.6 snappy"
        distro-refresh-pkg-db
    fi
    distro-install-pkg "snapd apparmor-profiles socat ebtables conntrackd iptables ethtool cni-plugins"
    distro-install-crictl
    vm-command "mkdir -p /opt/cni && ln -fs /usr/lib/cni/ -T /opt/cni/bin"

    vm-command "systemctl enable --now snapd"
    vm-command "snap wait system seed.loaded"
    for kubepart in kubelet kubectl kubeadm; do
        local snapcmd=install
        local k8sverparam
        if vm-command-q "snap info $kubepart | grep -q tracking"; then
            # $kubepart is already installed, either refresh or reinstall it.
            if [ "$(eval echo \$reinstall_$kubepart)" == "1" ]; then
                # Reinstalling $kubepart requested.
                # snap has no option for direct reinstalling,
                # so the package needs to be removed first.
                vm-command "snap remove $kubepart"
                snapcmd=install
            else
                snapcmd=refresh
            fi
        fi
        # Specify snap channel if user has requested a specific k8s version.
        if [[ "$k8s" == *.*.* ]]; then
            echo "WARNING: cannot snap install k8s=X.Y.Z, installing latest X.Y"
            k8sverparam="--channel ${k8s%.*}/edge"
        elif [[ "$k8s" == *.* ]]; then
            k8sverparam="--channel ${k8s}/edge"
        elif [[ -z "$k8s" ]]; then
            k8sverparam=""
        else
            error "invalid k8s version ${k8s}, expected k8s=X.Y"
        fi
        vm-command "snap $snapcmd $k8sverparam $kubepart --classic"
    done
    # Manage kubelet with systemd rather than snap
    vm-command "snap stop kubelet"
cat <<EOF |
[Unit]
Description=kubelet: The Kubernetes Node Agent
Documentation=https://kubernetes.io/docs/
Wants=network-online.target
After=network-online.target

[Service]
ExecStart=/snap/bin/kubelet --bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf --config=/var/lib/kubelet/config.yaml --container-runtime-endpoint=${k8scri_sock} --pod-infra-container-image=k8s.gcr.io/pause:3.4.1
Restart=always
StartLimitInterval=0
RestartSec=10

[Install]
WantedBy=multi-user.target
EOF
    vm-pipe-to-file /etc/systemd/system/kubelet.service
    vm-command "systemctl enable --now kubelet" ||
        command-error "failed to enable kubelet"
}

opensuse-install-kernel-dev() {
    vm-command-q "zypper lr | grep -q openSUSE_Tools" ||
        distro-install-repo "http://download.opensuse.org/repositories/openSUSE:/Tools/openSUSE_Factory/openSUSE:Tools.repo"
    distro-install-pkg "git-core make gcc flex bison bc ncurses-devel patch bzip2 osc build python quilt"
    vm-command "cd /root; [ -d kernel ] || git clone --depth=100 https://github.com/SUSE/kernel"
    vm-command "cd /root; [ -d kernel-source ] || git clone --depth=100 https://github.com/SUSE/kernel-source"
    vm-command "[ -f /etc/profile.d/linux_git.sh ] || echo export LINUX_GIT=/root/kernel > /etc/profile.d/linux_git.sh"
}

opensuse-bootstrap-commands-pre() {
    cat <<EOF
sed -e '/Signature checking/a gpgcheck = off' -i /etc/zypp/zypp.conf
EOF
}

opensuse-govm-env() {
    echo "DISABLE_VGA=N"
}

###########################################################################

#
# Generic rpm functions
#

rpm-pkg-type() {
    echo rpm
}

rpm-install-repo-key() {
    local key
    for key in "$@"; do
        vm-command "rpm --import $key" ||
            command-error "failed to import repo key $key"
    done
}

rpm-refresh-pkg-db() {
    :
}

###########################################################################

#
# default implementations
#

default-bootstrap-commands() {
    cat <<EOF
rm -f /etc/modules-load.d/k8s.conf; touch /etc/modules-load.d/k8s.conf
modprobe bridge && echo bridge >> /etc/modules-load.d/k8s.conf || :
modprobe nf-tables-bridge && echo nf-tables-bridge >> /etc/modules-load.d/k8s.conf || :
modprobe br_netfilter && echo br_netfilter >> /etc/modules-load.d/k8s.conf || :

touch /etc/sysctl.d/k8s.conf
echo "net.bridge.bridge-nf-call-ip6tables = 1" >> /etc/sysctl.d/k8s.conf
echo "net.bridge.bridge-nf-call-iptables = 1" >> /etc/sysctl.d/k8s.conf
echo "net.ipv4.ip_forward = 1" >> /etc/sysctl.d/k8s.conf

# rp_filter (partially) mitigates DDOS attacks with spoofed IP addresses
# by dropping packages with non-routable (unanswerable) source addresses.
# However, rp_filter > 0 breaks cilium networking. Make sure it's disabled.
echo "net.ipv4.conf.*.rp_filter = 0" >> /etc/sysctl.d/k8s.conf

/sbin/sysctl -p /etc/sysctl.d/k8s.conf || :
EOF
}

default-setup-proxies() {
    # Notes:
    #   We blindly assume that upper- vs. lower-case env vars are identical.
    # shellcheck disable=SC2154
    if [ -z "$http_proxy$https_proxy$ftp_proxy$no_proxy" ]; then
        return 0
    fi
    if vm-command-q "grep -q \"http_proxy=$http_proxy\" /etc/profile.d/proxy.sh && \
                     grep -q \"https_proxy=$https_proxy\" /etc/profile.d/proxy.sh && \
                     grep -q \"ftp_proxy=$ftp_proxy\" /etc/profile.d/proxy.sh && \
                     grep -q \"no_proxy=$no_proxy\" /etc/profile.d/proxy.sh" 2>/dev/null; then
        # No changes in proxy configuration
        return 0
    fi

    local file scope="" append="--append" hn ext_no_proxy
    hn="$(vm-command-q hostname)"

    local master_node_ip_comma=""
    if [ -n "$k8smaster" ]; then
        local master_user_ip
        master_user_ip="$(vm-ssh-user-ip $k8smaster)"
        master_node_ip_comma=${master_user_ip/*@},
    fi

    ext_no_proxy="$master_node_ip_comma$VM_IP,10.0.0.0/8,$CNI_SUBNET,$hn,.svc,.internal,192.168.0.0/16"

    for file in /etc/environment /etc/profile.d/proxy.sh; do
        cat <<EOF |
${scope}http_proxy=$http_proxy
${scope}https_proxy=$https_proxy
${scope}ftp_proxy=$ftp_proxy
${scope}no_proxy=$no_proxy,$ext_no_proxy
${scope}HTTP_PROXY=$http_proxy
${scope}HTTPS_PROXY=$https_proxy
${scope}FTP_PROXY=$ftp_proxy
${scope}NO_PROXY=$no_proxy,$ext_no_proxy
EOF
      vm-pipe-to-file $append $file
      scope="export "
      append=""
    done
    # Setup proxies for systemd services that might be installed later
    for file in /etc/systemd/system/{containerd,docker,crio}.service.d/proxy.conf; do
        cat <<EOF |
[Service]
Environment=HTTP_PROXY=$http_proxy
Environment=HTTPS_PROXY=$https_proxy
Environment=NO_PROXY=$no_proxy,$ext_no_proxy
EOF
        vm-pipe-to-file $file
    done
    # Setup proxies inside docker containers
    for file in /{root,home/$VM_SSH_USER}/.docker/config.json; do
        cat <<EOF |
{
    "proxies": {
        "default": {
            "httpProxy": "$http_proxy",
            "httpsProxy": "$https_proxy",
            "noProxy": "$no_proxy,$ext_no_proxy"
        }
    }
}
EOF
        vm-pipe-to-file $file
    done
}

default-setup-oneshot() {
    :
}

default-install-utils() {
    # $distro-install-utils() is responsible for installing common
    # utilities, such as pidof and killall, that the test framework
    # and tests in general can expect to be found on VM.
    :
}

default-k8s-cni() {
    echo ${k8scni:-bridge}
}

default-k8s-cni-subnet() {
    if [ "$(distro-k8s-cni)" == "flannel" ]; then
        echo 10.244.0.0/16
    else
        echo 10.217.0.0/16
    fi
}

default-install-runc() {
    distro-install-pkg runc
}

default-install-containerd() {
    vm-command-q "[ -f /usr/bin/containerd ]" || {
        distro-install-pkg containerd
    }
}

default-config-containerd() {
    if vm-command-q "[ ! -f /etc/containerd/config.toml ]"; then
        vm-command "mkdir -p /etc/containerd && containerd config default > /etc/containerd/config.toml"
    fi

    vm-sed-file /etc/containerd/config.toml 's/^.*disabled_plugins *= *.*$/disabled_plugins = []/'

    if vm-command-q "containerd config dump | grep -v -q SystemdCgroup"; then
        vm-command "containerd config dump > /etc/containerd/config.toml"
    fi

    vm-sed-file /etc/containerd/config.toml 's/SystemdCgroup = false/SystemdCgroup = true/g'
}

default-restart-containerd() {
    vm-command "systemctl daemon-reload && systemctl restart containerd" ||
        command-error "failed to restart containerd systemd service"
}

default-install-crio() {
    [ -n "$crio_src" ] || error "crio install error: crio_src is not set"
    [ -x "$crio_src/bin/crio" ] || error "crio install error: file not found $crio_src/bin/crio"
    for f in crio crio-status pinns; do
        vm-put-file "$crio_src/bin/$f" "/usr/bin/$f"
    done
    cat <<EOF |
[Unit]
Description=cri-o container runtime
Documentation=https://cri-o.io
After=network.target

[Service]
ExecStart=/usr/bin/crio

Delegate=yes
KillMode=process
Restart=always
LimitNPROC=infinity
LimitCORE=infinity
LimitNOFILE=1048576
TasksMax=infinity

[Install]
WantedBy=multi-user.target
EOF
    vm-pipe-to-file /etc/systemd/system/crio.service
    vm-command "mkdir -p /etc/systemd/system/crio.service.d"
    vm-command "(echo \"[Service]\"; echo \"Environment=PATH=/sbin:/usr/sbin:$PATH:/usr/libexec/podman\") > /etc/systemd/system/crio.service.d/path.conf; systemctl daemon-reload"
}

default-config-crio() {
    vm-command "mkdir -p /etc/containers"
    echo '{"default": [{"type":"insecureAcceptAnything"}]}' | vm-pipe-to-file /etc/containers/policy.json
    cat <<EOF |
[registries.search]
registries = ['docker.io']
EOF
    vm-pipe-to-file /etc/containers/registries.conf
}

default-restart-crio() {
    vm-command "systemctl daemon-reload && systemctl restart crio" ||
        command-error "failed to restart crio systemd service"
}

default-install-minikube() {
    vm-command "curl -Lo /usr/local/bin/minikube https://storage.googleapis.com/minikube/releases/${MINIKUBE_VERSION}/minikube-linux-amd64 && chmod +x /usr/local/bin/minikube"
    distro-install-crictl
}

default-install-crictl() {
    vm-command "set -e -x
    wget https://github.com/kubernetes-sigs/cri-tools/releases/download/${CRICTL_VERSION}/crictl-${CRICTL_VERSION}-linux-amd64.tar.gz
    sudo tar zxvf crictl-${CRICTL_VERSION}-linux-amd64.tar.gz -C /usr/local/bin
    rm -f crictl-${CRICTL_VERSION}-linux-amd64.tar.gz
    "
}

default-install-cri-dockerd() {
    vm-command "set -e -x
    git clone --depth=1 https://github.com/Mirantis/cri-dockerd.git
    cd cri-dockerd
    mkdir bin
    go build -o bin/cri-dockerd
    mkdir -p /usr/local/bin
    install -o root -g root -m 0755 bin/cri-dockerd /usr/local/bin/cri-dockerd
    cp -a packaging/systemd/* /etc/systemd/system
    sed -i -e 's,/usr/bin/cri-dockerd,/usr/local/bin/cri-dockerd,' /etc/systemd/system/cri-docker.service
    systemctl daemon-reload
    systemctl enable cri-docker.service
    systemctl enable --now cri-docker.socket
    "
}

default-govm-env() {
    echo "DISABLE_VGA=Y"
}

default-env-file-dir() {
    echo "/etc/sysconfig"
}

###########################################################################

#
# generic supporting functions
#

from-tarball-install-golang() {
    vm-command-q "go version | grep -q go$GOLANG_VERSION" || {
        vm-command "wget --progress=dot:giga $GOLANG_URL -O go.tgz" && \
            vm-command "tar -C /usr/local -xvzf go.tgz >/dev/null && rm go.tgz" && \
            vm-command "echo 'PATH=/usr/local/go/bin:\$PATH' > /etc/profile.d/go.sh" && \
            vm-command "echo \* installed \$(go version)"
    }
}

create-ext4-var-lib-containerd() {
    local dir="/var/lib/containerd" file="/loop-ext4.dsk" dev

    echo "Creating loopback-mounted ext4 $dir..."

    if ! dev="$(vm-command-q "losetup -f")" || [ -z "$dev" ]; then
        command-error "failed to find unused loopback device"
    fi
    vm-command "dd if=/dev/zero of=$file bs=$((1024*1000)) count=$((1000*5))" ||
        command-error "failed to create file for ext4 loopback mount"
    vm-command "losetup $dev $file" ||
        command-error "failed to attach $file to $dev"
    vm-command "mkfs.ext4 $dev" ||
        command-error "failed to create ext4 filesystem on $dev ($file)"
    if vm-command "[ -d $dir ]"; then
        vm-command "mv $dir $dir.orig" ||
            command-error "failed to rename original $dir to $dir.orig"
    fi
    vm-command "mkdir -p $dir" ||
        command-error "failed to create $dir"

    cat <<EOF |
[Unit]
Description=Activate loop device
DefaultDependencies=no
After=systemd-udev-settle.service
Wants=systemd-udev-settle.service

[Service]
ExecStart=/sbin/losetup $dev $file
Type=oneshot

[Install]
WantedBy=local-fs.target
EOF
    vm-pipe-to-file /etc/systemd/system/attach-loop-devices.service
    vm-command "systemctl enable attach-loop-devices.service"

    cat <<EOF |
$dev    $dir    ext4    defaults    1 2
EOF
    vm-pipe-to-file --append /etc/fstab

    vm-command "mount $dir" ||
        command-error "failed to mount new ext4 $dir"
    if vm-command "[ -d $dir.orig ]"; then
        vm-command "tar -C $dir.orig -cf - . | tar -C $dir -xf -" ||
            command-error "failed to copy $dir.orig to $dir"
    fi
}

CNI_SUBNET=$(distro-k8s-cni-subnet)


================================================
FILE: demo/lib/host.bash
================================================
source "$(dirname "${BASH_SOURCE[0]}")/command.bash"

HOST_PROMPT=${HOST_PROMPT-"\e[38;5;11mhost>\e[0m "}
HOST_LIB_DIR="$(dirname "${BASH_SOURCE[0]}")"
HOST_PROJECT_DIR="$(dirname "$(dirname "$(realpath "$HOST_LIB_DIR")")")"
HOST_VM_IMAGE_DIR=~/vms/images
HOST_VM_DATA_DIR_TEMPLATE="~/vms/data/\${VM_NAME}"
if [ -z "$HOST_GORESCTRL_DIR" ]; then
    HOST_GORESCTRL_DIR="$(realpath "$HOST_PROJECT_DIR/../goresctrl")"
fi
GOVM=${GOVM-govm}

host-command() {
    command-start "host" "$HOST_PROMPT" "$1"
    bash -c "$COMMAND" 2>&1 | command-handle-output
    command-end ${PIPESTATUS[0]}
    return $COMMAND_STATUS
}

host-require-govm() {
    command -v "$GOVM" >/dev/null || error "cannot run govm \"$GOVM\". Check PATH or set GOVM=/path/to/govm."
}

host-require-cmd() {
    command -v "$1" >/dev/null || error "cannot run \"$1\". Check dependencies."
}

host-get-vm-config() {
    if [ -z "$1" ]; then
        error "can't get VM configuration, name not set"
    fi
    VM_NAME="$1"
    HOST_VM_DATA_DIR="$(eval "echo $HOST_VM_DATA_DIR_TEMPLATE")"
    VM_DATA_CONFIG="$HOST_VM_DATA_DIR/vm-config"
    if ! [ -f "$VM_DATA_CONFIG" ]; then
        return 1
    fi
    source "$VM_DATA_CONFIG"
    if [ -z "$VM_NAME" ] || [ -z "$VM_DISTRO" ] || [ -z "$VM_CRI" ] || [ -z "$VM_SSH_USER" ]; then
        return 1
    fi
    VM_COMPOSE_YAML="$HOST_VM_DATA_DIR/govm-compose.yaml"
}

host-set-vm-config() {
    if [ -z "$1" ]; then
        error "can't configure VM, name not set"
    fi
    if [ -z "$2" ]; then
        error "can't configure VM, distro not set"
    fi
    if [ -z "$3" ]; then
        error "can't configure VM, CRI runtime not set"
    fi
    VM_NAME="$1"
    VM_DISTRO="$2"
    VM_CRI="$3"
    VM_SSH_USER="$(vm-ssh-user)"
    HOST_VM_DATA_DIR="$(eval "echo $HOST_VM_DATA_DIR_TEMPLATE")"
    mkdir -p "$HOST_VM_DATA_DIR"
    VM_COMPOSE_YAML="$HOST_VM_DATA_DIR/govm-compose.yaml"
    VM_DATA_CONFIG="$HOST_VM_DATA_DIR/vm-config"
    cat > "$VM_DATA_CONFIG" <<EOF
VM_NAME="$VM_NAME"
VM_DISTRO="$VM_DISTRO"
VM_CRI="$VM_CRI"
VM_SSH_USER="$VM_SSH_USER"
EOF
}

host-fetch-vm-image() {
    local url=$(vm-image-url)
    local file=$(basename $url)
    local image decompress
    [ -d "$HOST_VM_IMAGE_DIR" ] || mkdir -p "$HOST_VM_IMAGE_DIR" ||
        error "cannot create directory for VM images: $HOST_VM_IMAGE_DIR"
    case $file in
        *.xz)
            image=${file%.xz}
            decompress="xz -d"
            ;;
        *.bz2)
            image=${file%.bz2}
            decompress="bzip -d"
            ;;
        *.gz)
            image=${file%.gz}
            decompress="gzip -d"
            ;;
        *)
            image="$file"
            decompress=":"
            ;;
    esac
    [ -f "$HOST_VM_IMAGE_DIR/$image" ] || {
        echo "VM image $HOST_VM_IMAGE_DIR/$image not found..."
        [ -f "$HOST_VM_IMAGE_DIR/$file" ] || {
            echo "downloading VM image $image..."
            host-command "wget --progress=dot:giga -O \"$HOST_VM_IMAGE_DIR/$file\" \"$url\"" ||
                error "failed to download VM image ($url)"
        }
        if [ -n "$decompress" ]; then
            echo "decompressing VM image $file..."
            ( cd "$HOST_VM_IMAGE_DIR" && $decompress $file ) ||
                error "failed to decompress $file to $image using $decompress"
        fi
        if [ ! -f "$HOST_VM_IMAGE_DIR/$image" ]; then
            error "internal error, fetching+decompressing $url did not produce $HOST_VM_IMAGE_DIR/$image"
        fi
    }
    VM_IMAGE="$HOST_VM_IMAGE_DIR/$image"
}

host-create-vm() {
    # Usage: host-create-vm NAME [NUMANODELIST_JSON]
    #
    # If successful, VM_IP variable contains the IP address of the govm guest.
    #
    # If NUMANODELIST_JSON is given, Qemu CPU and memory parameters are
    # generated from it. Example, create VM with four identical NUMA nodes:
    #     host-create-vm myvm '[{"cpu": 2, "mem": "2G", "nodes": 4}]'
    #
    # If NUMANODELIST_JSON is not given, Qemu CPU and memory parameters
    # can be defined directly in VM_QEMU_CPUMEM environment variable.
    # VM_QEMU_CPUMEM is expected to contain at least parameters
    #     -m MEMORY -smp CPUCORES
    #
    # Example: four numa nodes, 2 cores each
    #     VM_QEMU_CPUMEM="-m 8G,slots=4,maxmem=32G \
    #         -smp cpus=8 \
    #         -numa node,cpus=0-1,nodeid=0 \
    #         -numa node,cpus=2-3,nodeid=1 \
    #         -numa node,cpus=4-5,nodeid=2 \
    #         -numa node,cpus=6-7,nodeid=3 \
    #         -cpu host"
    #     host-create-vm my-four-numa-node-pc
    #
    # If NUMANODELIST_JSON parameter or VM_QEMU_CPUMEM environment
    # variable defined, the VM will be created with "govm compose" and
    # VM_GOVM_COMPOSE_TEMPLATE yaml. In both cases parameters in
    # VM_QEMU_EXTRA environment variable are passed through to Qemu.
    #
    # Debug Qemu parameters and output with
    #     $ docker logs $(docker ps | awk '/govm/{print $1; exit}')
    #
    local TOPOLOGY="$2"

    if [ -z "$VM_NAME" ]; then
        error "cannot create VM: missing name"
    fi
    if [ -n "$TOPOLOGY" ]; then
        if [ -n "$VM_QEMU_CPUMEM" ]; then
            error "cannot take both VM_QEMU_CPUMEM and numa node JSON"
        fi
        VM_QEMU_CPUMEM=$(echo "$TOPOLOGY" | "$HOST_LIB_DIR/topology2qemuopts.py")
        if [ "$?" -ne  "0" ]; then
            error "error in topology"
        fi
    fi
    host-require-govm
    # If VM does not exist, create it from scrach
    ${GOVM} ls | grep -q "$VM_NAME" || {
        host-fetch-vm-image
        mkdir -p "$(dirname "$VM_COMPOSE_YAML")"
        vm-compose-govm-template > "$VM_COMPOSE_YAML"
        host-command "${GOVM} compose -f \"$VM_COMPOSE_YAML\""
        echo "# VM base image  : $VM_IMAGE"
        echo "# VM govm yaml   : $VM_COMPOSE_YAML"
    }

    sleep 1
    VM_CONTAINER_ID=$(${GOVM} ls | awk "/$VM_NAME/{print \$1}")
    # Verify Qemu version. Refuse to run if Qemu < 5.0.
    # Use "docker run IMAGE" instead of "docker exec CONTAINER",
    # because the container may have already failed.
    VM_CONTAINER_IMAGE=$(docker inspect $VM_CONTAINER_ID | jq '.[0].Image' -r | awk -F: '{print $2}')
    echo "# VM name        : $VM_NAME"
    echo "# VM Linux distro: $VM_DISTRO"
    echo "# VM CRI         : $VM_CRI"
    echo "# VM Docker image: $VM_CONTAINER_IMAGE"
    echo "# VM Docker cntnr: $VM_CONTAINER_ID"
    if [ -n "$VM_CONTAINER_IMAGE" ]; then
        VM_CONTAINER_QEMU_VERSION=$(docker run --rm --entrypoint=/usr/bin/qemu-system-x86_64 $VM_CONTAINER_IMAGE -version | awk '/QEMU emulator version/{print $4}')
    fi
    if [ -n "$VM_CONTAINER_QEMU_VERSION" ]; then
        if [[ "$VM_CONTAINER_QEMU_VERSION" > "5" ]]; then
            echo "# VM Qemu version: $VM_CONTAINER_QEMU_VERSION"
        else
            if [[ "$QEMU_CPUMEM" =~ ",dies=" ]]; then
                error "Too old Qemu version \"$VM_CONTAINER_QEMU_VERSION\". Topology with dies > 1 requires Qemu >= 5.0"
            else
                echo "# (Your Qemu does not support dies > 1, consider updating for full topology support)"
            fi
        fi
    else
        echo "Warning: cannot verify Qemu version on govm image. In case of failure, check it is >= 5.0" >&2
    fi
    echo "# VM Qemu output : docker logs $VM_CONTAINER_ID"
    echo "# VM Qemu monitor: docker exec -it $VM_CONTAINER_ID nc local:/data/monitor"
    VM_MONITOR="docker exec -i $VM_CONTAINER_ID nc local:/data/monitor"
    host-wait-vm-ssh-server
    host-wait-cloud-init
}

get-ssh-timeout() {
    echo $((`date +%s` + $1))
}

host-wait-vm-ssh-server() {
    timeout=`get-ssh-timeout 120`

    while [ "${1#-}" != "$1" ] && [ -n "$1" ]; do
        case "$1" in
            --timeout)
                timeout=`get-ssh-timeout $2`
                shift; shift
                ;;
            *)
                invalid="${invalid}${invalid:+,}\"$1\""
                shift
                ;;
        esac
    done
    if [ -n "$invalid" ]; then
        error "invalid options: $invalid"
        return 1
    fi

    if [ -z "$VM_IP" ]; then
        VM_IP=$(${GOVM} ls | awk "/$VM_NAME/{print \$4}")
        while [ "x$VM_IP" == "x" ]; do
            host-command "${GOVM} start \"$VM_NAME\""
            sleep 5
            VM_IP=$(${GOVM} ls | awk "/$VM_NAME/{print \$4}")
        done
    fi
    echo "# VM SSH server  : ssh $VM_SSH_USER@$VM_IP"

    if [ -d "$HOME/vms/data/$VM_NAME" ]; then
        SSH_OPTS="$SSH_OPTS -o ControlMaster=auto -o ControlPath=$HOME/vms/data/$VM_NAME/ssh -o ControlPersist=30"
        SSH="${SSH%% *} $SSH_OPTS"
        SCP="${SCP%% *} $SSH_OPTS"
        export SSH SSH_OPTS SCP
    fi

    ssh-keygen -f "$HOME/.ssh/known_hosts" -R "$VM_IP" >/dev/null 2>&1

    print_info=1

    while ! $SSH ${VM_SSH_USER}@${VM_IP} -o ConnectTimeout=2 true 2>/dev/null; do
	CURR_TIME=`date +%s`
	if [ $CURR_TIME -gt $timeout ]; then
            error "timeout"
	fi

	if [ "$print_info" == 1 ]; then
            echo -n "Waiting for VM SSH server to respond..."
	    print_info=0
	fi
        sleep 2
        echo -n "."
    done
    echo ""
}

host-wait-cloud-init() {
    retries=60
    retries_left=$retries
    while true; do
        $SSH -o ConnectTimeout=2 ${VM_SSH_USER}@${VM_IP} sudo cloud-init status --wait 2>/dev/null
        [ "$?" -eq 0 -o "$?" -eq 2 ] && break

        if [ "$retries" == "$retries_left" ]; then
            echo -n "Waiting for VM cloud-init to finish..."
        fi
        sleep 2
        echo -n "."
        retries_left=$(( $retries_left - 1 ))
        if [ "$retries_left" == "0" ]; then
            error "timeout"
        fi
    done
    [ "$retries" == "$retries_left" ] || echo ""
}

host-stop-vm() {
    #VM_NAME=$1
    host-require-govm
    host-command "${GOVM} stop $VM_NAME" || {
        command-error "stopping govm \"$VM_NAME\" failed"
    }
}

host-delete-vm() {
    #VM_NAME=$1
    host-require-govm
    host-command "${GOVM} delete $VM_NAME" || {
        command-error "deleting govm \"$VM_NAME\" failed"
    }
}

host-is-encrypted-ssh-key() {
    ssh-keygen -y -f "$1" < /dev/null >& /dev/null
    if [ $? != 0 ]; then
        return 0
    else
        return 1
    fi
}

host-mount-vm() {
    # Usage: host-mount-vm
    #
    # Mount VM / to VM data directory on host.
    # host-get-vm-config NAME must be run first.
    local mountpoint="${HOST_VM_DATA_DIR}/sshfs"
    local vm_sftp_server=""
    local vm_sftp_server_candidates=(/usr/lib/openssh/sftp-server /usr/libexec/sftp-server)
    local vm_sftp_server_candidate
    command -v sshfs >/dev/null || {
        error "host-mount-vm: missing sshfs"
    }
    if mount | grep "${mountpoint}"; then
        echo "host-mount-vm: already mounted"
        return 0
    fi
    for vm_sftp_server_candidate in "${vm_sftp_server_candidates[@]}"; do
        if vm-command-q "command -v ${vm_sftp_server_candidate} >/dev/null"; then
            vm_sftp_server="${vm_sftp_server_candidate}"
            break
        fi
    done
    if [ -z "${vm_sftp_server}" ]; then
        error "cannot find sftp-server from vm"
    fi
    mkdir -p "${mountpoint}"
    sshfs "${VM_SSH_USER}@${VM_IP}:/" "${mountpoint}" -o sftp_server="/usr/bin/sudo ${vm_sftp_server}" $SSH_OPTS || {
        error "sshfs mount failed"
    }
    echo "host-mount-vm: mounted ${VM_NAME}:/ to ${mountpoint}"
}


================================================
FILE: demo/lib/numactlH2numajson.py
================================================
#!/usr/bin/env python3

"""numactlH2numajson - convert numactl -H output to numajson

Example:
  numactl -H | numactlH2numajson
"""

import json
import math
import re
import sys

QEMU_DEFAULT_DIST_OTHER = 20
QEMU_DEFAULT_DIST_SELF = 10

def error(msg, exit_status=1):
    sys.stderr.write("numactlH2numajson: %s\n" % (msg,))
    if not exit_status is None:
        sys.exit(1)

def round_size(size, size_unit, non_zero_numbers=3):
    if size_unit == "kB":
        size_mb = size / 1024
    elif size_unit == "MB":
        size_mb = size
    elif size_unit == "GB":
        size_mb = size * 1024
    elif size_unit == "TB":
        size_mb = size * 1024 * 1024
    else:
        raise Exception("unsupported size unit: %r" % (size_unit,))
    if size_mb == 0:
        return "0G"
    size_mul = 10**int(math.log10(size_mb))
    rounded = round(size_mb * 10**(non_zero_numbers-1) / size_mul) * size_mul / (10**(non_zero_numbers-1))
    if size_mul < 1000:
        return "%.0fM" % (rounded,)
    else:
        return "%.0fG" % (rounded/1000)

def add_dists_to_numalist(numalist, dists):
    """Add/replace distance information in numalist with node distances in dists.
       dists[i][j] = distance from node i to node j.
       dists can be a matrix or a dict: {sourcenode: {destnode: dist}}"""
    dist_matrix = []
    node = -1
    node_group = {} # {node: group_index_in_numalist}
    group_nodes = {} # {group_index_in_numalist: set_of_nodes}
    for groupindex, numaspec in enumerate(numalist):
        group_nodes[groupindex] = set()
        nodecount = int(numaspec.get("nodes", 1))
        for _ in range(nodecount):
            node += 1
            group_nodes[groupindex].add(node)
            node_group[node] = groupindex
    lastnode = node
    if isinstance(dists, list):
        # dists is a dist matrix.
        dist_matrix = dists
    else:
        # dists is a dict. create dist_matrix from it.
        for sourcenode in range(lastnode + 1):
            dist_matrix.append([])
            for destnode in range(lastnode + 1):
                if sourcenode in dists and destnode in dists[sourcenode]:
                    d = dists[sourcenode][destnode]
                elif sourcenode != destnode:
                    d = QEMU_DEFAULT_DIST_OTHER
                else:
                    d = QEMU_DEFAULT_DIST_SELF
                dist_matrix[-1].append(d)
    dist_freq = {} # {distance: number-of-appearances}
    try:
        for sourcenode in range(lastnode + 1):
            for destnode in range(lastnode + 1):
                if sourcenode != destnode:
                    d = dist_matrix[sourcenode][destnode]
                    dist_freq[d] = dist_freq.get(d, 0) + 1
    except IndexError:
        raise ValueError("invalid dists matrix dimensions, %sx%s expected" % (lastnode + 1, lastnode + 1))
    # Read the most common distance from the matrix, ignore distance-to-self.
    if len(dist_freq) > 0:
        default_dist = max([(v, k) for k, v in dist_freq.items()])[1]
    else:
        default_dist = QEMU_DEFAULT_DIST_SELF # don't care: there's only one node
    # Try filling symmetric distances with the default dist.
    # There may be asymmetry or node grouping that making this impossible.
    # In those cases sym_dist_errors > 0.
    sym_dist_errors = 0
    group_node_dist = {} # {group_index: {othernode: dist}}
    for sourcenode in range(lastnode + 1):
        sourcegroup = node_group[sourcenode]
        if not sourcegroup in group_node_dist:
            group_node_dist[sourcegroup] = {}
        for destnode in range(lastnode + 1):
            destgroup = node_group[destnode]
            if sourcenode == destnode:
                continue
            elif dist_matrix[sourcenode][destnode] == default_dist:
                continue
            elif dist_matrix[sourcenode][destnode] != dist_matrix[destnode][sourcenode]:
                # There is asymmetry.
                sym_dist_errors += 1
                continue
            for othernode in [n for n in group_nodes[sourcegroup] if n != sourcenode and n != destnode]:
                if (dist_matrix[othernode][destnode] != dist_matrix[sourcenode][destnode] or
                    dist_matrix[othernode][destnode] != dist_matrix[destnode][sourcenode]):
                    # Different nodes in the same group have different distances.
                    sym_dist_errors += 1
            group_node_dist[sourcegroup][destnode] = dist_matrix[sourcenode][destnode]
    # Clear existing distance definitions from numalist.
    for numaspec in numalist:
        if "dist" in numaspec:
            del numaspec["dist"]
        if "dist-all" in numaspec:
            del numaspec["dist-all"]
        if "node-dist" in numaspec:
            del numaspec["node-dist"]
    # Now we are ready to add distance information.
    if sym_dist_errors == 0 and len(str(group_node_dist)) < len(str(dist_matrix)):
        # Add info using "dist" and "node-dist", that is symmetrical distances.
        # This time it is more compact representation than a matrix.
        for groupindex, numaspec in enumerate(numalist):
            if group_node_dist[groupindex] != {}:
                # if all nodes mentioned in node-dist are in earlier groups,
                # there is no need to inject this definition, because it has been
                # covered by distance symmetry.
                nodes_with_dists = set(group_node_dist[groupindex].keys())
                for earlier_group in range(groupindex):
                    nodes_with_dists -= group_nodes[earlier_group]
                # there are new distance definitions, include all
                if len(nodes_with_dists) > 0:
                    numaspec["node-dist"] = group_node_dist[groupindex]
        if default_dist != QEMU_DEFAULT_DIST_OTHER:
            numalist[0]["dist"] = default_dist
    elif len(numalist) > 1:
        # Add distances as a matrix.
        numalist[-1]["dist-all"] = dist_matrix
    else:
        # There is no need for distance information in the numalist,
        # as there is only one node.
        pass

def numactlH2numajson(input_line_iter):
    numalist = []
    dist_matrix = []
    re_node_cpus = re.compile('^node (?P<node>[0-9]+) cpus:( (?P<cpus>([0-9]+\s?)*))?')
    re_node_size = re.compile('^node (?P<node>[0-9]+) size:( (?P<size>[0-9]+) (?P<size_unit>[a-zA-Z]+))?')
    re_node_distances = re.compile('^\s*(?P<sourcenode>[0-9]+):(?P<dists>(\s*[0-9]+)*)')
    for line in input_line_iter:
        m = re_node_cpus.match(line)
        if m:
            m_dict = m.groupdict()
            node = int(m_dict["node"])
            if m_dict["cpus"] is None:
                cpus = []
            else:
                cpus = [int(cpu) for cpu in m.groupdict()["cpus"].strip().split()]
            continue
        m = re_node_size.match(line)
        if m:
            m_dict = m.groupdict()
            if int(m_dict["node"]) != node:
                raise Exception("expected node %s size, got %r" % (node, line))
            size_unit = m_dict["size_unit"]
            mem = round_size(int(m_dict["size"]), size_unit)
            if (len(numalist) == 0
                or numalist[-1]["cpu"] != len(cpus)
                or numalist[-1]["mem"] != mem):
                # found a node that is different from the previous
                numalist.append({"cpu": len(cpus),
                                      "mem": mem,
                                      "nodes": 1})
            else:
                # found a node that looks the same as the previous
                numalist[-1]["nodes"] += 1
            nodecount = node + 1
            continue
        m = re_node_distances.match(line)
        if m:
            m_dict = m.groupdict()
            dist_matrix.append([int(d) for d in m_dict['dists'].strip().split()])

    # filter out unnecessary "nodes": 1 from the list:
    for d in numalist:
        if d["nodes"] == 1:
            del d["nodes"]
    # parse distances
    add_dists_to_numalist(numalist, dist_matrix)
    return numalist

def self_test():
    input_output = {
        """available: 5 nodes (0-4)
node 0 cpus: 0
node 0 size: 1007 MB
node 0 free: 784 MB
node 1 cpus: 1
node 1 size: 1007 MB
node 1 free: 262 MB
node 2 cpus: 2 3
node 2 size: 1951 MB
node 2 free: 1081 MB
node 3 cpus: 4 5 6 7
node 3 size: 4030 MB
node 3 free: 693 MB
node 4 cpus:
node 4 size: 8039 MB
node 4 free: 8029 MB
node distances:
node   0   1   2   3   4
  0:  10  22  22  22  88
  1:  22  10  22  22  88
  2:  22  22  10  22  88
  3:  22  22  22  10  88
  4:  88  88  88  88  10
""": [{'cpu': 1, 'mem': '1G', 'nodes': 2, 'node-dist': {4: 88}, 'dist': 22}, {'cpu': 2, 'mem': '2G', 'node-dist': {4: 88}}, {'cpu': 4, 'mem': '4G', 'node-dist': {4: 88}}, {'cpu': 0, 'mem': '8G'}],
        """available: 2 nodes (0-1)
node 0 cpus: 0 1 2 3
node 0 size: 3966 MB
node 0 free: 1649 MB
node 1 cpus: 4 5 6 7
node 1 size: 4006 MB
node 1 free: 983 MB
node distances:
node   0   1
  0:  10  20
  1:  20  10
""": [{'cpu': 4, 'mem': '4G', 'nodes': 2}],
"""available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3
node 0 size: 3966 MB
node 0 free: 1649 MB
node 1 cpus: 4 5 6 7
node 1 size: 4006 MB
node 1 free: 983 MB
node 1 cpus: 8 9 10 11
node 1 size: 4006 MB
node 1 free: 983 MB
node 1 cpus: 12 13 14 15
node 1 size: 4006 MB
node 1 free: 983 MB
node distances:
node   0   1   2   3
  0:  10  55  55  55
  1:  55  10  55  55
  2:  55  55  10  55
  3:  55  55  55  10
""": [{'cpu': 4, 'mem': '4G', 'nodes': 4, 'dist': 55}],
    """available: 1 nodes (0)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
node 0 size: 128000 MB
node 0 free: 80000 MB
node distances:
node   0
  0:  10
""": [{'cpu': 20, 'mem': '128G'}],
        """available: 5 nodes (0-4)
node 0 cpus: 0
node 0 size: 4007 MB
node 0 free: 784 MB
node 1 cpus: 1
node 1 size: 1007 MB
node 1 free: 262 MB
node 2 cpus: 2 3
node 2 size: 1951 MB
node 2 free: 1081 MB
node 3 cpus: 4 5 6 7
node 3 size: 4030 MB
node 3 free: 693 MB
node 4 cpus:
node 4 size: 8039 MB
node 4 free: 8029 MB
node distances:
node   0   1   2   3   4
  0:  10  22  33  44  55
  1:  22  10  22  22  22
  2:  33  22  10  22  22
  3:  44  22  22  10  22
  4:  55  22  22  22  10
""": [{'cpu': 1, 'mem': '4G', 'node-dist': {2: 33, 3: 44, 4: 55}, 'dist': 22}, {'cpu': 1, 'mem': '1G'}, {'cpu': 2, 'mem': '2G'}, {'cpu': 4, 'mem': '4G'}, {'cpu': 0, 'mem': '8G'}]
    }

    for input_string in input_output.keys():
        observed = numactlH2numajson(input_string.splitlines())
        expected = input_output[input_string]
        if observed != expected:
            raise Exception("self-test: observed/expected mismatch on numanodes\n%s\n\nobserved: %r\nexpected: %r" % (input_string, observed, expected))
    add_dists_to_numalist([], [])
    return 0

if __name__ == "__main__":
    if len(sys.argv) > 1 and sys.argv[1] == "test":
        sys.exit(self_test())
    try:
        numalist = numactlH2numajson(sys.stdin)
    except Exception as e:
        raise
        error(str(e))
    print(json.dumps(numalist))


================================================
FILE: demo/lib/topology.py
================================================
#!/usr/bin/env python3

"""topology.py - topology utility

Usage: topology.py [options] command

Options:
  -t TOPOLOGY_DUMP    load topology_dump from TOPOLOGY_DUMP file instead of
                      the "topology_dump" environment variable or local host.
  -r RES_ALLOWED      load res_allowed from RES_ALLOWED file instead of
                      the "res_allowed" environment variable or local host.
  -o OUTPUT_FORMAT    "json" or "text". The default is "text".

Commands:
  help                print help
  cpus                view CPU topology from topology_dump.
  cpus_allowed [PROCESS...]
                      view how matching PROCESSes are allowed to use CPUs.
                      (Uses RES_ALLOWED like res_allowed below.)
  res                 view CPU and memory topology from topology_dump.
  res_allowed [PROCESS...]
                      view how matching PROCESSes are allowed to use CPUs
                      and memory in CPU/mem topology tree.
                      If the RES_ALLOWED file or the res_allowed
                      environment variable are not defined, "pgrep -f PROCESS"
                      is used to match processes.
  bash_topology_dump  print a Bash command that creates topology_dump.
  bash_res_allowed PROCESS [PROCESS...]
                      print a Bash command that creates res_allowed
                      dump that contains Cpus_allowed and Mems_allowed
                      masks of processes matching "pgrep -f PROCESS".

Examples:
  Print local host CPU topology
  $ topology.py cpus

  Print how processes with pod0..2 in their names are allowed to use CPUs
  $ topology.py res_allowed pod0 pod1 pod2

  Print remote host CPU topology
  $ topology_dump="$(ssh remotehost "$(topology.py bash_topology_dump)")" topology.py cpus

  Watch how pod0..2 are allowed to CPUS on remote host, read topology only once
  $ export topology_dump="$(ssh remotehost "$(topology.py bash_topology_dump)")"
  $ watch 'res_allowed=$(ssh remotehost "$(topology.py bash_res_allowed pod0 pod1 pod2)") topology.py res_allowed'
"""

import getopt
import json
import os
import re
import subprocess
import sys

_bash_topology_dump = """for cpu in /sys/devices/system/cpu/cpu[0-9]*; do cpu_id=${cpu#/sys/devices/system/cpu/cpu}; echo "cpu p:$(< ${cpu}/topology/physical_package_id) d:$(< ${cpu}/topology/die_id) n:$(basename  ${cpu}/node* | sed 's:node::g') c:$(< ${cpu}/topology/core_id) t:$(< ${cpu}/topology/thread_siblings) cpu:${cpu_id}" ; done;  for node in /sys/devices/system/node/node[0-9]*; do node_id=${node#/sys/devices/system/node/node}; echo "dist n:$node_id d:$(< $node/distance)"; echo "mem n:$node_id s:$(awk '/MemTotal/{print $4/1024}' < $node/meminfo)"; done"""

_bash_res_allowed = r"""for process in '%s'; do for pid in $(pgrep -f "$process"); do proc_pid_cmdline=$(< /proc/$pid/cmdline) || continue; proc_pid_status=$(< /proc/$pid/status) || continue; name=$(echo "$proc_pid_cmdline" | tr '\0 ' '\n' | grep -E "^$process" | head -n 1); [ -n "$name" ] && [ "$pid" != "$$" ] && [ "$pid" != "$PPID" ] && echo "${name}/${pid} $(awk '/Cpus_allowed:/{c=$2}/Mems_allowed:/{m=$2}END{print "c:"c" m:"m}' <<< "$proc_pid_status")"; done 2>/dev/null; done"""

def error(msg, exit_status=1):
    """Print error message and exit."""
    if not msg is None:
        sys.stderr.write('topology.py: %s\n' % (msg,))
    if not exit_status is None:
        sys.exit(exit_status)

def warning(msg):
    """Print warning."""
    sys.stderr.write('topology.py warning: %s\n' % (msg,))

def output_tree(tree):
    """Print tree to output in OUTPUT_FORMAT"""
    if opt_output_format == "json":
        sys.stdout.write(json.dumps(tree))
    else:
        sys.stdout.write(str_tree(tree) + "\n")
    sys.stdout.flush()

def add_tree(root, branch, value_dict):
    """Add key-value pairs in value_dict to given branch in the tree starting from root.

    If the branch does not exist in the tree, it will be created.

    Example:
      add_tree(tree, ("package0", "die1", "node3", "core7", "thread0", "cpu15"), {"GHz", 4.2})
    """
    node = root
    for b in branch:
        if b in node:
            node = node[b]
        else:
            node[b] = {}
            node = node[b]
    node.update(value_dict)

def _str_node(root, lines, branch):
    """Format node names in tree to lines ([[line1col1, line1col2], ...])."""
    for key in sorted(root.keys()):
        branch.append(key)
        if root[key]:
            _str_node(root[key], lines, branch)
        else:
            # Add those column texts to the new line which does not have the same value
            # as previous non-empty text in the same column.
            new_line = []
            new_col_txt_added = False
            for col, txt in enumerate(branch):
                if new_col_txt_added:
                    prev_col_txt = ""
                else:
                    for prev_line in lines[::-1]:
                        if len(prev_line) > col and prev_line[col] != "":
                            prev_col_txt = prev_line[col]
                            break
                    else:
                        prev_col_txt = ""
                if txt != prev_col_txt:
                    new_line.append(txt)
                    new_col_txt_added = True
                else:
                    new_line.append("")
            lines.append(new_line)
        branch.pop()

def str_tree(root):
    """Format tree to string."""
    lines = []
    _str_node(root, lines, [])
    col_max_len = {} # {column-index: max-string-length}
    max_col = -1
    for line in lines:
        for col, txt in enumerate(line):
            if col > max_col:
                max_col = col
            if len(txt) > col_max_len.get(col, -1):
                col_max_len[col] = len(txt)
    str_lines = []
    for line in lines:
        line_cols = len(line)
        new_str_fmt = ""
        for col, txt in enumerate(line):
            new_str_fmt += "%-" + str(col_max_len[col] + 1) + "s"
        str_lines.append(new_str_fmt % tuple(line))
    return "\n".join(str_lines)

def bash_output(cmd):
    """Return standard output of executing cmd in Bash."""
    p = subprocess.Popen(["bash", "-c", cmd], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out, err = p.communicate()
    return out.decode("utf-8")

def get_local_topology_dump():
    """Return topology_dump from local system."""
    return bash_output(_bash_topology_dump)

def get_local_res_allowed_dump(processes):
    """Return res_allowed from local system."""
    return bash_output(_bash_res_allowed % ("' '".join(processes),))

def dump_to_topology(dump, show_mem=True):
    """Parse topology_dump, return topology data structures."""
    # Output data structures:
    tree = {} # {"package0": {"die1": {"node1": ...}}}
    cpu_branch = {} # {cpu_id: (package_name, die_name, node_name, core_name, thread_name, cpu_name)}
    node_branch = {} # {node_id: (package_name, die_name, node_name)}
    mem_branch = {} # {node_id: (package_name, ...)}
    # Example input line to be parsed:
    # cpu line:
    # "cpu p:0 d:1 n:3 c:2 t:00003000 cpu:13"
    # mem line:
    # "mem n:4: s:8063.83"
    re_cpu_line = re.compile('cpu p:(?P<package>[0-9]+) d:(?P<die>[0-9]*) n:(?P<node>[0-9]+) c:(?P<core>[0-9]+) t:(?P<thread_siblings>[0-9a-f,]+) cpu:(?P<cpu_id>[0-9]+)')
    re_mem_line = re.compile('mem n:(?P<node>[0-9]+) s:(?P<size>[0-9.]+)')
    re_dist_line = re.compile('dist n:(?P<node>[0-9]+) d:(?P<dist>([0-9 ]+))')
    numeric_cpu_lines = []
    numeric_mem_lines = []
    numeric_dist_lines = []
    for line in dump.splitlines():
        m = re_cpu_line.match(line)
        if m:
            mdict = m.groupdict()
            package = int(mdict["package"])
            try:
                die = int(mdict["die"])
            except ValueError:
                die = 0 # handle kernels that do not provide topology/die_id
            node = int(mdict["node"])
            core = int(mdict["core"])
            thread_siblings = eval("0x" + mdict["thread_siblings"].replace(",", ""))
            cpu_id = int(mdict["cpu_id"])
            # Calculate thread id.
            # Let the lowest CPU bit owner in thread_siblings be thread 0, next thread 1 and so on.
            thread = -1
            bit = 1 << cpu_id
            while bit:
                if thread_siblings & bit:
                    thread += 1
                bit >>= 1
            numeric_cpu_lines.append((package, die, node, core, thread, cpu_id))
            continue
        m = re_mem_line.match(line)
        if m:
            mdict = m.groupdict()
            numeric_mem_lines.append((int(mdict["node"]), float(mdict["size"])))
            continue
        m = re_dist_line.match(line)
        if m:
            mdict = m.groupdict()
            numeric_dist_lines.append((int(mdict["node"]),
                                      tuple([int(n) for n in mdict["dist"].strip().split()])))
    numeric_mem_lines.sort() # make sure memory sizes are from node 0, 1, ...
    numeric_dist_lines.sort()

    # Build tree on CPUs
    max_package_len = max(len(str(nl[0])) for nl in numeric_cpu_lines)
    max_die_len = max(len(str(nl[1])) for nl in numeric_cpu_lines)
    max_node_len = max(len(str(nl[2])) for nl in numeric_cpu_lines)
    max_core_len = max(len(str(nl[3])) for nl in numeric_cpu_lines)
    max_thread_len = max(len(str(nl[4])) for nl in numeric_cpu_lines)
    max_cpu_id_len = max(len(str(nl[5])) for nl in numeric_cpu_lines)
    for (package, die, node, core, thread, cpu_id) in numeric_cpu_lines:
        branch = ("package" + str(package).zfill(max_package_len),
                  "die" + str(die).zfill(max_die_len),
                  "node" + str(node).zfill(max_node_len),
                  "core" + str(core).zfill(max_core_len),
                  "thread" + str(thread).zfill(max_thread_len),
                  "cpu" + str(cpu_id).zfill(max_cpu_id_len))
        add_tree(tree, branch, {})
        cpu_branch[cpu_id] = branch
        node_branch[node] = branch[:3]
    if show_mem:
        # Add node memory information to the tree
        for node, distvec in numeric_dist_lines:
            mem_node_name = "node" + str(node).zfill(max_node_len)
            node_mem_size = str(int(round((numeric_mem_lines[node][1]/1024)))) + "G"
            dists = sorted(distvec)
            if node in node_branch:
                # This node has CPU(s) as it has been added to the tree already in CPU lines.
                # Add memory branch to the tree under the existing node branch.
                branch = node_branch[node] + (
                    "mem", mem_node_name, node_mem_size)
            elif (dists[0] == 10 # sane distance-to-self
                  and (len(dists) < 3 or dists[1] < dists[2])  # there is a node closer than others
                  and distvec.index(dists[1]) in node_branch): # that node is already in the tree
                # This means that the node has the same memory controller as this node.
                # Add memory branch from this node under the existing node.
                node_same_ctrl = distvec.index(dists[1])
                branch = node_branch[node_same_ctrl] + (
                    "mem", mem_node_name, node_mem_size)
                node_branch[node] = branch[:3]
            else:
                # Suitable memory controller not found, create completely separate branch.
                branch = ("packagex", "mem", "node" + str(node).zfill(max_node_len),
                    "mem", mem_node_name, node_mem_size)
                node_branch[node] = branch[:3]
            add_tree(tree, branch, {})
            mem_branch[node] = branch
    return {"tree": tree,
            "cpu_branch": cpu_branch,
            "node_branch": node_branch,
            "mem_branch": mem_branch}

def dump_to_res_allowed(res_allowed_dump):
    """Parse res_allowed data, return allowed cpu and mem bitmasks in a data structure."""
    # Output data structure:
    owner_mask = {} # {owner_string: {"cpu": bitmask_int, "mem": bitmask_int}}
    # Example input line to be parsed:
    # "pod2  c:040c0000,00000000 m:00000000,00000300"
    re_owner_mask = re.compile(r'(?P<owner>[^ ]+)\s+c:(?P<cpumask>[0-9a-f,]+)\s+m:(?P<memmask>[0-9a-f,]+)')
    for line in res_allowed_dump.splitlines():
        if not line:
            continue
        try:
            mdict = re_owner_mask.match(line).groupdict()
        except:
            warning("cannot parse res_allowed line %r" % (line,))
            continue
        owner_mask[mdict["owner"]] = {
            "cpu": eval("0x" + mdict["cpumask"].replace(",", "")),
            "mem": eval("0x" + mdict["memmask"].replace(",", ""))
        }
    return owner_mask

def get_topology(show_mem=True):
    """Return topology data structure."""
    # Priority: use file, environment variable or read from local system
    if opt_topology_dump:
        topology_dump = opt_topology_dump
    else:
        topology_dump = os.getenv("topology_dump", None)
    if topology_dump is None:
        topology_dump = get_local_topology_dump()
    return dump_to_topology(topology_dump, show_mem=show_mem)

def get_res_allowed(processes):
    """Return res_allowed data structure."""
    # Priority: use file, environment variable or read from local system
    if opt_res_allowed_dump:
        res_allowed_dump = opt_res_allowed_dump
    else:
        res_allowed_dump = os.getenv("res_allowed", None)
    if res_allowed_dump is None:
        res_allowed_dump = get_local_res_allowed_dump(processes)
    return dump_to_res_allowed(res_allowed_dump)

def report_res(show_mem=True):
    """Print topology tree."""
    topology = get_topology(show_mem=show_mem)
    output_tree(topology["tree"])

def report_res_allowed(processes, show_mem=True):
    """Print topology tree with allowed processes as leaf nodes."""
    topology = get_topology(show_mem=show_mem)
    tree = topology["tree"]
    cpu_branch = topology["cpu_branch"]
    mem_branch = topology["mem_branch"]
    node_branch = topology["node_branch"]
    max_cpu = max(cpu_branch.keys())
    max_node = max(node_branch.keys())
    res_allowed = get_res_allowed(processes)
    # add found owners to tree as children of cpus
    for owner, masks in sorted(res_allowed.items()):
        cpumask = masks["cpu"]
        memmask = masks["mem"]
        for cpu in range(max_cpu + 1):
            if cpumask & (1 << cpu):
                add_tree(tree, cpu_branch[cpu], {owner: {}})
        if show_mem:
            for node in range(max_node + 1):
                if memmask & (1 << node):
                    add_tree(tree, mem_branch[node], {owner: {}})
    output_tree(tree)

if __name__ == "__main__":
    opt_topology_dump = None
    opt_res_allowed_dump = None
    opt_output_format = "text"
    try:
        options, commands = getopt.gnu_getopt(
            sys.argv[1:], 'ht:r:o:',
            ['help', '--topology-dump-file=', '--res-allowed-file='])
    except getopt.GetoptError as e:
        error(str(e))
    for opt, arg in options:
        if opt in ["-h", "--help"]:
            print(__doc__)
            error(None, exit_status=0)
        elif opt in ["-t", "--topology-file"]:
            try:
                opt_topology_dump = open(arg).read()
            except IOError as e:
                error("cannot read topology dump from file %r: %s" % (arg, e))
        elif opt in ["-r", "--res-allowed-file"]:
            try:
                opt_res_allowed_dump = open(arg).read()
            except IOError as e:
                error("cannot read res_allowed dump from file %r: %s" % (arg, e))
        elif opt in ["-o"]:
            if arg in ["json", "text"]:
                opt_output_format = arg
            else:
                error("invalid output format %r")
    if not commands:
        error("missing command, see --help")
    elif commands[0] == "help":
        print(__doc__)
        error(None, exit_status=0)
    elif commands[0] == "cpus":
        report_res(show_mem=False)
    elif commands[0] == "cpus_allowed":
        report_res_allowed(commands[1:], show_mem=False)
    elif commands[0] == "res":
        report_res(show_mem=True)
    elif commands[0] == "res_allowed":
        report_res_allowed(commands[1:])
    elif commands[0] == "bash_topology_dump":
        print(_bash_topology_dump)
    elif commands[0] == "bash_res_allowed":
        print(_bash_res_allowed % ("' '".join(commands[1:]),))
    else:
        error('invalid command %r' % (commands[0],))


================================================
FILE: demo/lib/topology2qemuopts.py
================================================
#!/usr/bin/env python3

"""topology2qemuopts - convert NUMA node list from JSON to Qemu options

NUMA node group definitions:
"mem"                 mem (RAM) size on each NUMA node in this group.
                      The default is "0G".
"nvmem"               nvmem (non-volatile RAM) size on each NUMA node
                      in this group. The default is "0G".
"dimm"                "": the default, memory is there without pc-dimm defined.
                      "plugged": start with cold plugged pc-dimm.
                      "unplugged": start with free slot for hot plug.
                        Add the dimm in Qemu monitor at runtime:
                          device_add pc-dimm,id=dimmX,memdev=memX,node=X
                        or
                          device_add nvdimm,id=nvdimmX,memdev=nvmemX,node=X
"cores"               number of CPU cores on each NUMA node in this group.
                      The default is 0.
"threads"             number of threads on each CPU core.
                      The default is 2.
"nodes"               number of NUMA nodes on each die.
                      The default is 1.
"dies"                number of dies on each package.
                      The default is 1.
"packages"            number of packages.
                      The default is 1.

NUMA node distances are defined with following keys:
"dist-all": [[from0to0, from0to1, ...], [from1to0, from1to1, ...], ...]
                      distances from every node to all nodes.
                      The order is the same as in to numactl -H
                      "node distances:" output.
"node-dist": {"node": dist, ...}
                      symmetrical distances from nodes in this group to other
                      nodes.

Distances that apply to all NUMA groups if defined in any:
"dist-same-die": N    the default distance between NUMA nodes on the same die.
"dist-same-package": N the default distance between NUMA nodes on the same package.
"dist-other-package": N  the default distance between NUMA nodes in other packages.

Note that the distance from a node to itself is always 10. The default
distance to a node on the same die is 11, and to other nodes on the
same and different packages is 21.

Example: Each of the first two NUMA groups in the list contains two
NUMA nodes. Each node in the first group includes two CPU cores and 2G
RAM, while nodes in the second group two CPU cores and 1G RAM. The
only NUMA node defined in the third group has 8G of NVRAM, and no CPU.

Every NUMA group with CPU cores adds a package (a socket) to the
configuration, or many identical packages if "packages" > 1.  This
example creates a two-socket system, four CPU cores per package. Note
that CPU cores are divided symmetrically to packages, meaning that
every NUMA group with CPU cores should contain the same number of
cores.

$ ( cat << EOF
[
    {
        "mem": "2G",
        "cores": 2,
        "nodes": 2
    },
    {
        "mem": "1G",
        "cores": 2,
        "nodes": 2
    },
    {
        "nvmem": "8G",
        "node-dist": {"0": 88, "1": 88, "2": 88, "3": 88,
                      "4": 66, "5": 66, "7": 66, "8": 66}
    }
]
EOF
) | python3 topology2qemuopts.py
"""

import sys
import json

DEFAULT_DIST = 21
DEFAULT_DIST_SAME_PACKAGE = 21
DEFAULT_DIST_SAME_DIE = 11
DEFAULT_DIST_SAME_NODE = 10

def error(msg, exitstatus=1):
    sys.stderr.write("topology2qemuopts: %s\n" % (msg,))
    if exitstatus is not None:
        sys.exit(exitstatus)

def siadd(s1, s2):
    if s1.lower().endswith("g") and s2.lower().endswith("g"):
        return str(int(s1[:-1]) + int(s2[:-1])) + "G"
    raise ValueError('supports only sizes in gigabytes, example: 2G')

def sisub(s1, s2):
    if s1.lower().endswith("g") and s2.lower().endswith("g"):
        return str(int(s1[:-1]) - int(s2[:-1])) + "G"
    raise ValueError('supports only sizes in gigabytes, example: 2G')

def validate(numalist):
    if not isinstance(numalist, list):
        raise ValueError('expected list containing dicts, got %s' % (type(numalist,).__name__))
    valid_keys = set(("mem", "nvmem", "dimm",
                      "cores", "threads", "nodes", "dies", "packages",
                      "node-dist", "dist-all",
                      "dist-other-package", "dist-same-package", "dist-same-die"))
    int_range_keys = {'cores': ('>= 0', lambda v: v >= 0),
                      'threads': ('> 0', lambda v: v > 0),
                      'nodes': ('> 0', lambda v: v > 0),
                      'dies': ('> 0', lambda v: v > 0),
                      'packages': ('> 0', lambda v: v > 0)}
    for numalistindex, numaspec in enumerate(numalist):
        for key in numaspec:
            if not key in valid_keys:
                raise ValueError('invalid name %r in node %r' % (key, numaspec))
            if key in ["mem", "nvmem"]:
                val = numaspec.get(key)
                if val == "0":
                    continue
                errmsg = 'invalid %s in node %r, expected string like "2G"' % (key, numaspec)
                if not isinstance(val, str):
                    raise ValueError(errmsg)
                try:
                    siadd(val, "0G")
                except ValueError:
                    raise ValueError(errmsg)
            if key in int_range_keys:
                try:
                    val = int(numaspec[key])
                    if not int_range_keys[key][1](val):
                        raise Exception()
                except:
                    raise ValueError('invalid %s in node %r, expected integer %s' % (key, numaspec, int_range_keys[key][0]))
        if 'threads' in numaspec and int(numaspec.get('cores', 0)) == 0:
            raise ValueError('threads set to %s but "cores" is 0 in node %r' % (numaspec["threads"], numaspec))

def dists(numalist):
    dist_dict = {} # Return value: {sourcenode: {destnode: dist}}, fully defined for all nodes
    sourcenode = -1
    lastsocket = -1
    dist_same_die = DEFAULT_DIST_SAME_DIE
    dist_same_package = DEFAULT_DIST_SAME_PACKAGE
    dist_other_package = DEFAULT_DIST # numalist "dist", if defined
    node_package_die = {} # topology {node: (package, die)}
    dist_matrix = None # numalist "dist_matrix", if defined
    node_node_dist = {} # numalist {sourcenode: {destnode: dist}}, if defined for sourcenode
    lastnode_in_group = -1
    for groupindex, numaspec in enumerate(numalist):
        nodecount = int(numaspec.get("nodes", 1))
        corecount = int(numaspec.get("cores", 0))
        diecount = int(numaspec.get("dies", 1))
        packagecount = int(numaspec.get("packages", 1))
        first_node_in_group = sourcenode + 1
        for package in range(packagecount):
            if nodecount > 0:
                lastsocket += 1
            for die in range(diecount):
                for node in range(nodecount):
                    sourcenode += 1
                    dist_dict[sourcenode] = {}
                    node_package_die[sourcenode] = (lastsocket, die)
        lastnode_in_group = sourcenode + 1
        if "dist" in numaspec:
            dist = numaspec["dist"]
        if "dist-same-die" in numaspec:
            dist_same_die = numaspec["dist-same-die"]
        if "dist-same-package" in numaspec:
            dist_same_package = numaspec["dist-same-package"]
        if "dist-all" in numaspec:
            dist_matrix = numaspec["dist-all"]
        if "node-dist" in numaspec:
            for n in range(first_node_in_group, lastnode_in_group):
                node_node_dist[n] = {int(nodename): value for nodename, value in numaspec["node-dist"].items()}
    if lastnode_in_group < 0:
        raise ValueError('no NUMA nodes found')
    lastnode = lastnode_in_group - 1
    if dist_matrix is not None:
        # Fill the dist_dict directly from dist_matrix.
        # It must cover all distances.
        if len(dist_matrix) != lastnode + 1:
            raise ValueError("wrong dimensions in dist-all %s rows seen, %s expected" % (len(dist_matrix), lastnode))
        for sourcenode, row in enumerate(dist_matrix):
            if len(row) != lastnode + 1:
                raise ValueError("wrong dimensions in dist-all on row %s: %s distances seen, %s expected" % (sourcenode + 1, len(row), lastnode + 1))
            for destnode, source_dest_dist in enumerate(row):
                dist_dict[sourcenode][destnode] = source_dest_dist
    else:
        for sourcenode in range(lastnode + 1):
            for destnode in range(lastnode + 1):
                if sourcenode == destnode:
                    dist_dict[sourcenode][destnode] = DEFAULT_DIST_SAME_NODE
                elif sourcenode in node_node_dist and destnode in node_node_dist[sourcenode]:
                    # User specified explicit node-to-node distance
                    dist_dict[sourcenode][destnode] = node_node_dist[sourcenode][destnode]
                    dist_dict[destnode][sourcenode] = node_node_dist[sourcenode][destnode]
                elif not destnode in dist_dict[sourcenode]:
                    # Set distance based on topology
                    if node_package_die[sourcenode] == node_package_die[destnode]:
                        dist_dict[sourcenode][destnode] = dist_same_die
                    elif node_package_die[sourcenode][0] == node_package_die[destnode][0]:
                        dist_dict[sourcenode][destnode] = dist_same_package
                    else:
                        dist_dict[sourcenode][destnode] = dist_other_package
    return dist_dict

def qemuopts(numalist):
    machineparam = "-machine pc"
    numaparams = []
    objectparams = []
    deviceparams = []
    lastnode = -1
    lastcpu = -1
    lastdie = -1
    lastsocket = -1
    lastmem = -1
    lastnvmem = -1
    totalmem = "0G"
    totalnvmem = "0G"
    unpluggedmem = "0G"
    pluggedmem = "0G"
    memslots = 0
    groupnodes = {} # groupnodes[NUMALISTINDEX] = (NODEID, ...)
    validate(numalist)

    # Read cpu counts, and "mem" and "nvmem" sizes for all nodes.
    threadcount = -1
    for numalistindex, numaspec in enumerate(numalist):
        nodecount = int(numaspec.get("nodes", 1))
        groupnodes[numalistindex] = tuple(range(lastnode + 1, lastnode + 1 + nodecount))
        corecount = int(numaspec.get("cores", 0))
        if corecount > 0:
            if threadcount < 0:
                # threads per cpu, set only once based on the first cpu-ful numa node
                threadcount = int(numaspec.get("threads", 2))
                threads_set_node = numaspec
            else:
                # threadcount already set, only check that there is no mismatch
                if (numaspec.get("threads", None) is not None and
                    threadcount != int(numaspec.get("threads"))):
                    raise ValueError('all CPUs must have the same number of threads, '
                                     'but %r had %s threads (the default) which contradicts %r' %
                                     (threads_set_node, threadcount, numaspec))
        cpucount = int(numaspec.get("cores", 0)) * threadcount # logical cpus per numa node (cores * threads)
        diecount = int(numaspec.get("dies", 1))
        packagecount = int(numaspec.get("packages", 1))
        memsize = numaspec.get("mem", "0")
        memdimm = numaspec.get("dimm", "")
        if memsize != "0":
            memcount = 1
        else:
            memcount = 0
        nvmemsize = numaspec.get("nvmem", "0")
        if nvmemsize != "0":
            nvmemcount = 1
        else:
            nvmemcount = 0
        for package in range(packagecount):
            if nodecount > 0 and cpucount > 0:
                lastsocket += 1
            for die in range(diecount):
                if nodecount > 0 and cpucount > 0:
                    lastdie += 1
                for node in range(nodecount):
                    lastnode += 1
                    currentnumaparams = []
                    for mem in range(memcount):
                        lastmem += 1
                        if memdimm == "":
                            objectparams.append("-object memory-backend-ram,size=%s,id=membuiltin_%s_node_%s" % (memsize, lastmem, lastnode))
                            currentnumaparams.append("-numa node,nodeid=%s,memdev=membuiltin_%s_node_%s" % (lastnode, lastmem, lastnode))
                        elif memdimm == "plugged":
                            objectparams.append("-object memory-backend-ram,size=%s,id=memdimm_%s_node_%s" % (memsize, lastmem, lastnode))
                            currentnumaparams.append("-numa node,nodeid=%s" % (lastnode,))
                            deviceparams.append("-device pc-dimm,node=%s,id=dimm%s,memdev=memdimm_%s_node_%s" % (lastnode, lastmem, lastmem, lastnode))
                            pluggedmem = siadd(pluggedmem, memsize)
                            memslots += 1
                        elif memdimm == "unplugged":
                            objectparams.append("-object memory-backend-ram,size=%s,id=memdimm_%s_node_%s" % (memsize, lastmem, lastnode))
                            currentnumaparams.append("-numa node,nodeid=%s" % (lastnode,))
                            unpluggedmem = siadd(unpluggedmem, memsize)
                            memslots += 1
                        else:
                            raise ValueError("unsupported dimm %r, expected 'plugged' or 'unplugged'" % (memdimm,))
                        totalmem = siadd(totalmem, memsize)
                    for nvmem in range(nvmemcount):
                        lastnvmem += 1
                        lastmem += 1
                        if lastnvmem == 0:
                            machineparam += ",nvdimm=on"
                        # Don't use file-backed nvdimms because the file would
                        # need to be accessible from the govm VM
                        # container. Everything is ram-backed on host for now.
                        if memdimm == "":
                            objectparams.append("-object memory-backend-ram,size=%s,id=memnvbuiltin_%s_node_%s" % (nvmemsize, lastmem, lastnode))
                            currentnumaparams.append("-numa node,nodeid=%s,memdev=memnvbuiltin_%s_node_%s" % (lastnode, lastmem, lastnode))
                        elif memdimm == "plugged":
                            objectparams.append("-object memory-backend-ram,size=%s,id=memnvdimm_%s_node_%s" % (nvmemsize, lastmem, lastnode))
                            currentnumaparams.append("-numa node,nodeid=%s" % (lastnode,))
                            deviceparams.append("-device nvdimm,node=%s,id=nvdimm%s,memdev=memnvdimm_%s_node_%s" % (lastnode, lastmem, lastmem, lastnode))
                            pluggedmem = siadd(pluggedmem, nvmemsize)
                            memslots += 1
                        elif memdimm == "unplugged":
                            objectparams.append("-object memory-backend-ram,size=%s,id=memnvdimm_%s_node_%s" % (nvmemsize, lastmem, lastnode))
                            currentnumaparams.append("-numa node,nodeid=%s" % (lastnode,))
                            unpluggedmem = siadd(unpluggedmem, nvmemsize)
                            memslots += 1
                        else:
                            raise ValueError("unsupported dimm %r, expected 'plugged' or 'unplugged'" % (memdimm,))
                        totalnvmem = siadd(totalnvmem, nvmemsize)
                    if cpucount > 0:
                        if not currentnumaparams:
                            currentnumaparams.append("-numa node,nodeid=%s" % (lastnode,))
                        currentnumaparams[-1] = currentnumaparams[-1] + (",cpus=%s-%s" % (lastcpu + 1, lastcpu + cpucount))
                        lastcpu += cpucount
                    numaparams.extend(currentnumaparams)
    node_node_dist = dists(numalist)
    for sourcenode in sorted(node_node_dist.keys()):
        for destnode in sorted(node_node_dist[sourcenode].keys()):
            if sourcenode == destnode:
                continue
            numaparams.append("-numa dist,src=%s,dst=%s,val=%s" % (
                sourcenode, destnode, node_node_dist[sourcenode][destnode]))
    if lastcpu == -1:
        raise ValueError('no CPUs found, make sure at least one NUMA node has "cores" > 0')
    if (lastdie + 1) // (lastsocket + 1) > 1:
        diesparam = ",dies=%s" % ((lastdie + 1) // (lastsocket + 1),)
    else:
        # Don't give dies parameter unless it is absolutely necessary
        # because it requires Qemu >= 5.0.
        diesparam = ""
    cpuparam = "-smp cpus=%s,threads=%s%s,sockets=%s" % (lastcpu + 1, threadcount, diesparam, lastsocket + 1)
    maxmem = siadd(totalmem, totalnvmem)
    startmem = sisub(sisub(maxmem, unpluggedmem), pluggedmem)
    memparam = "-m size=%s,slots=%s,maxmem=%s" % (startmem, memslots, maxmem)
    if startmem.startswith("0"):
        if pluggedmem.startswith("0"):
            raise ValueError('no memory in any NUMA node')
        raise ValueError("no initial memory in any NUMA node - cannot boot with hotpluggable memory")
    return (machineparam + " " +
            cpuparam + " " +
            memparam + " " +
            " ".join(numaparams) +
            " " +
            " ".join(deviceparams) +
            " " +
            " ".join(objectparams)
            )

def main(input_file):
    try:
        numalist = json.loads(input_file.read())
    except Exception as e:
        error("error reading JSON: %s" % (e,))
    try:
        print(qemuopts(numalist))
    except Exception as e:
        error("error converting JSON to Qemu opts: %s" % (e,))

if __name__ == "__main__":
    if len(sys.argv) > 1:
        if sys.argv[1] in ["-h", "--help"]:
            print(__doc__)
            sys.exit(0)
        else:
            input_file = open(sys.argv[1])
    else:
        input_file = sys.stdin
    main(input_file)


================================================
FILE: demo/lib/vm.bash
================================================
# shellcheck disable=SC1091
# shellcheck source=command.bash
source "$(dirname "${BASH_SOURCE[0]}")/command.bash"
# shellcheck disable=SC1091
# shellcheck source=distro.bash
source "$(dirname "${BASH_SOURCE[0]}")/distro.bash"

VM_PROMPT=${VM_PROMPT-"\e[38;5;11mroot@vm>\e[0m "}

vm-compose-govm-template() {
    (echo "
vms:
  - name: ${VM_NAME}
    image: ${VM_IMAGE}
    cloud: true
    ContainerEnvVars:
      - KVM_CPU_OPTS=${VM_QEMU_CPUMEM:=-machine pc -smp cpus=4 -m 8G}
      - EXTRA_QEMU_OPTS=-monitor unix:/data/monitor,server,nowait ${VM_QEMU_EXTRA}
      - USE_NET_BRIDGES=${USE_NET_BRIDGES:-0}
$(for govm_env in $(distro-govm-env); do echo "
      - ${govm_env}"; done)
    user-data: |
      #!/bin/bash
      set -e
"
     (if [ -n "$VM_EXTRA_BOOTSTRAP_COMMANDS" ]; then
          # shellcheck disable=SC2001
          sed 's/^/      /g' <<< "${VM_EXTRA_BOOTSTRAP_COMMANDS}"
     fi
      # shellcheck disable=SC2001
      sed 's/^/      /g' <<< "$(distro-bootstrap-commands)")) |
        grep -E -v '^ *$'
}

vm-bootstrap() {
    distro-bootstrap-commands | vm-pipe-to-file "./e2e-bootstrap.sh"
    vm-command "sh ./e2e-bootstrap.sh"
    host-wait-vm-ssh-server --timeout 600
}

vm-image-url() {
    distro-image-url
}

vm-ssh-user() {
    if [ -n "$VM_SSH_USER" ]; then
        echo "$VM_SSH_USER"
    else
        distro-ssh-user
    fi
}


vm-is-govm() { # script API
    local name="${1:-$VM_NAME}"
    # Usage: vm-is-govm [name]
    #
    # Check if the given name (or $VM_NAME if omitted) corresponds to
    # a govm-managed virtual machine. Returns 0 if it does. Returns 1
    # if it does not. Returns 2 if govm is not installed.

    if ! type -f govm >& /dev/null; then
        return 2
    fi
    if [ -z "$name" ]; then
        return 1
    fi

    if govm ls | cut -d ' ' -f 2 | grep -q "^$name$"; then
       return 0
    fi

    return 1
}

vm-check-env() {
    # If VM IP address is already defined, govm is not needed.
    if [ -n "$VM_IP" ]; then
        if [ "x$(vm-command-q "whoami")" != "xroot" ]; then
            echo "ERROR:"
            echo "ERROR: environment check failed:"
            echo "ERROR:   cannot run commands (with sudo) when connecting"
            echo "ERROR:   $SSH $VM_SSH_USER@$VM_IP"
            echo "ERROR:"
            return 1
        fi
        return 0
    fi
    # Check that VM created/managed with govm in this environment.
    type -p govm >& /dev/null || {
        echo "ERROR:"
        echo "ERROR: environment check failed:"
        echo "ERROR:   govm binary not found."
        echo "ERROR:"
        echo "ERROR: You can install it using the following commands:"
        echo "ERROR:"
        echo "ERROR:     git clone https://github.com/govm-project/govm"
        echo "ERROR:     cd govm"
        echo "ERROR:     go build -o govm"
        echo "ERROR:     cp -v govm \$GOPATH/bin"
        echo "ERROR:     docker build . -t govm/govm:latest"
        echo "ERROR:     cd .."
        echo "ERROR:"
        return 1
    }
    docker inspect govm/govm >& /dev/null || {
        echo "ERROR:"
        echo "ERROR: environment check failed:"
        echo "ERROR:   govm/govm docker image not present (but govm needs it)."
        echo "ERROR:"
        echo "ERROR: You can install it using the following commands:"
        echo "ERROR:"
        echo "ERROR:     git clone https://github.com/govm-project/govm"
        echo "ERROR:     cd govm"
        echo "ERROR:     docker build . -t govm/govm:latest"
        echo "ERROR:     cd .."
        echo "ERROR:"
        return 1
    }
    if [ ! -e "$SSH_KEY".pub ]; then
        echo "ERROR:"
        echo "ERROR: environment check failed:"
        echo "ERROR:   $SSH_KEY.pub SSH public key not found (but govm needs it)."
        echo "ERROR:"
        echo "ERROR: You can generate it using the following command:"
        echo "ERROR:"
        echo "ERROR:     ssh-keygen"
        echo "ERROR:"
        return 1
    fi
    if [ -n "$SSH_AUTH_SOCK" ] && [ -e "$SSH_AUTH_SOCK" ]; then
        if ! ssh-add -l | grep -q "$(ssh-keygen -l -f "$SSH_KEY" < /dev/null 2>/dev/null | awk '{print $2}')"; then
            if ! ssh-add "$SSH_KEY" < /dev/null; then
                echo "ERROR:"
                echo "ERROR: environment setup failed:"
                echo "ERROR:   Failed to load $SSH_KEY SSH key to agent."
                echo "ERROR:"
                echo "ERROR: Please make sure an SSH agent is running, then"
                echo "ERROR: try loading the key using the following command:"
                echo "ERROR:"
                echo "ERROR:     ssh-add $SSH_KEY"
                echo "ERROR:"
                return 1
            fi
        fi
    else
        if host-is-encrypted-ssh-key "$SSH_KEY"; then
            echo "ERROR:"
            echo "ERROR: environment setup failed:"
            echo "ERROR:   $SSH_KEY SSH key is encrypted, but agent is not running."
            echo "ERROR:"
            echo "ERROR: Please make sure an SSH agent is running, then"
            echo "ERROR: try loading the key using the following command:"
            echo "ERROR:"
            echo "ERROR:     ssh-add $SSH_KEY"
            echo "ERROR:"
            return 1
        fi
    fi
}

vm-check-running-binary() {
    local bin_file="$1"
    local bin_name
    bin_name="$(basename "$bin_file")"
    pid_of_bin="$(vm-command-q "pidof $bin_name")"
    if [ -f "$bin_file" ] && [ -n "$pid_of_bin" ] && [ "$(vm-command-q "md5sum < /proc/$pid_of_bin/exe")" != "$(md5sum < "$bin_file")" ]; then
        echo "WARNING:"
        echo "WARNING: Running $bin_name binary is different from"
        echo "WARNING: $bin_file"
        echo "WARNING: Consider restarting with reinstall_${bin_name//-/_}=1."
        echo "WARNING:"
        sleep "${warning_delay:-0}"
        return 1
    fi
    return 0
}

vm-check-source-files-changed() {
    local bin_change
    local src_change
    local src_dir="$1"
    local bin_file="$2"
    bin_change=$(stat --format "%Z" "$bin_file")
    src_change=$(find "$src_dir" -name '*.go' -type f -print0 | xargs -0 stat --format "%Z" | sort -n | tail -n 1)
    if [[ "$src_change" > "$bin_change" ]]; then
        echo "WARNING:"
        echo "WARNING: Source files changed, outdated binaries in"
        echo "WARNING: $(dirname "$bin_file")/"
        echo "WARNING:"
        sleep "${warning_delay:-0}"
    fi
}

vm-command() { # script API
    # Usage: vm-command COMMAND
    #
    # Execute COMMAND on virtual machine as root.
    # Returns the exit status of the execution.
    # Environment variable COMMAND_OUTPUT contains what COMMAND printed
    # in standard output and error.
    #
    # Examples:
    #   vm-command "kubectl get pods"
    #   vm-command "whoami | grep myuser" || command-error "user is not myuser"
    command-start "vm" "$VM_PROMPT" "$1"
    if [ "$2" == "bg" ]; then
        ( $SSH "${VM_SSH_USER}@${VM_IP}" sudo bash -l <<<"$COMMAND" 2>&1 | command-handle-output ;
          command-end "${PIPESTATUS[0]}"
        ) &
        command-runs-in-bg
    else
        $SSH "${VM_SSH_USER}@${VM_IP}" sudo bash -l <<<"$COMMAND" 2>&1 | command-handle-output ;
        command-end "${PIPESTATUS[0]}"
    fi
    return "$COMMAND_STATUS"
}

vm-command-q() {
    $SSH "${VM_SSH_USER}@${VM_IP}" sudo bash -l <<<"$1"
}

vm-ssh-user-ip() {
    # Usage: vm-ssh-user-ip NODE
    #
    # Print canonical USER@HOST for NODE. NODE can be a govm vm name
    # or already of the form: USER@HOST.
    local NODE="$1"
    local node_ssh_user=""
    local node_ssh_ip=""
    if [[ "$NODE" == *"@"* ]]; then
        node_ssh_ip=${NODE/*@}
        node_ssh_user=${NODE%@*}
    else
        node_ssh_ip=$(${GOVM} ls | awk "/$NODE/{print \$4}")
        node_ssh_user=$( host-get-vm-config $NODE && echo $VM_SSH_USER )
    fi
    if [ -z "$node_ssh_ip" ]; then
        error "cannot find IP address for NODE=$NODE"
    fi
    if [ -z "$node_ssh_user" ]; then
        error "cannot find ssh user for NODE=$NODE"
    fi
    echo "${node_ssh_user}@${node_ssh_ip}"
}

vm-join() {
    # Usage: vm-join MASTER_NODE
    #
    # Join vm to the cluster whose master node is MASTER_NODE."
    # MASTER_NODE is a name of a govm virtual machine, or
    # "USER@HOST" that can be logged into using ssh.
    local MASTER_NODE="$1"
    local master_user_ip
    local k8s_join_cmd
    k8s_join_cmd="$(vm-join-cmd "$MASTER_NODE")"
    vm-command "$k8s_join_cmd" || {
        command-error "joining to the cluster master ($MASTER_NODE) failed"
    }
    # Enable using kubectl on the worker vm by
    # copying k8s admin configuration on it.
    master_user_ip="$(vm-ssh-user-ip $MASTER_NODE)"
    ssh "$master_user_ip" "sudo cat /etc/kubernetes/admin.conf" | vm-pipe-to-file "/root/.kube/config"
}

vm-join-cmd() {
    # Usage: vm-join-cmd MASTER_NODE
    #
    # Print a join command to join VM to existing cluster MASTER_NODE.
    # MASTER_NODE is a name of a govm virtual machine (exists in "govm ls")
    # or USERNAME@IP.
    local MASTER_NODE="$1"
    local master_user_ip
    local k8s_join_cmd=""
    master_user_ip="$(vm-ssh-user-ip $MASTER_NODE)"
    local ssh_get_join_cmd="ssh $master_user_ip sudo kubeadm token create --print-join-command"
    k8s_join_cmd="$( $ssh_get_join_cmd )"
    if [[ "$k8s_join_cmd" != *" join "* ]]; then
        error "failed to get kubeadm join command: $k8s_join_cmd"
    fi
    echo $k8s_join_cmd
}

vm-mem-hotplug() { # script API
    # Usage: vm-mem-hotplug MEMORY
    #
    # Hotplug currently unplugged MEMORY to VM.
    # Find unplugged memory with "vm-mem-hw | grep unplugged".
    #
    # Examples:
    #   vm-mem-hotplug mem2
    local memmatch memline memid memdimm memnode memdriver
    memmatch=$1
    if [ -z "$memmatch" ]; then
        error "missing MEMORY"
        return 1
    fi
    memline="$(vm-mem-hw | grep unplugged | grep "$memmatch")"
    if [ -z "$memline" ]; then
        error "unplugged memory matching '$memmatch' not found"
        return 1
    fi
    memid="$(awk '{print $1}' <<< "$memline")"
    memid=${memid#mem}
    memid=${memid%[: ]*}
    memdimm="$(awk '{print $2}' <<< "$memline")"
    memnode="$(awk '{print $4}' <<< "$memline")"
    memnode=${memnode#node}
    if [ "$memdimm" == "nvdimm" ]; then
        memdriver="nvdimm"
    else
        memdriver="pc-dimm"
    fi
    vm-monitor "device_add ${memdriver},id=${memdimm}${memid},memdev=mem${memdimm}_${memid}_node_${memnode},node=${memnode}"
}

vm-mem-hotremove() { # script API
    # Usage: vm-mem-hotremove MEMORY
    #
    # Hotremove currently plugged MEMORY from VM.
    # Find plugged memory with "vm-mem-hw | grep ' plugged'".
    #
    # Examples:
    #   vm-mem-hotremove mem2
    local memmatch memline memid memdimm memnode memdriver
    memmatch=$1
    if [ -z "$memmatch" ]; then
        error "missing MEMORY"
        return 1
    fi
    memline="$(vm-mem-hw | grep \ plugged | grep "$memmatch")"
    if [ -z "$memline" ]; then
        error "plugged memory matching '$memmatch' not found"
        return 1
    fi
    memid="$(awk '{print $1}' <<< "$memline")"
    memid=${memid#mem}
    memid=${memid%[: ]*}
    memdimm="$(awk '{print $2}' <<< "$memline")"
    vm-monitor "device_del ${memdimm}${memid}"
}

vm-mem-hw() { # script API
    # Usage: vm-mem-hw
    #
    # List VM memory hardware with current status.
    # See also: vm-mem-hotplug, vm-mem-hotremove
    vm-monitor "$(echo info memdev; echo info memory-devices)" | awk '
      /memdev: /{
          split($2,a,"_");
          state[a[2]]="plugged  ";
      }
      /memory backend: membuiltin/{
          split($3,a,"_"); backend=1;
          type[a[2]]="ram    "; state[a[2]]="builtin  "; node[a[2]]=a[4];
      }
      /memory backend: memnvbuiltin/{
          split($3,a,"_"); backend=1;
          type[a[2]]="nvram  "; state[a[2]]="builtin  "; node[a[2]]=a[4];
      }
      /memory backend: memnvdimm/{
          split($3,a,"_"); backend=1;
          type[a[2]]="nvdimm "; state[a[2]]="unplugged"; node[a[2]]=a[4];
      }
      /memory backend: memdimm/{
          split($3,a,"_"); backend=1;
          type[a[2]]="dimm   "; state[a[2]]="unplugged"; node[a[2]]=a[4];
      }
      /size: /{sz=$2/1024/1024; if (backend==1) {size[a[2]]=sz;backend=0;}}
      END{
          for (m in node) print "mem"m": "type[m]" "state[m]" node"node[m]" size="size[m]"M";
      }'
}

vm-monitor() { # script API
    # Usage: vm-monitor COMMAND
    #
    # Execute COMMAND on Qemu monitor.
    #
    # Example: VM monitor help:
    #  vm-monitor "help" | less
    #
    # Example: print memdev objects and plugged in memory devices:
    #  vm-monitor "info memdev"
    #  vm-monitor "info memory-devices"
    #
    # Example: hot plug a NVDIMM to NUMA node 1 when launched with topology
    # topology='[{"cores":2,"mem":"2G"},{"nvmem":"4G","dimm":"unplugged"}]':
    #   vm-monitor "device_add pc-dimm,id=nvdimm0,memdev=nvmem0,node=1"
    [ -n "$VM_MONITOR" ] ||
        error "VM is not running"
    eval "$VM_MONITOR" <<< "$1" | sed 's/\r//g'
    if [ "${PIPESTATUS[0]}" != "0" ]; then
        error "sending command to Qemu monitor failed"
    fi
    echo ""
}

vm-wait-process() { # script API
    # Usage: vm-wait-process [--timeout TIMEOUT] [--pidfile PIDFILE] PROCESS
    #
    # Wait for a PROCESS (string) to appear in process list (pidof output).
    # If pidfile parameter is given, we also check that the process has that file open.
    # The default TIMEOUT is 30 seconds.
    local process timeout pidfile invalid
    timeout=30
    while [ "${1#-}" != "$1" ] && [ -n "$1" ]; do
        case "$1" in
            --timeout)
                timeout="$2"
                shift 2
                ;;
            --pidfile)
                pidfile="$2"
                shift 2
                ;;
            *)
                invalid="${invalid}${invalid:+,}\"$1\""
                shift
                ;;
        esac
    done
    if [ -n "$invalid" ]; then
        error "invalid options: $invalid"
        return 1
    fi
    process="$1"
    vm-run-until --timeout "$timeout" "pidof \"$process\" > /dev/null" || error "timeout while waiting $process"

    # As we first wait for the process, and then wait for the pidfile (if enabled)
    # we might wait longer than expected. Accept that anomaly atm.
    if [ ! -z "$pidfile" ]; then
	vm-run-until --timeout $timeout "[ ! -z \"\$(fuser $pidfile 2>/dev/null)\" ]" || error "timeout while waiting $pidfile"
	vm-run-until --timeout $timeout "[ \$(fuser $pidfile 2>/dev/null) -eq \$(pidof $process) ]" || error "timeout while waiting $process and $pidfile"
    fi
}

vm-run-until() { # script API
    # Usage: vm-run-until [--timeout TIMEOUT] CMD
    #
    # Keep running CMD (string) until it exits successfully.
    # The default TIMEOUT is 30 seconds.
    local cmd timeout invalid
    timeout=30
    while [ "${1#-}" != "$1" ] && [ -n "$1" ]; do
        case "$1" in
            --timeout)
                timeout="$2"
                shift; shift
                ;;
            *)
                invalid="${invalid}${invalid:+,}\"$1\""
                shift
                ;;
        esac
    done
    if [ -n "$invalid" ]; then
        error "invalid options: $invalid"
        return 1
    fi
    cmd="$1"
    if ! vm-command-q "retry=$timeout; until $cmd; do retry=\$(( \$retry - 1 )); [ \"\$retry\" == \"0\" ] && exit 1; sleep 1; done"; then
        error "waiting for command \"$cmd\" to exit successfully timed out after $timeout s"
    fi
}

vm-write-file() {
    local vm_path_file="$1"
    local file_content_b64
    file_content_b64="$(base64 <<<"$2")"
    vm-command-q "mkdir -p $(dirname "$vm_path_file"); echo -n \"$file_content_b64\" | base64 -d > \"$vm_path_file\""
}

vm-put-file() { # script API
    # Usage: vm-put-file [--cleanup] [--append] SRC-HOST-FILE DST-VM-FILE
    #
    # Copy SRC-HOST-FILE to DST-VM-FILE on the VM, removing
    # SRC-HOST-FILE if called with the --cleanup flag, and
    # appending instead of copying if the --append flag is
    # specified.
    #
    # Example:
    #   src=$(mktemp) && \
    #       echo 'Ahoy, Matey...' > $src && \
    #       vm-put-file --cleanup $src /etc/motd
    local cleanup append invalid
    while [ "${1#-}" != "$1" ] && [ -n "$1" ]; do
        case "$1" in
            --cleanup)
                cleanup=1
                shift
                ;;
            --append)
                append=1
                shift
                ;;
            *)
                invalid="${invalid}${invalid:+,}\"$1\""
                shift
                ;;
        esac
    done
    if [ -n "$cleanup" ] && [ -n "$1" ]; then
        # shellcheck disable=SC2064
        trap "rm -f \"$1\"" RETURN EXIT
    fi
    if [ -n "$invalid" ]; then
        error "invalid options: $invalid"
        return 1
    fi
    [ "$(dirname "$2")" == "." ] || vm-command-q "[ -d \"$(dirname "$2")\" ]" || vm-command "mkdir -p \"$(dirname "$2")\"" ||
        command-error "cannot create vm-put-file destination directory to VM"
    host-command "$SCP \"$1\" ${VM_SSH_USER}@${VM_IP}:\"vm-put-file.${1##*/}\"" ||
        command-error "failed to copy file to VM"
    if [ -z "$append" ]; then
        vm-command "mv \"vm-put-file.${1##*/}\" \"$2\"" ||
            command-error "failed to rename file"
    else
        vm-command "touch \"$2\" && cat \"vm-put-file.${1##*/}\" >> \"$2\" && rm -f \"vm-put-file.${1##*/}\"" ||
            command-error "failed to append file"
    fi
}

vm-put-pkg() { # script API
    # Usage: vm-put-pkg [--force] HOST-FILE...
    #
    # Copies HOST-FILEs from host to vm and installs them.
    #
    # Examples:
    #   vm-put-pkg /tmp/kernel.rpm /tmp/myutil.rpm
    local host_pkg
    local vm_pkgs=""
    local force=""
    if [ "$1" == "--force" ]; then
        force="--force "
        shift
    fi
    for host_pkg in "$@"; do
        local vm_pkg="pkgs/$(basename "$host_pkg")"
        vm-command-q "mkdir -p $(dirname "$vm_pkg")"
        vm-put-file "$host_pkg" "$vm_pkg"
        vm_pkgs="$vm_pkgs $vm_pkg"
    done
    distro-install-pkg-local $force "$vm_pkgs"
}

vm-put-docker-image() { # script API
    # Usage: vm-put-docker-image IMAGE
    #
    # Exports IMAGE from docker images on the host, and
    # imports it in the "k8s.io" namespace (visible
    # for kubernetes containers) on the vm.
    #
    # Works with containerd only.
    #
    # Examples:
    #   vm-put-docker-image busybox:latest
    local image_name="$1"
    local image_file_on_vm="images/${image_name//:/__}"
    vm-command-q "mkdir -p $(dirname "$image_file_on_vm")"
    docker save "$image_name" | vm-pipe-to-file "$image_file_on_vm" ||
        error "failed to save and pipe image '$image_name'"
    vm-cri-import-image "$image_name" "$image_file_on_vm"
}

vm-pipe-to-file() { # script API
    # Usage: vm-pipe-to-file [--append] DST-VM-FILE
    #
    # Reads stdin and writes the content to DST-VM-FILE, creating any
    # intermediate directories necessary.
    #
    # Example:
    #   echo 'Ahoy, Matey...' | vm-pipe-to-file /etc/motd
    local tmp append
    tmp="$(mktemp vm-pipe-to-file.XXXXXX)"
    if [ "$1" = "--append" ]; then
        append="--append"
        shift
    fi
    cat > "$tmp"
    vm-put-file --cleanup $append "$tmp" "$1"
}

vm-sed-file() { # script API
    # Usage: vm-sed-file PATH-IN-VM SED-EXTENDED-REGEXP-COMMANDS
    #
    # Edits the given file in place with the given extended regexp
    # sed commands.
    #
    # Example:
    #   vm-sed-file /etc/motd 's/Matey/Guybrush Threepwood/'
    local file="$1" cmd
    shift
    for cmd in "$@"; do
        vm-command "sed -E -i \"$cmd\" $file" ||
            command-error "failed to edit $file with sed"
    done
}

vm-set-kernel-cmdline() { # script API
    # Usage: vm-set-kernel-cmdline E2E-DEFAULTS
    #
    # Adds/replaces E2E-DEFAULTS to kernel command line"
    #
    # Example:
    #   vm-set-kernel-cmdline nr_cpus=4
    #   vm-reboot
    #   vm-command "cat /proc/cmdline"
    #   launch cri-resmgr
    distro-set-kernel-cmdline "$@"
}

vm-reboot() { # script API
    # Usage: vm-reboot
    #
    # Reboots the virtual machine and waits that the ssh server starts
    # responding again.
    vm-command "reboot"
    sleep 10
    if ! host-wait-vm-ssh-server; then
        vm-monitor system_reset
        host-wait-vm-ssh-server
    fi
}

vm-setup-proxies() {
    distro-setup-proxies
}

vm-networking() {
    vm-command-q "touch /etc/hosts; grep -q \$(hostname) /etc/hosts" || {
        vm-command "echo \"$VM_IP \$(hostname)\" >>/etc/hosts"
    }

    vm-setup-proxies
}

vm-install-cri-resmgr() {
    prefix=/usr/local
    # shellcheck disable=SC2154
    if [ "$binsrc" == "github" ]; then
        vm-install-golang
        vm-install-pkg make
        vm-command "go get -d -v github.com/intel/cri-resource-manager"
        CRI_RESMGR_SOURCE_DIR=$(awk '/package.*cri-resource-manager/{print $NF}' <<< "$COMMAND_OUTPUT")
        vm-command "cd $CRI_RESMGR_SOURCE_DIR && make install && cd -"
    elif [ "${binsrc#packages/}" != "$binsrc" ]; then
        suf=$(vm-pkg-type)
        vm-command "rm -f *.$suf"
        local pkg_count
        # shellcheck disable=SC2010,SC2126
        pkg_count="$(ls "$HOST_PROJECT_DIR/$binsrc"/cri-resource-manager*."$suf" | grep -v dbg | wc -l)"
        if [ "$pkg_count" == "0" ]; then
            error "installing from $binsrc failed: cannot find cri-resource-manager_*.$suf from $HOST_PROJECT_DIR/$binsrc"
        elif [[ "$pkg_count" -gt 1 ]]; then
            error "installing from $binsrc failed: expected exactly one cri-resource-manager*.$suf in $HOST_PROJECT_DIR/$binsrc, found $pkg_count alternatives."
        fi
        vm-command "mkdir -p /etc/cri-resmgr && touch /etc/cri-resmgr/fallback.cfg"
        host-command "$SCP $HOST_PROJECT_DIR/$binsrc/*.$suf $VM_SSH_USER@$VM_IP:/tmp" || {
            command-error "copying *.$suf to vm failed, run \"make cross-$suf\" first"
        }
        vm-install-pkg "/tmp/cri-resource-manager*.$suf" || {
            command-error "installing packages failed"
        }
        vm-command "systemctl daemon-reload"
    elif [ -z "$binsrc" ] || [ "$binsrc" == "local" ]; then
        vm-put-file "$BIN_DIR/cri-resmgr" "$prefix/bin/cri-resmgr"
        vm-put-file "$BIN_DIR/cri-resmgr-agent" "$prefix/bin/cri-resmgr-agent"
        sed -E -e "s:__DEFAULTDIR__:$(distro-env-file-dir):g" \
            -E -e "s:__BINDIR__:$prefix/bin:g" < "$HOST_PROJECT_DIR/cmd/cri-resmgr/cri-resource-manager.service.in" |
            vm-pipe-to-file /usr/lib/systemd/system/cri-resource-manager.service
        cat <<EOF |
CONFIG_OPTIONS="--fallback-config /etc/cri-resmgr/fallback.cfg -relay-socket ${cri_resmgr_sock} -runtime-socket ${cri_sock} -image-socket ${cri_sock}"
EOF
        vm-pipe-to-file "$(distro-env-file-dir)/cri-resource-manager"
        vm-put-file "$HOST_PROJECT_DIR/cmd/cri-resmgr/fallback.cfg.sample" "/etc/cri-resmgr/fallback.cfg"
    else
        error "vm-install-cri-resmgr: unknown binsrc=\"$binsrc\""
    fi
}

vm-install-cri-resmgr-agent() {
    prefix=/usr/local
    local bin_change
    local src_change
    bin_change=$(stat --format "%Z" "$BIN_DIR/cri-resmgr-agent")
    src_change=$(find "$HOST_PROJECT_DIR" -name '*.go' -type f -print0 | xargs -0 stat --format "%Z" | sort -n | tail -n 1)
    if [[ "$src_change" > "$bin_change" ]]; then
        echo "WARNING:"
        echo "WARNING: Source files changed - installing possibly outdated binaries from"
        echo "WARNING: $BIN_DIR/"
        echo "WARNING:"
        sleep "${warning_delay:-0}"
    fi
    vm-put-file "$BIN_DIR/cri-resmgr-agent" "$prefix/bin/cri-resmgr-agent"
}

vm-cri-import-image() {
    local image_name="$1"
    local image_tar="$2"
    case "$VM_CRI" in
        containerd)
            vm-command "ctr -n k8s.io images import '$image_tar'" ||
                command-error "failed to import \"$image_tar\" on VM"
            ;;
        *)
            error "vm-cri-import-image unsupported container runtime: \"$VM_CRI\""
    esac
}

vm-install-cri-resmgr-webhook() {
    local service=cri-resmgr-webhook
    local namespace=cri-resmgr
    vm-command-q "\
        kubectl delete secret -n ${namespace} cri-resmgr-webhook-secret 2>/dev/null; \
        kubectl delete csr ${service}.${namespace} 2>/dev/null; \
        kubectl delete -f webhook/mutating-webhook-config.yaml 2>/dev/null; \
        kubectl delete -f webhook/webhook-deployment.yaml 2>/dev/null; \
        "
    local webhook_image_info webhook_image_id webhook_image_repotag webhook_image_tar
    webhook_image_info="$(docker images --filter=reference=cri-resmgr-webhook --format '{{.ID}} {{.Repository}}:{{.Tag}} (created {{.CreatedSince}}, {{.CreatedAt}})' | head -n 1)"
    if [ -z "$webhook_image_info" ]; then
        error "cannot find cri-resmgr-webhook image on host, run \"make images\" and check \"docker images --filter=reference=cri-resmgr-webhook\""
    fi
    echo "installing webhook to VM from image: $webhook_image_info"
    sleep 2
    webhook_image_id="$(awk '{print $1}' <<< "$webhook_image_info")"
    webhook_image_repotag="$(awk '{print $2}' <<< "$webhook_image_info")"
    webhook_image_tar="$(realpath "$OUTPUT_DIR/webhook-image-$webhook_image_id.tar")"
    # It is better to export (save) the image with image_repotag rather than image_id
    # because otherwise manifest.json RepoTags will be null and containerd will
    # remove the image immediately after impoting it as part of garbage collection.
    docker image save "$webhook_image_repotag" > "$webhook_image_tar"
    vm-put-file "$webhook_image_tar" "webhook/$(basename "$webhook_image_tar")" || {
        command-error "copying webhook image to VM failed"
    }
    vm-cri-import-image cri-resmgr-webhook "webhook/$(basename "$webhook_image_tar")"
    # Create a self-signed certificate with SANs
    vm-command "openssl req -x509 -newkey rsa:2048 -sha256 -days 365 -nodes -keyout webhook/server-key.pem -out webhook/server-crt.pem -subj '/CN=${service}.${namespace}.svc' -addext 'subjectAltName=DNS:${service},DNS:${service}.${namespace},DNS:${service}.${namespace}.svc'" ||
        command-error "creating self-signed certificate failed, requires openssl >= 1.1.1"
    # Allow webhook to run on node tainted by cmk=true
    sed -e "s|IMAGE_PLACEHOLDER|$webhook_image_repotag|" \
        -e 's|^\(\s*\)tolerations:$|\1tolerations:\n\1  - {"key": "cmk", "operator": "Equal", "value": "true", "effect": "NoSchedule"}|g' \
        -e 's/imagePullPolicy: Always/imagePullPolicy: Never/' \
        < "${HOST_PROJECT_DIR}/cmd/cri-resmgr-webhook/webhook-deployment.yaml" \
        | vm-pipe-to-file webhook/webhook-deployment.yaml
    # Create secret that contains svc.crt and svc.key for webhook deployment
    local server_crt_b64 server_key_b64
    server_crt_b64="$(vm-command-q "cat webhook/server-crt.pem" | base64 -w 0)"
    server_key_b64="$(vm-command-q "cat webhook/server-key.pem" | base64 -w 0)"
    cat <<EOF | vm-pipe-to-file --append webhook/webhook-deployment.yaml
---
apiVersion: v1
kind: Secret
metadata:
  name: cri-resmgr-webhook-secret
  namespace: cri-resmgr
data:
  svc.crt: ${server_crt_b64}
  svc.key: ${server_key_b64}
type: Opaque
EOF
    local cabundle_b64
    cabundle_b64="$server_crt_b64"
    sed -e "s/CA_BUNDLE_PLACEHOLDER/${cabundle_b64}/" \
        < "${HOST_PROJECT_DIR}/cmd/cri-resmgr-webhook/mutating-webhook-config.yaml" \
        | vm-pipe-to-file webhook/mutating-webhook-config.yaml
}

vm-pkg-type() {
    distro-pkg-type
}

vm-install-pkg() {
    distro-install-pkg "$@"
}

vm-setup-oneshot() {
    local util
    ( distro-refresh-pkg-db ) || true
    distro-setup-oneshot
    distro-install-utils
    # Verify that all required utilities exit on the VM.
    for util in pidof killall; do
        vm-command-q "command -v $util >/dev/null" || {
            error "required command '$util' missing on VM, fix/implement $distro-install-utils()"
        }
    done
}

vm-install-golang() {
    distro-install-golang
}

vm-install-runc() {
    local host_runc="$runc_src/runc"
    local vm_runc="/usr/sbin/runc"
    if [ -n "$runc_src" ]; then
        # Check if runc is already installed on VM.
        # If it is, replace existing binary with local build."
        vm-command 'command -v runc'
        if [ -n "$COMMAND_OUTPUT" ] && [ "x$COMMAND_STATUS" == "x0" ]; then
            vm_runc="$COMMAND_OUTPUT"
        fi
        vm-put-file "$host_runc" "$vm_runc"
    else
        distro-install-runc
    fi
}

vm-install-cri() {
    local vm_cri_dir="/usr/bin"
    distro-install-"$VM_CRI"
    distro-config-"$VM_CRI"
    if [ "$VM_CRI" == "containerd" ]; then
        if [ -n "$containerd_src" ]; then
            vm-command "systemctl stop containerd"
            vm-command 'command -v containerd'
            if [ -n "$COMMAND_OUTPUT" ] && [ "x$COMMAND_STATUS" == "x0" ]; then
                vm_cri_dir="${COMMAND_OUTPUT%/*}"
            fi
            for f in ctr containerd containerd-stress containerd-shim containerd-shim-runc-v1 containerd-shim-runc-v2; do
                vm-put-file "$containerd_src/bin/$f" "$vm_cri_dir/$f"
            done
            vm-command "mkdir -p /etc/containerd; containerd config default | sed -e 's/SystemdCgroup = false/SystemdCgroup = true/g' > /etc/containerd/config.toml"
            vm-command "systemctl enable --now containerd"
        fi
    elif [ "$VM_CRI" == "crio" ]; then
        if [ -n "$crio_src" ]; then
            vm-command "systemctl stop crio"
            vm-command 'command -v crio'
            if [ -n "$COMMAND_OUTPUT" ] && [ "x$COMMAND_STATUS" == "x0" ]; then
                vm_cri_dir="${COMMAND_OUTPUT%/*}"
            fi
            for f in crio crio-status pinns; do
                vm-put-file "$crio_src/bin/$f" "$vm_cri_dir/$f"
            done
            vm-command "systemctl enable --now crio"
        fi
    fi
}

vm-install-containernetworking() {
    vm-install-golang
    vm-command "GO111MODULE=off go get -d github.com/containernetworking/plugins"
    CNI_PLUGINS_SOURCE_DIR="$(awk '/package.*plugins/{print $NF}' <<< "$COMMAND_OUTPUT")"
    [ -n "$CNI_PLUGINS_SOURCE_DIR" ] || {
        command-error "downloading containernetworking plugins failed"
    }
    vm-command "pushd \"$CNI_PLUGINS_SOURCE_DIR\" && ./build_linux.sh && mkdir -p /opt/cni && cp -rv bin /opt/cni && popd" || {
        command-error "building and installing cri-tools failed"
    }
    vm-command "rm -rf /etc/cni/net.d && mkdir -p /etc/cni/net.d && cat > /etc/cni/net.d/10-bridge.conf <<EOF
{
  \"cniVersion\": \"0.4.0\",
  \"name\": \"mynet\",
  \"type\": \"bridge\",
  \"bridge\": \"cni0\",
  \"isGateway\": true,
  \"ipMasq\": true,
  \"ipam\": {
    \"type\": \"host-local\",
    \"subnet\": \"$CNI_SUBNET\",
    \"routes\": [
      { \"dst\": \"0.0.0.0/0\" }
    ]
  }
}
EOF"
    vm-command 'cat > /etc/cni/net.d/20-portmap.conf <<EOF
{
    "cniVersion": "0.4.0",
    "type": "portmap",
    "capabilities": {"portMappings": true},
    "snat": true
}
EOF'
    vm-command 'cat > /etc/cni/net.d/99-loopback.conf <<EOF
{
  "cniVersion": "0.4.0",
  "name": "lo",
  "type": "loopback"
}
EOF'
}

vm-install-dlv() {
    vm-install-golang
    vm-install-pkg rsync
    vm-command "go install github.com/go-delve/delve/cmd/dlv@latest" || {
        command-error "installing delve failed"
    }
    echo '[ "`id -u`" -eq 0 ] && PATH=$PATH:/root/go/bin' | vm-pipe-to-file /etc/profile.d/root-path-go.sh
    vm-command "mkdir -p \"\$HOME/.config/dlv/config.yml.d\""
    vm-command "echo 'substitute-path:' > \"\$HOME/.config/dlv/config.yml.d/00-substitute-path\""
}

vm-install-glibc() { # script API
    # Usage: vm-install-glibc [VERSION]
    #
    # If glibc_src=/host/path/to/glibc is set, install a glibc that is
    # built and installed on host using configure --prefix $glibc_src.
    # If glibc_src is not set, download, build and install a glibc on vm.
    # In both cases glibc is installed to /opt/glibc/VERSION on vm.
    #
    # vm-set-glibc wraps selected binaries to use an installed glibc.
    #
    # Example: install a glibc from host and use it with two binaries.
    #   glibc_src=/host/glibc/install/prefix vm-install-glibc host-2.34
    #   vm-set-glibc host-2.34 /usr/bin/containerd /usr/local/bin/cri-resmgr
    #
    # Example: download, build and install glibc 2.32 on vm:
    #   vm-install-glibc 2.32
    #   vm-set-glibc 2.32 /usr/bin/containerd /usr/local/bin/cri-resmgr
    local glibc_ver="${1:-host}"
    local vm_glibc_dir="/opt/glibc/${glibc_ver}"
    if [ -n "$glibc_src" ] && [ -d "$glibc_src" ]; then
        vm-command "mkdir -p $vm_glibc_dir"
        ( cd "$glibc_src" && tar cz . ) | vm-pipe-to-file "$vm_glibc_dir/glibc-$glibc_ver.tar.gz" ||
            error "failed to package glibc from '$glibc_src'"
        vm-command "cd $vm_glibc_dir && tar xf glibc-$glibc_ver.tar.gz && rm -f glibc-$glibc_ver.tar.gz" ||
            command-error "failed to extract glibc-$glibc_ver.tar.gz"
        return 0
    fi
    if [[ "$glibc_ver" == "host"* ]]; then
        error "vm-install-glibc: invalid glibc_src='$glibc_src' when installing glibc from host"
    fi
    local vm_glibc_src="$vm_glibc_dir/src/glibc-${glibc_ver}"
    local vm_glibc_build="$vm_glibc_dir/src/build"
    local vm_glibc_install="$vm_glibc_dir"
    vm-install-pkg make bison flex gcc
    vm-command "mkdir -p $vm_glibc_src; cd $vm_glibc_src; curl -L --remote-name-all https://ftp.gnu.org/gnu/glibc/glibc-${glibc_ver}.tar.gz" ||
        command-error "failed to download glibc"
    vm-command "mkdir -p $vm_glibc_src; cd $vm_glibc_src/..; tar xzf $vm_glibc_src/glibc-${glibc_ver}.tar.gz" ||
        command-error "failed to extract glibc"
    vm-command "mkdir -p $vm_glibc_build; cd $vm_glibc_build && $vm_glibc_src/configure --prefix=$vm_glibc_install" ||
        command-error "failed to configure glibc"
    vm-command "cd $vm_glibc_build && make -j 4 >make.output.txt 2>&1 || ( tail make.output.txt; exit 1 )" ||
        command-error "failed to build glibc, see $vm_glibc_build/make.output.txt"
    vm-command "cd $vm_glibc_build && make install" ||
        command-error "failed to install glibc"
}

vm-set-glibc() { # script API
    # Usage: vm-set-glibc VERSION BIN [BIN...]
    #
    # Wrap binaries to use glibc VERSION.
    #
    # Note glibc VERSION must be installed first.
    # See vm-install-glibc.
    local glibc_ver="$1"
    local vm_glibc_dir="/opt/glibc/${glibc_ver}"
    local vm_glibc_install="$vm_glibc_dir"
    local vm_glibc_ld="$vm_glibc_install/lib/ld-linux-x86-64.so.2"
    shift
    if [ -z "$glibc_ver" ]; then
        error "vm-switch-glibc: missing glibc version to switch to"
    fi
    vm-command "[ -x $vm_glibc_ld ]" ||
        command-error "cannot find loader $vm_glibc_ld"
    local vm_bin
    for vm_bin in "$@"; do
        vm-command "[ -x $vm_bin ]" ||
            command-error "cannot find binary to be wrapped: $vm_bin"
        vm-command "( [ \"\$(dd bs=1 count=3 skip=1 if=$vm_bin)\" == \"ELF\" ] && mv $vm_bin ${vm_bin}.bin ) || [ -f $vm_bin.bin ]" ||
            command-error "failed to rename binary"
        vm-pipe-to-file "$vm_bin" <<EOF
#!/bin/bash
LD_LIBRARY_PATH=$vm_glibc_install/lib:\$LD_LIBRARY_PATH exec $vm_glibc_ld ${vm_bin}.bin "\$@"
EOF
        vm-command "chmod a+rx $vm_bin"
    done
}

vm-dlv-add-src() {
    local host_src_dir="$1"
    [ -d "$host_src_dir" ] || error "vm-dlv-add-src: invalid source directory \"$host_src_dir\", existing go project directory expected"
    vm-command "mkdir -p /home/$VM_SSH_USER/src; chmod a+rwX /home/$VM_SSH_USER/src; mkdir -p \$HOME/.config/dlv/config.yml.d"
    host-command "cd \"$host_src_dir/..\" && rsync -avz --include \"*/\" --include \"**/*.go\" --exclude \"*\" \"$(basename "$host_src_dir")\" $VM_SSH_USER@$VM_IP:src/"
    vm-command "echo ' - {from: \"$host_src_dir\", to: \"/home/$VM_SSH_USER/src/$(basename "$host_src_dir")\"}' > \"\$HOME/.config/dlv/config.yml.d/01-$(basename "$host_src_dir")\""
    vm-dlv-update-config
}

vm-dlv-update-config() {
    vm-command "( echo 'substitute-path:'; cat \$HOME/.config/dlv/config.yml.d/* ) > \$HOME/.config/dlv/config.yml"
}

vm-install-k8s() {
    distro-install-k8s
    distro-restart-$VM_CRI
}

vm-install-minikube() {
    vm-install-containernetworking
    distro-install-cri-dockerd
    distro-install-minikube
}

vm-create-minikube-cluster() {
    vm-command "sysctl fs.protected_regular=0; minikube start --driver=none --alsologtostderr=true"
}

vm-create-singlenode-cluster() {
    if ! [ "$(type -t vm-install-cni-$(distro-k8s-cni))" == "function" ]; then
        error "invalid CNI: $(distro-k8s-cni)"
    fi
    vm-create-cluster
    vm-command "kubectl taint nodes --all node-role.kubernetes.io/control-plane-"
    vm-command "kubectl taint nodes --all node-role.kubernetes.io/master-"
    vm-install-cni-"$(distro-k8s-cni)"
    if ! vm-command "kubectl wait --for=condition=Ready node/\$(hostname) --timeout=240s"; then
        command-error "kubectl waiting for node readiness timed out"
    fi
    vm-run-until --timeout 30 "kubectl get sa default > /dev/null" || error "serviceaccount 'default' not found"
}

vm-create-cluster() {
    vm-command "kubeadm init --pod-network-cidr=$CNI_SUBNET --cri-socket ${k8scri_sock}"
    if ! grep -q "initialized successfully" <<< "$COMMAND_OUTPUT"; then
        command-error "kubeadm init failed"
    fi

    user="$(vm-ssh-user)"

    vm-command "mkdir -p ~$user/.kube"
    vm-command "cp /etc/kubernetes/admin.conf ~$user/.kube/config"
    vm-command "chown -R $user:$user ~$user/.kube"
    vm-command "mkdir -p ~root/.kube"
    vm-command "cp /etc/kubernetes/admin.conf ~root/.kube/config"
}

vm-destroy-cluster() {
    user="$(vm-ssh-user)"
    vm-command "yes | kubeadm reset; rm -f ~$user/.kube/config ~root/.kube/config /etc/kubernetes"
}

vm-install-cni-bridge() {
    vm-command "rm -rf /etc/cni/net.d/* && mkdir -p /etc/cni/net.d && cat > /etc/cni/net.d/10-bridge.conf <<EOF
{
  \"cniVersion\": \"0.4.0\",
  \"name\": \"demonet\",
  \"type\": \"bridge\",
  \"isGateway\": true,
  \"ipMasq\": true,
  \"ipam\": {
    \"type\": \"host-local\",
    \"subnet\": \"$CNI_SUBNET\",
    \"routes\": [
      { \"dst\": \"0.0.0.0/0\" }
    ]
  }
}
EOF"
}

vm-install-cni-cilium() {
    if ! vm-command "curl -L --remote-name-all https://github.com/cilium/cilium-cli/releases/latest/download/cilium-linux-amd64.tar.gz && tar xzvfC cilium-linux-amd64.tar.gz /usr/local/bin && cilium install && rm -f cilium-linux-amd64.tar.gz"; then
        command-error "installing cilium CNI to Kubernetes failed"
    fi
}

vm-install-cni-weavenet() {
    vm-command "kubectl apply -f https://github.com/weaveworks/weave/releases/download/v2.8.1/weave-daemonset-k8s.yaml"
    if ! vm-command "kubectl rollout status --timeout=360s -n kube-system daemonsets/weave-net"; then
        command-error "installing weavenet CNI to Kubernetes failed/timed out"
    fi
}

vm-install-cni-flannel() {
    vm-command "kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml"
    if ! vm-command "kubectl rollout status --timeout=360s -n kube-system daemonsets/kube-flannel-ds"; then
        command-error "installing flannel CNI to Kubernetes failed/timed out"
    fi
}

vm-install-kernel-dev() { # script API
    # Usage: vm-install-kernel-dev
    #
    # Install dependencies and kernel sources ready for patching,
    # configuring and building packages.
    distro-install-kernel-dev
}

vm-print-usage() {
    echo "- Login VM:     ssh $VM_SSH_USER@$VM_IP"
    echo "- Stop VM:      govm stop $VM_NAME"
    echo "- Delete VM:    govm delete $VM_NAME"
}

vm-check-env || exit 1


================================================
FILE: dockerfiles/cross-build/Dockerfile.debian-11
================================================
# pull in base + a minimal set of useful packages
FROM debian:bullseye as debian-11-build

ARG GO_VERSION=x.yz
ARG GOLICENSES_VERSION
ARG CREATE_USER="test"
ARG USER_UID=""
ENV PATH /go/bin:/usr/local/go/bin:$PATH

# pull in stuff for cgo
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential fakeroot devscripts \
        bash git make sed debhelper ca-certificates && \
    rm -rf /var/lib/apt/lists/*

ADD http://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz /

RUN tar xf /go${GO_VERSION}.linux-amd64.tar.gz -C "/usr/local" && \
    rm /go${GO_VERSION}.linux-amd64.tar.gz

RUN GOBIN=/go/bin go install github.com/google/go-licenses@${GOLICENSES_VERSION}

RUN [ -n "$CREATE_USER" -a "$CREATE_USER" != "root" ] && \
    useradd -m -s /bin/bash $CREATE_USER -u $USER_UID


================================================
FILE: dockerfiles/cross-build/Dockerfile.debian-12
================================================
# pull in base + a minimal set of useful packages
FROM debian:bookworm as debian-11-build

ARG GO_VERSION=x.yz
ARG GOLICENSES_VERSION
ARG CREATE_USER="test"
ARG USER_UID=""
ENV PATH /go/bin:/usr/local/go/bin:$PATH

# pull in stuff for cgo
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential fakeroot devscripts \
        bash git make sed debhelper ca-certificates && \
    rm -rf /var/lib/apt/lists/*

ADD http://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz /

RUN tar xf /go${GO_VERSION}.linux-amd64.tar.gz -C "/usr/local" && \
    rm /go${GO_VERSION}.linux-amd64.tar.gz

RUN GOBIN=/go/bin go install github.com/google/go-licenses@${GOLICENSES_VERSION}

RUN [ -n "$CREATE_USER" -a "$CREATE_USER" != "root" ] && \
    useradd -m -s /bin/bash $CREATE_USER -u $USER_UID


================================================
FILE: dockerfiles/cross-build/Dockerfile.debian-sid
================================================
# pull in base + a minimal set of useful packages
FROM debian:sid as debian-sid-build

ARG GO_VERSION=x.yz
ARG GOLICENSES_VERSION
ARG CREATE_USER="test"
ARG USER_UID=""
ENV PATH /go/bin:/usr/local/go/bin:$PATH

# pull in stuff for cgo
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential fakeroot devscripts \
        bash git make sed debhelper ca-certificates && \
    rm -rf /var/lib/apt/lists/*

ADD http://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz /

RUN tar xf /go${GO_VERSION}.linux-amd64.tar.gz -C "/usr/local" && \
    rm /go${GO_VERSION}.linux-amd64.tar.gz

RUN GOBIN=/go/bin go install github.com/google/go-licenses@${GOLICENSES_VERSION}

RUN [ -n "$CREATE_USER" -a "$CREATE_USER" != "root" ] && \
    useradd -m -s /bin/bash $CREATE_USER -u $USER_UID


================================================
FILE: dockerfiles/cross-build/Dockerfile.fedora
================================================
# pull in base + a minimal set of useful packages
FROM fedora:latest as fedora-build

ARG GO_VERSION=x.yz
ARG GOLICENSES_VERSION
ARG CREATE_USER="build"
ARG USER_UID=""
ENV PATH /go/bin:/usr/local/go/bin:$PATH

RUN dnf install -y rpm-build systemd-rpm-macros \
    kernel-devel gcc \
    git-core make

ADD http://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz /

RUN tar xf /go${GO_VERSION}.linux-amd64.tar.gz -C "/usr/local" && \
    rm /go${GO_VERSION}.linux-amd64.tar.gz

RUN GOBIN=/go/bin go install github.com/google/go-licenses@${GOLICENSES_VERSION}

RUN [ -n "$CREATE_USER" -a "$CREATE_USER" != "root" ] && \
    useradd -m -s /bin/bash $CREATE_USER -u $USER_UID


================================================
FILE: dockerfiles/cross-build/Dockerfile.opensuse-leap-15.6
================================================
# pull in base + a minimal set of useful packages
FROM opensuse/leap:15.6 as suse-15.6-build

ARG GO_VERSION=x.yz
ARG GOLICENSES_VERSION
ARG CREATE_USER="build"
ARG USER_UID=""
ENV PATH /go/bin:/usr/local/go/bin:$PATH

RUN zypper install -y rpm-build \
    kernel-devel gcc \
    git make

ADD http://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz /

RUN tar xf /go${GO_VERSION}.linux-amd64.tar.gz -C "/usr/local" && \
    rm /go${GO_VERSION}.linux-amd64.tar.gz

RUN GOBIN=/go/bin go install github.com/google/go-licenses@${GOLICENSES_VERSION}

RUN [ -n "$CREATE_USER" -a "$CREATE_USER" != "root" ] && \
    useradd -m -s /bin/bash $CREATE_USER -u $USER_UID


================================================
FILE: dockerfiles/cross-build/Dockerfile.ubuntu-18.04
================================================
# pull in base + a minimal set of useful packages
FROM ubuntu:18.04 as ubuntu-18.04-build

ARG GO_VERSION=x.yz
ARG GOLICENSES_VERSION
ARG CREATE_USER="test"
ARG USER_UID=""
ENV PATH /go/bin:/usr/local/go/bin:$PATH

# pull in stuff for cgo
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential fakeroot devscripts \
        bash git make sed debhelper ca-certificates && \
    rm -rf /var/lib/apt/lists/*

ADD http://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz /

RUN tar xf /go${GO_VERSION}.linux-amd64.tar.gz -C "/usr/local" && \
    rm /go${GO_VERSION}.linux-amd64.tar.gz

RUN GOBIN=/go/bin go install github.com/google/go-licenses@${GOLICENSES_VERSION}

RUN [ -n "$CREATE_USER" -a "$CREATE_USER" != "root" ] && \
    useradd -m -s /bin/bash $CREATE_USER -u $USER_UID


================================================
FILE: dockerfiles/cross-build/Dockerfile.ubuntu-20.04
================================================
# pull in base + a minimal set of useful packages
FROM ubuntu:20.04 as ubuntu-20.04-build

ARG GO_VERSION=x.yz
ARG GOLICENSES_VERSION
ARG CREATE_USER="test"
ARG USER_UID=""
ENV PATH /go/bin:/usr/local/go/bin:$PATH
ENV DEBIAN_FRONTEND noninteractive

# pull in stuff for building
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        tzdata build-essential fakeroot devscripts \
        bash git make sed debhelper ca-certificates && \
    rm -rf /var/lib/apt/lists/*

ADD http://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz /

RUN tar xf /go${GO_VERSION}.linux-amd64.tar.gz -C "/usr/local" && \
    rm /go${GO_VERSION}.linux-amd64.tar.gz

RUN GOBIN=/go/bin go install github.com/google/go-licenses@${GOLICENSES_VERSION}

RUN [ -n "$CREATE_USER" -a "$CREATE_USER" != "root" ] && \
    useradd -m -s /bin/bash $CREATE_USER -u $USER_UID


================================================
FILE: dockerfiles/cross-build/Dockerfile.ubuntu-22.04
================================================
# pull in base + a minimal set of useful packages
FROM ubuntu:22.04 as ubuntu-22.04-build

ARG GO_VERSION=x.yz
ARG GOLICENSES_VERSION
ARG CREATE_USER="test"
ARG USER_UID=""
ENV PATH /go/bin:/usr/local/go/bin:$PATH
ENV DEBIAN_FRONTEND noninteractive

# pull in stuff for building
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        tzdata build-essential fakeroot devscripts \
        bash git make sed debhelper ca-certificates && \
    rm -rf /var/lib/apt/lists/*

ADD http://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz /

RUN tar xf /go${GO_VERSION}.linux-amd64.tar.gz -C "/usr/local" && \
    rm /go${GO_VERSION}.linux-amd64.tar.gz

RUN GOBIN=/go/bin go install github.com/google/go-licenses@${GOLICENSES_VERSION}

RUN [ -n "$CREATE_USER" -a "$CREATE_USER" != "root" ] && \
    useradd -m -s /bin/bash $CREATE_USER -u $USER_UID


================================================
FILE: dockerfiles/cross-build/Dockerfile.ubuntu-24.04
================================================
# pull in base + a minimal set of useful packages
FROM ubuntu:24.04 as ubuntu-24.04-build

ARG GO_VERSION=x.yz
ARG GOLICENSES_VERSION
ARG CREATE_USER="test"
ARG USER_UID=""
ENV PATH /go/bin:/usr/local/go/bin:$PATH
ENV DEBIAN_FRONTEND noninteractive

# pull in stuff for building
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        tzdata build-essential fakeroot devscripts \
        bash git make sed debhelper ca-certificates && \
    rm -rf /var/lib/apt/lists/*

ADD http://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz /

RUN tar xf /go${GO_VERSION}.linux-amd64.tar.gz -C "/usr/local" && \
    rm /go${GO_VERSION}.linux-amd64.tar.gz

RUN GOBIN=/go/bin go install github.com/google/go-licenses@${GOLICENSES_VERSION}

RUN if [ -n "$CREATE_USER" -a "$CREATE_USER" != "root" ]; then \
        if getent passwd $USER_UID; then \
             userdel `id -un $USER_UID`; \
        fi; \
        useradd -m -s /bin/bash $CREATE_USER -u $USER_UID; \
    fi


================================================
FILE: docs/Dockerfile
================================================
FROM sphinxdoc/sphinx:5.3.0

RUN apt-get update && apt-get install -y wget git

# Note: Any golang version that can 'go list -m -f {{.Variable}}' is fine...
ADD https://go.dev/dl/go1.24.1.linux-amd64.tar.gz /

RUN tar -C /usr/local -xzf /go1.24.1.linux-amd64.tar.gz && \
    rm /go1*.linux-amd64.tar.gz

ENV PATH=$PATH:/usr/local/go/bin

COPY requirements.txt .

RUN pip3 install -r requirements.txt


================================================
FILE: docs/_templates/layout.html
================================================
{%- extends "!layout.html" %}

{% block footer %}
  {% if versions_menu %}
    <div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">
      <span class="rst-current-version" data-toggle="rst-current-version">
        <span class="fa fa-book"> GitHub Pages</span>
        {{ versions_menu_this_version }}
        <span class="fa fa-caret-down"></span>
      </span>
      <div class="rst-other-versions">
        <dl id="versions">
          <dt>{{ _('Versions') }}</dt>
        </dl>
        <dl>
          <dt>
          <a href="/cri-resource-manager/releases">all releases</a>
          </dt>
        </dl>
      </div>
    </div>
  {% endif %}
  <script src="{{ pathto('../versions.js', 1) }}"></script>
  <script>
      var list = document.getElementById('versions')
      var menuItems = getVersionsMenuItems();
      for (var i=0; i < menuItems.length; i++) {
        var item = document.createElement('dd');
        var anchor = item.appendChild(document.createElement('a'));
        anchor.appendChild(document.createTextNode(menuItems[i].name));
        anchor.href = menuItems[i].url;
        list.appendChild(item);
      }
  </script>
{% endblock %}


================================================
FILE: docs/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
from docutils import nodes
from os.path import isdir, isfile, join, basename, dirname
from os import makedirs, getenv
from shutil import copyfile
from subprocess import run, STDOUT

# -- Project information -----------------------------------------------------

project = 'CRI Resource Manager'
copyright = '2020, various'
author = 'various'

master_doc = 'docs/index'


##############################################################################
#
# This section determines the behavior of links to local items in .md files.
#
#  if useGitHubURL == True:
#
#     links to local files and directories will be turned into github URLs
#     using either the baseBranch defined here or using the commit SHA.
#
#  if useGitHubURL == False:
#
#     local files will be moved to the website directory structure when built
#     local directories will still be links to github URLs
#
#  if built with GitHub workflows:
#
#     the GitHub URLs will use the commit SHA (GITHUB_SHA environment variable
#     is defined by GitHub workflows) to link to the specific commit.
#
##############################################################################

baseBranch = "master"
useGitHubURL = True
commitSHA = getenv('GITHUB_SHA')
githubServerURL = getenv('GITHUB_SERVER_URL')
githubRepository = getenv('GITHUB_REPOSITORY')
if githubServerURL and githubRepository:
    githubBaseURL = join(githubServerURL, githubRepository)
else:
    githubBaseURL = "https://github.com/intel/cri-resource-manager/"

githubFileURL = join(githubBaseURL, "blob/")
githubDirURL = join(githubBaseURL, "tree/")
if commitSHA:
    githubFileURL = join(githubFileURL, commitSHA)
    githubDirURL = join(githubDirURL, commitSHA)
else:
    githubFileURL = join(githubFileURL, baseBranch)
    githubDirURL = join(githubDirURL, baseBranch)

# Version displayed in the upper left corner of the site
ref = getenv('GITHUB_REF', default="")
if ref == "refs/heads/master":
    version = "devel"
elif ref.startswith("refs/heads/release-"):
    # For release branches just show the latest tag name
    buildVersion = getenv("BUILD_VERSION", default="unknown")
    version = buildVersion.split('-')[0]
elif ref.startswith("refs/tags/"):
    version = ref[len("refs/tags/"):]
else:
    version = getenv("BUILD_VERSION", default="unknown")

release = getenv("BUILD_VERSION", default="unknown")

# Versions to show in the version menu
if getenv('VERSIONS_MENU'):
    html_context = {
        'versions_menu': True,
        'versions_menu_this_version': getenv('VERSIONS_MENU_THIS_VERSION', version)}

# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ['myst_parser', 'sphinx_markdown_tables']
myst_enable_extensions = ['substitution']
source_suffix = {'.rst': 'restructuredtext','.md': 'markdown'}

# Substitution variables
def module_version(module, version):
    version=version.split('-', 1)[0]
    if module == 'github.com/intel/goresctrl':
        version = '.'.join(version.split('.')[0:2]) + '.0'
    return version

def gomod_versions(modules):
    versions = {}
    gocmd = run(['go', 'list', '-m', '-f', '{{.GoVersion}}'],
                check=True, capture_output=True, universal_newlines=True)
    versions['golang'] = gocmd.stdout.strip()
    for m in modules:
        gocmd = run(['go', 'list', '-m', '-f', '{{.Version}}', '%s' % m],
                    check=True, capture_output=True, universal_newlines=True)
        versions[m] = module_version(m, gocmd.stdout.strip())
    return versions

mod_versions = gomod_versions(['github.com/intel/goresctrl'])
myst_substitutions = {
    'golang_version': mod_versions['golang'],
    'goresctrl_version': mod_versions['github.com/intel/goresctrl']
}
myst_heading_anchors = 3

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', '.github', '_work', 'generate', 'README.md', 'SECURITY.md', 'docs/releases']


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'

html_theme_options = {
    'display_version': True,
}

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
#html_static_path = ['_static']

def setup(app):
    app.connect('doctree-resolved',fixLocalMDAnchors)
    app.connect('missing-reference',fixRSTLinkInMD)

###############################################################################
#
#  This section defines callbacks that make markdown specific tweaks to
#  either:
#
#  1. Fix something that recommonmark does wrong.
#  2. Provide support for .md files that are written as READMEs in a GitHub
#     repo.
#
#  Only use these changes if using the extension ``recommonmark``.
#
###############################################################################

def isHTTPLink(uri):
    return uri.startswith('http://') or uri.startswith('https://')

def isMDFileLink(uri):
    return uri.endswith('.md') or '.md#' in uri

def isRSTFileLink(uri):
    return uri.endswith('.rst')

# Callback registerd with 'missing-reference'.
def fixRSTLinkInMD(app, env, node, contnode):
    refTarget = node.get('reftarget')

    if isHTTPLink(refTarget):
        return

    if isRSTFileLink(refTarget) and not isHTTPLink(refTarget):
    # This occurs when a .rst file is referenced from a .md file
    # Currently unable to check if file exists as no file
    # context is provided and links are relative.
    #
    # Example: [Application examples](examples/readme.rst)
    #
        contnode['refuri'] = contnode['refuri'].replace('.rst','.html')
        contnode['internal'] = "True"
        return contnode
    elif refTarget.startswith("/"):
    # This occurs when a file is referenced for download from an .md file.
    # Construct a list of them and short-circuit the warning. The files
    # are moved later (need file location context). To avoid warnings,
    # write .md files, make the links absolute. This only marks them fixed
    # if it can verify that they exist.
    #
    # Example: [Makefile](/Makefile)
    #
        filePath = refTarget.lstrip("/")
        if isfile(filePath) or isdir(filePath):
            return contnode


def normalizePath(docPath,uriPath):
    if uriPath == "":
        return uriPath
    if "#" in uriPath:
    # Strip out anchors
        uriPath = uriPath.split("#")[0]
    if uriPath.startswith("/"):
    # It's an absolute path
        return uriPath.lstrip("/") #path to file from project directory
    else:
    # It's a relative path
        docDir = dirname(docPath)
        return join(docDir,uriPath) #path to file from referencing file


# Callback registerd with 'doctree-resolved'.
def fixLocalMDAnchors(app, doctree, docname):
    for node in doctree.traverse(nodes.reference):
        uri = node.get('refuri')
        if uri is None:
             print("fixLocalMDAnchor: skipping anchor with no URI at node: ", node)
             continue

        if isHTTPLink(uri):
            continue

        filePath = normalizePath(docname,uri)

        if isfile(filePath):
        # Only do this if the file exists.
        #
        # TODO: Pop a warning if the file doesn't exist.
        #
            if isMDFileLink(uri) and not isHTTPLink(uri):
            # Make sure .md file links that weren't caught are converted.
            # These occur when creating an explicit link to an .md file
            # from an .rst file. By default these are not validated by Sphinx
            # or recommonmark. Only toctree references are validated. recommonmark
            # also fails to convert links to local Markdown files that include
            # anchors. This fixes that as well.
            #
            # Only include this code if .md files are being converted to html
            #
            # Example: `Google Cloud Engine <gce.md>`__
            #          [configuration options](autotest.md#configuration-options)
            #
                node['refuri'] = node['refuri'].replace('.md','.html')
            else:
            # Handle the case where markdown is referencing local files in the repo
            #
            # Example: [Makefile](/Makefile)
            #
                if useGitHubURL:
                # Replace references to local files with links to the GitHub repo
                #
                    newURI = join(githubFileURL, filePath)
                    print("new url: ", newURI)
                    node['refuri']=newURI
                else:
                # If there are links to local files other than .md (.rst files are caught
                # when warnings are fired), move the files into the Sphinx project, so
                # they can be accessed.
                    newFileDir = join(app.outdir,dirname(filePath)) # where to move the file in Sphinx output.
                    newFilePath = join(app.outdir,filePath)
                    newURI = uri # if the path is relative no need to change it.
                    if uri.startswith("/"):
                    # It's an absolute path. Need to make it relative.
                        uri = uri.lstrip("/")
                        docDirDepth = len(docname.split("/")) - 1
                        newURI = "../"*docDirDepth + uri
                    if not isdir(newFileDir):
                        makedirs(newFileDir)
                    copyfile(filePath,newFilePath)
                    node['refuri'] = newURI
        elif "#" not in uri: # ignore anchors
        # turn links to directories into links to the repo
            if isdir(filePath):
                newURI = join(githubDirURL, filePath)
                node['refuri']=newURI


================================================
FILE: docs/contributing.md
================================================
# Contributing

Please use the GitHub\* infrastructure for contributing to
CRI Resource Manager.
Use [pull requests](https://github.com/intel/cri-resource-manager/pulls)
to contribute code, bug fixes, or if you want to discuss your ideas in terms of
code. Open [issues](https://github.com/intel/cri-resource-manager/issues) to
report bugs, request new features, or if you want to discuss any other topics
related to CRI Resource Manager or orchestration resource management in
general.


================================================
FILE: docs/demos/blockio.md
================================================
# Block I/O Demo

This demo creates a virtual machine for a single-node Kubernetes\*
cluster where container runtime features are extended by `cri-resmgr`.

In this setup, `cri-resmgr` is configured with block I/O parameters
that throttle I/O bandwith of a container that constantly scans
system file checksums.

## Prerequisites

Install:
- `docker`
- `govm`

## Run the demo

```
./run.sh play
```

The demo does not delete the virtual machine so that you can
experiment with it. You can login to the virtual machine:

```
$ govm ssh crirm-demo-blockio
```

## Clean up - and run the demo from scratch

In order to run the demo from scratch again, delete the virtual
machine:

```
$ govm delete crirm-demo-blockio
```


================================================
FILE: docs/demos/index.rst
================================================
Demos
#####
.. toctree::
  :maxdepth: 1

  blockio.md


================================================
FILE: docs/developers-guide/architecture.md
================================================
# Architecture

## Overview

CRI Resource Manager (CRI-RM) is a pluggable add-on for controlling how much
and which resources are assigned to containers in a Kubernetes\* cluster.
It's an add-on because you install it in addition to the normal selection of
your components. It's pluggable since you inject it on the signaling path
between two existing components with the rest of the cluster unaware of its
presence.

CRI-RM plugs in between kubelet and CRI, the Kubernetes node agent and the
container runtime implementation. CRI-RM intercepts CRI protocol requests
from the kubelet acting as a non-transparent proxy towards the runtime.
Proxying by CRI-RM is non-transparent in nature because it usually alters
intercepted protocol messages before forwarding them.

CRI-RM keeps track of the states of all containers running on a Kubernetes
node. Whenever it intercepts a CRI request that results in changes to the
resource allocation of any container (container creation, deletion, or
resource assignment update request), CRI-RM runs one of its built-in policy
algorithms. This policy makes a decision about how the assignment of
resources should be updated and, eventually, the intercepted request is
modified according to this decision. The policy can make changes to any
container in the system, not just the one associated with the intercepted
CRI request. Therefore it does not operate directly on CRI requests.
Instead CRI-RM's internal state tracking cache provides an abstraction for
modifying containers and the policy uses this abstraction for recording its
decisions.

In addition to policies, CRI-RM has a number of built-in resource
controllers.
These are used to put policy decisions—in practice pending changes made to
containers by a policy—into effect. A special in-band CRI controller is used
to control all resources that are controllable via the CRI runtime. This
controller handles the practical details of updating the intercepted CRI
request and generating any additional unsolicited update requests for other
existing containers updated by the policy decision. Additional out-of-band
controllers exist to exercise control over resources that the current CRI
runtimes are unable to handle.

To tell which containers need to be handed off to various controllers for
updating, CRI-RM uses the internal state tracking cache's ability to tell
which containers have pending unenforced changes and to which controllers'
domain these changes belong. The CRI controller currently handles CPU and
memory resources, including huge pages. The level of control covers per
container CPU sets, CFS parametrization, memory limits, OOM score adjustment,
and pinning to memory controllers. The two existing out-of-band controllers,
Intel® Resource Director Technology (Intel® RDT) and Block I/O, handle last-level cache and memory bandwidth allocation,
and the arbitration of Block I/O bandwidth respectively.

Many of the details of how CRI-RM operates is configurable. These include,
for instance, which policy is active within CRI-RM, configuration of the
resource assignment algorithm for the active policy, and configuration for
the various resource controllers. Although CRI-RM can be configured using a
configuration file present on the node running CRI-RM, the preferred way to
configure all CRI-RM instances in a cluster is to use Kubernetes
ConfigMaps and the CRI-RM Node Agent.

<p align="center">
<!-- # It's a pity the markdown ![]()-syntax does not support aligning... -->
<img src="figures/arch-overview.svg" title="Architecture Overview">
</p>

## Components

### [Node Agent](/pkg/agent/)

The node agent is a component external to CRI-RM itself. All interactions
by CRI-RM with the Kubernetes Control Plane go through the node agent with
the node agent performing any direct interactions on behalf of CRI-RM.

The node agent communicates with CRI-RM using two gRPC interfaces. The
[config interface](/pkg/cri/resource-manager/config/api/v1/) is used to:
  - push updated external configuration data to CRI-RM
  - push adjustments to container resource assignments to CRI-RM

The [cluster interface](/pkg/agent/api/v1/) implements the necessary
low-level plumbing for the agent interface CRI-RM internally exposes
for its policies and other components. This interface in turn implements
the following:
  - updating resource capacity of the node
  - getting, setting, or removing labels on the node
  - getting, setting, or removing annotations on the node
  - getting, setting, or removing taints on the node

The config interface is defined and has its gRPC server running in
CRI-RM. The agent acts as a gRPC client for this interface. The low-level
cluster interface is defined and has its gRPC server running in the agent,
with the [convenience layer](/pkg/cri/resource-manager/agent) defined in
CRI-RM. CRI-RM acts as a gRPC client for the low-level plumbing interface.

Additionally, the stock node agent that comes with CRI-RM implements schemes
for:
   - configuration management for all CRI-RM instances
   - management of dynamic adjustments to container resource assignments

<p align="center">
<!-- # It's a pity the markdown ![]()-syntax does not support aligning... -->
<img src="figures/cri-resmgr.png" title="Architecture Overview" width="50%">
</p>


### [Resource Manager](/pkg/cri/resource-manager/)

CRI-RM implements a request processing pipeline and an event processing
pipeline.
The request processing pipeline takes care of proxying CRI requests and
responses between CRI clients and the CRI runtime. The event processing
pipeline processes a set of other events that are not directly related
to or the result of CRI requests. These events are typically internally
generated within CRI-RM. They can be the result of changes in the state
of some containers or the utilization of a shared system resource, which
potentially could warrant an attempt to rebalance the distribution of
resources among containers to bring the system closer to an optimal state.
Some events can also be generated by policies.

The Resource Manager component of CRI-RM implements the basic control
flow of both of these processing pipelines. It passes control to all the
necessary sub-components of CRI-RM at the various phases of processing a
request or an event. Additionally, it serializes the processing of these,
making sure there is at most one (intercepted) request or event being
processed at any point in time.

The high-level control flow of the request processing pipeline is as
follows:

A. If the request does not need policying, let it bypass the processing
pipeline; hand it off for logging, then relay it to the server and the
corresponding response back to the client.

B. If the request needs to be intercepted for policying, do the following:
 1. Lock the processing pipeline serialization lock.
 2. Look up/create cache objects (pod/container) for the request.
 3. If the request has no resource allocation consequences, do proxying
    (step 6).
 4. Otherwise, invoke the policy layer for resource allocation:
    - Pass it on to the configured active policy, which will
    - Allocate resources for the container.
    - Update the assignments for the container in the cache.
    - Update any other containers affected by the allocation in the cache.
 5. Invoke the controller layer for post-policy processing, which will:
    - Collect controllers with pending changes in their domain of control
    - for each invoke the post-policy processing function corresponding to
      the request.
    - Clear pending markers for the controllers.
 6. Proxy the request:
    - Relay the request to the server.
    - Send update requests for any additional affected containers.
    - Update the cache if/as necessary based on the response.
    - Relay the response back to the client.
 7. Release the processing pipeline serialization lock.

The high-level control flow of the event processing pipeline is one of the
following, based on the event type:

 - For policy-specific events:
   1. Engage the processing pipeline lock.
   2. Call policy event handler.
   3. Invoke the controller layer for post-policy processing (same as step 5 for requests).
   4. Release the pipeline lock.
 - For metrics events:
   1. Perform collection/processing/correlation.
   2. Engage the processing pipeline lock.
   3. Update cache objects as/if necessary.
   4. Request rebalancing as/if necessary.
   5. Release pipeline lock.
 - For rebalance events:
   1. Engage the processing pipeline lock.
   2. Invoke policy layer for rebalancing.
   3. Invoke the controller layer for post-policy processing (same as step 5 for requests).
   4. Release the pipeline lock.


### [Cache](/pkg/cri/resource-manager/cache/)

The cache is a shared internal storage location within CRI-RM. It tracks the
runtime state of pods and containers known to CRI-RM, as well as the state
of CRI-RM itself, including the active configuration and the state of the
active policy. The cache is saved to permanent storage in the filesystem and
is used to restore the runtime state of CRI-RM across restarts.

The cache provides functions for querying and updating the state of pods and
containers. This is the mechanism used by the active policy to make resource
assignment decisions. The policy simply updates the state of the affected
containers in the cache according to the decisions.

The cache's ability to associate and track changes to containers with
resource domains is used to enforce policy decisions. The generic controller
layer first queries which containers have pending changes, then invokes each
controller for each container. The controllers use the querying functions
provided by the cache to decide if anything in their resource/control domain
needs to be changed and then act accordingly.

Access to the cache needs to be serialized. However, this serialization is
not provided by the cache itself. Instead, it assumes callers to make sure
proper protection is in place against concurrent read-write access. The
request and event processing pipelines in the resource manager use a lock to
serialize request and event processing and consequently access to the cache.

If a policy needs to do processing unsolicited by the resource manager, IOW
processing other than handling the internal policy backend API calls from the
resource manager, then it should inject a policy event into the resource
managers event loop. This causes a callback from the resource manager to
the policy's event handler with the injected event as an argument and with
the cache properly locked.


### [Generic Policy Layer](/pkg/cri/resource-manager/policy/policy.go)

The generic policy layer defines the abstract interface the rest of CRI-RM
uses to interact with policy implementations and takes care of the details
of activating and dispatching calls through to the configured active policy.


### [Generic Resource Controller Layer](/pkg/cri/resource-manager/control/control.go)

The generic resource controller layer defines the abstract interface the rest
of CRI-RM uses to interact with resource controller implementations and takes
care of the details of dispatching calls to the controller implementations
for post-policy enforcment of decisions.


### [Metrics Collector](/pkg/metrics/)

The metrics collector gathers a set of runtime metrics about the containers
running on the node. CRI-RM can be configured to periodically evaluate this
collected data to determine how optimal the current assignment of container
resources is and to attempt a rebalancing/reallocation if it is deemed
both possible and necessary.


### [Policy Implementations](/pkg/cri/resource-manager/policy/builtin/)

#### [None](/pkg/cri/resource-manager/policy/builtin/none/)

An empty policy that makes no policy decisions. It is included
merely for the sake of completeness, analoguous to the none policy of the
CPU Manager in kubelet.

#### [Static Pools](/pkg/cri/resource-manager/policy/builtin/static-pools/)

A backward-compatible reimplementation of
[CMK](https://github.com/intel/CPU-Manager-for-Kubernetes)
for CRI-RM with a few extra features.

#### [Static](/pkg/cri/resource-manager/policy/builtin/static/)

Part of the code from the static policy of CPU Manager in kubelet, that has
been brutally hacked to work within CRI-RM. Serves merely as a
proof-of-concept that the current policies of kubelet can be implemented in
CRI-RM.

#### [Static Plus](/pkg/cri/resource-manager/policy/builtin/static-plus/)

A fairly simplistic policy similar in spirit to the static policy of
CPU Manager in kubelet, with a few extra features.

#### [Topology Aware](/pkg/cri/resource-manager/policy/builtin/topology-aware/)

A topology-aware policy capable of handling multiple tiers/types of memory,
typically a DRAM/PMEM combination configured in 2-layer memory mode.

### [Resource Controller Implementations](/pkg/cri/resource-manager/control/)

#### [Intel RDT](/pkg/cri/resource-manager/control/rdt/)

A resource controller implementation responsible for the practical details of
associating a container with Intel RDT classes. This class effectively
determines how much last level cache and memory bandwidth will be available
for the container. This controller uses the resctrl pseudo filesystem of the
Linux kernel for control.

#### [Block I/O](/pkg/cri/resource-manager/control/blockio/)

A resource controller implementation responsible for the practical details of
associating a container with a Block I/O class. This class effectively
determines how much Block I/O bandwidth will be available for the container.
This controller uses the blkio cgroup controller and the cgroupfs pseudo-
filesystem of the Linux kernel for control.

#### [CRI](/pkg/cri/resource-manager/control/cri/)

A resource controller responsible for modifying intercepted CRI container
creation requests and creating CRI container resource update requests,
according to the changes the active policy makes to containers.


================================================
FILE: docs/developers-guide/cri-test.md
================================================
# CRI Validation

[This test](/test/critest) runs
[`critest`](https://github.com/kubernetes-sigs/cri-tools/blob/master/docs/validation.md)
from [cri-tools](https://github.com/kubernetes-sigs/cri-tools/) to
make sure that various `cri-resmgr` configurations do not break CRI
runtime conformance.

## Prerequisites

Install:
- `docker`
- `govm`

## Run the test

```
cd test/critest
./run.sh test
```


================================================
FILE: docs/developers-guide/e2e-test.md
================================================
# End-to-End tests

## Prerequisites

Install:
- `docker`
- `govm` v0.95
  In case of errors in building `govm` with `go get`, or creating a virtual machine (`Error when creating the new VM: repository name must be canonical`), these are the workarounds:
  ```
  git clone https://github.com/govm-project/govm -b 0.95 && cd govm && go install && docker build . -t govm/govm:latest
  ```

## Usage

Run policy tests:

```
[VAR=VALUE...] ./run_tests.sh policies
```

Run tests only on certain policy, topology, or only selected test:

```
[VAR=VALUE...] ./run_tests.sh policies[/POLICY[/TOPOLOGY[/testNN-*]]]
```

Run custom tests:

```
[VAR=VALUE...] ./run.sh MODE
```

Get help on available `VAR=VALUE`'s with `./run.sh
help`. `run_tests.sh` calls `run.sh` in order to execute selected
tests. Therefore the same `VAR=VALUE` definitions apply both scripts.

## Test phases

In the *setup phase* `run.sh` creates a virtual machine unless it
already exists. When it is running, tests create a single-node cluster
and launches `cri-resmgr` on it, unless they are already running.

In the *test phase* `run.sh` runs a test script, or gives a prompt
(`run.sh> `) asking a user to run test script commands in the
`interactive` mode. *Test scripts* are `bash` scripts that can use
helper functions for running commands and observing the status of the
virtual machine and software running on it.

In the *tear down phase* `run.sh` copies logs from the virtual machine
and finally stops or deletes the virtual machine, if that is wanted.

## Test modes

- `test` mode runs fast and reports `Test verdict: PASS` or
  `FAIL`. The exit status is zero if and only if a test passed.

- `play` mode runs the same phases and scripts as the `test` mode, but
  slower. This is good for following and demonstrating what is
  happening.

- `interactive` mode runs the setup and tear down phases, but instead
  of executing a test script it gives an interactive prompt.

Print help to see clean up, execution speed and other options for all
modes.

## Running from scratch and quick rerun in existing virtual machine

The test will use `govm`-managed virtual machine named in the `vm`
environment variable. The default is `crirm-test-e2e`. If a virtual
machine with that name exists, the test will be run on it. Otherwise
the test will create a virtual machine with that name from
scratch. You can delete a virtual machine with `govm delete NAME`.

If you want rerun the test many times, possibly with different test
inputs or against different versions of `cri-resmgr`, either use the
`play` mode or set `cleanup=0` in order to keep the virtual machine
after each run. Then tests will run in the same single node cluster,
and the test script will only delete running pods before launching new
ones.

## Testing locally built cri-resmgr and cri-resmgr from github

If you make changes to `cri-resmgr` sources and rebuild it, you can
force the test script to reinstall newly built `cri-resmgr` to
existing virtual machine before rerunning the test:

```
cri-resource-manager$ make
cri-resource-manager$ cd test/e2e
e2e$ reinstall_cri_resmgr=1 speed=1000 ./run.sh play
```

You can also let the test script build `cri-resmgr` from the github
master branch. This takes place inside the virtual machine, so your
local git sources will not be affected:

```
e2e$ reinstall_cri_resmgr=1 binsrc=github ./run.sh play
```

## Custom tests

You can run a custom test script in a virtual machine that runs
single-node Kubernetes\* cluster. Example:

```
$ cat > myscript.sh << EOF
# create two pods, each requesting two CPUs
CPU=2 n=2 create guaranteed
# create four pods, no resource requests
n=4 create besteffort
# show pods
kubectl get pods
# check that the first two pods are not allowed to use the same CPUs
verify 'cpus["pod0c0"].isdisjoint(cpus["pod1c0"])'
EOF
$ ./run.sh test myscript.sh
```

## Custom topologies

If you change NUMA node topology of an existing virtual machine, you
must delete the virtual machine first. Otherwise the `topology` variable
is ignored and the test will run in the existing NUMA
configuration.

The `topology` variable is a JSON array of objects. Each object
defines one or more NUMA nodes. Keys in objects:
```
"mem"                 mem (RAM) size on each NUMA node in this group.
                      The default is "0G".
"nvmem"               nvmem (non-volatile RAM) size on each NUMA node
                      in this group. The default is "0G".
"cores"               number of CPU cores on each NUMA node in this group.
                      The default is 0.
"threads"             number of threads on each CPU core.
                      The default is 2.
"nodes"               number of NUMA nodes on each die.
                      The default is 1.
"dies"                number of dies on each package.
                      The default is 1.
"packages"            number of packages.
                      The default is 1.
```


Example:

Run the test in a VM with two NUMA nodes. There are 4 CPUs (two cores, two
threads per core by default) and 4G RAM in each node
```
e2e$ govm delete my2x4 ; vm=my2x4 topology='[{"mem":"4G","cores":2,"nodes":2}]' ./run.sh play
```

Run the test in a VM with 32 CPUs in total: there are two packages
(sockets) in the system, each containing two dies. Each die containing
two NUMA nodes, each node containing 2 CPU cores, each core containing
two threads. And with a NUMA node with 16G of non-volatile memory
(NVRAM) but no CPUs.

```
e2e$ vm=mynvram topology='[{"mem":"4G","cores":2,"nodes":2,"dies":2,"packages":2},{"nvmem":"16G"}]' ./run.sh play
```

## Test output

All test output is saved under the directory in the environment
variable `outdir`. The default is `./output`.

Executed commands with their output, exit status and timestamps are
saved under the `output/commands` directory.

You can find Qemu output from Docker\* logs. For instance, output of the
most recent Qemu launced by `govm`:
```
$ docker logs $(docker ps | awk '/govm/{print $1; exit}')
```

## Manual testing and debugging

Interactive mode helps developing and debugging scripts:

```
$ ./run.sh interactive
...
run.sh> CPU=2 n=2 create guaranteed
```

You can get help on functions available in test scripts with `./run.sh
help script`, or with `help` and `help FUNCTION` when in the
interactive mode.

If a test has stopped to a failing `verify`, you can inspect
`cri-resmgr` cache and allowed OS resources in Python\* after the test
run:

```
$ PYTHONPATH=<TEST-OUTPUT-DIR> python3
>>> from pyexec_state import *
>>> pp(allowed) # allowed OS resources
>>> pp(pods["pod0"]) # pod entry in cache
>>> pp(containers["pod0c0"])) # container entry in cache
```

If you want to get the interactive prompt in the middle of a test run
wherever a `verify` or `create` fails, you can set a `on_FUNC_fail` hook to
either or both of them. Example:

```
$ on_verify_fail=interactive ./run.sh myscript.sh
```


================================================
FILE: docs/developers-guide/index.rst
================================================
Developer's Guide
#################
.. toctree::
   :maxdepth: 1

   architecture.md
   policy-writers-guide.md
   testing.rst


================================================
FILE: docs/developers-guide/policy-writers-guide.md
================================================
# Policy Writer's Guide

***WORK IN PROGRESS***


================================================
FILE: docs/developers-guide/testing.rst
================================================
Testing
#######

.. toctree::
   :maxdepth: 1

   unit-test.md
   cri-test.md
   e2e-test.md


================================================
FILE: docs/developers-guide/unit-test.md
================================================
# Unit tests

Run unit tests with
```
make test
```


================================================
FILE: docs/index.html
================================================
<meta http-equiv="refresh" content="0; URL='docs/index.html'" />


================================================
FILE: docs/index.rst
================================================
.. CRI Resource Manager documentation master file

Welcome to CRI Resource Manager's documentation!
================================================

.. toctree::
   :maxdepth: 2
   :caption: Contents:

   introduction.md
   quick-start.md
   installation.md
   setup.md
   policy/index.rst
   node-agent.md
   webhook.md

   developers-guide/index.rst

   migration-to-NRI.md

   demos/index.rst

   reference/index.md

   contributing.md
   security.md

   Project GitHub repository <https://github.com/intel/cri-resource-manager>


================================================
FILE: docs/installation.md
================================================
# Installation

## Installing from packages

You can install CRI Resource Manager from `deb` or `rpm` packages
for supported distros.

  - [download](https://github.com/intel/cri-resource-manager/releases/latest)
  packages
  - install them:
    - for rpm packages: `sudo rpm -Uvh <packages>`
    - for deb packages: `sudo dpkg -i <packages>`

## Installing from sources

Although not recommended, you can install CRI Resource Manager from sources:

  - get the sources: `git clone https://github.com/intel/cri-resource-manager`
  - build and install: `cd cri-resource-manager; make build && sudo make install`

You will need at least `git`, {{ '`golang '+ '{}'.format(golang_version) + '`' }} or newer,
`GNU make`, `bash`, `find`, `sed`, `head`, `date`, and `install` to be able to build and install
from sources.

## Building packages for the distro of your host

You can build packages for the `$distro` of your host by executing the
following command:

```
make packages
```

If the `$version` of your `$distro` is supported, this will leave the
resulting packages in `packages/$distro-$version`. Building packages
this way requires `docker`, but it does not require you to install
the full set of build dependencies of CRI Resource Manager to your host.

If you want to build packages without docker, you can use either
`make rpm` or `make deb`, depending on which supported distro you are
running. Building this way requires all the build dependencies to be
installed on your host.

You can check which `$distro`'s and `$version`'s are supported by running

```
ls dockerfiles/cross-build
```

If you see a `Dockerfile.$distro-$version` matching your host then your
distro is supported.

## Building packages for another distro

You can cross-build packages of the native `$type` for a particular
`$version` of a `$distro` by running the following command:

```
make cross-$type.$distro-$version
```

Similarly to `make packages`, this will build packages using a `Docker\*`
container. However, instead of building for your host, it will build them
for the specified distro. For instance `make cross-deb.ubuntu-18.04` will
build `deb` packages for `Ubuntu\* 18.04`.

## Post-install configuration

The provided packages install `systemd` service files and a sample
configuration. The easiest way to get up and running is to rename the sample
configuration and start CRI Resource Manager using systemd. You can do this
using the following commands:

```
mv /etc/cri-resmgr/fallback.cfg.sample /etc/cri-resmgr/fallback.cfg
systemctl start cri-resource-manager
```

If you want, you can set CRI Resource Manager to automatically start
when your system boots with this command:

```
systemctl enable cri-resource-manager
```

The provided packages also install a file for managing the default options
passed to CRI Resource Manager upon startup. You can change these by editing
this file and then restarting CRI Resource Manager, like this:

```
# On Debian\*-based systems edit the defaults like this:
${EDITOR:-vi} /etc/default/cri-resource-manager
# On rpm-based systems edit the defaults like this:
${EDITOR:-vi} /etc/sysconfig/cri-resource-manager
# Restart the service.
systemctl restart cri-resource-manager
```


================================================
FILE: docs/introduction.md
================================================
# Introduction

CRI Resource Manager is a Container Runtime Interface Proxy. It sits between
clients and the actual Container Runtime implementation (containerd, cri-o)
relaying requests and responses back and forth. The main purpose of the proxy
is to apply hardware-aware resource allocation policies to the containers
running in the system.

Policies are applied by either modifying a request before forwarding it or
by performing extra actions related to the request during its processing and
proxying. There are several policies available, each with a different set of
goals in mind and implementing different hardware allocation strategies. The
details of whether and how a CRI request is altered or if extra actions are
performed depend on which policy is active in CRI Resource Manager and how
that policy is configured.

The current goal for the CRI Resource Manager is to prototype and experiment
with new Kubernetes\* container placement policies. The existing policies are
written with this in mind and the intended setup is for the Resource Manager
to only act as a proxy for the Kubernetes Node Agent, kubelet.


================================================
FILE: docs/migration-to-NRI.md
================================================
# Migrating from CRI-RM to NRI

## Prerequisities

- Up and running CRI Resource Manager
- One of the two supported policies in use: balloons or topology-aware.
- For other policies a little bit more work is required and the policies need to be 'ported'. This can be done by just following the example of how balloons or topology-aware were converted.

## Steps for an initial/basic migration test

### Containerd

Replace the containerd version in the system with 1.7 or newer version (NRI server not supported in older versions).

Replace kubelet's --container-runtime-endpoint=/var/run/cri-resmgr/cri-resmgr.sock with --container-runtime-endpoint=/var/run/containerd/containerd.sock

Replacing the runtime endpoint on a node that was setup using Kubeadm:
```console
# Get the Kubelet args
systemctl cat kubelet <- Look for: EnvironmentFile=/.../kubeadm-flags.env

vim /.../kubeadm-flags.env
  KUBELET_KUBEADM_ARGS="--container-runtime-endpoint=unix:///var/run/containerd/containerd.sock --pod-infra-container-image=registry.k8s.io/pause:3.9"

vim /etc/sysconfig/kubelet
  KUBELET_EXTRA_ARGS= --container-runtime-endpoint=/var/run/containerd/containerd.sock <- Remember this aswell

systemctl restart kubelet
```

Edit the containerd config file and look for the section [plugins."io.containerd.nri.v1.nri"] and replace "disable = true" with "disable = false":
```console
vim /etc/containerd/config.toml
```
```toml
[plugins."io.containerd.nri.v1.nri"]
  disable = false
  disable_connections = false
  plugin_config_path = "/etc/nri/conf.d"
  plugin_path = "/opt/nri/plugins"
  plugin_registration_timeout = "5s"
  plugin_request_timeout = "2s"
  socket_path = "/var/run/nri/nri.sock"
```
```console
systemctl restart containerd
```

### CRI-O

Ensure that crio version 1.26.2 or newer is used.

Replace kubelet's --container-runtime-endpoint=/var/run/cri-resmgr/cri-resmgr.sock with --container-runtime-endpoint=/var/run/crio/crio.sock

Replacing the runtime endpoint on a node that was setup using Kubeadm:
```console
# Get the Kubelet args
systemctl cat kubelet <- Look for: EnvironmentFile=/.../kubeadm-flags.env

vim /.../kubeadm-flags.env
  KUBELET_KUBEADM_ARGS="--container-runtime-endpoint=unix:///var/run/crio/crio.sock --pod-infra-container-image=registry.k8s.io/pause:3.9"

vim /etc/sysconfig/kubelet
  KUBELET_EXTRA_ARGS= --container-runtime-endpoint=/var/run/crio/crio.sock <- Remember this aswell

systemctl restart kubelet
```

Enable NRI:
```console
CRIO_CONF=/etc/crio/crio.conf
cp $CRIO_CONF $CRIO_CONF.orig
crio --enable-nri config > $CRIO_CONF
systemctl restart crio
```

### Build the NRI policies

```console
git clone https://github.com/containers/nri-plugins.git
cd nri-plugins
make
# Build the images, specify your image repo to easily push the image later.
make images IMAGE_REPO=my-repo IMAGE_VERSION=my-tag
```

### Create required CRDs

```console
kubectl apply -f deployment/base/crds/noderesourcetopology_crd.yaml
```

### Import the image of the NRI plugin you want to run

<b>Containerd</b>

```console
ctr -n k8s.io images import build/images/nri-resmgr-topology-aware-image-*.tar
```

<b>CRI-O</b>

See the section [below](#steps-for-a-more-real-life-migration-using-self-hosted-image-repository) for instructions on how to push the images to a registry, then pull from there.

### Deploy the plugin

  ```console
  kubectl apply -f build/images/nri-resmgr-topology-aware-deployment.yaml
  ```

### Deploy a test pod

```console
kubectl run mypod --image busybox -- sleep inf
kubectl exec mypod  -- grep allowed_list: /proc/self/status
```

### See the resources the pod got assigned with

```console
kubectl exec $pod -c $container  -- grep allowed_list: /proc/self/status
# Output should look similar to the output of CRI-RM
```

## Steps for a more real-life migration using self-hosted image repository

- Same steps as above for enabling NRI with Containerd/CRI-O and building the images.
- Push the images built to your repository:
  ```console
  # Replace my-repo and my-tag with the IMAGE_REPO and IMAGE_VERSION you specified when building the images with make images
  docker push my-repo:my-tag
  ```

- Remember to change the image name & pull policy in the plugins .yaml file to match your registyr and image, ex:
  ```console
  vim build/images/nri-resmgr-topology-aware-deployment.yaml
  ```

- Then deploy the plugin simlarly to the earlier step.

## Migrating existing configuration

- The ConfigMap used by the ported policies/infra has a different name/naming scheme than the original one used in CRI-RM, ex:
  - configMapName:
    ```diff
    - configmap-name: cri-resmgr-config
    + configmap-name: nri-resource-policy-config
    ```
  - The details of grouping nodes by labeling to share configuration:
    ```diff
    - cri-resource-manager.intel.com/group: $GROUP_NAME
    + resource-policy.nri.io/group: $GROUP_NAME
    ```

## Migrating existing workloads

- The annotations one can use to customize how a policy treats a workload use slightly different keys than the original ones in CRI-RM. The collective 'key namespace' for policy- and resource-manager-specific annotation has been changed from cri-resource-manager.intel.com to resource-policy.nri.io.

- For instance, an explicit type annotation for the balloons policy, which used to be:
  ```yaml
  ...
  metadata:
    annotations:
      balloon.balloons.cri-resource-manager.intel.com/container.$CONTAINER_NAME: $BALLOON_TYPE`
  ...
  ```

- Should now be:
  ```yaml
  ...
  metadata:
    annotations:
      balloon.balloons.resource-policy.nri.io/container.$CONTAINER_NAME: $BALLOON_TYPE`
  ...
  ```

- Similarly a workload opt-out annotation from exclusive CPU allocation for the topology-aware policy, which used to be:
  ```yaml
  ...
  metadata:
    annotations:
      prefer-shared-cpus.cri-resource-manager.intel.com/container.$CONTAINER_NAME: "true"
  ...
  ```

- Should now be:
  ```yaml
  ...
  metadata:
    annotations:
      prefer-shared-cpus.resource-policy.nri.io/container.$CONTAINER_NAME: "true"
  ...
  ```

- Similar changes are needed for any cri-resmgr-specific annotation that uses the same semantic scoping for key syntax.


All of the annotations:
| Was                                                 | Is now                                      |
| --------------------------------------------------- | ------------------------------------------- |
| cri-resource-manager.intel.com/afffinity            | resource-policy.nri.io/affinity             |
| cri-resource-manager.intel.com/anti-afffinity       | resource-policy.nri.io/anti-affinity        |
| cri-resource-manager.intel.com/prefer-isolated-cpus | resource-policy.nri.io/prefer-isolated-cpus |
| cri-resource-manager.intel.com/prefer-shared-cpus   | resource-policy.nri.io/prefer-shared-cpus   |
| cri-resource-manager.intel.com/cold-start           | resource-policy.nri.io/cold-start           |
| cri-resource-manager.intel.com/memory-type          | resource-policy.nri.io/ memory-type         |
| prefer-isolated-cpus.cri-resource-manager.intel.com | prefer-isolated-cpus.resource-policy.nri.io |
| prefer-shared-cpus.cri-resource-manager.intel.com   | prefer-shared-cpus.resource-policy.nri.io   |
| memory-type.cri-resource-manager.intel.com          | memory-type.resource-policy.nri.io          |
| cold-start.cri-resource-manager.intel.com           | cold-start.resource-policy.nri.io           |
| prefer-reserved-cpus.cri-resource-manager.intel.com | prefer-reserved-cpus.resource-policy.nri.io |
| rdtclass.cri-resource-manager.intel.com             | rdtclass.resource-policy.nri.io             |
| blockioclass.cri-resource-manager.intel.com         | blockioclass.resource-policy.nri.io         |
| toptierlimit.cri-resource-manager.intel.com         | toptierlimit.resource-policy.nri.io         |
| topologyhints.cri-resource-manager.intel.com        | topologyhints.resource-policy.nri.io        |
| balloon.balloons.cri-resource-manager.intel.com     | balloon.balloons.resource-policy.nri.io     |


================================================
FILE: docs/node-agent.md
================================================
# Node Agent

CRI Resource Manager can be configured dynamically using the CRI Resource
Manager Node Agent and Kubernetes\* ConfigMaps.

## Running as a DaemonSet

The agent can be build using the
[provided Dockerfile](/cmd/cri-resmgr-agent/Dockerfile). It can be
deployed as a `DaemonSet` in the cluster using the
[provided deployment file](/cmd/cri-resmgr-agent/agent-deployment.yaml).

When using the provided or a similar deployment, the agent uses a
readiness probe to propagate the status of the last configuration
update back to the control plane. If the configuration could not
be taken into use for any reason, the agent's probe will fail which
eventually marks the agent as not being `Ready`. In this case, more
details about the failure should be present among the latest messages
logged by the agent or the probe itself. if the reason for failure is
a configuration error, once the error is fixed, the agent should become
eventually `Ready` again.

## Running as a Host Service

To run the agent manually or as a `systemd` service, set the environment
variable `NODE_NAME` to the name of the cluster node the agent is running
on. If necessary pass it the credentials for accessing the cluster using
 the `-kubeconfig <file>` command line option.

## ConfigMap to Node Mapping Conventions

The agent monitors two ConfigMaps for the node, a primary node-specific one
and a secondary group-specific or default one, depending on whether the node
belongs to a configuration group. The node-specific ConfigMap always takes
precedence over the others.

The names of these ConfigMaps are

1. `cri-resmgr-config.node.$NODE_NAME`: primary, node-specific configuration
2. `cri-resmgr-config.group.$GROUP_NAME`: secondary group-specific node
    configuration
3. `cri-resmgr-config.default`: secondary: secondary default node
    configuration

You can assign a node to a configuration group by setting the
`cri-resource-manager.intel.com/group` label on the node to the name of
the configuration group. You can remove a node from its group by deleting
the node group label.

There is a
[sample ConfigMap spec](/sample-configs/cri-resmgr-configmap.example.yaml)
that contains a node-specific, a group-specific, and a default ConfigMap
example. See [any available policy-specific documentation](policy/index.rst)
for more information on the policy configurations.


================================================
FILE: docs/policy/balloons.md
================================================
# Balloons Policy

## Overview

The balloons policy implements workload placement into "balloons" that
are disjoint CPU pools. Balloons can be inflated and deflated, that is
CPUs added and removed, based on the CPU resource requests of
containers. Balloons can be static or dynamically created and
destroyed. CPUs in balloons can be configured, for example, by setting
min and max frequencies on CPU cores and uncore.

## How It Works

1. User configures balloon types from which the policy instantiates
   balloons.

2. A balloon has a set of CPUs and a set of containers that run on the
   CPUs.

3. Every container is assigned to exactly one balloon. A container is
   allowed to use all CPUs of its balloon and no other CPUs.

4. Every logical CPU belongs to at most one balloon. There can be CPUs
   that do not belong to any balloon.

5. The number of CPUs in a balloon can change during the lifetime of
   the balloon. If a balloon inflates, that is CPUs are added to it,
   all containers in the balloon are allowed to use more CPUs. If a
   balloon deflates, the opposite is true.

6. When a new container is created on a Kubernetes node, the policy
   first decides the type of the balloon that will run the
   container. The decision is based on annotations of the pod, or the
   namespace if annotations are not given.

7. Next the policy decides which balloon of the decided type will run
   the container. Options are:
   - an existing balloon that already has enough CPUs to run its
     current and new containers
   - an existing balloon that can be inflated to fit its current and
     new containers
   - new balloon.

9. When a CPU is added to a balloon or removed from it, the CPU is
   reconfigured based on balloon's CPU class attributes, or idle CPU
   class attributes.

## Deployment

### Install cri-resmgr

Deploy cri-resmgr on each node as you would for any other policy. See
[installation](../installation.md) for more details.

## Configuration

The balloons policy is configured using the yaml-based configuration
system of CRI-RM. See [setup and
usage](../setup.md#setting-up-cri-resource-manager) for more details
on managing the configuration.

### Parameters

Balloons policy parameters:

- `PinCPU` controls pinning a container to CPUs of its balloon. The
  default is `true`: the container cannot use other CPUs.
- `PinMemory` controls pinning a container to the memories that are
  closest to the CPUs of its balloon. The default is `true`: allow
  using memory only from the closest NUMA nodes. Warning: this may
  cause kernel to kill workloads due to out-of-memory error when
  closest NUMA nodes do not have enough memory. In this situation
  consider switching this option `false`.
- `IdleCPUClass` specifies the CPU class of those CPUs that do not
  belong to any balloon.
- `ReservedPoolNamespaces` is a list of namespaces (wildcards allowed)
  that are assigned to the special reserved balloon, that is, will run
  on reserved CPUs. This always includes the `kube-system` namespace.
- `AllocatorTopologyBalancing` affects selecting CPUs for new
  balloons. If `true`, new balloons are created using CPUs on
  NUMA/die/package with most free CPUs, that is, balloons are spread
  across the hardware topology. This helps inflating balloons within
  the same NUMA/die/package and reduces interference between workloads
  in balloons when system is not fully loaded. The default is `false`:
  pack new balloons tightly into the same NUMAs/dies/packages. This
  helps keeping large portions of hardware idle and entering into deep
  power saving states.
- `PreferSpreadOnPhysicalCores` prefers allocating logical CPUs
  (possibly hyperthreads) for a balloon from separate physical CPU
  cores. This prevents workloads in the balloon from interfering with
  themselves as they do not compete on the resources of the same CPU
  cores. On the other hand, it allows more interference between
  workloads in different balloons. The default is `false`: balloons
  are packed tightly to a minimum number of physical CPU cores. The
  value set here is the default for all balloon types, but it can be
  overridden with the balloon type specific setting with the same
  name.
- `BalloonTypes` is a list of balloon type definitions. Each type can
  be configured with the following parameters:
  - `Name` of the balloon type. This is used in pod annotations to
    assign containers to balloons of this type.
  - `Namespaces` is a list of namespaces (wildcards allowed) whose
    pods should be assigned to this balloon type, unless overridden by
    pod annotations.
  - `MinBalloons` is the minimum number of balloons of this type that
    is always present, even if the balloons would not have any
    containers. The default is 0: if a balloon has no containers, it
    can be destroyed.
  - `MaxBalloons` is the maximum number of balloons of this type that
    is allowed to co-exist. The default is 0: creating new balloons is
    not limited by the number of existing balloons.
  - `MaxCPUs` specifies the maximum number of CPUs in any balloon of
	this type. Balloons will not be inflated larger than this. 0 means
	unlimited.
  - `MinCPUs` specifies the minimum number of CPUs in any balloon of
    this type. When a balloon is created or deflated, it will always
    have at least this many CPUs, even if containers in the balloon
    request less.
  - `CpuClass` specifies the name of the CPU class according to which
    CPUs of balloons are configured.
  - `PreferSpreadingPods`: if `true`, containers of the same pod
    should be spread to different balloons of this type. The default
    is `false`: prefer placing containers of the same pod to the same
    balloon(s).
  - `PreferPerNamespaceBalloon`: if `true`, containers in the same
	namespace will be placed in the same balloon(s). On the other
	hand, containers in different namespaces are preferrably placed in
	different balloons. The default is `false`: namespace has no
	effect on choosing the balloon of this type.
  - `PreferNewBalloons`: if `true`, prefer creating new balloons over
    placing containers to existing balloons. This results in
    preferring exclusive CPUs, as long as there are enough free
    CPUs. The default is `false`: prefer filling and inflating
    existing balloons over creating new ones.
  - `ShareIdleCPUsInSame`: Whenever the number of or sizes of balloons
    change, idle CPUs (that do not belong to any balloon) are reshared
    as extra CPUs to workloads in balloons with this option. The value
    sets locality of allowed extra CPUs that will be common to these
    workloads.
    - `system`: workloads are allowed to use idle CPUs available
      anywhere in the system.
    - `package`: ...allowed to use idle CPUs in the same package(s)
    (sockets) as the balloon.
    - `die`: ...in the same die(s) as the balloon.
    - `numa`: ...in the same numa node(s) as the balloon.
    - `core`: ...allowed to use idle CPU threads in the same cores with
      the balloon.
  - `PreferSpreadOnPhysicalCores` overrides the policy level option
    with the same name in the scope of this balloon type.
  - `AllocatorPriority` (0: High, 1: Normal, 2: Low, 3: None). CPU
    allocator parameter, used when creating new or resizing existing
    balloons. If there are balloon types with pre-created balloons
    (`MinBalloons` > 0), balloons of the type with the highest
    `AllocatorPriority` are created first.

Related configuration parameters:
- `policy.ReservedResources.CPU` specifies the (number of) CPUs in the
  special `reserved` balloon. By default all containers in the
  `kube-system` namespace are assigned to the reserved balloon.
- `cpu.classes` defines CPU classes and their parameters (such as
  `minFreq`, `maxFreq`, `uncoreMinFreq` and `uncoreMaxFreq`).

### Example

Example configuration that runs all pods in balloons of 1-4 CPUs.
```yaml
policy:
  Active: balloons
  ReservedResources:
    CPU: 1
  balloons:
    PinCPU: true
    PinMemory: true
    IdleCPUClass: lowpower
    BalloonTypes:
      - Name: "quad"
        MinCpus: 1
        MaxCPUs: 4
        CPUClass: dynamic
        Namespaces:
          - "*"
cpu:
  classes:
    lowpower:
      minFreq: 800
      maxFreq: 800
    dynamic:
      minFreq: 800
      maxFreq: 3600
    turbo:
      minFreq: 3000
      maxFreq: 3600
      uncoreMinFreq: 2000
      uncoreMaxFreq: 2400
```

See the [sample configmap](/sample-configs/balloons-policy.cfg) for a
complete example.

## Assigning a Container to a Balloon

The balloon type of a container can be defined in pod annotations. In
the example below, the first annotation sets the balloon type (`BT`)
of a single container (`CONTAINER_NAME`). The last two annotations set
the default balloon type for all containers in the pod.

```yaml
balloon.balloons.cri-resource-manager.intel.com/container.CONTAINER_NAME: BT
balloon.balloons.cri-resource-manager.intel.com/pod: BT
balloon.balloons.cri-resource-manager.intel.com: BT
```

If a pod has no annotations, its namespace is matched to the
`Namespaces` of balloon types. The first matching balloon type is
used.

If the namespace does not match, the container is assigned to the
special `default` balloon, that means reserved CPUs unless `MinCPUs`
or `MaxCPUs` of the `default` balloon type are explicitely defined in
the `BalloonTypes` configuration.

## Metrics and Debugging

In order to enable more verbose logging and metrics exporting from the
balloons policy, enable instrumentation and policy debugging from the
CRI-RM global config:

```yaml
instrumentation:
  # The balloons policy exports containers running in each balloon,
  # and cpusets of balloons. Accessible in command line:
  # curl --silent http://localhost:8891/metrics
  HTTPEndpoint: :8891
  PrometheusExport: true
logger:
  Debug: policy
```


================================================
FILE: docs/policy/blockio.md
================================================
# Block IO

## Overview

Block IO controller provides means to control
- block device IO scheduling priority (weight)
- throttling IO bandwith
- throttling number of IO operations.

CRI Resource Manager applies block IO contoller parameters to pods via
[cgroups block io contoller](https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v1/blkio-controller.html).

## Configuration

See [sample blockio configuration](/sample-configs/blockio.cfg).

## Demo

See [Block IO demo](../demos/blockio.md)


================================================
FILE: docs/policy/container-affinity.md
================================================
# Container Affinity and Anti-Affinity

## Introduction

Some policies allow the user to give hints about how particular containers
should be *co-located* within a node. In particular these hints express whether
containers should be located *'close'* to each other or *'far away'* from each
other, in a hardware topology sense.

Since these hints are interpreted always by a particular *policy implementation*,
the exact definitions of 'close' and 'far' are also somewhat *policy-specific*.
However as a general rule of thumb containers running

  - on CPUs within the *same NUMA nodes* are considered *'close'* to each other,
  - on CPUs within *different NUMA nodes* in the *same socket* are *'farther'*, and
  - on CPUs within *different sockets* are *'far'* from each other

These hints are expressed by `container affinity annotations` on the Pod.
There are two types of affinities:

  - `affinity` (or `positive affinty`): cause affected containers to *pull* each other closer
  - `anti-affinity` (or `negative affinity`): cause affected containers to *push* each other further away

Policies try to place a container
  - close to those the container has affinity towards
  - far from those the container has anti-affinity towards.

## Affinity Annotation Syntax

*Affinities* are defined as the `cri-resource-manager.intel.com/affinity` annotation.
*Anti-affinities* are defined as the `cri-resource-manager.intel.com/anti-affinity`
annotation. They are specified in the `metadata` section of the `Pod YAML`, under
`annotations` as a dictionary, with each dictionary key being the name of the
*container* within the Pod to which the annotation belongs to.

```yaml
metadata:
  anotations:
    cri-resource-manager.intel.com/affinity: |
      container1:
        - scope:
            key: key-ref
            operator: op
            values:
            - value1
            ...
            - valueN
          match:
            key: key-ref
            operator: op
            values:
            - value1
            ...
            - valueN
          weight: w
```

An anti-affinity is defined similarly but using `cri-resource-manager.intel.com/anti-affinity`
as the annotation key.

```yaml
metadata:
  anotations:
    cri-resource-manager.intel.com/anti-affinity: |
      container1:
        - scope:
            key: key-ref
            operator: op
            values:
            - value1
            ...
            - valueN
          match:
            key: key-ref
            operator: op
            values:
            - value1
            ...
            - valueN
          weight: w
```

## Affinity Semantics

An affinity consists of three parts:

  - `scope expression`: defines which containers this affinity is evaluated against
  - `match expression`: defines for which containers (within the scope) the affinity applies to
  - `weight`: defines how *strong* a pull or a push the affinity causes

*Affinities* are also sometimes referred to as *positive affinities* while
*anti-affinities* are referred to as *negative affinities*. The reason for this is
that the only difference between these are that affinities have a *positive weight*
while anti-affinities have a *negative weight*.

The *scope* of an affinity defines the *bounding set of containers* the affinity can
apply to. The affinity *expression* is evaluated against the containers *in scope* and
it *selects the containers* the affinity really has an effect on. The *weight* specifies
whether the effect is a *pull* or a *push*. *Positive* weights cause a *pull* while
*negative* weights cause a *push*. Additionally, the *weight* specifies *how strong* the
push or the pull is. This is useful in situations where the policy needs to make some
compromises because an optimal placement is not possible. The weight then also acts as
a way to specify preferences of priorities between the various compromises: the heavier
the weight the stronger the pull or push and the larger the propbability that it will be
honored, if this is possible at all.

The scope can be omitted from an affinity in which case it implies *Pod scope*, in other
words the scope of all containers that belong to the same Pod as the container for which
which the affinity is defined.

The weight can also be omitted in which case it defaults to -1 for anti-affinities
and +1 for affinities. Weights are currently limited to the range [-1000,1000].

Both the affinity scope and the expression select containers, therefore they are identical.
Both of them are *expressions*. An expression consists of three parts:

  - key: specifies what *metadata* to pick from a container for evaluation
  - operation (op): specifies what *logical operation* the expression evaluates
  - values: a set of *strings* to evaluate the the value of the key against

The supported keys are:

  - for pods:
    - `name`
    - `namespace`
    - `qosclass`
    - `labels/<label-key>`
    - `id`
    - `uid`
  - for containers:
    - `pod/<pod-key>`
    - `name`
    - `namespace`
    - `qosclass`
    - `labels/<label-key>`
    - `tags/<tag-key>`
    - `id`

Essentially an expression defines a logical operation of the form (key op values).
Evaluating this logical expression will take the value of the key in  which
either evaluates to true or false.
a boolean true/false result. Currently the following operations are supported:

  - `Equals`: equality, true if the *value of key* equals the single item in *values*
  - `NotEqual`: inequality, true if the *value of key* is not equal to the single item in *values*
  - `In`: membership, true if *value of key* equals to any among *values*
  - `NotIn`: negated membership, true if the *value of key* is not equal to any among *values*
  - `Exists`: true if the given *key* exists with any value
  - `NotExists`: true if the given *key* does not exist
  - `AlwaysTrue`: always evaluates to true, can be used to denote node-global scope (all containers)
  - `Matches`: true if the *value of key* matches the globbing pattern in values
  - `MatchesNot`: true if the *value of key* does not match the globbing pattern in values
  - `MatchesAny`: true if the *value of key* matches any of the globbing patterns in values
  - `MatchesNone`: true if the *value of key* does not match any of the globbing patterns in values

The effective affinity between containers C_1 and C_2, A(C_1, C_2) is the sum of the
weights of all pairwise in-scope matching affinities W(C_1, C_2). To put it another way,
evaluating an affinity for a container C_1 is done by first using the scope (expression)
to determine which containers are in the scope of the affinity. Then, for each in-scope
container C_2 for which the match expression evaluates to true, taking the weight of the
affinity and adding it to the effective affinity A(C_1, C_2).

Note that currently (for the topology-aware policy) this evaluation is asymmetric:
A(C_1, C_2) and A(C_2, C_1) can and will be different unless the affinity annotations are
crafted to prevent this (by making them fully symmetric). Moreover, A(C_1, C_2) is calculated
and taken into consideration during resource allocation for C_1, while A(C_2, C_1)
is calculated and taken into account during resource allocation for C_2. This might be
changed in a future version.


Currently affinity expressions lack support for boolean operators (and, or, not).
Sometimes this limitation can be overcome by using joint keys, especially with
matching operators. The joint key syntax allows joining the value of several keys
with a separator into a single value. A joint key can be specified in a simple or
full format:

  - simple: `<colon-separated-subkeys>`, this is equivalent to `:::<colon-separated-subkeys>`
  - full:   `<ksep><vsep><ksep-separated-keylist>`

A joint key evaluates to the values of all the `<ksep>`-separated subkeys joined by `<vsep>`.
A non-existent subkey evaluates to the empty string. For instance the joint key

  `:pod/qosclass:pod/name:name`

evaluates to

  `<qosclass>:<pod name>:<container name>`

For existence operators, a joint key is considered to exist if any of its subkeys exists.


## Examples

Put the container `peter` close to the container `sheep` but far away from the
container `wolf`.

```yaml
metadata:
  annotations:
    cri-resource-manager.intel.com/affinity: |
      peter:
      - match:
          key: name
          operator: Equals
          values:
          - sheep
        weight: 5
    cri-resource-manager.intel.com/anti-affinity: |
      peter:
      - match:
          key: name
          operator: Equals
          values:
          - wolf
        weight: 5
```

## Shorthand Notation

There is an alternative shorthand syntax for what is considered to be the most common
case: defining affinities between containers within the same pod. With this notation
one needs to give just the names of the containers, like in the example below.

```yaml
  annotations:
    cri-resource-manager.intel.com/affinity: |
      container3: [ container1 ]
    cri-resource-manager.intel.com/anti-affinity: |
      container3: [ container2 ]
      container4: [ container2, container3 ]
```


This shorthand notation defines:
  - `container3` having
    - affinity (weight 1) to `container1`
    - `anti-affinity` (weight -1) to `container2`
  - `container4` having
    - `anti-affinity` (weight -1) to `container2`, and `container3`

The equivalent annotation in full syntax would be

```yaml
metadata:
  annotations:
    cri-resource-manager.intel.com/affinity: |+
      container3:
      - match:
          key: labels/io.kubernetes.container.name
          operator: In
          values:
          - container1
    cri-resource-manager.intel.com/anti-affinity: |+
      container3:
      - match:
          key: labels/io.kubernetes.container.name
          operator: In
          values:
          - container2
      container4:
      - match:
          key: labels/io.kubernetes.container.name
          operator: In
          values:
          - container2
          - container3
```


================================================
FILE: docs/policy/cpu-allocator.md
================================================
# CPU Allocator

CRI Resource Manager has a separate CPU allocator component that helps policies
make educated allocation of CPU cores for workloads.  Currently all policies
except for [static-pools](static-pools.md) utilize the built-in CPU allocator.
See policy specific documentation for more details.

## Topology Based Allocation

The CPU allocator tries to optimize the allocation of CPUs in terms of the
hardware topology. More specifically, it aims at packing all CPUs of one
request "near" each other in order to minimize memory latencies between CPUs.

## CPU Prioritization

The CPU allocator also does automatic CPU prioritization by detecting CPU
features and their configuration parameters. Currently, CRI Resource Manager
supports CPU priority detection based on the `intel_pstate` scaling
driver in the Linux CPUFreq subsystem, and, Intel Speed Select Technology
(SST).

CPUs are divided into three priority classes, i.e. *high*, *normal* and *low*.
Policies utilizing the CPU allocator may choose to prefer certain priority
class for certain types of workloads. For example, prefer (and preserve) high
priority CPUs for high priority workloads.

### Intel Speed Select Technology (SST)

CRI Resource Manager supports detection of all Intel Speed Select Technology
(SST) features, i.e. Speed Select Technology Performance Profile (SST-PP), Base
Frequency (SST-BF), Turbo Frequency (SST-TF) and Core Power (SST-CP).

CPU prioritization is based on detection of the currently active SST features
and their parameterization:

1. If SST-TF has been enabled, all CPUs prioritized by SST-TF are flagged as
   high priority.
1. If SST-CP is enabled but SST-TF disabled, the CPU allocator examines the
   active Classes of Service (CLOSes) and their parameters. CPUs associated
   with the highest priority CLOS will be flagged as high priority, lowest
   priority CLOS will be flagged as low priority and possible "middle priority"
   CLOS as normal priority.
1. If SST-BF has been enabled and SST-TF and SST-CP are inactive, all BF high
   priority cores (having higher guaranteed base frequency) will be flagged
   as high priority.

### Linux CPUFreq

CPUFreq based prioritization only takes effect if Intel Speed Select Technology
(SST) is disabled (or not supported). CRI-RM divides CPU cores into priority
classes based on two parameters:

- base frequency
- EPP (Energy-Performance Preference)

CPU cores with high base frequency (relative to the other cores in the system)
will be flagged as high priority.  Low base frequency will map to low priority,
correspondingly.

CPU cores with high EPP priority (relative to the other cores in the system)
will be marked as high priority cores.


================================================
FILE: docs/policy/dynamic-pools.md
================================================
# Dynamic-Pools Policy

## Overview

The dynamic-pools policy can put the workload into different dynamic-pools. Each dynamic-pool contains several CPUs and can be resized dynamically in terms of the specific algorithms.

The main idea of the algorithm is: on the premise that the CPUs in each dynamic-pool can meet the requests of the pods in the dynamic-pool, the CPUs are allocated based on the CPU utilization of the workload. Dynamic-pools policy try to keep CPU utilization balanced.

CPUs in dynamic-pools can be configured, for example, by setting min and max frequencies on CPU cores and uncore.

## How It Works

1. The user configures the dynamic-pool types from which the policy instantiates dynamic-pools. In addition to the dynamic-pools configured by the user, there is also a built-in dynamic-pool named shared pool.
2. A dynamic-pool has a set of CPUs and a set of containers running on the CPUs.
3. Every container is assigned to a dynamic-pool. Dynamic-pools policy allows a container to use all CPUs of its pool and no other CPUs.
4. Each logical CPU belongs to exactly one dynamic-pool. There cannot be CPUs that do not belong to any dynamic-pool.
5. The number of CPUs in a dynamic-pool can change. If CPUs are added to a dynamic-pool, then all containers in the dynamic-pool can use more CPUs. The opposite is true if the CPUs are removed.
6. As CPUs are added to or removed from the dynamic-pool, the CPUs are reconfigured according to the dynamic-pool's CPU class attributes or the idle CPU class attributes.
7. Updating the number of CPUs in dynamic-pools:
   - The dynamic-pools policy needs to update the number of CPUs in dynamic-pools when starting policy, creating pods, deleting pods, updating configurations, and at regular intervals.
   - The number of CPUs in the dynamic-pools is determined by the requests of containers and CPU utilization in the dynamic-pools.
   - The number of CPUs allocated in each dynamic-pool is the sum of the requests of the containers in the dynamic pool and the CPUs allocated based on the CPU utilization of the workload.
8. When a new container is created on a Kubernetes node, the policy first decides the type of the dynamic-pool that will run the container. The decision is based on the annotation of the pod, or the namespace if annotations are not given.

## Deployment

### Install cri-resmgr

Deploy cri-resmgr on each node as you would for any other policy. See [installation](https://intel.github.io/cri-resource-manager/stable/docs/installation.html) for more details.

## Configuration

The dynamic-pools policy is configured using the yaml-based configuration system of CRI-RM. See [setup and usage](https://intel.github.io/cri-resource-manager/stable/docs/setup.html#setting-up-cri-resource-manager) for more details on managing the configuration.

### Parameters

Dynamic-pools policy parameters:

* `PinCPU` controls pinning a container to CPUs of its dynamic-pool. The default is  `true`: the container cannot use other CPUs.
* `PinMemory` controls pinning a container to the memories that are closest to the CPUs of its dynamic-pool. The default is `true`: allow using memory only from the closest NUMA nodes. Warning: this may cause kernel to kill workloads due to out-of-memory error when closest NUMA nodes do not have enough memory. In this situation consider switching this option `false`.
* `ReservedPoolNamespaces` is a list of namespaces (wildcards allowed) that are assigned to the special reserved dynamic-pool, that is, will run on reserved CPUs. This always includes the `kube-system` namespace.
* `DynamicPoolTypes` is a list of dynamic-pool type definitions. Each type can be configured with the following parameters:
  - `Name` of the dynamic-pool type. This is used in pod annotations to assign containers to dynamic-pool of this type.
  - `Namespaces` is a list of namespaces (wildcards allowed) whose pods should be assigned to this dynamic-pool type, unless overridden by pod annotations.
  - `CpuClass` specifies the name of the CPU class according to which CPUs of dynamic-pools are configured.
  - `AllocatorPriority` (0: High, 1: Normal, 2: Low, 3: None). CPU allocator parameter, used when creating new or resizing existing dynamic-pools.

Related configuration parameters:

* `policy.ReservedResources.CPU` specifies the (number of) CPUs in the special `reserved` dynamic-pool. By default all containers in the `kube-system` namespace are assigned to the reserved dynamic-pool.
* `policy.AvailableResources.CPU` specifies the CPUs that can be used by the policy, including `policy.ReservedResources.CPU`.
* `cpu.classes` defines CPU classes and their parameters (such as `minFreq`, `maxFreq`, `uncoreMinFreq` and `uncoreMaxFreq`).

### Example

```yaml
cpu:
  classes:
    pool1-cpuclass:
      maxFreq: 1500000
      minFreq: 2000000
    pool2-cpuclass:
      maxFreq: 2000000
      minFreq: 2500000
policy:
  Active: dynamic-pools
  ReservedResources:
      CPU: cpuset:0
  dynamic-pools:
    PinCPU: true
    PinMemory: true
    DynamicPoolTypes:
      - Name: "pool1"
        Namespaces:
          - "pool1"
        CPUClass: "pool1-cpuclass"
      - Name: "pool2"
        Namespaces:
          - "pool2"
        CPUClass: "pool2-cpuclass"
```

### Update Dynamic-Pools at Regular Intervals

The dynamic-pools policy can be set at regular intervals, based on the cpu utilization of the workload in each pool, to update the cpu allocation, and use the `--rebalance-interval` option to set the interval.

### Assigning a Container to a Dynamic-pool

The dynamic-pool type of a container can be defined in pod annotations. In the example below, the first annotation sets the dynamic-pool type (`DPT`) of a single container (`CONTAINER_NAME`). The last two annotations set the default dynamic-pools type for all containers in the pod.

```yaml
dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/container.CONTAINER_NAME: DPT
dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: DPT
dynamic-pool.dynamic-pools.cri-resource-manager.intel.com: DPT
```

If a pod has no annotations, its namespace is matched to the `namespace` of dynamic-pool types. The first matching dynamic-pool type is used.

If the namespace does not match, the container is assigned to the `shared` dynamic-pool.

## Metrics and Debugging

In order to enable more verbose logging and metrics exporting from the dynamic-pools policy, enable instrumentation and policy debugging from the CRI-RM global config:

```yaml
instrumentation:
  # The dynamic-pools policy exports containers running in each dynamic-pool,
  # and cpusets of dynamic-pools. Accessible in command line:
  # curl --silent http://localhost:8891/metrics
  HTTPEndpoint: :8891
  PrometheusExport: true
logger:
  Debug: policy
```

Use the `--metrics-interval` option to set the interval for updating metrics data.

================================================
FILE: docs/policy/index.rst
================================================
Policies
########

.. toctree::
   :maxdepth: 1

   topology-aware.md
   static-pools.md
   balloons.md
   podpools.md
   container-affinity.md
   blockio.md
   rdt.md
   cpu-allocator.md
   dynamic-pools.md

================================================
FILE: docs/policy/podpools.md
================================================
# Podpools Policy

## Overview

The podpools policy implements pod-level workload placement. It
assigns all containers of a pod to the same CPU/memory pool. The
number of CPUs in a pool is configurable by user.

## Deployment

### Install cri-resmgr

Deploy cri-resmgr on each node as you would for any other policy. See
[installation](../installation.md) for more details.

## Configuration

The policy is configured using the yaml-based configuration system of CRI-RM.
See [setup and usage](../setup.md#setting-up-cri-resource-manager) for more
details on managing the configuration.

At minimum, you need to specify the active policy in the
configuration, and define at least one pod pool. For example, the
following configuration dedicates 95 % of non-reserved CPUs on the
node to be used by `dualcpu` pools. Every pool instance (`dualcpu[0]`,
`dualcpu[1]`, ...) contains two exclusive CPUs and has a capacity
(`MaxPods`) of one pod. The CPUs are used only by containers of pods
assigned to the pool. Remaining CPUs will be used for running pods
that are not `dualcpu` or `kube-system` pods.

```yaml
policy:
  Active: podpools
  ReservedResources:
    CPU: 1
  podpools:
    Pools:
      - Name: dualcpu
        CPU: 2
        MaxPods: 1
        Instances: 95 %
```

Note that the configuration above allocates two exclusive CPUs for
each pod assigned to the pool. To align with kube-scheduler resource
accounting, requested CPUs of all containers in this kind of pods must
sum up to CPU/MaxPods, that is 2000m CPU in this case.

See the [sample configmap](/sample-configs/podpools-policy.cfg) for a
complete example.

### Debugging

In order to enable more verbose logging for the podpools policy enable
policy debug from the CRI-RM global config:

```yaml
logger:
  Debug: policy
```

## Running Pods in Podpools

The podpools policy assigns a pod to a pod pool instance if the pod
has annotation

```yaml
pool.podpools.cri-resource-manager.intel.com: POOLNAME
```

Following Pod runs in a `dualcpu` pool. This example assumes that
`dualcpu` pools include two CPUs per pod, as in the above
configuration example. Therefore containers in the yaml request 2000m
CPUs in total.

```yaml
apiVersion: v1
kind: Pod
metadata:
  name: podpools-test
  annotations:
    pool.podpools.cri-resource-manager.intel.com: dualcpu
spec:
  containers:
  - name: testcont0
    image: busybox
    command:
      - "sh"
      - "-c"
      - "while :; do grep _allowed_list /proc/self/status; sleep 5; done"
    resources:
      requests:
        cpu: 1200m
  - name: testcont1
    image: busybox
    command:
      - "sh"
      - "-c"
      - "while :; do grep _allowed_list /proc/self/status; sleep 5; done"
    resources:
      requests:
        cpu: 800m
```

If a pod is not annotated to run on any specific pod pool and it is
not a `kube-system` pod, it will be run on shared CPUs. Shared CPUs
include left-over CPUs after creating user-defined pools. If all CPUs
were allocated to other pools, reserved CPUs will be used as shared,
too.


================================================
FILE: docs/policy/rdt.md
================================================
# RDT (Intel® Resource Director Technology)

## Background

Intel® RDT provides capabilities for cache and memory allocation and
monitoring. In Linux system the functionality is exposed to the user space via
the [resctrl](https://docs.kernel.org/x86/resctrl.html)
filesystem. Cache and memory allocation in RDT is handled by using resource
control groups. Resource allocation is specified on the group level and each
task (process/thread) is assigned to one group. In the context of CRI Resource
we use the term 'RDT class' instead of 'resource control group'.

CRI Resource Manager supports all available RDT technologies, i.e. L2 and L3
Cache Allocation (CAT) with Code and Data Prioritization (CDP) and Memory
Bandwidth Allocation (MBA) plus Cache Monitoring (CMT) and Memory Bandwidth
Monitoring (MBM).


## Overview

RDT configuration in CRI-RM is class-based. Each container gets assigned to an
RDT class. In turn, all processes of the container will be assigned to the RDT
Classes of Service (CLOS) (under `/sys/fs/resctrl`) corresponding the RDT class. CRI-RM will configure
the CLOSes according to its configuration at startup or whenever the
configuration changes.

CRI-RM maintains a direct mapping between Pod QoS classes and RDT classes. If
RDT is enabled CRI-RM tries to assign containers into an RDT class with a name
matching their Pod QoS class. This default behavior can be overridden with
pod annotations.

## Class Assignment

By default, containers get an RDT class with the same name as its Pod QoS class
(Guaranteed, Burstable or Besteffort). If the RDT class is missing the
container will be assigned to the system root class.

The default behavior can be overridden with pod annotations:

- `rdtclass.cri-resource-manager.intel.com/pod: <class-name>` specifies a
  pod-level default that will be used for all containers of a pod
- `rdtclass.cri-resource-manager.intel.com/container.<container-name>: <class-name>`
   specifies container-specific assignment, taking preference over possible
   pod-level annotation (above)

With pod annotations it is possible to specify RDT classes other than
Guaranteed, Burstable or Besteffort.

The default assignment could also be overridden by a policy but currently none
of the builtin policies do that.

## Configuration

### Operating Modes

The RDT controller supports three operating modes, controlled by
`rdt.options.mode` configuration option.

- Disabled: RDT controller is effectively disabled and containers will not be
  assigned and no monitoring groups will be created. Upon activation of this
  mode all CRI-RM specific control and monitoring groups from the resctrl
  filesystem are removed.
- Discovery: RDT controller detects existing non-CRI-RM specific classes from
  the resctrl filesystem and uses these. The configuration of the discovered
  classes is considered read-only and it will not be altered. Upon activation
  of this mode all CRI-RM specific control groups from the resctrl filesystem
  are removed.
- Full: Full operating mode. The controller manages the configuration of the
  resctrl filesystem according to the rdt class definitions in the CRI-RM
  configuration. This is the default operating mode.

### RDT Classes

The RDT class configuration in CRI-RM is a two-level hierarchy consisting of
partitions and classes. It specifies a set of partitions each having a set of
classes.

#### Partitions

Partitions represent a logical grouping of the underlying classes, each
partition specifying a portion of the available resources (L2/L3/MB) which will
be shared by the classes under it. Partitions guarantee non-overlapping
exclusive cache allocation - i.e. no overlap on the cache ways between
partitions is allowed. However, by technology, MB allocations are not
exclusive. Thus, it is possible to assign all partitions 100% of memory
bandwidth, for example.

#### Classes

Classes represent the actual RDT classes containers are assigned to. In
contrast to partitions, cache allocation between classes under a specific
partition may overlap (and they usually do).

The set of RDT classes can be freely specified, but, it should be ensured that
classes corresponding to the Pod QoS classes are specified. Also, the maximum
number of classes (CLOSes) supported by the underlying hardware must not be
exceeded.

### Example

Below is a config snippet that would allocate 60% of the L3 cache lines
exclusively to the Guarenteed class. The remaining 40% L3 is for Burstable and
Besteffort, Besteffort getting only 50% of this. Guaranteed class gets full
memory bandwidth whereas the other classes are throttled to 50%.

```yaml
rdt:
  # Common options
  options:
    # One of Full, Discovery or Disabled
    mode: Full
    # Set to true to disable creation of monitoring groups
    monitoringDisabled: false
    l3:
      # Make this false if L3 CAT must be available
      optional: true
    mb:
      # Make this false if MBA must be available
      optional: true

  # Configuration of classes
  partitions:
    exclusive:
      # Allocate 60% of all L3 cache to the "exclusive" partition
      l3Allocation: "60%"
      mbAllocation: ["100%"]
      classes:
        Guaranteed:
          # Allocate all of the partitions cache lines to "Guaranteed"
          l3Allocation: "100%"
    shared:
      # Allocate 40% L3 cache IDs to the "shared" partition
      # These will NOT overlap with the cache lines allocated for "exclusive" partition
      l3Allocation: "40%"
      mbAllocation: ["50%"]
      classes:
        Burstable:
          # Allow "Burstable" to use all cache lines of the "shared" partition
          l3Allocation: "100%"
        BestEffort:
          # Allow "Besteffort" to use only half of the L3 cache # lines of the "shared" partition.
          # These will overlap with those used by "Burstable"
          l3Allocation: "50%"
```

The configuration also supports far more fine-grained control, e.g. per
cache-ID configuration (i.e. different sockets having different allocation) and
Code and Data Prioritization (CDP) allowing different cache allocation for code
and data paths. If the hardware details are known, raw bitmasks or bit numbers
("0x1f" or 0-4) can be used instead of percentages in order to be able to
configure cache allocations exactly as required. For detailed description of the RDT configuration format with examples see the
{{ '[goresctrl library documentation](https://github.com/intel/goresctrl/blob/{}/doc/rdt.md)'.format(goresctrl_version) }}

See `rdt` in the [example ConfigMap spec](/sample-configs/cri-resmgr-configmap.example.yaml)
for another example configuration.

### Dynamic Configuration

RDT supports dynamic configuration i.e. the resctrl filesystem is reconfigured
whenever a configuration update e.g. via the [Node Agent](../node-agent.md) is
received. However, the configuration update is rejected if it is incompatible
with the set of currently running containers - e.g. the new config is missing a
class that a running container has been assigned to.


================================================
FILE: docs/policy/static-pools.md
================================================
# Static-Pools (STP) Policy

## Overview

The `static-pools` (STP) builtin policy was inspired by [CMK (CPU Manager for
Kubernetes)][cmk]. It is an example policy demonstrating capabilities of
`cri-resource-manager` and not considered as production ready.

Basically, the STP policy aims to replicate the functionality of the `cmk
isolate` command of CMK. It also has compatibility features to function as
a drop-in replacement in order to allow easier testing and prototyping.

Features:

- arbitrary number of configurable CPU list pools
- dynamic configuration updates via the [node agent](../node-agent.md)

Please see the documentation of
[CMK][cmk] for a more detailed
description of the terminology and functionality.

CMK compatibility features:

- supports the same environment variables as the original CMK, except for:
  - `CMK_LOCK_TIMEOUT` and `CMK_PROC_FS`: configuration variables that are not
    applicable in cri-resmgr context
  - `CMK_LOG_LEVEL`: not implemented, yet
  - `CMK_NUM_CORES`: not needed in cri-resmgr as we take this value directly
    from the container resource request
- supports the existing configuration directory format of CMK for retrieving
  the pool configuration
- parses the container command/args in an attempt to retrieve command line
  options of `cmk isolate`
- supports generating CMK-specific node label and taint (off by default)

## Deployment

### Install cri-resmgr

Deploy cri-resmgr on each node as you would for any other policy. See
[installation](../installation.md) for more details.

### Deploy Node Agent

The CRI-RM node agent is required in order to communicate with the Kubernetes
control plane. In particular, the STP policy needs this capability for
updating the extended resource (that represents exclusive cores) as well as
managing legacy CMK node annotation and taint. In addition, the node agent
enables dynamic configuration updates.

See [node agent](../node-agent.md) for detailed instructions for set-up and
usage.

### Deploy Admission Webhook

You need to run and enable the cri-resmgr mutating admission webhook which
creates pod annotations consumed by CRI-RM. This is required so that the STP
policy is able to inspect the extended resources (in this case, exclusive CPU
cores) requested by containers.

See the [webhook](../webhook.md) for instructions how to set it up.

## Configuration

The policy is configured using the yaml-based configuration system of CRI-RM.
See [setup and usage](../setup.md#setting-up-cri-resource-manager) for more
details on managing the configuration.

At minimum, you need to specify the active policy in the configuration.
Policy-specific options control the pool configuration and legacy node label
and taint.

```yaml
policy:
  Active: static-pools
  static-pools:
    # Set to true to create CMK node label
    #LabelNode: false
    # Set to true to create CMK node taint
    #TaintNode: false
  ...
```

See the [sample configmap](/sample-configs/cri-resmgr-configmap.example.yaml)
for a complete example containing all available configuration options.

If dynamic configuration via the [node agent](../node-agent.md) is in use the
policy options, including pools configuration, may be altered at runtime.

**NOTE**: the active policy (`policy.Active`) cannot be changed at runtime. In
order to change the active policy cri-resmgr needs to be restarted.

### Pools Configuration

There are three possible sources of the pools configuration, in decreasing
priority order:

1. CRI-RM global config
1. stand-alone static-pools config file
1. CMK directory tree

The configuration is fully evaluated whenever a re-configuration event is
received (e.g. from the node agent). Thus, a valid pools config appearing in
the CRI-RM global config will take precedence over a directory tree based
config that was previously active. Similarly, removing pools config from the
CRI-RM global config will make a local config (file or directory tree)
effective.

**NOTE:** cri-resmgr does not have any utility for generating a pool
configuration. Thus, you need to either manually write one by yourself, or, run
the `cmk init` command (of the original CMK) in order to create a legacy
configuration directory structure.

#### Global Config

Configuration from the global CRI-RM config takes the highest preference, if
specified (under `policy.static-pools.pools`). A referential example:

```yaml
policy:
  static-pools:
    pools:
      exclusive:
        exclusive: true
        cpuLists:
        ...
      shared:
        cpuLists:
        ...
      infra:
        cpuLists:
        ...

```

#### Stand-alone YAML File

Path to a stand-alone configuration file can be specified by the
`policy.static-pools.ConfFilePath` option (empty by default) in the CRI-RM
global config:

```yaml
policy:
  static-pools:
    ConfFilePath: "/path/to/conf.yaml"
```

Format of the configuration file is similar to the pools config used in the
global CRI-RM config. You can also see the
[example config file](/sample-configs/static-pools-policy.conf.example)
for a starting point.

#### CMK Directory Tree

The STP policy also supports configuration directory format of the original
CMK. It reads the configuration from a location specified by the
`policy.static-pools.ConfFileDir` field (`/etc/cmk` by default) in the CRI-RM
global config:

```yaml
policy:
  static-pools:
    ConfFileDir: "/etc/cmk"
```

### Debugging

In order to enable more verbose logging for the STP policy set the
`LOGGER_DEBUG=static-pools` environment variable or enable debug from the CRI-RM
global config:

```yaml
logger:
  Debug: static-pools

```

## Running Workloads

The preferred way to specify the pod configuration is through environment
variables. However, exclusive cores must be reserved by making a request of the
`cmk.intel.com/exclusive-cores` extended resource. Naming of the extended
resource has `cmk` prefix in order to provide backwards compatibility with the
original CMK.

### Pod Configuration Using Env Variables

The following environment variables are recognized:

| Name            | Description                                                |
| --------------- | ---------------------------------------------------------- |
| STP_NO_AFFINITY | Do not set cpu affinity. The workload is responsible for reading the `CMK_CPUS_ASSIGNED` environment variable and setting the affinity itself.
| STP_POOL        | Name of the pool to run in
| STP_SOCKET_ID   | Socket where cores should be allocated. Set to -1 to accept any socket.

An example Pod spec for running a workload in the `exclusive` pool with one
core reserved from socket id 0:

```yaml
apiVersion: v1
kind: Pod
metadata:
  name: stp-test
spec:
  containers:
  - name: stp-test
    image: busybox
    env:
      - name: STP_POOL
        value: "exclusive"
      - name: STP_SOCKET_ID
        value: "0"
    command:
      - "sh"
      - "-c"
      - "while :; do echo ASSIGNED: $CMK_CPUS_ASSIGNED; sleep 1; done"
    resources:
      requests:
        cmk.intel.com/exclusive-cores: "1"
      limits:
        cmk.intel.com/exclusive-cores: "1"
```

### Backwards Compatibility for `cmk isolate`

The STP policy parses the container command/args in an attempt to
retrieve the Pod configuration (from `cmk isolate` options). This is to provide
backwards compatibility with existing CMK workload specs. It manipulates the
container command and args so that `cmk isolate` and all it's arguments are
removed.

In the example below STP policy will run `sh -c "sleep 10000"` in the `infra`
pool.

```yaml
apiVersion: v1
kind: Pod
metadata:
  name: cmk-test
spec:
  containers:
  - name: cmk-test
    image: busybox
    command:
      - "sh"
      - "-c"
    args:
      - "/opt/bin/cmk isolate --conf-dir=/etc/cmk --pool=infra sleep 10000"
```

<!-- Links -->
[cmk]: https://github.com/intel/CPU-Manager-for-Kubernetes


================================================
FILE: docs/policy/topology-aware.md
================================================
# Topology-Aware Policy

## Background

On server-grade hardware the CPU cores, I/O devices and other peripherals
form a rather complex network together with the memory controllers, the
I/O bus hierarchy and the CPU interconnect. When a combination of these
resources are allocated to a single workload, the performance of that
workload can vary greatly, depending on how efficiently data is transferred
between them or, in other words, on how well the resources are aligned.

There are a number of inherent architectural hardware properties that,
unless properly taken into account, can cause resource misalignment and
workload performance degradation. There are a multitude of CPU cores
available to run workloads. There are a multitude of memory controllers
these workloads can use to store and retrieve data from main memory. There
are a multitude of I/O devices attached to a number of I/O buses the same
workloads can access. The CPU cores can be divided into a number of groups,
with each group having different access latency and bandwidth to each
memory controller and I/O device.

If a workload is not assigned to run with a properly aligned set of CPU,
memory and devices, it will not be able to achieve optimal performance.
Given the idiosyncrasies of hardware, allocating a properly aligned set
of resources for optimal workload performance requires identifying and
understanding the multiple dimensions of access latency locality present
in hardware or, in other words, hardware topology awareness.

## Overview

The `topology-aware` policy automatically builds a tree of pools based on the
detected hardware topology. Each pool has a set of CPUs and memory zones
assigned as their resources. Resource allocation for workloads happens by
first picking the pool which is considered to fit the best the resource
requirements of the workload and then assigning CPU and memory from this pool.

The pool nodes at various depths from bottom to top represent the NUMA nodes,
dies, sockets, and finally the whole of the system at the root node. Leaf NUMA
nodes are assigned the memory behind their controllers / zones and CPU cores
with the smallest distance / access penalty to this memory. If the machine
has multiple types of memory separately visible to both the kernel and user
space, for instance both DRAM and [PMEM](https://www.intel.com/content/www/us/en/products/memory-storage/optane-dc-persistent-memory.html), each zone of special type of memory
is assigned to the closest NUMA node pool.

Each non-leaf pool node in the tree is assigned the union of the resources of
its children. So in practice, dies nodes end up containing all the CPU cores
and the memory zones in the corresponding die, sockets nodes end up containing
the CPU cores and memory zones in the corresponding socket's dies, and the root
ends up containing all CPU cores and memory zones in all sockets.

With this setup, each pool in the tree has a topologically aligned set of CPU
and memory resources. The amount of available resources gradually increases in
the tree from bottom to top, while the strictness of alignment is gradually
relaxed. In other words, as one moves from bottom to top in the tree, it is
getting gradually easier to fit in a workload, but the price paid for this is
a gradually increasing maximum potential cost or penalty for memory access and
data transfer between CPU cores.

Another property of this setup is that the resource sets of sibling pools at
the same depth in the tree are disjoint while the resource sets of descendant
pools along the same path in the tree partially overlap, with the intersection
decreasing as the the distance between pools increases. This makes it easy to 
isolate workloads from each other. As long as workloads are assigned to pools
which has no other common ancestor than the root, the resources of these
workloads should be as well isolated from each other as possible on the given
hardware.

With such an arrangement, this policy should handle topology-aware alignment
of resources without any special or extra configuration. When allocating
resources, the policy

  - filters out all pools with insufficient free capacity
  - runs a scoring algorithm for the remaining ones
  - picks the one with the best score
  - assigns resources to the workload from there

Although the details of the scoring algorithm are subject to change as the
implementation evolves, its basic principles are roughly

  - prefer pools lower in the tree, IOW stricter alignment and lower latency
  - prefer idle pools over busy ones, IOW more remaining free capacity and
    fewer workloads
  - prefer pools with better overall device alignment

## Features

The `topology-aware` policy has the following features:

  - topologically aligned allocation of CPU and memory
    * assign CPU and memory to workloads with tightest available alignment
  - aligned allocation of devices
    * pick pool for workload based on locality of devices already assigned
  - shared allocation of CPU cores
    * assign workload to shared subset of pool CPUs
  - exclusive allocation of CPU cores
    * dynamically slice off CPU cores from shared subset and assign to workload
  - mixed allocation of CPU cores
    * assign both exclusive and shared CPU cores to workload
  - discovering and using kernel-isolated CPU cores (['isolcpus'](https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists))
    * use kernel-isolated CPU cores for exclusively assigned CPU cores
  - exposing assigned resources to workloads
  - notifying workloads about changes in resource assignment
  - dynamic relaxation of memory alignment to prevent OOM
    * dynamically widen workload memory set to avoid pool/workload OOM
  - multi-tier memory allocation
    * assign workloads to memory zones of their preferred type
    * the policy knows about three kinds of memory:
      - DRAM is regular system main memory
      - PMEM is large-capacity memory, such as
        [Intel® Optane™ memory](https://www.intel.com/content/www/us/en/products/memory-storage/optane-dc-persistent-memory.html)
      - [HBM](https://en.wikipedia.org/wiki/High_Bandwidth_Memory) is high speed memory,
        typically found on some special-purpose computing systems
  - cold start
    * pin workload exclusively to PMEM for an initial warm-up period
  - dynamic page demotion
    * forcibly migrate read-only and idle container memory pages to PMEM

## Activating the Policy

You can activate the `topology-aware` policy by using the following configuration
fragment in the configuration for `cri-resmgr`:

```yaml
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
```

## Configuring the Policy

The policy has a number of configuration options which affect its default behavior.
These options can be supplied as part of the
[dynamic configuration](../setup.md#using-cri-resource-manager-agent-and-a-configmap)
received via the [`node agent`](../node-agent.md), or in a fallback or forced
[configuration file](../setup.md#using-a-local-configuration-from-a-file). These
configuration options are

  - `PinCPU`
    * whether to pin workloads to assigned pool CPU sets
  - `PinMemory`
    * whether to pin workloads to assigned pool memory zones
  - `PreferIsolatedCPUs`
    * whether isolated CPUs are preferred by default for workloads that are
      eligible for exclusive CPU allocation
  - `PreferSharedCPUs`
    * whether shared allocation is preferred by default for workloads that
      would be otherwise eligible for exclusive CPU allocation
  - `ReservedPoolNamespaces`
    * list of extra namespaces (or glob patters) that will be allocated to reserved CPUs
  - `ColocatePods`
    * whether try to allocate containers in a pod to the same or close by topology pools
  - `ColocateNamespaces`
    * whether try to allocate containers in a namespace to the same or close by topology pools

## Policy CPU Allocation Preferences

There are a number of workload properties this policy actively checks to decide
if the workload could potentially benefit from extra resource allocation
optimizations. Unless configured differently, containers fulfilling certain
corresponding criteria are considered eligible for these optimizations. This
will be reflected in the assigned resources whenever that is possible at the
time the container's creation / resource allocation request hits the policy.

The set of these extra optimizations consist of

  - assignment of `kube-reserved` CPUs
  - assignment of exclusively allocated CPU cores
  - usage of kernel-isolated CPU cores (for exclusive allocation)

The policy uses a combination of the QoS class and the resource requirements of
the container to decide if any of these extra allocation preferences should be
applied. Containers are divided into five groups, with each group having a
slightly different set of criteria for eligibility.

  - `kube-system` group
    * all containers in the `kube-system` namespace
  - `low-priority` group
    * containers in the `BestEffort` or `Burstable` QoS class
  - `sub-core` group
    * Guaranteed QoS class containers with `CPU request < 1 CPU`
  - `mixed` group
    * Guaranteed QoS class containers with `1 <= CPU request < 2`
  - `multi-core` group
    * Guaranteed QoS class containers with `CPU request >= 2`

The eligibility rules for extra optimization are slightly different among these
groups.

  - `kube-system`
    * not eligible for extra optimizations
    * eligible to run on `kube-reserved` CPU cores
    * always run on shared CPU cores
  - `low-priority`
    * not eligible for extra optimizations
    * always run on shared CPU cores
  - `sub-core`
    * not eligible for extra optimizations
    * always run on shared CPU cores
  - `mixed`
    * by default eligible for exclusive and isolated allocation
    * not eligible for either if `PreferSharedCPUs` is set to true
    * not eligible for either if annotated to opt out from exclusive allocation
    * not eligible for isolated allocation if annotated to opt out
  - `multi-core`
    * CPU request fractional (`(CPU request % 1000 milli-CPU) != 0`):
      - by default not eligible for extra optimizations
      - eligible for exclusive and isolated allocation if annotated to opt in
    * CPU request not fractional:
      - by default eligible for exclusive allocation
      - by default not eligible for isolated allocation
      - not eligible for exclusive allocation if annotated to opt out
      - eligible for isolated allocation if annotated to opt in

Eligibility for kube-reserved CPU core allocation should always be possible to
honor. If this is not the case, it is probably due to an incorrect configuration
which underdeclares `ReservedResources`. In that case, ordinary shared CPU cores
will be used instead of kube-reserved ones.

Eligibility for exclusive CPU allocation should always be possible to honor.
Eligibility for isolated core allocation is only honored if there are enough
isolated cores available to fulfill the exclusive part of the container's CPU
request with isolated cores alone. Otherwise ordinary CPUs will be allocated,
by slicing them off for exclusive usage from the shared subset of CPU cores in
the container's assigned pool.

Containers in the kube-system group are pinned to share all kube-reserved CPU
cores. Containers in the low-priority or sub-core groups, and containers which
are only eligible for shared CPU core allocation in the mixed and multi-core
groups, are all pinned to run on the shared subset of CPU cores in the
container's assigned pool. This shared subset can and usually does change
dynamically as exclusive CPU cores are allocated and released in the pool.

## Container CPU Allocation Preference Annotations

Containers can be annotated to diverge from the default CPU allocation
preferences the policy would otherwise apply to them. These Pod annotations
can be given both with per pod and per container resolution. If for any
container both of these exist, the container-specific one takes precedence.

### Shared, Exclusive, and Isolated CPU Preference

A container can opt in to or opt out from shared CPU allocation using the
following Pod annotation.

```yaml
metadata:
  annotations:
    # opt in container C1 to shared CPU core allocation
    prefer-shared-cpus.cri-resource-manager.intel.com/container.C1: "true"
    # opt in the whole pod to shared CPU core allocation
    prefer-shared-cpus.cri-resource-manager.intel.com/pod: "true"
    # selectively opt out container C2 from shared CPU core allocation
    prefer-shared-cpus.cri-resource-manager.intel.com/container.C2: "false"
```

Opting in to exclusive allocation happens by opting out from shared allocation,
and opting out from exclusive allocation happens by opting in to shared
allocation.

A container can opt in to or opt out from isolated exclusive CPU core
allocation using the following Pod annotation.

```yaml
metadata:
  annotations:
    # opt in container C1 to isolated exclusive CPU core allocation
    prefer-isolated-cpus.cri-resource-manager.intel.com/container.C1: "true"
    # opt in the whole pod to isolated exclusive CPU core allocation
    prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true"
    # selectively opt out container C2 from isolated exclusive CPU core allocation
    prefer-isolated-cpus.cri-resource-manager.intel.com/container.C2: "false"
```

These Pod annotations have no effect on containers which are not eligible for
exclusive allocation.

### Implicit Hardware Topology Hints

`CRI Resource Manager` automatically generates HW `Topology Hints` for devices
assigned to a container, prior to handing the container off to the active policy
for resource allocation. The `topology-aware` policy is hint-aware and normally
takes topology hints into account when picking the best pool to allocate
resources. Hints indicate optimal `HW locality` for device access and they can
alter significantly which pool gets picked for a container.

Since device topology hints are implicitly generated, there are cases where one
would like the policy to disregard them altogether. For instance, when a local
volume is used by a container but not in any performance critical manner.

Containers can be annotated to opt out from and selectively opt in to hint-aware
pool selection using the following Pod annotations.

```yaml
metadata:
  annotations:
    # only disregard hints for container C1
    topologyhints.cri-resource-manager.intel.com/container.C1: "false"
    # disregard hints for all containers by default
    topologyhints.cri-resource-manager.intel.com/pod: "false"
    # but take hints into account for container C2
    topologyhints.cri-resource-manager.intel.com/container.C2: "true"
```

Topology hint generation is globally enabled by default. Therefore, using the
Pod annotation as opt in only has an effect when the whole pod is annotated to
opt out from hint-aware pool selection.

### Implicit Topological Co-location for Pods and Namespaces

The `ColocatePods` or `ColocateNamespaces` configuration options control whether
the policy will try to co-locate, that is allocate topologically close, containers
within the same Pod or K8s namespace.

Both of these options are false by default. Setting them to true is a shorthand
for adding to each container an affinity of weight 10 for all other containers
in the same pod or namespace.

Containers with user-defined affinities are never extended with either of these
co-location affinities. However, such containers can still have affinity effects
on other containers that do get extended with co-location. Therefore mixing user-
defined affinities with implicit co-location requires both careful consideration
and a thorough understanding of affinity evaluation, or it should be avoided
altogether.

## Cold Start

The `topology-aware` policy supports "cold start" functionality. When cold start
is enabled and the workload is allocated to a topology node with both DRAM and
PMEM memory, the initial memory controller is only the PMEM controller. DRAM
controller is added to the workload only after the cold start timeout is
done. The effect of this is that allocated large unused memory areas of
memory don't need to be migrated to PMEM, because it was allocated there to
begin with. Cold start is configured like this in the pod metadata:

```yaml
metadata:
  annotations:
    memory-type.cri-resource-manager.intel.com/container.container1: dram,pmem
    cold-start.cri-resource-manager.intel.com/container.container1: |
      duration: 60s
```

Again, alternatively you can use the following deprecated Pod annotation syntax
to achieve the same, but support for this syntax is subject to be dropped in a
future release:

```yaml
metadata:
  annotations:
    cri-resource-manager.intel.com/memory-type: |
      container1: dram,pmem
    cri-resource-manager.intel.com/cold-start: |
      container1:
        duration: 60s
```

In the above example, `container1` would be initially granted only PMEM
memory controller, but after 60 seconds the DRAM controller would be
added to the container memset.

## Dynamic Page Demotion

The `topology-aware` policy also supports dynamic page demotion. With dynamic
demotion enabled, rarely-used pages are periodically moved from DRAM to PMEM
for those workloads which are assigned to use both DRAM and PMEM memory types.
The configuration for this feature is done using three configuration keys:
`DirtyBitScanPeriod`, `PageMovePeriod`, and `PageMoveCount`. All of these
parameters need to be set to non-zero values in order for dynamic page demotion
to get enabled. See this configuration file fragment as an example:

```yaml
policy:
  Active: topology-aware
  topology-aware:
    DirtyBitScanPeriod: 10s
    PageMovePeriod: 2s
    PageMoveCount: 1000
```

In this setup, every pid in every container in every non-system pod
fulfilling the memory container requirements would have their page ranges
scanned for non-accessed pages every ten seconds. The result of the scan
would be fed to a page-moving loop, which would attempt to move 1000 pages
every two seconds from DRAM to PMEM.

## Container memory requests and limits

Due to inaccuracies in how `cri-resmgr` calculates memory requests for
pods in QoS class `Burstable`, you should either use `Limit` for setting
the amount of memory for containers in `Burstable` pods or run the
[resource-annotating webhook](../webhook.md) to provide `cri-resmgr` with
an exact copy of the resource requirements from the Pod Spec as an extra
Pod annotation.

## Reserved pool namespaces

User is able to mark certain namespaces to have a reserved CPU allocation.
Containers belonging to such namespaces will only run on CPUs set aside
according to the global CPU reservation, as configured by the ReservedResources
configuration option in the policy section.
The `ReservedPoolNamespaces` option is a list of namespace globs that will be
allocated to reserved CPU class.

For example:

```yaml
policy:
  Active: topology-aware
  topology-aware:
    ReservedPoolNamespaces: ["my-pool","reserved-*"]
```

In this setup, all the workloads in `my-pool` namespace and those namespaces
starting with `reserved-` string are allocated to reserved CPU class.
The workloads in `kube-system` are automatically assigned to reserved CPU
class so no need to mention `kube-system` in this list.

## Reserved CPU annotations

User is able to mark certain pods and containers to have a reserved CPU allocation
by using annotations. Containers having a such annotation will only run on CPUs set
aside according to the global CPU reservation, as configured by the ReservedResources
configuration option in the policy section.

For example:

```yaml
metadata:
  annotations:
    prefer-reserved-cpus.cri-resource-manager.intel.com/pod: "true"
    prefer-reserved-cpus.cri-resource-manager.intel.com/container.special: "false"
```


================================================
FILE: docs/quick-start.md
================================================
# Quick-start

The following describes the minimum number of steps to get started with CRI
Resource Manager.

## Pre-requisites

- containerd container runtime installed and running
- kubelet installed on your nodes

## Setup CRI-Resmgr

First, install and setup cri-resource-manager.

### Install package

#### Fedora\*, and SUSE\*

```
CRIRM_VERSION=`curl -s "https://api.github.com/repos/intel/cri-resource-manager/releases/latest" | \
               jq .tag_name | tr -d '"v'`
source /etc/os-release
[ "$ID" = "sles" ] && export ID=opensuse-leap
sudo rpm -Uvh https://github.com/intel/cri-resource-manager/releases/download/v${CRIRM_VERSION}/cri-resource-manager-${CRIRM_VERSION}-0.${ID}-${VERSION_ID}.x86_64.rpm
```

#### Ubuntu\* and Debian\*
```
CRIRM_VERSION=`curl -s "https://api.github.com/repos/intel/cri-resource-manager/releases/latest" | \
               jq .tag_name | tr -d '"v'`
source /etc/os-release
pkg=cri-resource-manager_${CRIRM_VERSION}_${ID}-${VERSION_ID}_amd64.deb; curl -LO https://github.com/intel/cri-resource-manager/releases/download/v${CRIRM_VERSION}/${pkg}; sudo dpkg -i ${pkg}; rm ${pkg}
```


### Setup and verify

Create configuration and start cri-resource-manager
```
sudo cp /etc/cri-resmgr/fallback.cfg.sample /etc/cri-resmgr/fallback.cfg
sudo systemctl enable cri-resource-manager && sudo systemctl start cri-resource-manager
```

See that cri-resource-manager is running
```
systemctl status cri-resource-manager
```

## Kubelet setup

Next, you need to configure kubelet to use cri-resource-manager as it's
container runtime endpoint.

### Existing cluster

When integrating into an existing cluster you need to change kubelet to use
cri-resmgr instead of the existing container runtime (expecting containerd
here).

#### Fedora, and SUSE
```
sudo sed '/KUBELET_EXTRA_ARGS/ s!$! --container-runtime-endpoint=/var/run/cri-resmgr/cri-resmgr.sock!' -i /etc/sysconfig/kubelet
sudo systemctl restart kubelet
```

#### Ubuntu and Debian
```
sudo sed '/KUBELET_EXTRA_ARGS/ s!$! --container-runtime-endpoint=/var/run/cri-resmgr/cri-resmgr.sock!' -i /etc/default/kubelet
sudo systemctl restart kubelet
```

### New Cluster

When in the process of setting up a new cluster you simply point the kubelet
to use the cri-resmgr cri sockets on cluster node setup time. Here's an
example with kubeadm:
```
kubeadm join --cri-socket /var/run/cri-resmgr/cri-resmgr.sock \
...

```

## What Next

Congratulations, you now have cri-resource-manager running on your system and
policying container resource allocations. Next, you could see:
- [Installation](installation.md) for more installation options and
  detailed installation instructions
- [Setup](setup.md) for details on setup and usage
- [Node Agent](node-agent.md) for setting up cri-resmgr-agent for dynamic
  configuration and more
- [Webhook](webhook.md) for setting up our resource-annotating webhook
- [Support for Kata Containers\*](setup.md#kata-containers) for setting up
  CRI-RM with Kata Containers


================================================
FILE: docs/reference/agent-command-line-reference.md
================================================
# CRI-Resmgr-Agent Command-line Reference

***WORK IN PROGRESS***


================================================
FILE: docs/reference/configuration-reference.md
================================================
# Configuration Reference

## Configuration file

***WORK IN PROGRESS***

### `policy`

**Active** specifies the active policy.
```yaml
policy:
  Active: static
```

**AvailableResources** specifies the available hardware resources.

**ReservedResources** specifies the hardware resources reserved for system and
kube tasks.

Currently, only CPU resources are supported. CPUs may be specified as a cpuset
or as a numerical value, similar to Kubernetes resource quantities. Not all
policies use these configuration settings. See the policy-specific documentation
for details.

```yaml
policy:
  AvailableResources:
    cpu: cpuset:0-63
  ReservedResources:
    cpu: cpuset:0-3
    # Alternative ways to specify CPUs:
    #cpu: 4
    #cpu: 4000m
```

### `policy.static`

**RelaxedIsolation** controls whether isolated CPUs are preferred for Guarenteed
Pods.

```yaml
policy:
  static:
    RelaxedIsolation: true
```

### `policy.static-plus`

### `policytopology-aware`

### `policy.static-pools`

### `policy.eda`

### `control`

### `control.blockio`

### `control.rdt`

### `blockio`

### `rdt`

### `instrumentation`

### `rdt`

### `blockio`

### `log`

### `dump`


================================================
FILE: docs/reference/index.rst
================================================
Reference
#########
.. toctree::
   :maxdepth: 1

   resmgr-command-line-reference.md
   agent-command-line-reference.md
   configuration-reference.md


================================================
FILE: docs/reference/resmgr-command-line-reference.md
================================================
# CRI-Resmgr Command-line Reference

***WORK IN PROGRESS***


================================================
FILE: docs/releases/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
# sys.path.insert(0, os.path.abspath('.'))


# -- Project information -----------------------------------------------------

project = 'CRI Resource Manager'
copyright = '2020, various'
author = 'various'

# Versions to show in the version menu
version = "all releases"
if os.getenv('VERSIONS_MENU'):
    html_context = {
        'versions_menu': True,
        'versions_menu_this_version': version}


# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
        'myst_parser',
        'sphinx_markdown_tables'
        ]
source_suffix = {
        '.rst': 'restructuredtext',
        '.md': 'markdown'
        }

# Add any paths that contain templates here, relative to this directory.
templates_path = ['../_templates']

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'

html_theme_options = {
    'display_version': True,
}


# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
#html_static_path = ['_static']

# Callbacks for recommonmark
def setup(app):
    app.connect('missing-reference',ignoreMissingRefs)

def ignoreMissingRefs(app, env, node, contnode):
    return contnode


================================================
FILE: docs/releases/index.md
================================================
# Releases

For up-to-date user documentation see the [documentation site](/cri-resource-manager)

## Documentation for Released Versions
<div id="releases">
</div>
<script src="releases.js"></script>
<script>
  var list = document.getElementById('releases').appendChild(document.createElement("ul"));
  var releaseItems = getReleaseListItems();
  for (var i=0; i < releaseItems.length; i++) {
    var item = document.createElement('li');
    var paragraph = item.appendChild(document.createElement("p"));
    var anchor = paragraph.appendChild(document.createElement('a'));
    anchor.appendChild(document.createTextNode(releaseItems[i].name));
    anchor.href = releaseItems[i].url;
    anchor.class = "reference external";
    list.appendChild(item);
  }
</script>


================================================
FILE: docs/requirements.txt
================================================
sphinx==5.3.0
sphinx_rtd_theme
myst-parser==0.18.1
sphinx-markdown-tables
Pygments==2.15.1


================================================
FILE: docs/security.md
================================================
# Reporting a Potential Security Vulnerability

Please visit [intel.com/security](https://intel.com/security) to report
security issues.


================================================
FILE: docs/setup.md
================================================
# Setup and Usage

If you want to give CRI Resource Manager a try, here is the list of things
you need to do, assuming you already have a Kubernetes\* cluster up and
running, using either `containerd` or `cri-o` as the runtime.

  0. [Install](installation.md) CRI Resource Manager.
  1. Set up kubelet to use CRI Resource Manager as the runtime.
  2. Set up CRI Resource Manager to use the runtime with a policy.

For kubelet you do this by altering its command line options like this:

```
   kubelet <other-kubelet-options> --container-runtime=remote \
     --container-runtime-endpoint=unix:///var/run/cri-resmgr/cri-resmgr.sock
```

For CRI Resource Manager, you need to provide a configuration file, and also
a socket path if you don't use `containerd` or you run it with a different
socket path.

```
   # for containerd with default socket path
   cri-resmgr --force-config <config-file> --runtime-socket unix:///var/run/containerd/containerd.sock
   # for cri-o
   cri-resmgr --force-config <config-file> --runtime-socket unix:///var/run/crio/crio.sock
```

The choice of policy to use along with any potential parameters specific to
that policy are taken from the configuration file. You can take a look at the
[sample configurations](/sample-configs) for some minimal/trivial examples.
For instance, you can use
[sample-configs/topology-aware-policy.cfg](/sample-configs/topology-aware-policy.cfg)
as `<config-file>` to activate the topology aware policy with memory
tiering support.

**NOTE**: Currently, the available policies are a work in progress.

## Setting up kubelet to use CRI Resource Manager as the runtime

To let CRI Resource Manager act as a proxy between kubelet and the CRI
runtime, you need to configure kubelet to connect to CRI Resource Manager
instead of the runtime. You do this by passing extra command line options to
kubelet as shown below:

```
   kubelet <other-kubelet-options> --container-runtime=remote \
     --container-runtime-endpoint=unix:///var/run/cri-resmgr/cri-resmgr.sock
```

## Setting up CRI Resource Manager

Setting up CRI Resource Manager involves pointing it to your runtime and
providing it with a configuration. Pointing to the runtime is done using
the `--runtime-socket <path>` and, optionally, the `--image-socket <path>`.

For providing a configuration there are two options:

  1. use a local configuration YAML file
  2. use the [CRI Resource Manager Node Agent][agent] and a `ConfigMap`

The former is easier to set up and it is also the preferred way to run CRI
Resource Manager for development, and in some cases testing. Setting up the
latter is a bit more involved but it allows you to

  - manage policy configuration for your cluster as a single source, and
  - dynamically update that configuration

### Using a local configuration from a file

This is the easiest way to run CRI Resource Manager for development or
testing. You can do it with the following command:

```
   cri-resmgr --force-config <config-file> --runtime-socket <path>
```

When started this way, CRI Resource Manager reads its configuration from the
given file. It does not fetch external configuration from the node agent and
also disables the config interface for receiving configuration updates.

### Using CRI Resource Manager Agent and a ConfigMap

This setup requires an extra component, the
[CRI Resource Manager Node Agent][agent],
to monitor and fetch configuration from the ConfigMap and pass it on to CRI
Resource Manager. By default, CRI Resource Manager automatically tries to
use the agent to acquire configuration, unless you override this by forcing
a static local configuration using the `--force-config <config-file>` option.
When using the agent, it is also possible to provide an initial fallback for
configuration using the `--fallback-config <config-file>`. This file is
used before the very first configuration is successfully acquired from the
agent.

Whenever a new configuration is acquired from the agent and successfully
taken into use, this configuration is stored in the cache and becomes
the default configuration to take into use the next time CRI Resource
Manager is restarted (unless that time the --force-config option is used).
While CRI Resource Manager is shut down, any cached configuration can be
cleared from the cache using the --reset-config command line option.

See the [Node Agent][agent] about how to set up and configure the agent.


### Changing the active policy

Currently, CRI Resource Manager disables changing the active policy using
the [agent][agent]. That is, once the active policy is recorded in the cache,
any configuration received through the agent that requests a different policy
is rejected. This limitation will be removed in a future version of
CRI Resource Manager.

However, by default CRI Resource Manager allows you to change policies during
its startup phase. If you want to disable this, you can pass the command line
option `--disable-policy-switch` to CRI Resource Manager.

If you run CRI Resource Manager with disabled policy switching, you can still
switch policies by clearing any policy-specific data stored in the cache while
CRI Resource Manager is shut down. You can do this by using the command line
option `--reset-policy`. The whole sequence of switching policies this way is

  - stop cri-resmgr (`systemctl stop cri-resource-manager`)
  - reset policy data (`cri-resmgr --reset-policy`)
  - change policy (`$EDITOR /etc/cri-resource-manager/fallback.cfg`)
  - start cri-resmgr (`systemctl start cri-resource-manager`)


### Container adjustments

When the [agent][agent] is in use, it is also possible to `adjust` container
`resource assignments` externally, using dedicated `Adjustment`
`Custom Resources` in the `adjustments.criresmgr.intel.com` group. You can
use the [provided schema](/pkg/apis/resmgr/v1alpha1/adjustment-schema.yaml)
to define the `Adjustment` resource. Then you can copy and modify the
[sample adjustment CR](/sample-configs/external-adjustment.yaml) as a
starting point to test some overrides.

An `Adjustment` consists of the following:
- `scope`:
  - the nodes and containers to which the adjustment applies
- adjustment data:
  - updated native/compute resources (`cpu`/`memory` `requests` and `limits`)
  - updated `RDT` and/or `Block I/O` class
  - updated top tier (practically now DRAM) memory limit

All adjustment data is optional. An adjustment can choose to set any or all
of them as necessary. The current handling of adjustment update updates the
resource assignments of containers, marks all existing containers as having
pending changes in all controller domains, and then triggers a rebalancing in
the active policy. This causes all containers to be updated.

The scope defines to which containers on what nodes the adjustment applies.
Nodes are currently matched/picked by name, but a trailing wildcard (`*`) is
allowed and matches all nodes with the given prefix in their names.

Containers are matched by expressions. These are exactly the same as the
expressions for defining [affinity scopes](policy/container-affinity.md). A
single adjustment can specify multiple node/container match pairs. An
adjustment applies to all containers in its scope. If an adjustment/update
results in conflicts for some container, that is at least one container is
in the scope of multiple adjustments, the adjustment is rejected and the
whole update is ignored.

#### Commands for declaring, creating, deleting, and examining adjustments

You can declare the custom resource for adjustments with this command:

```
kubectl apply -f pkg/apis/resmgr/v1alpha1/adjustment-schema.yaml
```

You can then add adjustments with a command like this:

```
kubectl apply -f sample-configs/external-adjustment.yaml
```

You can list existing adjustments with the following command. Use the correct
`-n namespace` option according to the namespace you use for the agent, for
the configuration, and in your adjustment specifications.

```
kubectl get adjustments.criresmgr.intel.com -n kube-system
```

You can examine the contents of a single adjustment with these commands:

```
kubectl describe adjustments external-adjustment -n kube-system
kubectl get adjustments.criresmgr.intel.com/<adjustment-name> -n kube-system -oyaml
```

Or you can examine the contents of all adjustments using this command:

```
kubectl get adjustments.criresmgr.intel.com -n kube-system -oyaml
```

Finally, you can delete an adjustment with commands like these:

```
kubectl delete -f sample-configs/external-adjustment.yaml
kubectl delete adjustments.criresmgr.intel.com/<adjustment-name> -n kube-system
```

The status of adjustment updates is propagated back to the `Adjustment`
`Custom Resources`, more specifically into their `Status` fields. With the
help of `jq`, you can easily examine the status of external adjustments
using a command like this:

```
kli@r640-1:~> kubectl get -n kube-system adjustments.criresmgr.intel.com -ojson | jq '.items[].status'
{
  "nodes": {
    "r640-1": {
      "errors": {}
    }
  }
}
{
  "nodes": {
    "r640-1": {
      "errors": {}
    }
  }
}
```

The above response is what you get for adjustments applied without conflicts
or errors. You can see here that only node *r640-1* is in the scope of both
of your existing adjustments and those applied without errors.

If your adjustments resulted in errors, the output will look something like
this:

```
klitkey1@r640-1:~> kubectl get -n kube-system adjustments.criresmgr.intel.com -ojson | jq '.items[].status'
{
  "nodes": {
    "r640-1": {
      "errors": {
        "b71a93523e58cb4ba0310aa225b2e2a329cef895ca4b96fcd9d12b375337ea35": "cache: conflicting adjustments for my-pod-r640-1:my-container: adjustment-1,adjustment-2"
      }
    }
  }
}
{
  "nodes": {
    "r640-1": {
      "errors": {
        "b71a93523e58cb4ba0310aa225b2e2a329cef895ca4b96fcd9d12b375337ea35": "cache: conflicting adjustments for my-pod-r640-1:my-container: adjustment-1,adjustment-2"
      }
    }
  }
}
```

In the sample above, you can see that on node *r640-1* the container with
`ID`*b71a93523e58cb4ba0310aa225b2e2a329cef895ca4b96fcd9d12b375337ea35*, or
*my-container* of *my-pod-r640-1*, had a conflict. Moreover you can see that
the reason of the conflict is that the container is in the scope of both
*adjustment-1* and *adjustment-2*.

You can now fix those adjustments to resolve/remove the conflict, then
reapply the adjustments, and finally verify that the conflicts are gone.

```
kli@r640-1:~> $EDITOR adjustment-1.yaml adjustment-2.yaml
kli@r640-1:~> kubectl apply -f adjustment-1.yaml && kubectl apply -f adjustment-1.yaml && sleep 2
kli@r640-1:~> kubectl get -n kube-system adjustments.criresmgr.intel.com -ojson | jq '.items[].status'
{
  "nodes": {
    "r640-1": {
      "errors": {}
    }
  }
}
{
  "nodes": {
    "r640-1": {
      "errors": {}
    }
  }
}
```


## Using CRI Resource Manager as a message dumper

You can use CRI Resource Manager to simply inspect all proxied CRI requests
and responses without applying any policy. Run CRI Resource Manager with the
provided [sample configuration](/sample-configs/cri-full-message-dump.cfg)
for doing this.


## Kata Containers

[Kata Containers](https://katacontainers.io/) is an open source container
runtime, building lightweight virtual machines that seamlessly plug into the
containers ecosystem.

In order to enable Kata Containers in a Kubernetes-CRI-RM stack, both
Kubernetes and the Container Runtime need to be aware of the new runtime
environment:

  * The Container Runtime can only be CRI-O or containerd, and needs to
   have the runtimes enabled in their configuration files.
  * Kubernetes must be made aware of the CRI-O/containerd runtimes via a
   "RuntimeClass"
   [resource](https://kubernetes.io/docs/concepts/containers/runtime-class/)

After these prerequisites are satisfied, the configuration file for the
target  Kata Container, must have the flag "SandboxCgroupOnly" set to true.
As of Kata 2.0 this is the only way Kata Containers can work with the
Kubernetes cgroup naming conventions.

   ```toml
   ...
   # If enabled, the runtime will add all the kata processes inside one dedicated cgroup.
   # The container cgroups in the host are not created, just one single cgroup per sandbox.
   # The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox.
   # The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation.
   # The sandbox cgroup is constrained if there is no container type annotation.
   # See: https://godoc.org/github.com/kata-containers/runtime/virtcontainers#ContainerType
   sandbox_cgroup_only=true
   ...
   ```

### Reference

If you have a pre-existing Kubernetes cluster, for an easy deployement
follow this [document](https://github.com/kata-containers/packaging/blob/master/kata-deploy/README.md#kubernetes-quick-start).


Starting from scratch:

   * [Kata installation guide](https://github.com/kata-containers/kata-containers/tree/2.0-dev/docs/install#manual-installation)
   * [Kata Containers + CRI-O](https://github.com/kata-containers/documentation/blob/master/how-to/run-kata-with-k8s.md)
   * [Kata Containers + containerd](https://github.com/kata-containers/documentation/blob/master/how-to/containerd-kata.md)
   * [Kubernetes Runtime Class](https://kubernetes.io/docs/concepts/containers/runtime-class/)
   * [Cgroup and Kata containers](https://github.com/kata-containers/kata-containers/blob/stable-2.0.0/docs/design/host-cgroups.md)


## Running with Untested Runtimes

CRI Resource Manager is tested with `containerd` and `CRI-O`. If any other runtime is
detected during startup, `cri-resmgr` will refuse to start. This default behavior can
be changed using the `--allow-untested-runtimes` command line option.

## Logging and debugging

You can control logging with the klog command line options or by setting the
corresponding environment variables. You can get the name of the environment
variable for a command line option by prepending the `LOGGER_` prefix to the
capitalized option name without any leading dashes. For instance, setting the
environment variable `LOGGER_SKIP_HEADERS=true` has the same effect as using
the `-skip_headers` command line option.

Additionally, the `LOGGER_DEBUG` environment variable controls debug logs.
These are globally disabled by default. You can turn on full debugging by
setting `LOGGER_DEBUG='*'`.

When using environment variables, be careful which configuration you pass to
CRI Resource Manager using a file or ConfigMap. The environment is treated
as default configuration but a file or a ConfigMap has higher precedence.
If something is configured in both, the environment will only be in effect
until the configuration is applied. However, in such a case if you later
push an updated configuration to CRI Resource Manager with the overlapping
settings removed, the original ones from the environment will be in effect
again.

For debug logs, the settings from the configuration are applied in addition
to any settings in the environment. That said, if you turn something on in
the environment but off in the configuration, it will be turned off
eventually.

<!-- Links -->
[agent]: node-agent.md


================================================
FILE: docs/webhook.md
================================================
# Webhook

By default, CRI Resource Manager does not see the original container
*resource requirements* specified in the *Pod Spec*. It tries to calculate
these for `cpu` and `memory` *compute resource*s using the related parameters
present in the CRI container creation request. The resulting estimates are
normally accurate for `cpu`, and also for `memory` `limits`. However, it is
not possible to use these parameters to estimate `memory` `request`s or any
*extended resource*s.

If you want to make sure that CRI Resource Manager uses the origin *Pod Spec*
*resource requirement*s, you need to duplicate these as *annotations* on the
Pod. This is necessary if you plan using or writing a policy which needs
*extended resource*s.

This process can be fully automated using the
[CRI Resource Manager Annotating Webhook](/cmd/cri-resmgr-webhook). Once you
built the Docker\* image for it using the
[provided Dockerfile](/cmd/cri-resmgr-webhook/Dockerfile) and published it,
you can set up the webhook as follows:
- Fill in the `IMAGE_PLACEHOLDER` in
  [webhook-deployment.yaml](/cmd/cri-resmgr-webhook/webhook-deployment.yaml)
  to match the image.
- Create a `cri-resmgr-webhook-secret` that carries a key and a certificate
  to `cri-resmgr-webhook`. You can create a key, a self-signed certificate
  and the secret that holds them with the following commands:
  ```bash
  SVC=cri-resmgr-webhook NS=cri-resmgr
  openssl req -x509 -newkey rsa:2048 -sha256 -days 365 -nodes \
    -keyout cmd/cri-resmgr-webhook/server-key.pem \
    -out cmd/cri-resmgr-webhook/server-crt.pem \
    -subj "/CN=$SVC.$NS.svc" \
    -addext "subjectAltName=DNS:$SVC,DNS:$SVC.$NS,DNS:$SVC.$NS.svc"
  cat >cmd/cri-resmgr-webhook/webhook-secret.yaml <<EOF
  apiVersion: v1
  kind: Secret
  metadata:
    name: cri-resmgr-webhook-secret
    namespace: $NS
  data:
    svc.crt: $(base64 -w0 < cmd/cri-resmgr-webhook/server-crt.pem)
    svc.key: $(base64 -w0 < cmd/cri-resmgr-webhook/server-key.pem)
  type: Opaque
  EOF
  kubectl create namespace $NS
  kubectl create -f cmd/cri-resmgr-webhook/webhook-secret.yaml
  ```
- Fill in the `CA_BUNDLE_PLACEHOLDER` in
  [mutating-webhook-config.yaml](/cmd/cri-resmgr-webhook/mutating-webhook-config.yaml).
  If you created the key and the certificate with the commands above,
  you can do this with the following command:
  ```bash
  sed -e "s/CA_BUNDLE_PLACEHOLDER/$(base64 -w0 < cmd/cri-resmgr-webhook/server-crt.pem)/" \
      -i cmd/cri-resmgr-webhook/mutating-webhook-config.yaml
  ```
- Finally set up the webhook with these commands:
  ```bash
  kubectl apply -f cmd/cri-resmgr-webhook/webhook-deployment.yaml
  kubectl wait --for=condition=Available -n cri-resmgr deployments/cri-resmgr-webhook
  kubectl apply -f cmd/cri-resmgr-webhook/mutating-webhook-config.yaml
  ```


================================================
FILE: elf/avx512.c
================================================
#include <uapi/linux/bpf.h>
#include <asm/page_types.h>
#include <asm/fpu/types.h>

#define SEC(NAME) __attribute__((section(NAME), used))

#ifndef KERNEL_VERSION
    #define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))
#endif

#define BUF_SIZE_MAP_NS 256

typedef struct bpf_map_def {
	unsigned int type;
	unsigned int key_size;
	unsigned int value_size;
	unsigned int max_entries;
	unsigned int map_flags;
	unsigned int pinning;
	char namespace[BUF_SIZE_MAP_NS];
} bpf_map_def;

static int (*bpf_probe_read)(void *dst, u64 size, const void *unsafe_ptr) =
	(void *)BPF_FUNC_probe_read;

static u64 (*bpf_get_current_cgroup_id)(void) = (void *)
	BPF_FUNC_get_current_cgroup_id;

static u64 (*bpf_ktime_get_ns)(void) = (void *)
	BPF_FUNC_ktime_get_ns;

static int (*bpf_map_update_elem)(void *map, void *key, void *value,
				  u64 flags) = (void *)BPF_FUNC_map_update_elem;

static void *(*bpf_map_lookup_elem)(void *map, void *key) = (void *)
	BPF_FUNC_map_lookup_elem;

struct bpf_map_def
	SEC("maps/all_context_switch_count") all_context_switch_count_hash = {
		.type = BPF_MAP_TYPE_HASH,
		.key_size = sizeof(u64),
		.value_size = sizeof(u32),
		.max_entries = 1024,
	};

struct bpf_map_def
	SEC("maps/avx_context_switch_count") avx_context_switch_count_hash = {
		.type = BPF_MAP_TYPE_PERCPU_HASH,
		.key_size = sizeof(u64),
		.value_size = sizeof(u32),
		.max_entries = 1024,
	};

struct bpf_map_def
	SEC("maps/avx_timestamp") avx_timestamp_hash = {
		.type = BPF_MAP_TYPE_HASH,
		.key_size = sizeof(u64),
		.value_size = sizeof(u32),
		.max_entries = 1024,
	};

struct bpf_map_def
	SEC("maps/last_update_ns") last_update_ns_hash = {
		.type = BPF_MAP_TYPE_HASH,
		.key_size = sizeof(u64),
		.value_size = sizeof(u64),
		.max_entries = 1024,
	};

SEC("tracepoint/sched/sched_switch")
int tracepoint__sched_switch(void *args)
{
	u64 cgroup_id = bpf_get_current_cgroup_id();
	u32 *count, *found;
	u32 new_count = 1;

	found = bpf_map_lookup_elem(&avx_context_switch_count_hash, &cgroup_id);

	/* store sched_switch counts only for cgroups that have AVX activity */
	if (!found) {
		return 0;
	}

	count = bpf_map_lookup_elem(&all_context_switch_count_hash, &cgroup_id);
	if (count) {
		__sync_fetch_and_add(count, 1);
	} else {
		bpf_map_update_elem(&all_context_switch_count_hash, &cgroup_id,
				    &new_count, BPF_ANY);
	}
	return 0;
}

struct x86_fpu_args {
	u64 pad;
	struct fpu *fpu;
	bool load_fpu;
	u64 xfeatures;
	u64 xcomp_bv;
};

SEC("tracepoint/x86_fpu/x86_fpu_regs_deactivated")
int tracepoint__x86_fpu_regs_deactivated(struct x86_fpu_args *args)
{
	u32 *counter;
	u32 ts;
	bpf_probe_read(&ts, sizeof(u32), (void *)&args->fpu->avx512_timestamp);

	if (ts == 0) {
		return 0;
	}

	u64 cgroup_id = bpf_get_current_cgroup_id();

	u32 ts_prev;
	u32 *tsp;
	tsp = bpf_map_lookup_elem(&avx_timestamp_hash, &cgroup_id);

	ts_prev = tsp ? *tsp : 0;

	if (ts == ts_prev) {
		return 0;
	}
	bpf_map_update_elem(&avx_timestamp_hash, &cgroup_id, &ts, BPF_ANY);

	u32 count = 1;
	counter = bpf_map_lookup_elem(&avx_context_switch_count_hash, &cgroup_id);
	if (counter) {
		__sync_fetch_and_add(counter, 1);
	} else {
		bpf_map_update_elem(&avx_context_switch_count_hash, &cgroup_id,
				    &count, BPF_ANY);
	}

	u64 last = bpf_ktime_get_ns();
	bpf_map_update_elem(&last_update_ns_hash, &cgroup_id, &last, BPF_ANY);

	return 0;
}

char _license[] SEC("license") = "GPL";

/*
Notes about Linux version:
   * We don't check LINUX_VERSION_CODE build time. It's user's responsibility to provide new enough headers.
   * Build failures may happen due to too old kernel headers (currently, Linux >= 5.1 headers are needed).
   * Our dependency to Kernel ABI is x86_fpu tracepoint parameters and struct fpu.
   * The host kernel needs to run Linux >= 5.2 and the version is checked upon eBPF loading.
   * We build the minimum supported version in SEC("version") section.
   * Max supported version is not checked but the check may be added later.
*/
unsigned int _version SEC("version") = KERNEL_VERSION(5, 2, 0);


================================================
FILE: go.mod
================================================
module github.com/intel/cri-resource-manager

go 1.24

require (
	contrib.go.opencensus.io/exporter/jaeger v0.2.1
	contrib.go.opencensus.io/exporter/prometheus v0.4.2
	github.com/cilium/ebpf v0.12.3
	github.com/evanphx/json-patch v5.7.0+incompatible
	github.com/google/go-cmp v0.6.0
	github.com/intel/cri-resource-manager/pkg/topology v0.0.0
	github.com/intel/goresctrl v0.5.0
	github.com/pkg/errors v0.9.1
	github.com/prometheus/client_golang v1.18.0
	github.com/prometheus/client_model v0.5.0
	github.com/prometheus/common v0.45.0
	github.com/shurcooL/vfsgen v0.0.0-20230704071429-0000e147ea92
	github.com/stretchr/testify v1.8.4
	go.opencensus.io v0.24.0
	golang.org/x/sys v0.31.0
	golang.org/x/time v0.5.0
	google.golang.org/grpc v1.60.1
	google.golang.org/protobuf v1.33.0
	k8s.io/api v0.29.0
	k8s.io/apimachinery v0.29.0
	k8s.io/client-go v0.29.0
	k8s.io/cri-api v0.29.0
	k8s.io/klog/v2 v2.110.1
	k8s.io/utils v0.0.0-20240102154912-e7106e64919e
	sigs.k8s.io/yaml v1.4.0
)

require (
	github.com/beorn7/perks v1.0.1 // indirect
	github.com/cespare/xxhash/v2 v2.2.0 // indirect
	github.com/davecgh/go-spew v1.1.1 // indirect
	github.com/emicklei/go-restful/v3 v3.11.0 // indirect
	github.com/go-kit/log v0.2.1 // indirect
	github.com/go-logfmt/logfmt v0.6.0 // indirect
	github.com/go-logr/logr v1.3.0 // indirect
	github.com/go-openapi/jsonpointer v0.19.6 // indirect
	github.com/go-openapi/jsonreference v0.20.2 // indirect
	github.com/go-openapi/swag v0.22.3 // indirect
	github.com/gogo/protobuf v1.3.2 // indirect
	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
	github.com/golang/protobuf v1.5.3 // indirect
	github.com/google/gnostic-models v0.6.8 // indirect
	github.com/google/gofuzz v1.2.0 // indirect
	github.com/google/uuid v1.5.0 // indirect
	github.com/imdario/mergo v0.3.12 // indirect
	github.com/josharian/intern v1.0.0 // indirect
	github.com/json-iterator/go v1.1.12 // indirect
	github.com/mailru/easyjson v0.7.7 // indirect
	github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect
	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
	github.com/modern-go/reflect2 v1.0.2 // indirect
	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
	github.com/pmezard/go-difflib v1.0.0 // indirect
	github.com/prometheus/procfs v0.12.0 // indirect
	github.com/prometheus/statsd_exporter v0.26.0 // indirect
	github.com/shurcooL/httpfs v0.0.0-20230704072500-f1e31cf0ba5c // indirect
	github.com/spf13/pflag v1.0.5 // indirect
	github.com/uber/jaeger-client-go v2.25.0+incompatible // indirect
	golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc // indirect
	golang.org/x/net v0.38.0 // indirect
	golang.org/x/oauth2 v0.27.0 // indirect
	golang.org/x/sync v0.12.0 // indirect
	golang.org/x/term v0.30.0 // indirect
	golang.org/x/text v0.23.0 // indirect
	google.golang.org/api v0.155.0 // indirect
	google.golang.org/genproto/googleapis/rpc v0.0.0-20240102182953-50ed04b92917 // indirect
	gopkg.in/inf.v0 v0.9.1 // indirect
	gopkg.in/yaml.v2 v2.4.0 // indirect
	gopkg.in/yaml.v3 v3.0.1 // indirect
	k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect
	sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
	sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
)

replace (
	github.com/intel/cri-resource-manager/pkg/topology v0.0.0 => ./pkg/topology

	go.opentelemetry.io/contrib => go.opentelemetry.io/contrib v0.20.0
	go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc => go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.20.0
	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp => go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.20.0
	go.opentelemetry.io/otel => go.opentelemetry.io/otel v0.20.0
	go.opentelemetry.io/otel/exporters/otlp => go.opentelemetry.io/otel/exporters/otlp v0.20.0
	go.opentelemetry.io/otel/metric => go.opentelemetry.io/otel/metric v0.20.0
	go.opentelemetry.io/otel/oteltest => go.opentelemetry.io/otel/oteltest v0.20.0
	go.opentelemetry.io/otel/sdk => go.opentelemetry.io/otel/sdk v0.20.0
	go.opentelemetry.io/otel/sdk/export/metric => go.opentelemetry.io/otel/sdk/export/metric v0.20.0
	go.opentelemetry.io/otel/sdk/metric => go.opentelemetry.io/otel/sdk/metric v0.20.0
	go.opentelemetry.io/otel/trace => go.opentelemetry.io/otel/trace v0.20.0

	k8s.io/api => k8s.io/api v0.29.0
	k8s.io/apimachinery => k8s.io/apimachinery v0.29.0
	k8s.io/apiserver => k8s.io/apiserver v0.29.0
	k8s.io/client-go => k8s.io/client-go v0.29.0
	k8s.io/component-base => k8s.io/component-base v0.29.0
	k8s.io/cri-api => k8s.io/cri-api v0.29.0
)


================================================
FILE: go.sum
================================================
cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU=
cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU=
cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY=
cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc=
cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0=
cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To=
cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4=
cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M=
cloud.google.com/go v0.54.0/go.mod h1:1rq2OEkV3YMf6n/9ZvGWI3GWw0VoqH/1x2nd8Is/bPc=
cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKVk=
cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs=
cloud.google.com/go v0.62.0/go.mod h1:jmCYTdRCQuc1PHIIJ/maLInMho30T/Y0M4hTdTShOYc=
cloud.google.com/go v0.65.0/go.mod h1:O5N8zS7uWy9vkA9vayVHs65eM1ubvY4h553ofrNHObY=
cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o=
cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE=
cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc=
cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg=
cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc=
cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ=
cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk=
cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I=
cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw=
cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA=
cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU=
cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw=
cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos=
cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk=
cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs=
cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0=
contrib.go.opencensus.io/exporter/jaeger v0.2.1 h1:yGBYzYMewVL0yO9qqJv3Z5+IRhPdU7e9o/2oKpX4YvI=
contrib.go.opencensus.io/exporter/jaeger v0.2.1/go.mod h1:Y8IsLgdxqh1QxYxPC5IgXVmBaeLUeQFfBeBi9PbeZd0=
contrib.go.opencensus.io/exporter/prometheus v0.4.2 h1:sqfsYl5GIY/L570iT+l93ehxaWJs2/OwXtiWwew3oAg=
contrib.go.opencensus.io/exporter/prometheus v0.4.2/go.mod h1:dvEHbiKmgvbr5pjaF9fpw1KeYcjrnC1J8B+JKjsZyRQ=
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho=
github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
github.com/cilium/ebpf v0.12.3 h1:8ht6F9MquybnY97at+VDZb3eQQr8ev79RueWeVaEcG4=
github.com/cilium/ebpf v0.12.3/go.mod h1:TctK1ivibvI3znr66ljgi4hqOT8EYQjz1KWBfb1UVgM=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g=
github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI=
github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
github.com/frankban/quicktest v1.14.5 h1:dfYrrRyLtiqT9GyKXgdh+k4inNeTvmGbuSgZ3lx3GhA=
github.com/frankban/quicktest v1.14.5/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
github.com/go-kit/log v0.1.0/go.mod h1:zbhenjAZHb184qTLMA9ZjW7ThYL0H2mk7Q6pNt4vbaY=
github.com/go-kit/log v0.2.0/go.mod h1:NwTd00d/i8cPZ3xOwwiv2PO5MOcx78fFErGNcVmBjv0=
github.com/go-kit/log v0.2.1 h1:MRVx0/zhvdseW+Gza6N9rVzU/IVzaeE1SFI4raAhmBU=
github.com/go-kit/log v0.2.1/go.mod h1:NwTd00d/i8cPZ3xOwwiv2PO5MOcx78fFErGNcVmBjv0=
github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk=
github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A=
github.com/go-logfmt/logfmt v0.5.1/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs=
github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4=
github.com/go-logfmt/logfmt v0.6.0/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs=
github.com/go-logr/logr v1.3.0 h1:2y3SDp0ZXuc6/cjLSZ+Q3ir+QB9T/iG5yYRXqsagWSY=
github.com/go-logr/logr v1.3.0/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE=
github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE=
github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g=
github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls=
github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y=
github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw=
github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw=
github.com/golang/mock v1.4.3/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw=
github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk=
github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg=
github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I=
github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U=
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.4.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
github.com/google/martian/v3 v3.0.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0=
github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
github.com/google/pprof v0.0.0-20200430221834-fc25d7d30c6d/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec=
github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU=
github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg=
github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk=
github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/imdario/mergo v0.3.12 h1:b6R2BslTbIEToALKP7LxUvijTsNI9TAe80pLWN2g/HU=
github.com/imdario/mergo v0.3.12/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA=
github.com/intel/goresctrl v0.5.0 h1:kcDhjE3ZF/mNrJuRzLS3LY2Hp6atFaF1XVFBT7SVL2g=
github.com/intel/goresctrl v0.5.0/go.mod h1:mIe63ggylWYr0cU/l8n11FAkesqfvuP3oktIsxvu0T0=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
github.com/json-iterator/go v1.1.11/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 h1:jWpvCLoY8Z/e3VKvlsiIGKtc+UG6U5vzxaoagmhXfyg=
github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0/go.mod h1:QUyp042oQthUoa9bqDv0ER0wrtXnBruoNd7aNjkbP+k=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/onsi/ginkgo/v2 v2.13.0 h1:0jY9lJquiL8fcf3M4LAXN5aMlS/b2BV86HFFPCPMgE4=
github.com/onsi/ginkgo/v2 v2.13.0/go.mod h1:TE309ZR8s5FsKKpuB1YAQYBzCaAfUgatB/xlT/ETL/o=
github.com/onsi/gomega v1.29.0 h1:KIA/t2t5UBzoirT4H9tsML45GEbo3ouUnBHsCfD2tVg=
github.com/onsi/gomega v1.29.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ=
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=
github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0=
github.com/prometheus/client_golang v1.12.1/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY=
github.com/prometheus/client_golang v1.12.2/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY=
github.com/prometheus/client_golang v1.13.0/go.mod h1:vTeo+zgvILHsnnj/39Ou/1fPN5nJFOEMgftOUOmlvYQ=
github.com/prometheus/client_golang v1.18.0 h1:HzFfmkOzH5Q8L8G+kSJKUx5dtG87sewO+FoDDqP5Tbk=
github.com/prometheus/client_golang v1.18.0/go.mod h1:T+GXkCk5wSJyOqMIzVgvvjFDlkOQntgjkJWKrN5txjA=
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw=
github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk5Pm3gvsjB5tr+kI=
github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo=
github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc=
github.com/prometheus/common v0.32.1/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls=
github.com/prometheus/common v0.35.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA=
github.com/prometheus/common v0.37.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA=
github.com/prometheus/common v0.45.0 h1:2BGz0eBc2hdMDLnO/8n0jeB3oPrt2D08CekT0lneoxM=
github.com/prometheus/common v0.45.0/go.mod h1:YJmSTw9BoKxJplESWWxlbyttQR4uaEcGyv9MZjVOJsY=
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA=
github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA=
github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4=
github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo=
github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo=
github.com/prometheus/statsd_exporter v0.22.7/go.mod h1:N/TevpjkIh9ccs6nuzY3jQn9dFqnUakOjnEuMPJJJnI=
github.com/prometheus/statsd_exporter v0.26.0 h1:SQl3M6suC6NWQYEzOvIv+EF6dAMYEqIuZy+o4H9F5Ig=
github.com/prometheus/statsd_exporter v0.26.0/go.mod h1:GXFLADOmBTVDrHc7b04nX8ooq3azG61pnECNqT7O5DM=
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/shurcooL/httpfs v0.0.0-20230704072500-f1e31cf0ba5c h1:aqg5Vm5dwtvL+YgDpBcK1ITf3o96N/K7/wsRXQnUTEs=
github.com/shurcooL/httpfs v0.0.0-20230704072500-f1e31cf0ba5c/go.mod h1:owqhoLW1qZoYLZzLnBw+QkPP9WZnjlSWihhxAJC1+/M=
github.com/shurcooL/vfsgen v0.0.0-20230704071429-0000e147ea92 h1:OfRzdxCzDhp+rsKWXuOO2I/quKMJ/+TQwVbIP/gltZg=
github.com/shurcooL/vfsgen v0.0.0-20230704071429-0000e147ea92/go.mod h1:7/OT02F6S6I7v6WXb+IjhMuZEYfH/RJ5RwEWnEo5BMg=
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/stvp/go-udp-testing v0.0.0-20201019212854-469649b16807/go.mod h1:7jxmlfBCDBXRzr0eAQJ48XC1hBu1np4CS5+cHEYfwpc=
github.com/uber/jaeger-client-go v2.25.0+incompatible h1:IxcNZ7WRY1Y3G4poYlx24szfsn/3LvK9QHCq9oQw8+U=
github.com/uber/jaeger-client-go v2.25.0+incompatible/go.mod h1:WVhlPFC8FDjOFMMWRy2pZqQJSXxYSwNYOkTr/Z6d3Kk=
github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
go.opencensus.io v0.23.0/go.mod h1:XItmlyltB5F7CS4xOC1DcqMoFqwtC6OG2xF7mCv7P7E=
go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek=
golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY=
golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4=
golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4=
golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4=
golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM=
golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU=
golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc h1:ao2WRsKSzW6KuUY9IWPwWahcHCgR0s52IfwutMfEbdM=
golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc/go.mod h1:iRJReGqOEeBhDZGkGbynYwcHlctCvnjTYIamk7uXpHI=
golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs=
golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc=
golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20210514164344-f6687ab2804c/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc=
golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M=
golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220601150217-0de741cfad7f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220708085239-5a0f0661e09d/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik=
golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk=
golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191130070609-6e064ea0cf2d/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191216173652-a0e659d51361/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200117161641-43d50277825c/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200122220014-bf1340f18c4a/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200204074204-1cc6d1ef6c74/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200224181240-023911ca70b2/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200227222343-706bc42d1f0d/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw=
golang.org/x/tools v0.0.0-20200312045724-11d5b4c81c7d/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw=
golang.org/x/tools v0.0.0-20200331025713-a30bf2db82d4/go.mod h1:Sl4aGygMT6LrqrWclx+PTx3U+LnKx/seiNR+3G19Ar8=
golang.org/x/tools v0.0.0-20200501065659-ab2804fb9c9d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200512131952-2bc93b1c0c88/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200618134242-20370b0cb4b2/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200729194436-6467de6f59a7/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA=
golang.org/x/tools v0.0.0-20200804011535-6c149bb5ef0d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA=
golang.org/x/tools v0.0.0-20200825202427-b303f430e36d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI=
google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI=
google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI=
google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
google.golang.org/api v0.19.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
google.golang.org/api v0.22.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
google.golang.org/api v0.28.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
google.golang.org/api v0.29.0/go.mod h1:Lcubydp8VUV7KeIHD9z2Bys/sm/vGKnG1UHuDBSrHWM=
google.golang.org/api v0.30.0/go.mod h1:QGmEvQ87FHZNiUVJkT14jQNYJ4ZJjdRF23ZXz5138Fc=
google.golang.org/api v0.155.0 h1:vBmGhCYs0djJttDNynWo44zosHlPvHmA0XiN2zP2DtA=
google.golang.org/api v0.155.0/go.mod h1:GI5qK5f40kCpHfPn6+YzGAByIKWv8ujFnmoWm7Igduk=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0=
google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8=
google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
google.golang.org/genproto v0.0.0-20191115194625-c23dd37a84c9/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
google.golang.org/genproto v0.0.0-20191216164720-4f79533eabd1/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
google.golang.org/genproto v0.0.0-20200115191322-ca5a22157cba/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
google.golang.org/genproto v0.0.0-20200122232147-0452cf42e150/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
google.golang.org/genproto v0.0.0-20200204135345-fa8e72b47b90/go.mod h1:GmwEX6Z4W5gMy59cAlVYjN9JhxgbQH6Gn+gFDQe2lzA=
google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200228133532-8c2c7df3a383/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200305110556-506484158171/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200312145019-da6875a35672/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200331122359-1ee6d9798940/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200515170657-fc4c6c6a6587/go.mod h1:YsZOwe1myG/8QRHRsmBRE1LrgQY60beZKjly0O1fX9U=
google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
google.golang.org/genproto v0.0.0-20200618031413-b414f8b61790/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA=
google.golang.org/genproto v0.0.0-20200729003335-053ba62fc06f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
google.golang.org/genproto v0.0.0-20200804131852-c06518451d9c/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
google.golang.org/genproto v0.0.0-20200825200019-8632dd797987/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240102182953-50ed04b92917 h1:6G8oQ016D88m1xAKljMlBOOGWDZkes4kMhgGFlf8WcQ=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240102182953-50ed04b92917/go.mod h1:xtjpI3tXFPP051KaWnhvxkiubL/6dJ18vLVf7q2pTOU=
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
google.golang.org/grpc v1.28.0/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60=
google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk=
google.golang.org/grpc v1.30.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc=
google.golang.org/grpc v1.60.1 h1:26+wFr+cNqSGFcOXcabYC0lUVJVRa2Sb2ortSK7VrEU=
google.golang.org/grpc v1.60.1/go.mod h1:OlCHIeLYqSSsLi6i49B5QGdzaMZK9+M7LXN2FKz4eGM=
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI=
google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg=
honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
k8s.io/api v0.29.0 h1:NiCdQMY1QOp1H8lfRyeEf8eOwV6+0xA6XEE44ohDX2A=
k8s.io/api v0.29.0/go.mod h1:sdVmXoz2Bo/cb77Pxi71IPTSErEW32xa4aXwKH7gfBA=
k8s.io/apimachinery v0.29.0 h1:+ACVktwyicPz0oc6MTMLwa2Pw3ouLAfAon1wPLtG48o=
k8s.io/apimachinery v0.29.0/go.mod h1:eVBxQ/cwiJxH58eK/jd/vAk4mrxmVlnpBH5J2GbMeis=
k8s.io/client-go v0.29.0 h1:KmlDtFcrdUzOYrBhXHgKw5ycWzc3ryPX5mQe0SkG3y8=
k8s.io/client-go v0.29.0/go.mod h1:yLkXH4HKMAywcrD82KMSmfYg2DlE8mepPR4JGSo5n38=
k8s.io/cri-api v0.29.0 h1:atenAqOltRsFqcCQlFFpDnl/R4aGfOELoNLTDJfd7t8=
k8s.io/cri-api v0.29.0/go.mod h1:Rls2JoVwfC7kW3tndm7267kriuRukQ02qfht0PCRuIc=
k8s.io/klog/v2 v2.110.1 h1:U/Af64HJf7FcwMcXyKm2RPM22WZzyR7OSpYj5tg3cL0=
k8s.io/klog/v2 v2.110.1/go.mod h1:YGtd1984u+GgbuZ7e08/yBuAfKLSO0+uR1Fhi6ExXjo=
k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 h1:aVUu9fTY98ivBPKR9Y5w/AuzbMm96cd3YHRTU83I780=
k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00/go.mod h1:AsvuZPBlUDVuCdzJ87iajxtXuR9oktsTctW/R9wwouA=
k8s.io/utils v0.0.0-20240102154912-e7106e64919e h1:eQ/4ljkx21sObifjzXwlPKpdGLrCfRziVtos3ofG/sQ=
k8s.io/utils v0.0.0-20240102154912-e7106e64919e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4=
sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08=
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=


================================================
FILE: packaging/deb.in/changelog
================================================
__PACKAGE__ (__VERSION__) unstable; urgency=low

  * Release build of __PACKAGE__ __VERSION__ for debian/ubuntu.

 -- __AUTHOR__ <__EMAIL__>  __DATE__


================================================
FILE: packaging/deb.in/compat
================================================
11


================================================
FILE: packaging/deb.in/control
================================================
Source: __PACKAGE__
Maintainer: __AUTHOR__ <__EMAIL__>

Package: __PACKAGE__
Architecture: any
Description: A CRI Proxy for Hardware Resource Management

================================================
FILE: packaging/deb.in/rules
================================================
#!/usr/bin/make -f
#-*- make -*-

DISTRIBUTION = $(shell sed -n "s/^VERSION_CODENAME=//p" /etc/os-release)
VERSION = __VERSION__
PACKAGEVERSION = $(VERSION)
TARBALL = __TARBALL__
URL = http://github.com/intel/cri-resource-manager

%:
	dh $@

override_dh_auto_clean:
override_dh_auto_test:
override_dh_auto_build:
override_dh_auto_install:
	export PATH="$$PATH:$$(go env GOPATH)/bin"; \
	make BUILD_DIRS=cri-resmgr install DESTDIR=debian/__PACKAGE__
	make BUILD_DIRS=cri-resmgr install-licenses DESTDIR=debian/__PACKAGE__/usr/share/doc/__PACKAGE__
	cp README.md docs/*.md cmd/*/*.sample \
	    debian/__PACKAGE__/usr/share/doc/__PACKAGE__

override_dh_gencontrol:
	dh_gencontrol -- -v$(PACKAGEVERSION)


================================================
FILE: packaging/rpm/cri-resource-manager.spec.in
================================================
Name:    cri-resource-manager
Version: __VERSION__
Release: 0
Summary: CRI Resource Manager, a CRI proxy with various in-node workload placement policies
License: ASL 2.0 
URL:     https://github.com/intel/cri-resource-manager
Source0: https://github.com/intel/cri-resource-manager/archive/cri-resource-manager-__TARVERSION__.tar.gz
BuildRequires: coreutils, make, kernel-devel

# Disable the building of debug package(s).
%define debug_package %{nil}

%description
Kubernetes Container Runtime Interface proxy service with hardware resource aware workload
placement policies.

%prep
%setup -q -n cri-resource-manager-__TARVERSION__

%build
make build BUILD_DIRS=cri-resmgr
make install-licenses BUILD_DIRS=cri-resmgr DESTDIR=.

%install
%make_install UNITDIR=%{_unitdir} SYSCONFDIR=%{_sysconfdir} BUILD_DIRS=cri-resmgr
install -m 0700 -d %{?buildroot}%{_sharedstatedir}/cri-resmgr

%files
%defattr(-,root,root,-)
%{_bindir}/*
%{_sysconfdir}/sysconfig/*
%{_unitdir}/*
%dir %attr(0700,root,root) %{_sharedstatedir}/cri-resmgr
%dir %attr(0700,root,root) %{_sysconfdir}/cri-resmgr
%config(noreplace) %{_sysconfdir}/cri-resmgr/*
%license licenses/cri-resmgr/*
%doc README.md docs/*.md
%doc cmd/*/*.sample


================================================
FILE: pkg/agent/agent.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package agent

import (
	"fmt"

	"github.com/intel/cri-resource-manager/pkg/log"
	k8sclient "k8s.io/client-go/kubernetes"

	resmgrcs "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1"
	resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1"
)

// Get cri-resmgr config
type configInterface interface {
	getConfig() resmgrConfig
	getError() error
}

// resmgrConfig represents cri-resmgr configuration
type resmgrConfig map[string]string

// resmgrAdjustment represents external adjustments for the resource-manager
type resmgrAdjustment map[string]*resmgr.Adjustment

// resmgrStatus represents the status of an external adjustment update
type resmgrStatus struct {
	request error
	errors  map[string]string
}

// ResourceManagerAgent is the interface exposed for the CRI Resource Manager Congig Agent
type ResourceManagerAgent interface {
	Run() error
}

// agent implements ResourceManagerAgent
type agent struct {
	log.Logger                      // Our logging interface
	cli        *k8sclient.Clientset // K8s client
	extCli     *resmgrcs.CriresmgrV1alpha1Client
	server     agentServer   // gRPC server listening for requests from cri-resource-manager
	watcher    k8sWatcher    // Watcher monitoring events in K8s cluster
	updater    configUpdater // Client sending config updates to cri-resource-manager
}

// NewResourceManagerAgent creates a new instance of ResourceManagerAgent
func NewResourceManagerAgent() (ResourceManagerAgent, error) {
	var err error

	a := &agent{
		Logger: log.NewLogger("resource-manager-agent"),
	}

	if a.cli, a.extCli, err = a.getK8sClient(opts.kubeconfig); err != nil {
		return nil, agentError("failed to get k8s client: %v", err)
	}

	if a.watcher, err = newK8sWatcher(a.cli, a.extCli); err != nil {
		return nil, agentError("failed to initialize watcher instance: %v", err)
	}

	if a.server, err = newAgentServer(a.cli, a); err != nil {
		return nil, agentError("failed to initialize gRPC server")
	}

	if a.updater, err = newConfigUpdater(); err != nil {
		return nil, agentError("failed to initialize config updater instance: %v", err)
	}

	return a, nil
}

// Start starts the resource manager.
func (a *agent) Run() error {
	a.Info("starting CRI Resource Manager Agent")

	if err := a.server.Start(opts.agentSocket); err != nil {
		return agentError("failed to start gRPC server: %v", err)
	}

	if err := a.watcher.Start(); err != nil {
		return agentError("failed to start watcher: %v", err)
	}

	if err := a.updater.Start(); err != nil {
		return agentError("failed to start config updater: %v", err)
	}

	for {
		select {
		case config, ok := <-a.watcher.ConfigChan():
			if ok {
				a.updater.UpdateConfig(&config)
			}
		case adjust, ok := <-a.watcher.AdjustmentChan():
			if ok {
				a.updater.UpdateAdjustment(&adjust)
			}
		case status, ok := <-a.updater.StatusChan():
			if ok {
				a.Info("got status %v", status)
				if err := a.watcher.UpdateStatus(status); err != nil {
					a.Error("failed to update adjustment node status: %v", err)
				}
			}
		}
	}
}

func (a *agent) getConfig() resmgrConfig {
	if a.watcher == nil {
		return nil
	}
	return a.watcher.GetConfig()
}

func (a *agent) getError() error {
	if a.updater == nil {
		return nil
	}
	return a.updater.GetError()
}

func agentError(format string, args ...interface{}) error {
	return fmt.Errorf(format, args...)
}


================================================
FILE: pkg/agent/api/v1/api.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package v1

import (
	"encoding/json"
)

var _ json.Marshaler = &JsonPatch{}

// MarshalJSON marshals JsonPatch to valid Json
func (j *JsonPatch) MarshalJSON() ([]byte, error) {
	// Don't really encode anything. Op and Path are ascii strings and value
	// is assumed to be in marshaled format
	if len(j.Value) == 0 {
		return []byte(`{"op":"` + j.Op + `","path":"` + j.Path + `"}`), nil
	}
	return []byte(`{"op":"` + j.Op + `","path":"` + j.Path + `","value":` + j.Value + `}`), nil
}


================================================
FILE: pkg/agent/api/v1/api.pb.go
================================================
//
//Copyright 2019 Intel Corporation
//
//Licensed under the Apache License, Version 2.0 (the "License");
//you may not use this file except in compliance with the License.
//You may obtain a copy of the License at
//
//http://www.apache.org/licenses/LICENSE-2.0
//
//Unless required by applicable law or agreed to in writing, software
//distributed under the License is distributed on an "AS IS" BASIS,
//WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//See the License for the specific language governing permissions and
//limitations under the License.

// Code generated by protoc-gen-go. DO NOT EDIT.
// versions:
// 	protoc-gen-go v1.28.0
// 	protoc        v3.20.1
// source: pkg/agent/api/v1/api.proto

package v1

import (
	protoreflect "google.golang.org/protobuf/reflect/protoreflect"
	protoimpl "google.golang.org/protobuf/runtime/protoimpl"
	reflect "reflect"
	sync "sync"
)

const (
	// Verify that this generated code is sufficiently up-to-date.
	_ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion)
	// Verify that runtime/protoimpl is sufficiently up-to-date.
	_ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20)
)

type GetNodeRequest struct {
	state         protoimpl.MessageState
	sizeCache     protoimpl.SizeCache
	unknownFields protoimpl.UnknownFields
}

func (x *GetNodeRequest) Reset() {
	*x = GetNodeRequest{}
	if protoimpl.UnsafeEnabled {
		mi := &file_pkg_agent_api_v1_api_proto_msgTypes[0]
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		ms.StoreMessageInfo(mi)
	}
}

func (x *GetNodeRequest) String() string {
	return protoimpl.X.MessageStringOf(x)
}

func (*GetNodeRequest) ProtoMessage() {}

func (x *GetNodeRequest) ProtoReflect() protoreflect.Message {
	mi := &file_pkg_agent_api_v1_api_proto_msgTypes[0]
	if protoimpl.UnsafeEnabled && x != nil {
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		if ms.LoadMessageInfo() == nil {
			ms.StoreMessageInfo(mi)
		}
		return ms
	}
	return mi.MessageOf(x)
}

// Deprecated: Use GetNodeRequest.ProtoReflect.Descriptor instead.
func (*GetNodeRequest) Descriptor() ([]byte, []int) {
	return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{0}
}

type GetNodeReply struct {
	state         protoimpl.MessageState
	sizeCache     protoimpl.SizeCache
	unknownFields protoimpl.UnknownFields

	Node string `protobuf:"bytes,1,opt,name=node,proto3" json:"node,omitempty"`
}

func (x *GetNodeReply) Reset() {
	*x = GetNodeReply{}
	if protoimpl.UnsafeEnabled {
		mi := &file_pkg_agent_api_v1_api_proto_msgTypes[1]
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		ms.StoreMessageInfo(mi)
	}
}

func (x *GetNodeReply) String() string {
	return protoimpl.X.MessageStringOf(x)
}

func (*GetNodeReply) ProtoMessage() {}

func (x *GetNodeReply) ProtoReflect() protoreflect.Message {
	mi := &file_pkg_agent_api_v1_api_proto_msgTypes[1]
	if protoimpl.UnsafeEnabled && x != nil {
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		if ms.LoadMessageInfo() == nil {
			ms.StoreMessageInfo(mi)
		}
		return ms
	}
	return mi.MessageOf(x)
}

// Deprecated: Use GetNodeReply.ProtoReflect.Descriptor instead.
func (*GetNodeReply) Descriptor() ([]byte, []int) {
	return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{1}
}

func (x *GetNodeReply) GetNode() string {
	if x != nil {
		return x.Node
	}
	return ""
}

// JsonPatch holds on JSON patch
type JsonPatch struct {
	state         protoimpl.MessageState
	sizeCache     protoimpl.SizeCache
	unknownFields protoimpl.UnknownFields

	Op    string `protobuf:"bytes,1,opt,name=op,proto3" json:"op,omitempty"`
	Path  string `protobuf:"bytes,2,opt,name=path,proto3" json:"path,omitempty"`
	Value string `protobuf:"bytes,3,opt,name=value,proto3" json:"value,omitempty"`
}

func (x *JsonPatch) Reset() {
	*x = JsonPatch{}
	if protoimpl.UnsafeEnabled {
		mi := &file_pkg_agent_api_v1_api_proto_msgTypes[2]
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		ms.StoreMessageInfo(mi)
	}
}

func (x *JsonPatch) String() string {
	return protoimpl.X.MessageStringOf(x)
}

func (*JsonPatch) ProtoMessage() {}

func (x *JsonPatch) ProtoReflect() protoreflect.Message {
	mi := &file_pkg_agent_api_v1_api_proto_msgTypes[2]
	if protoimpl.UnsafeEnabled && x != nil {
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		if ms.LoadMessageInfo() == nil {
			ms.StoreMessageInfo(mi)
		}
		return ms
	}
	return mi.MessageOf(x)
}

// Deprecated: Use JsonPatch.ProtoReflect.Descriptor instead.
func (*JsonPatch) Descriptor() ([]byte, []int) {
	return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{2}
}

func (x *JsonPatch) GetOp() string {
	if x != nil {
		return x.Op
	}
	return ""
}

func (x *JsonPatch) GetPath() string {
	if x != nil {
		return x.Path
	}
	return ""
}

func (x *JsonPatch) GetValue() string {
	if x != nil {
		return x.Value
	}
	return ""
}

type PatchNodeRequest struct {
	state         protoimpl.MessageState
	sizeCache     protoimpl.SizeCache
	unknownFields protoimpl.UnknownFields

	// List of JSON patches to apply on the node
	Patches []*JsonPatch `protobuf:"bytes,1,rep,name=patches,proto3" json:"patches,omitempty"`
}

func (x *PatchNodeRequest) Reset() {
	*x = PatchNodeRequest{}
	if protoimpl.UnsafeEnabled {
		mi := &file_pkg_agent_api_v1_api_proto_msgTypes[3]
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		ms.StoreMessageInfo(mi)
	}
}

func (x *PatchNodeRequest) String() string {
	return protoimpl.X.MessageStringOf(x)
}

func (*PatchNodeRequest) ProtoMessage() {}

func (x *PatchNodeRequest) ProtoReflect() protoreflect.Message {
	mi := &file_pkg_agent_api_v1_api_proto_msgTypes[3]
	if protoimpl.UnsafeEnabled && x != nil {
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		if ms.LoadMessageInfo() == nil {
			ms.StoreMessageInfo(mi)
		}
		return ms
	}
	return mi.MessageOf(x)
}

// Deprecated: Use PatchNodeRequest.ProtoReflect.Descriptor instead.
func (*PatchNodeRequest) Descriptor() ([]byte, []int) {
	return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{3}
}

func (x *PatchNodeRequest) GetPatches() []*JsonPatch {
	if x != nil {
		return x.Patches
	}
	return nil
}

type PatchNodeReply struct {
	state         protoimpl.MessageState
	sizeCache     protoimpl.SizeCache
	unknownFields protoimpl.UnknownFields
}

func (x *PatchNodeReply) Reset() {
	*x = PatchNodeReply{}
	if protoimpl.UnsafeEnabled {
		mi := &file_pkg_agent_api_v1_api_proto_msgTypes[4]
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		ms.StoreMessageInfo(mi)
	}
}

func (x *PatchNodeReply) String() string {
	return protoimpl.X.MessageStringOf(x)
}

func (*PatchNodeReply) ProtoMessage() {}

func (x *PatchNodeReply) ProtoReflect() protoreflect.Message {
	mi := &file_pkg_agent_api_v1_api_proto_msgTypes[4]
	if protoimpl.UnsafeEnabled && x != nil {
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		if ms.LoadMessageInfo() == nil {
			ms.StoreMessageInfo(mi)
		}
		return ms
	}
	return mi.MessageOf(x)
}

// Deprecated: Use PatchNodeReply.ProtoReflect.Descriptor instead.
func (*PatchNodeReply) Descriptor() ([]byte, []int) {
	return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{4}
}

type UpdateNodeCapacityRequest struct {
	state         protoimpl.MessageState
	sizeCache     protoimpl.SizeCache
	unknownFields protoimpl.UnknownFields

	// Name-value map of status.capacity to update
	Capacities map[string]string `protobuf:"bytes,1,rep,name=capacities,proto3" json:"capacities,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"`
}

func (x *UpdateNodeCapacityRequest) Reset() {
	*x = UpdateNodeCapacityRequest{}
	if protoimpl.UnsafeEnabled {
		mi := &file_pkg_agent_api_v1_api_proto_msgTypes[5]
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		ms.StoreMessageInfo(mi)
	}
}

func (x *UpdateNodeCapacityRequest) String() string {
	return protoimpl.X.MessageStringOf(x)
}

func (*UpdateNodeCapacityRequest) ProtoMessage() {}

func (x *UpdateNodeCapacityRequest) ProtoReflect() protoreflect.Message {
	mi := &file_pkg_agent_api_v1_api_proto_msgTypes[5]
	if protoimpl.UnsafeEnabled && x != nil {
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		if ms.LoadMessageInfo() == nil {
			ms.StoreMessageInfo(mi)
		}
		return ms
	}
	return mi.MessageOf(x)
}

// Deprecated: Use UpdateNodeCapacityRequest.ProtoReflect.Descriptor instead.
func (*UpdateNodeCapacityRequest) Descriptor() ([]byte, []int) {
	return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{5}
}

func (x *UpdateNodeCapacityRequest) GetCapacities() map[string]string {
	if x != nil {
		return x.Capacities
	}
	return nil
}

type UpdateNodeCapacityReply struct {
	state         protoimpl.MessageState
	sizeCache     protoimpl.SizeCache
	unknownFields protoimpl.UnknownFields
}

func (x *UpdateNodeCapacityReply) Reset() {
	*x = UpdateNodeCapacityReply{}
	if protoimpl.UnsafeEnabled {
		mi := &file_pkg_agent_api_v1_api_proto_msgTypes[6]
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		ms.StoreMessageInfo(mi)
	}
}

func (x *UpdateNodeCapacityReply) String() string {
	return protoimpl.X.MessageStringOf(x)
}

func (*UpdateNodeCapacityReply) ProtoMessage() {}

func (x *UpdateNodeCapacityReply) ProtoReflect() protoreflect.Message {
	mi := &file_pkg_agent_api_v1_api_proto_msgTypes[6]
	if protoimpl.UnsafeEnabled && x != nil {
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		if ms.LoadMessageInfo() == nil {
			ms.StoreMessageInfo(mi)
		}
		return ms
	}
	return mi.MessageOf(x)
}

// Deprecated: Use UpdateNodeCapacityReply.ProtoReflect.Descriptor instead.
func (*UpdateNodeCapacityReply) Descriptor() ([]byte, []int) {
	return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{6}
}

type HealthCheckRequest struct {
	state         protoimpl.MessageState
	sizeCache     protoimpl.SizeCache
	unknownFields protoimpl.UnknownFields

	Query string `protobuf:"bytes,1,opt,name=query,proto3" json:"query,omitempty"`
}

func (x *HealthCheckRequest) Reset() {
	*x = HealthCheckRequest{}
	if protoimpl.UnsafeEnabled {
		mi := &file_pkg_agent_api_v1_api_proto_msgTypes[7]
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		ms.StoreMessageInfo(mi)
	}
}

func (x *HealthCheckRequest) String() string {
	return protoimpl.X.MessageStringOf(x)
}

func (*HealthCheckRequest) ProtoMessage() {}

func (x *HealthCheckRequest) ProtoReflect() protoreflect.Message {
	mi := &file_pkg_agent_api_v1_api_proto_msgTypes[7]
	if protoimpl.UnsafeEnabled && x != nil {
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		if ms.LoadMessageInfo() == nil {
			ms.StoreMessageInfo(mi)
		}
		return ms
	}
	return mi.MessageOf(x)
}

// Deprecated: Use HealthCheckRequest.ProtoReflect.Descriptor instead.
func (*HealthCheckRequest) Descriptor() ([]byte, []int) {
	return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{7}
}

func (x *HealthCheckRequest) GetQuery() string {
	if x != nil {
		return x.Query
	}
	return ""
}

type HealthCheckReply struct {
	state         protoimpl.MessageState
	sizeCache     protoimpl.SizeCache
	unknownFields protoimpl.UnknownFields

	Error string `protobuf:"bytes,1,opt,name=error,proto3" json:"error,omitempty"`
}

func (x *HealthCheckReply) Reset() {
	*x = HealthCheckReply{}
	if protoimpl.UnsafeEnabled {
		mi := &file_pkg_agent_api_v1_api_proto_msgTypes[8]
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		ms.StoreMessageInfo(mi)
	}
}

func (x *HealthCheckReply) String() string {
	return protoimpl.X.MessageStringOf(x)
}

func (*HealthCheckReply) ProtoMessage() {}

func (x *HealthCheckReply) ProtoReflect() protoreflect.Message {
	mi := &file_pkg_agent_api_v1_api_proto_msgTypes[8]
	if protoimpl.UnsafeEnabled && x != nil {
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		if ms.LoadMessageInfo() == nil {
			ms.StoreMessageInfo(mi)
		}
		return ms
	}
	return mi.MessageOf(x)
}

// Deprecated: Use HealthCheckReply.ProtoReflect.Descriptor instead.
func (*HealthCheckReply) Descriptor() ([]byte, []int) {
	return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{8}
}

func (x *HealthCheckReply) GetError() string {
	if x != nil {
		return x.Error
	}
	return ""
}

var File_pkg_agent_api_v1_api_proto protoreflect.FileDescriptor

var file_pkg_agent_api_v1_api_proto_rawDesc = []byte{
	0x0a, 0x1a, 0x70, 0x6b, 0x67, 0x2f, 0x61, 0x67, 0x65, 0x6e, 0x74, 0x2f, 0x61, 0x70, 0x69, 0x2f,
	0x76, 0x31, 0x2f, 0x61, 0x70, 0x69, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x02, 0x76, 0x31,
	0x22, 0x10, 0x0a, 0x0e, 0x47, 0x65, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65,
	0x73, 0x74, 0x22, 0x22, 0x0a, 0x0c, 0x47, 0x65, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x70,
	0x6c, 0x79, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x6f, 0x64, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09,
	0x52, 0x04, 0x6e, 0x6f, 0x64, 0x65, 0x22, 0x45, 0x0a, 0x09, 0x4a, 0x73, 0x6f, 0x6e, 0x50, 0x61,
	0x74, 0x63, 0x68, 0x12, 0x0e, 0x0a, 0x02, 0x6f, 0x70, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52,
	0x02, 0x6f, 0x70, 0x12, 0x12, 0x0a, 0x04, 0x70, 0x61, 0x74, 0x68, 0x18, 0x02, 0x20, 0x01, 0x28,
	0x09, 0x52, 0x04, 0x70, 0x61, 0x74, 0x68, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65,
	0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x3b, 0x0a,
	0x10, 0x50, 0x61, 0x74, 0x63, 0x68, 0x4e, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73,
	0x74, 0x12, 0x27, 0x0a, 0x07, 0x70, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x18, 0x01, 0x20, 0x03,
	0x28, 0x0b, 0x32, 0x0d, 0x2e, 0x76, 0x31, 0x2e, 0x4a, 0x73, 0x6f, 0x6e, 0x50, 0x61, 0x74, 0x63,
	0x68, 0x52, 0x07, 0x70, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x22, 0x10, 0x0a, 0x0e, 0x50, 0x61,
	0x74, 0x63, 0x68, 0x4e, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0xa9, 0x01, 0x0a,
	0x19, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x4e, 0x6f, 0x64, 0x65, 0x43, 0x61, 0x70, 0x61, 0x63,
	0x69, 0x74, 0x79, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x4d, 0x0a, 0x0a, 0x63, 0x61,
	0x70, 0x61, 0x63, 0x69, 0x74, 0x69, 0x65, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2d,
	0x2e, 0x76, 0x31, 0x2e, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x4e, 0x6f, 0x64, 0x65, 0x43, 0x61,
	0x70, 0x61, 0x63, 0x69, 0x74, 0x79, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x2e, 0x43, 0x61,
	0x70, 0x61, 0x63, 0x69, 0x74, 0x69, 0x65, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x0a, 0x63,
	0x61, 0x70, 0x61, 0x63, 0x69, 0x74, 0x69, 0x65, 0x73, 0x1a, 0x3d, 0x0a, 0x0f, 0x43, 0x61, 0x70,
	0x61, 0x63, 0x69, 0x74, 0x69, 0x65, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03,
	0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14,
	0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76,
	0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x19, 0x0a, 0x17, 0x55, 0x70, 0x64, 0x61,
	0x74, 0x65, 0x4e, 0x6f, 0x64, 0x65, 0x43, 0x61, 0x70, 0x61, 0x63, 0x69, 0x74, 0x79, 0x52, 0x65,
	0x70, 0x6c, 0x79, 0x22, 0x2a, 0x0a, 0x12, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x43, 0x68, 0x65,
	0x63, 0x6b, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x71, 0x75, 0x65,
	0x72, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x71, 0x75, 0x65, 0x72, 0x79, 0x22,
	0x28, 0x0a, 0x10, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x52, 0x65,
	0x70, 0x6c, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x18, 0x01, 0x20, 0x01,
	0x28, 0x09, 0x52, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x32, 0x86, 0x02, 0x0a, 0x05, 0x41, 0x67,
	0x65, 0x6e, 0x74, 0x12, 0x31, 0x0a, 0x07, 0x47, 0x65, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x12, 0x12,
	0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65,
	0x73, 0x74, 0x1a, 0x10, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x52,
	0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x12, 0x37, 0x0a, 0x09, 0x50, 0x61, 0x74, 0x63, 0x68, 0x4e,
	0x6f, 0x64, 0x65, 0x12, 0x14, 0x2e, 0x76, 0x31, 0x2e, 0x50, 0x61, 0x74, 0x63, 0x68, 0x4e, 0x6f,
	0x64, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x12, 0x2e, 0x76, 0x31, 0x2e, 0x50,
	0x61, 0x74, 0x63, 0x68, 0x4e, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x12,
	0x52, 0x0a, 0x12, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x4e, 0x6f, 0x64, 0x65, 0x43, 0x61, 0x70,
	0x61, 0x63, 0x69, 0x74, 0x79, 0x12, 0x1d, 0x2e, 0x76, 0x31, 0x2e, 0x55, 0x70, 0x64, 0x61, 0x74,
	0x65, 0x4e, 0x6f, 0x64, 0x65, 0x43, 0x61, 0x70, 0x61, 0x63, 0x69, 0x74, 0x79, 0x52, 0x65, 0x71,
	0x75, 0x65, 0x73, 0x74, 0x1a, 0x1b, 0x2e, 0x76, 0x31, 0x2e, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65,
	0x4e, 0x6f, 0x64, 0x65, 0x43, 0x61, 0x70, 0x61, 0x63, 0x69, 0x74, 0x79, 0x52, 0x65, 0x70, 0x6c,
	0x79, 0x22, 0x00, 0x12, 0x3d, 0x0a, 0x0b, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x43, 0x68, 0x65,
	0x63, 0x6b, 0x12, 0x16, 0x2e, 0x76, 0x31, 0x2e, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x43, 0x68,
	0x65, 0x63, 0x6b, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x14, 0x2e, 0x76, 0x31, 0x2e,
	0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x52, 0x65, 0x70, 0x6c, 0x79,
	0x22, 0x00, 0x42, 0x07, 0x5a, 0x05, 0x2e, 0x2e, 0x2f, 0x76, 0x31, 0x62, 0x06, 0x70, 0x72, 0x6f,
	0x74, 0x6f, 0x33,
}

var (
	file_pkg_agent_api_v1_api_proto_rawDescOnce sync.Once
	file_pkg_agent_api_v1_api_proto_rawDescData = file_pkg_agent_api_v1_api_proto_rawDesc
)

func file_pkg_agent_api_v1_api_proto_rawDescGZIP() []byte {
	file_pkg_agent_api_v1_api_proto_rawDescOnce.Do(func() {
		file_pkg_agent_api_v1_api_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_agent_api_v1_api_proto_rawDescData)
	})
	return file_pkg_agent_api_v1_api_proto_rawDescData
}

var file_pkg_agent_api_v1_api_proto_msgTypes = make([]protoimpl.MessageInfo, 10)
var file_pkg_agent_api_v1_api_proto_goTypes = []interface{}{
	(*GetNodeRequest)(nil),            // 0: v1.GetNodeRequest
	(*GetNodeReply)(nil),              // 1: v1.GetNodeReply
	(*JsonPatch)(nil),                 // 2: v1.JsonPatch
	(*PatchNodeRequest)(nil),          // 3: v1.PatchNodeRequest
	(*PatchNodeReply)(nil),            // 4: v1.PatchNodeReply
	(*UpdateNodeCapacityRequest)(nil), // 5: v1.UpdateNodeCapacityRequest
	(*UpdateNodeCapacityReply)(nil),   // 6: v1.UpdateNodeCapacityReply
	(*HealthCheckRequest)(nil),        // 7: v1.HealthCheckRequest
	(*HealthCheckReply)(nil),          // 8: v1.HealthCheckReply
	nil,                               // 9: v1.UpdateNodeCapacityRequest.CapacitiesEntry
}
var file_pkg_agent_api_v1_api_proto_depIdxs = []int32{
	2, // 0: v1.PatchNodeRequest.patches:type_name -> v1.JsonPatch
	9, // 1: v1.UpdateNodeCapacityRequest.capacities:type_name -> v1.UpdateNodeCapacityRequest.CapacitiesEntry
	0, // 2: v1.Agent.GetNode:input_type -> v1.GetNodeRequest
	3, // 3: v1.Agent.PatchNode:input_type -> v1.PatchNodeRequest
	5, // 4: v1.Agent.UpdateNodeCapacity:input_type -> v1.UpdateNodeCapacityRequest
	7, // 5: v1.Agent.HealthCheck:input_type -> v1.HealthCheckRequest
	1, // 6: v1.Agent.GetNode:output_type -> v1.GetNodeReply
	4, // 7: v1.Agent.PatchNode:output_type -> v1.PatchNodeReply
	6, // 8: v1.Agent.UpdateNodeCapacity:output_type -> v1.UpdateNodeCapacityReply
	8, // 9: v1.Agent.HealthCheck:output_type -> v1.HealthCheckReply
	6, // [6:10] is the sub-list for method output_type
	2, // [2:6] is the sub-list for method input_type
	2, // [2:2] is the sub-list for extension type_name
	2, // [2:2] is the sub-list for extension extendee
	0, // [0:2] is the sub-list for field type_name
}

func init() { file_pkg_agent_api_v1_api_proto_init() }
func file_pkg_agent_api_v1_api_proto_init() {
	if File_pkg_agent_api_v1_api_proto != nil {
		return
	}
	if !protoimpl.UnsafeEnabled {
		file_pkg_agent_api_v1_api_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} {
			switch v := v.(*GetNodeRequest); i {
			case 0:
				return &v.state
			case 1:
				return &v.sizeCache
			case 2:
				return &v.unknownFields
			default:
				return nil
			}
		}
		file_pkg_agent_api_v1_api_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} {
			switch v := v.(*GetNodeReply); i {
			case 0:
				return &v.state
			case 1:
				return &v.sizeCache
			case 2:
				return &v.unknownFields
			default:
				return nil
			}
		}
		file_pkg_agent_api_v1_api_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} {
			switch v := v.(*JsonPatch); i {
			case 0:
				return &v.state
			case 1:
				return &v.sizeCache
			case 2:
				return &v.unknownFields
			default:
				return nil
			}
		}
		file_pkg_agent_api_v1_api_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} {
			switch v := v.(*PatchNodeRequest); i {
			case 0:
				return &v.state
			case 1:
				return &v.sizeCache
			case 2:
				return &v.unknownFields
			default:
				return nil
			}
		}
		file_pkg_agent_api_v1_api_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} {
			switch v := v.(*PatchNodeReply); i {
			case 0:
				return &v.state
			case 1:
				return &v.sizeCache
			case 2:
				return &v.unknownFields
			default:
				return nil
			}
		}
		file_pkg_agent_api_v1_api_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} {
			switch v := v.(*UpdateNodeCapacityRequest); i {
			case 0:
				return &v.state
			case 1:
				return &v.sizeCache
			case 2:
				return &v.unknownFields
			default:
				return nil
			}
		}
		file_pkg_agent_api_v1_api_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} {
			switch v := v.(*UpdateNodeCapacityReply); i {
			case 0:
				return &v.state
			case 1:
				return &v.sizeCache
			case 2:
				return &v.unknownFields
			default:
				return nil
			}
		}
		file_pkg_agent_api_v1_api_proto_msgTypes[7].Exporter = func(v interface{}, i int) interface{} {
			switch v := v.(*HealthCheckRequest); i {
			case 0:
				return &v.state
			case 1:
				return &v.sizeCache
			case 2:
				return &v.unknownFields
			default:
				return nil
			}
		}
		file_pkg_agent_api_v1_api_proto_msgTypes[8].Exporter = func(v interface{}, i int) interface{} {
			switch v := v.(*HealthCheckReply); i {
			case 0:
				return &v.state
			case 1:
				return &v.sizeCache
			case 2:
				return &v.unknownFields
			default:
				return nil
			}
		}
	}
	type x struct{}
	out := protoimpl.TypeBuilder{
		File: protoimpl.DescBuilder{
			GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
			RawDescriptor: file_pkg_agent_api_v1_api_proto_rawDesc,
			NumEnums:      0,
			NumMessages:   10,
			NumExtensions: 0,
			NumServices:   1,
		},
		GoTypes:           file_pkg_agent_api_v1_api_proto_goTypes,
		DependencyIndexes: file_pkg_agent_api_v1_api_proto_depIdxs,
		MessageInfos:      file_pkg_agent_api_v1_api_proto_msgTypes,
	}.Build()
	File_pkg_agent_api_v1_api_proto = out.File
	file_pkg_agent_api_v1_api_proto_rawDesc = nil
	file_pkg_agent_api_v1_api_proto_goTypes = nil
	file_pkg_agent_api_v1_api_proto_depIdxs = nil
}


================================================
FILE: pkg/agent/api/v1/api.proto
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

syntax = "proto3";

package v1;
option go_package = "../v1";

service Agent{
    rpc GetNode(GetNodeRequest) returns (GetNodeReply) {}
    rpc PatchNode(PatchNodeRequest) returns (PatchNodeReply) {}
    rpc UpdateNodeCapacity(UpdateNodeCapacityRequest) returns (UpdateNodeCapacityReply) {}
    rpc HealthCheck(HealthCheckRequest) returns (HealthCheckReply) {}
}

message GetNodeRequest {
}

message GetNodeReply {
    string node = 1;
}

// JsonPatch holds on JSON patch
message JsonPatch {
    string op = 1;
    string path = 2;
    string value = 3;
}

message PatchNodeRequest {
    // List of JSON patches to apply on the node
    repeated JsonPatch patches = 1;
}

message PatchNodeReply {
}

message UpdateNodeCapacityRequest {
    // Name-value map of status.capacity to update
    map<string, string> capacities = 1;
}

message UpdateNodeCapacityReply {
}

message HealthCheckRequest {
    string query = 1;
}

message HealthCheckReply {
    string error = 1;
}


================================================
FILE: pkg/agent/api/v1/api_grpc.pb.go
================================================
// Code generated by protoc-gen-go-grpc. DO NOT EDIT.
// versions:
// - protoc-gen-go-grpc v1.2.0
// - protoc             v3.20.1
// source: pkg/agent/api/v1/api.proto

package v1

import (
	context "context"
	grpc "google.golang.org/grpc"
	codes "google.golang.org/grpc/codes"
	status "google.golang.org/grpc/status"
)

// This is a compile-time assertion to ensure that this generated file
// is compatible with the grpc package it is being compiled against.
// Requires gRPC-Go v1.32.0 or later.
const _ = grpc.SupportPackageIsVersion7

// AgentClient is the client API for Agent service.
//
// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
type AgentClient interface {
	GetNode(ctx context.Context, in *GetNodeRequest, opts ...grpc.CallOption) (*GetNodeReply, error)
	PatchNode(ctx context.Context, in *PatchNodeRequest, opts ...grpc.CallOption) (*PatchNodeReply, error)
	UpdateNodeCapacity(ctx context.Context, in *UpdateNodeCapacityRequest, opts ...grpc.CallOption) (*UpdateNodeCapacityReply, error)
	HealthCheck(ctx context.Context, in *HealthCheckRequest, opts ...grpc.CallOption) (*HealthCheckReply, error)
}

type agentClient struct {
	cc grpc.ClientConnInterface
}

func NewAgentClient(cc grpc.ClientConnInterface) AgentClient {
	return &agentClient{cc}
}

func (c *agentClient) GetNode(ctx context.Context, in *GetNodeRequest, opts ...grpc.CallOption) (*GetNodeReply, error) {
	out := new(GetNodeReply)
	err := c.cc.Invoke(ctx, "/v1.Agent/GetNode", in, out, opts...)
	if err != nil {
		return nil, err
	}
	return out, nil
}

func (c *agentClient) PatchNode(ctx context.Context, in *PatchNodeRequest, opts ...grpc.CallOption) (*PatchNodeReply, error) {
	out := new(PatchNodeReply)
	err := c.cc.Invoke(ctx, "/v1.Agent/PatchNode", in, out, opts...)
	if err != nil {
		return nil, err
	}
	return out, nil
}

func (c *agentClient) UpdateNodeCapacity(ctx context.Context, in *UpdateNodeCapacityRequest, opts ...grpc.CallOption) (*UpdateNodeCapacityReply, error) {
	out := new(UpdateNodeCapacityReply)
	err := c.cc.Invoke(ctx, "/v1.Agent/UpdateNodeCapacity", in, out, opts...)
	if err != nil {
		return nil, err
	}
	return out, nil
}

func (c *agentClient) HealthCheck(ctx context.Context, in *HealthCheckRequest, opts ...grpc.CallOption) (*HealthCheckReply, error) {
	out := new(HealthCheckReply)
	err := c.cc.Invoke(ctx, "/v1.Agent/HealthCheck", in, out, opts...)
	if err != nil {
		return nil, err
	}
	return out, nil
}

// AgentServer is the server API for Agent service.
// All implementations must embed UnimplementedAgentServer
// for forward compatibility
type AgentServer interface {
	GetNode(context.Context, *GetNodeRequest) (*GetNodeReply, error)
	PatchNode(context.Context, *PatchNodeRequest) (*PatchNodeReply, error)
	UpdateNodeCapacity(context.Context, *UpdateNodeCapacityRequest) (*UpdateNodeCapacityReply, error)
	HealthCheck(context.Context, *HealthCheckRequest) (*HealthCheckReply, error)
	mustEmbedUnimplementedAgentServer()
}

// UnimplementedAgentServer must be embedded to have forward compatible implementations.
type UnimplementedAgentServer struct {
}

func (UnimplementedAgentServer) GetNode(context.Context, *GetNodeRequest) (*GetNodeReply, error) {
	return nil, status.Errorf(codes.Unimplemented, "method GetNode not implemented")
}
func (UnimplementedAgentServer) PatchNode(context.Context, *PatchNodeRequest) (*PatchNodeReply, error) {
	return nil, status.Errorf(codes.Unimplemented, "method PatchNode not implemented")
}
func (UnimplementedAgentServer) UpdateNodeCapacity(context.Context, *UpdateNodeCapacityRequest) (*UpdateNodeCapacityReply, error) {
	return nil, status.Errorf(codes.Unimplemented, "method UpdateNodeCapacity not implemented")
}
func (UnimplementedAgentServer) HealthCheck(context.Context, *HealthCheckRequest) (*HealthCheckReply, error) {
	return nil, status.Errorf(codes.Unimplemented, "method HealthCheck not implemented")
}
func (UnimplementedAgentServer) mustEmbedUnimplementedAgentServer() {}

// UnsafeAgentServer may be embedded to opt out of forward compatibility for this service.
// Use of this interface is not recommended, as added methods to AgentServer will
// result in compilation errors.
type UnsafeAgentServer interface {
	mustEmbedUnimplementedAgentServer()
}

func RegisterAgentServer(s grpc.ServiceRegistrar, srv AgentServer) {
	s.RegisterService(&Agent_ServiceDesc, srv)
}

func _Agent_GetNode_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
	in := new(GetNodeRequest)
	if err := dec(in); err != nil {
		return nil, err
	}
	if interceptor == nil {
		return srv.(AgentServer).GetNode(ctx, in)
	}
	info := &grpc.UnaryServerInfo{
		Server:     srv,
		FullMethod: "/v1.Agent/GetNode",
	}
	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
		return srv.(AgentServer).GetNode(ctx, req.(*GetNodeRequest))
	}
	return interceptor(ctx, in, info, handler)
}

func _Agent_PatchNode_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
	in := new(PatchNodeRequest)
	if err := dec(in); err != nil {
		return nil, err
	}
	if interceptor == nil {
		return srv.(AgentServer).PatchNode(ctx, in)
	}
	info := &grpc.UnaryServerInfo{
		Server:     srv,
		FullMethod: "/v1.Agent/PatchNode",
	}
	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
		return srv.(AgentServer).PatchNode(ctx, req.(*PatchNodeRequest))
	}
	return interceptor(ctx, in, info, handler)
}

func _Agent_UpdateNodeCapacity_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
	in := new(UpdateNodeCapacityRequest)
	if err := dec(in); err != nil {
		return nil, err
	}
	if interceptor == nil {
		return srv.(AgentServer).UpdateNodeCapacity(ctx, in)
	}
	info := &grpc.UnaryServerInfo{
		Server:     srv,
		FullMethod: "/v1.Agent/UpdateNodeCapacity",
	}
	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
		return srv.(AgentServer).UpdateNodeCapacity(ctx, req.(*UpdateNodeCapacityRequest))
	}
	return interceptor(ctx, in, info, handler)
}

func _Agent_HealthCheck_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
	in := new(HealthCheckRequest)
	if err := dec(in); err != nil {
		return nil, err
	}
	if interceptor == nil {
		return srv.(AgentServer).HealthCheck(ctx, in)
	}
	info := &grpc.UnaryServerInfo{
		Server:     srv,
		FullMethod: "/v1.Agent/HealthCheck",
	}
	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
		return srv.(AgentServer).HealthCheck(ctx, req.(*HealthCheckRequest))
	}
	return interceptor(ctx, in, info, handler)
}

// Agent_ServiceDesc is the grpc.ServiceDesc for Agent service.
// It's only intended for direct use with grpc.RegisterService,
// and not to be introspected or modified (even as a copy)
var Agent_ServiceDesc = grpc.ServiceDesc{
	ServiceName: "v1.Agent",
	HandlerType: (*AgentServer)(nil),
	Methods: []grpc.MethodDesc{
		{
			MethodName: "GetNode",
			Handler:    _Agent_GetNode_Handler,
		},
		{
			MethodName: "PatchNode",
			Handler:    _Agent_PatchNode_Handler,
		},
		{
			MethodName: "UpdateNodeCapacity",
			Handler:    _Agent_UpdateNodeCapacity_Handler,
		},
		{
			MethodName: "HealthCheck",
			Handler:    _Agent_HealthCheck_Handler,
		},
	},
	Streams:  []grpc.StreamDesc{},
	Metadata: "pkg/agent/api/v1/api.proto",
}


================================================
FILE: pkg/agent/api/v1/constants.go
================================================
/*
Copyright Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package v1

const (
	// ConfigStatus queries the status of the last configuration push to resmgr.
	ConfigStatus = "config-status"
)


================================================
FILE: pkg/agent/config-updater.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package agent

import (
	"fmt"
	"net"
	"sync"
	"time"

	"context"
	"encoding/json"

	"google.golang.org/grpc"

	resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1"
	resmgr_v1 "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/config/api/v1"
	"github.com/intel/cri-resource-manager/pkg/log"
)

const (
	// configuration update rate-limiting timeout
	rateLimitTimeout = 2 * time.Second
	// setConfigTimeout is the duration we wait at most for a SetConfig reply
	setConfigTimeout = 5 * time.Second
	// retryTimeout is the timeout after we retry sending configuration updates upon failure
	retryTimeout = 5 * time.Second
)

// configUpdater handles sending configuration to cri-resmgr
type configUpdater interface {
	Start() error
	Stop()
	UpdateConfig(*resmgrConfig)
	UpdateAdjustment(*resmgrAdjustment)
	StatusChan() chan *resmgrStatus
	GetError() error
}

// updater implements configUpdater
type updater struct {
	log.Logger
	resmgrCli     resmgr_v1.ConfigClient
	newConfig     chan *resmgrConfig
	newAdjustment chan *resmgrAdjustment
	newStatus     chan *resmgrStatus
	cfgErrLock    sync.RWMutex
	cfgErr        error
}

func newConfigUpdater() (configUpdater, error) {
	u := &updater{Logger: log.NewLogger("config-updater")}

	c, err := newResmgrCli(opts.resmgrSocket)
	if err != nil {
		return nil, agentError("failed to create connection to cri-resmgr")
	}
	u.resmgrCli = c

	u.newConfig = make(chan *resmgrConfig)
	u.newAdjustment = make(chan *resmgrAdjustment)
	u.newStatus = make(chan *resmgrStatus)

	return u, nil
}

func (u *updater) Start() error {
	u.Info("Starting config-updater")
	go func() {
		var pendingConfig *resmgrConfig
		var pendingAdjustment *resmgrAdjustment

		var ratelimit <-chan time.Time

		for {
			select {
			case cfg := <-u.newConfig:
				u.Info("scheduling update after %v rate-limiting timeout...", rateLimitTimeout)
				pendingConfig = cfg
				ratelimit = time.After(rateLimitTimeout)

			case adjust := <-u.newAdjustment:
				u.Info("scheduling update after %v rate-limiting timeout...", rateLimitTimeout)
				pendingAdjustment = adjust
				ratelimit = time.After(rateLimitTimeout)

			case _ = <-ratelimit:
				if pendingConfig != nil {
					mgrErr, err := u.setConfig(pendingConfig)
					if err != nil {
						u.Error("failed to send configuration update: %v", err)
						ratelimit = time.After(retryTimeout)
					} else {
						if mgrErr != nil {
							u.Error("cri-resmgr configuration error: %v", mgrErr)
						}
						pendingConfig = nil
						ratelimit = nil
					}
				}
				if pendingAdjustment != nil {
					errors, err := u.setAdjustment(pendingAdjustment)

					if err != nil {
						u.Error("failed to update adjustments: %+v", err)
					}
					if len(errors) > 0 {
						u.Error("some adjustment updates failed: %+v", errors)
					}

					u.newStatus <- &resmgrStatus{
						request: err,
						errors:  errors,
					}

					pendingAdjustment = nil
					ratelimit = nil
				}
			}
		}
	}()

	return nil
}

func (u *updater) Stop() {
}

func (u *updater) UpdateConfig(c *resmgrConfig) {
	u.newConfig <- c
}

func (u *updater) UpdateAdjustment(c *resmgrAdjustment) {
	u.newAdjustment <- c
}

func (u *updater) StatusChan() chan *resmgrStatus {
	return u.newStatus
}

func (u *updater) setError(err error) error {
	u.cfgErrLock.Lock()
	defer u.cfgErrLock.Unlock()
	u.cfgErr = err
	return err
}

func (u *updater) GetError() error {
	u.cfgErrLock.RLock()
	defer u.cfgErrLock.RUnlock()
	return u.cfgErr
}

func (u *updater) setConfig(cfg *resmgrConfig) (error, error) {
	ctx, cancel := context.WithTimeout(context.Background(), setConfigTimeout)
	defer cancel()

	req := &resmgr_v1.SetConfigRequest{NodeName: nodeName, Config: *cfg}
	u.Debug("sending SetConfig request to cri-resmgr")

	reply, err := u.resmgrCli.SetConfig(ctx, req, []grpc.CallOption{grpc.FailFast(false)}...)

	switch {
	case err != nil:
		return nil, u.setError(err)
	case reply.Error != "":
		return u.setError(fmt.Errorf("%s", reply.Error)), nil
	default:
		return u.setError(nil), nil
	}
}

func (u *updater) setAdjustment(adjust *resmgrAdjustment) (map[string]string, error) {
	ctx, cancel := context.WithTimeout(context.Background(), setConfigTimeout)
	defer cancel()

	specs := map[string]*resmgr.AdjustmentSpec{}
	for name, p := range *adjust {
		specs[name] = &resmgr.AdjustmentSpec{
			Scope:        p.Spec.NodeScope(nodeName),
			Resources:    p.Spec.Resources,
			Classes:      p.Spec.Classes,
			ToptierLimit: p.Spec.ToptierLimit,
		}
	}
	encoded, err := json.Marshal(specs)
	if err != nil {
		return nil, agentError("setAdjustment: failed to encode AdjustmentSpec: %v", err)
	}

	req := &resmgr_v1.SetAdjustmentRequest{NodeName: nodeName, Adjustment: string(encoded)}
	u.Debug("sending SetAdjustment request to cri-resmgr")

	reply, err := u.resmgrCli.SetAdjustment(ctx, req, []grpc.CallOption{grpc.FailFast(false)}...)

	if err != nil {
		return nil, err
	}
	return reply.Errors, nil
}

func newResmgrCli(socket string) (resmgr_v1.ConfigClient, error) {
	dialOpts := []grpc.DialOption{
		grpc.WithInsecure(),
		grpc.WithDialer(func(sock string, timeout time.Duration) (net.Conn, error) {
			return net.Dial("unix", socket)
		}),
	}
	conn, err := grpc.Dial(socket, dialOpts...)
	if err != nil {
		return nil, agentError("failed to connect to cri-resmgr: %v", err)
	}
	return resmgr_v1.NewConfigClient(conn), nil
}


================================================
FILE: pkg/agent/flags.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package agent

import (
	"flag"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/sockets"
)

type options struct {
	kubeconfig    string
	agentSocket   string
	resmgrSocket  string
	configNs      string
	configMapName string
	labelName     string
}

var opts = options{}

func init() {
	flag.StringVar(&opts.agentSocket, "agent-socket", sockets.ResourceManagerAgent, "Socket for incoming requests from cri-resmgr")
	flag.StringVar(&opts.resmgrSocket, "cri-resmgr-socket", sockets.ResourceManagerConfig, "cri-resmgr socket to connect to")
	flag.StringVar(&opts.kubeconfig, "kubeconfig", "", "Kubeconfig to use, empty string implies in-cluster config (i.e. running inside a Pod)")
	flag.StringVar(&opts.configNs, "config-ns", "kube-system", "Kubernetes namespace where to look for config")
	flag.StringVar(&opts.configMapName, "configmap-name", "cri-resmgr-config", "Name of the K8s ConfigMap to watch")
	flag.StringVar(&opts.labelName, "label-name", kubernetes.ResmgrKey("group"), "Name of the label used to assign a node to a configuration group.")
}


================================================
FILE: pkg/agent/kubernetes.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package agent

import (
	"context"
	"encoding/json"
	"fmt"
	"os"
	"time"

	core_v1 "k8s.io/api/core/v1"
	meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/apimachinery/pkg/types"
	k8swatch "k8s.io/apimachinery/pkg/watch"
	k8sclient "k8s.io/client-go/kubernetes"

	"k8s.io/client-go/rest"
	"k8s.io/client-go/tools/clientcmd"

	resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1"

	agent_v1 "github.com/intel/cri-resource-manager/pkg/agent/api/v1"
)

type namespace string

// nodeName contains the name of the k8s we're running on
var nodeName string

// getK8sClient initializes a new Kubernetes client
func (a *agent) getK8sClient(kubeconfig string) (*k8sclient.Clientset, *resmgr.CriresmgrV1alpha1Client, error) {
	var config *rest.Config
	var err error

	if kubeconfig == "" {
		a.Info("using in-cluster kubeconfig")
		config, err = rest.InClusterConfig()
	} else {
		config, err = clientcmd.BuildConfigFromFlags("", kubeconfig)
	}
	if err != nil {
		return nil, nil, err
	}

	genCli, err := k8sclient.NewForConfig(config)
	if err != nil {
		return nil, nil, err
	}

	resmgr, err := resmgr.NewForConfig(config)
	if err != nil {
		return nil, nil, err
	}

	return genCli, resmgr, nil
}

// getNodeObject gets a k8s Node object
func getNodeObject(cli *k8sclient.Clientset) (*core_v1.Node, error) {
	node, err := cli.CoreV1().Nodes().Get(context.TODO(), nodeName, meta_v1.GetOptions{})
	if err != nil {
		return nil, agentError("failed to get node object for node %q: %v", nodeName, err)
	}
	return node, nil
}

// patchNodeObject is a helper for patching a k8s Node object
func patchNode(cli *k8sclient.Clientset, patchList []*agent_v1.JsonPatch) error {
	// Convert patch list into bytes
	data, err := json.Marshal(patchList)
	if err != nil {
		return agentError("failed to marshal Node patches: %v", err)
	}

	// Patch our node
	pt := types.JSONPatchType
	_, err = cli.CoreV1().Nodes().Patch(context.TODO(), nodeName, pt, data, meta_v1.PatchOptions{})
	if err != nil {
		return err
	}
	return nil
}

// patchNodeStatus is a helper for patching the status of a k8s Node object
func patchNodeStatus(cli *k8sclient.Clientset, fields map[string]string) error {
	patch, sep := fmt.Sprintf(`{"status": {`), ""
	for f, v := range fields {
		patch += sep + fmt.Sprintf(`"%s": %s`, f, v)
		sep = ","
	}
	patch += "}}"

	_, err := cli.CoreV1().Nodes().PatchStatus(context.TODO(), nodeName, []byte(patch))

	return err
}

// patchAdjustmentStatus is a helper for patching the status of a Adjustment CRD.
func patchAdjustmentStatus(_ *resmgr.CriresmgrV1alpha1Client, _ *resmgrStatus, _ ...string) error {
	return nil
}

// watch is a wrapper around the k8s watch.Interface
type watch struct {
	parent  *watcher
	kind    string
	ns      namespace
	name    string
	openfn  func(namespace, string) (k8swatch.Interface, error)
	queryfn func(namespace, string) (interface{}, error)
	stop    chan struct{}
	events  chan k8swatch.Event
}

// openFn is the type for functions creating k8s watcher of a particular kind.
type openFn func(ns namespace, name string) (k8swatch.Interface, error)

// queryFn is the type for functions querying k8s objects being watched.
type queryFn func(ns namespace, name string) (interface{}, error)

const (
	// SyntheticMissing is a synthetic initial event for currently non-existent object.
	SyntheticMissing = k8swatch.EventType("SyntheticMissing")
)

func newWatch(parent *watcher, kind string, ns namespace, open openFn, query queryFn) *watch {
	return &watch{
		parent:  parent,
		kind:    kind,
		ns:      ns,
		stop:    make(chan struct{}),
		events:  make(chan k8swatch.Event),
		openfn:  open,
		queryfn: query,
	}
}

// newNodeWatch creates a watch for k8s Node
func newNodeWatch(parent *watcher) *watch {
	w := newWatch(parent, "Node", namespace(""),
		func(ns namespace, name string) (k8swatch.Interface, error) {
			selector := meta_v1.ListOptions{FieldSelector: "metadata.name=" + name}
			k8w, err := parent.k8sCli.CoreV1().Nodes().Watch(context.TODO(), selector)
			if err != nil {
				return nil, err
			}
			return k8w, nil
		},
		func(ns namespace, name string) (interface{}, error) {
			noopts := meta_v1.GetOptions{}
			node, err := parent.k8sCli.CoreV1().Nodes().Get(context.TODO(), name, noopts)
			if err != nil {
				return nil, err
			}
			return node, nil
		})
	w.Start(nodeName)
	return w
}

// newConfigMapWatch creates a watch for k8s ConfigMap
func newConfigMapWatch(parent *watcher, name string, ns namespace) *watch {
	w := newWatch(parent, "ConfigMap", ns,
		func(ns namespace, name string) (k8swatch.Interface, error) {
			selector := meta_v1.ListOptions{FieldSelector: "metadata.name=" + name}
			k8w, err := parent.k8sCli.CoreV1().ConfigMaps(string(ns)).Watch(context.TODO(), selector)
			if err != nil {
				return nil, err
			}
			return k8w, nil
		},
		func(ns namespace, name string) (interface{}, error) {
			noopts := meta_v1.GetOptions{}
			cm, err := parent.k8sCli.CoreV1().ConfigMaps(string(ns)).Get(context.TODO(), name, noopts)
			if err != nil {
				return nil, err
			}
			return cm, nil
		})
	w.Start(name)
	return w
}

// newAdustmentCRDWatch creates a watch for k8s Adjustment CRDs
func newAdjustmentCRDWatch(parent *watcher, ns namespace) *watch {
	w := newWatch(parent, "AdjustmentCRD", ns,
		func(ns namespace, name string) (k8swatch.Interface, error) {
			k8w, err := parent.resmgrCli.Adjustments(string(ns)).Watch(meta_v1.ListOptions{})
			if err != nil {
				return nil, err
			}
			return k8w, nil
		},
		func(ns namespace, name string) (interface{}, error) {
			crds, err := parent.resmgrCli.Adjustments(string(ns)).List(meta_v1.ListOptions{})
			if err != nil {
				return nil, err
			}
			if crds == nil || len(crds.Items) == 0 {
				crds = nil
			}
			return crds, nil
		})
	w.Start("AdjustmentCRD")
	return w
}

func (w *watch) Name() string {
	ns, name := w.ns, w.name
	if ns != "" {
		ns += "/"
	}
	if name == "" {
		name = "<none>"
	}
	return w.kind + ":" + string(ns) + name
}

// Query queries the object being watched.
func (w *watch) Query() (interface{}, error) {
	if w.name == "" {
		return nil, nil
	}
	return w.queryfn(w.ns, w.name)
}

// Start watching an object.
func (w *watch) Start(name string) {
	w.Stop()
	w.name = name

	if w.name == "" {
		return
	}

	// proxy events from a go-routine until we're told to stop.
	go func() {
		var k8w k8swatch.Interface
		var events <-chan k8swatch.Event
		var ratelimit <-chan time.Time
		var err error

		// let the watcher know not to expect initial event
		if objs, _ := w.queryfn(w.ns, w.name); objs == nil {
			w.events <- k8swatch.Event{Type: SyntheticMissing}
		}

		for {
			if events == nil {
				w.parent.Info("creating %s watch", w.Name())
				if k8w, err = w.openfn(w.ns, w.name); err != nil {
					w.parent.Warn("failed to create %s watch: %v", w.Name(), err)
					ratelimit = time.After(1 * time.Second)
				} else {
					events = k8w.ResultChan()
					ratelimit = nil
				}
			}

			select {
			case _ = <-w.stop:
				if events != nil {
					k8w.Stop()
				}
				return
			case e, ok := <-events:
				if ok {
					w.events <- e
				} else {
					w.parent.Warn("failed to get event from watch %s", w.Name())
					k8w.Stop()
					events = nil
				}
			case _ = <-ratelimit:
			}
		}
	}()
}

// Close closes a watch.
func (w *watch) Stop() {
	select {
	case w.stop <- struct{}{}:
	default:
	}
}

// ResultChan returns the event channel of the watch.
func (w *watch) ResultChan() <-chan k8swatch.Event {
	return w.events
}

func init() {
	// Node name is expected to be set in an environment variable
	nodeName = os.Getenv("NODE_NAME")
}


================================================
FILE: pkg/agent/server.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package agent

import (
	"context"
	"encoding/json"
	"fmt"
	"net"
	"os"
	"path/filepath"
	"strings"

	"google.golang.org/grpc"
	core_v1 "k8s.io/api/core/v1"
	k8sclient "k8s.io/client-go/kubernetes"

	v1 "github.com/intel/cri-resource-manager/pkg/agent/api/v1"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/sockets"
	"github.com/intel/cri-resource-manager/pkg/log"
)

// agentServer is the interface for our gRPC server.
type agentServer interface {
	Start(string) error
	Stop()
}

// server implements agentServer.
type server struct {
	log.Logger
	cli    *k8sclient.Clientset // client for accessing k8s api
	server *grpc.Server         // gRPC server instance
	cfg    configInterface
}

// newAgentServer creates new agentServer instance.
func newAgentServer(cli *k8sclient.Clientset, cfg configInterface) (agentServer, error) {
	s := &server{
		Logger: log.NewLogger("server"),
		cli:    cli,
		cfg:    cfg,
	}

	return s, nil
}

// Start runs server instance.
func (s *server) Start(socket string) error {
	// Make sure we have a directory for the socket.
	if err := os.MkdirAll(filepath.Dir(socket), sockets.DirPermissions); err != nil {
		return agentError("failed to create directory for socket %s: %v", socket, err)
	}

	// Remove any leftover sockets.
	if err := os.Remove(socket); err != nil && !os.IsNotExist(err) {
		return agentError("failed to unlink socket file: %s", err)
	}

	// Create server listening for local unix domain socket
	lis, err := net.Listen("unix", socket)
	if err != nil {
		return agentError("failed to listen to socket: %v", err)
	}

	serverOpts := []grpc.ServerOption{}
	s.server = grpc.NewServer(serverOpts...)
	gs := &grpcServer{
		Logger: s.Logger,
		cli:    s.cli,
		cfg:    s.cfg,
	}
	v1.RegisterAgentServer(s.server, gs)

	s.Info("starting gRPC server at socket %s", socket)
	go func() {
		defer lis.Close()
		err := s.server.Serve(lis)
		if err != nil {
			s.Fatal("grpc server died: %v", err)
		}
	}()
	return nil
}

// Stop agentServer instance
func (s *server) Stop() {
	s.server.Stop()
}

// grpcServer implements v1.AgentServer
type grpcServer struct {
	v1.UnimplementedAgentServer
	log.Logger
	cli *k8sclient.Clientset
	cfg configInterface
}

// GetNode gets K8s node object.
func (g *grpcServer) GetNode(_ context.Context, req *v1.GetNodeRequest) (*v1.GetNodeReply, error) {
	g.Debug("received GetNodeRequest: %v", req)
	rpl := &v1.GetNodeReply{}

	node, err := getNodeObject(g.cli)
	if err != nil {
		return rpl, agentError("failed to get node object: %v", err)
	}
	serialized, err := json.Marshal(node)
	if err != nil {
		return rpl, agentError("failed to serialized node object: %v", err)
	}
	rpl.Node = string(serialized)

	return rpl, nil
}

// PatchNode patches the K8s node object.
func (g *grpcServer) PatchNode(_ context.Context, req *v1.PatchNodeRequest) (*v1.PatchNodeReply, error) {
	g.Debug("received PatchNodeRequest: %v", req)
	rpl := &v1.PatchNodeReply{}

	// Apply patches
	if len(req.Patches) > 0 {
		err := patchNode(g.cli, req.Patches)
		if err != nil {
			return rpl, agentError("failed to patch node object: %v", err)
		}
	}

	return rpl, nil
}

// UpdateNodeCapacity updates capacity in Node status
func (g *grpcServer) UpdateNodeCapacity(_ context.Context, req *v1.UpdateNodeCapacityRequest) (*v1.UpdateNodeCapacityReply, error) {
	g.Debug("received UpdateNodeCapacityRequest: %v", req)

	rpl := &v1.UpdateNodeCapacityReply{}

	capacity, sep := "", ""
	for name, count := range req.Capacities {
		if isNativeResource(name) {
			err := agentError("refusing to update capacity of native resource '%s'", name)
			return rpl, err
		}

		if !strings.Contains(name, ".") || !strings.Contains(name, "/") {
			err := agentError("invalid resource '%s' in capacity update", name)
			return rpl, err
		}

		capacity += sep + fmt.Sprintf(`"%s": "%s"`, name, count)
		sep = ", "
	}

	err := patchNodeStatus(g.cli, map[string]string{"capacity": "{" + capacity + "}"})

	return rpl, err
}

// HealthCheck checks if the agent is in healthy state
func (g *grpcServer) HealthCheck(_ context.Context, req *v1.HealthCheckRequest) (*v1.HealthCheckReply, error) {
	g.Debug("received HealthCheckRequest: %v", req)

	reply := &v1.HealthCheckReply{}

	if req.Query == v1.ConfigStatus {
		if err := g.cfg.getError(); err != nil {
			reply.Error = fmt.Sprintf("configuration error: %v", err)
		}
	}

	return reply, nil
}

func isNativeResource(name string) bool {
	switch {
	case name == string(core_v1.ResourceCPU), name == string(core_v1.ResourceMemory):
		return true
	case strings.HasPrefix(name, core_v1.ResourceHugePagesPrefix):
		return true
	default:
		return false
	}
}


================================================
FILE: pkg/agent/watcher.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package agent

import (
	core_v1 "k8s.io/api/core/v1"
	k8swatch "k8s.io/apimachinery/pkg/watch"
	k8sclient "k8s.io/client-go/kubernetes"
	"sync"
	"time"

	"encoding/json"
	patch "github.com/evanphx/json-patch"
	pkgtypes "k8s.io/apimachinery/pkg/types"

	resmgrcli "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1"
	resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1"

	"github.com/intel/cri-resource-manager/pkg/log"
)

type cachedConfig struct {
	sync.RWMutex
	nodeCfg  *resmgrConfig    // node-specific configuration
	groupCfg *resmgrConfig    // group-specific configuration
	group    string           // group name, "" for default
	inscope  resmgrAdjustment // external adjustments that apply to this node
	ignored  resmgrAdjustment // external adjustments that do not apply to this node
	status   *resmgrStatus    // latest adjustment update status
}

// k8sWatcher is our interface to K8s control plane watcher
type k8sWatcher interface {
	// Start the watcher instance
	Start() error
	// Stop the watcher instance
	Stop()
	// Get a chan through which to receive configuration updates
	ConfigChan() <-chan resmgrConfig
	// Get up-to-date config
	GetConfig() resmgrConfig
	// Get a chan through which to receive adjustment updates
	AdjustmentChan() <-chan resmgrAdjustment
	// Update the node Status for adjustment updates.
	UpdateStatus(*resmgrStatus) error
}

// watcher implements k8sWatcher
type watcher struct {
	log.Logger
	stop           chan struct{}                      // channel to stop watcher goroutine
	k8sCli         *k8sclient.Clientset               // k8s client interface
	resmgrCli      *resmgrcli.CriresmgrV1alpha1Client // adjustment CRD interface
	currentConfig  cachedConfig                       // current configuration, cached
	configChan     chan resmgrConfig                  // channel for config updates
	adjustmentChan chan resmgrAdjustment              // channel for adjustment updates
}

// newK8sWatcher creates a new K8sWatcher instance
func newK8sWatcher(k8sCli *k8sclient.Clientset, resmgrCli *resmgrcli.CriresmgrV1alpha1Client) (k8sWatcher, error) {
	w := &watcher{
		Logger:         log.NewLogger("watcher"),
		k8sCli:         k8sCli,
		resmgrCli:      resmgrCli,
		stop:           make(chan struct{}, 1),
		currentConfig:  newCachedConfig(),
		configChan:     make(chan resmgrConfig, 1),
		adjustmentChan: make(chan resmgrAdjustment, 1),
	}

	return w, nil
}

// Start runs a k8sWatcher instance
func (w *watcher) Start() error {
	w.Info("starting watcher...")
	if nodeName == "" {
		return agentError("node name not set, NODE_NAME env variable should be set to match the name of this k8s Node")
	}

	go func() {
		w.watch()
	}()
	return nil
}

// Stop stops a running k8sWatcher instance
func (w *watcher) Stop() {
	select {
	case w.stop <- struct{}{}:
	default:
		w.Debug("stop already sent")
	}
}

// ConfigChan returns the chan for config updates
func (w *watcher) ConfigChan() <-chan resmgrConfig {
	return w.configChan
}

// AdjustmentChan returns the chan for adjustment updates
func (w *watcher) AdjustmentChan() <-chan resmgrAdjustment {
	return w.adjustmentChan
}

// GetConfig returns the current cri-resmgr configuration
func (w *watcher) GetConfig() resmgrConfig {
	cfg, kind := w.currentConfig.getConfig()
	w.Info("giving %s configuration in reply to query", kind)
	return cfg
}

// UpdateStatus updates the node status for adjustment updates.
func (w *watcher) UpdateStatus(status *resmgrStatus) error {
	w.currentConfig.setStatus(status)
	return w.PatchAdjustmentStatus(status)
}

// PatchAdjustmentStatus updates the node status for adjustment updates.
func (w *watcher) PatchAdjustmentStatus(status *resmgrStatus) error {
	errors := status.errors
	if errors == nil {
		errors = map[string]string{}
	}
	if status.request != nil {
		errors["request"] = status.request.Error()
	}

	inscope, ignored := w.currentConfig.getAdjustment()

	w.currentConfig.Lock()
	defer w.currentConfig.Unlock()

	errCnt := 0
	for _, adjust := range inscope {
		if err := w.patchAdjustment(adjust, true, errors); err != nil {
			w.Error("%v", err)
			errCnt++
		}
	}
	for _, adjust := range ignored {
		if err := w.patchAdjustment(adjust, false, errors); err != nil {
			w.Error("%v", err)
			errCnt++
		}
	}
	if errCnt > 0 {
		return agentError("some adjustment status updates failed")
	}

	return nil
}

// patchAdjustment patches the status of an update to the given adjustment.
func (w *watcher) patchAdjustment(adjust *resmgr.Adjustment, inscope bool, errors map[string]string) error {
	var pdata []byte
	var err error

	old, ok := adjust.Status.Nodes[nodeName]

	if !inscope {
		if !ok {
			w.Debug("adjustment %s does not need status patching...", adjust.Name)
			return nil
		}
		current := &resmgr.Adjustment{
			Status: resmgr.AdjustmentStatus{
				Nodes: map[string]resmgr.AdjustmentNodeStatus{
					nodeName: old,
				},
			},
		}
		updated := &resmgr.Adjustment{
			Status: resmgr.AdjustmentStatus{
				Nodes: map[string]resmgr.AdjustmentNodeStatus{},
			},
		}
		oldData, _ := json.Marshal(current)
		newData, _ := json.Marshal(updated)
		pdata, err = patch.CreateMergePatch(oldData, newData)
		if err != nil {
			return agentError("failed to adjustment status patch: %v", err)
		}
	} else {
		current := &resmgr.Adjustment{
			Status: resmgr.AdjustmentStatus{
				Nodes: map[string]resmgr.AdjustmentNodeStatus{},
			},
		}
		if ok {
			current.Status.Nodes[nodeName] = old
		}
		updated := &resmgr.Adjustment{
			Status: resmgr.AdjustmentStatus{
				Nodes: map[string]resmgr.AdjustmentNodeStatus{
					nodeName: {Errors: errors},
				},
			},
		}
		oldData, _ := json.Marshal(current)
		newData, _ := json.Marshal(updated)
		pdata, err = patch.CreateMergePatch(oldData, newData)
		if err != nil {
			return agentError("failed to adjustment status patch: %v", err)
		}
	}

	ptype := pkgtypes.MergePatchType

	w.Debug("patching status of adjustment %s status with %v...", adjust.Name, string(pdata))

	if _, err := w.resmgrCli.Adjustments(opts.configNs).Patch(adjust.Name, ptype, pdata); err != nil {
		return agentError("failed to patch Adjustment CRD %q: %v", adjust.Name, err)
	}

	if inscope {
		if adjust.Status.Nodes == nil {
			adjust.Status.Nodes = make(map[string]resmgr.AdjustmentNodeStatus)
		}
		adjust.Status.Nodes[nodeName] = resmgr.AdjustmentNodeStatus{Errors: errors}
	} else {
		delete(adjust.Status.Nodes, nodeName)
	}

	return nil
}

// sendConfig sends the current configuration.
func (w *watcher) sendConfig() {
	cfg, kind := w.currentConfig.getConfig()
	w.Info("pushing %s configuration to client", kind)
	w.configChan <- cfg
}

// sendAdjustment sends the current overridden policies.
func (w *watcher) sendAdjustment() {
	inscope, _ := w.currentConfig.getAdjustment()
	w.adjustmentChan <- inscope
}

func (w *watcher) watch() error {
	nodew := newNodeWatch(w)
	group := ""

	if node, err := nodew.Query(); err != nil {
		w.Warn("failed to query node %q: %v", nodeName, err)
	} else if node == nil {
		w.Warn("failed to query node %q, make sure that NODE_NAME is correctly set", nodeName)
	} else {
		group = node.(*core_v1.Node).Labels[opts.labelName]
		w.Info("configuration group is set to '%s'", group)
	}

	cfgw := newConfigMapWatch(w, opts.configMapName+".node."+nodeName, namespace(opts.configNs))
	grpw := newConfigMapWatch(w, groupMapName(group), namespace(opts.configNs))
	crdw := newAdjustmentCRDWatch(w, namespace(opts.configNs))

	w.Info("watcher running")
	w.sendConfig()

	for {
		select {
		case _ = <-w.stop:
			w.Info("stopping configuration watcher")
			nodew.Stop()
			cfgw.Stop()
			grpw.Stop()
			crdw.Stop()
			return nil

		case e, ok := <-nodew.ResultChan():
			if ok {
				switch e.Type {
				case k8swatch.Added, k8swatch.Modified:
					w.Info("node (%s) configuration updated", nodeName)
					label, _ := e.Object.(*core_v1.Node).Labels[opts.labelName]
					if group != label {
						group = label
						w.Info("configuration group is set to '%s'", group)
						grpw.Start(groupMapName(group))
					}
				case k8swatch.Deleted:
					w.Warn("Hmm, our node got removed...")
				}
				continue
			}

		case e, ok := <-cfgw.ResultChan():
			if ok {
				switch e.Type {
				case k8swatch.Added, k8swatch.Modified:
					w.Info("node ConfigMap updated")
					cm := e.Object.(*core_v1.ConfigMap)
					w.currentConfig.setNode(&cm.Data)
					w.sendConfig()

				case k8swatch.Deleted, SyntheticMissing:
					w.Info("node ConfigMap deleted")
					w.currentConfig.setNode(nil)
					w.sendConfig()
				}
				continue
			}

		case e, ok := <-grpw.ResultChan():
			if ok {
				switch e.Type {
				case k8swatch.Added, k8swatch.Modified:
					w.Info("group/default ConfigMap updated")
					cm := e.Object.(*core_v1.ConfigMap)
					if w.currentConfig.setGroup(group, &cm.Data) {
						w.sendConfig()
					}
				case k8swatch.Deleted, SyntheticMissing:
					w.Info("group/default ConfigMap deleted")
					if w.currentConfig.setGroup(group, nil) {
						w.sendConfig()
					}
				}
				continue
			}

		case e, ok := <-crdw.ResultChan():
			if ok {
				switch e.Type {
				case k8swatch.Added, k8swatch.Modified:
					w.Info("Adjustment CRD(s) updated: %T, %+v", e.Object, e.Object)
					w.Info("Adjustment CRD(s): %+v", e.Object.(*resmgr.Adjustment).Spec)
					if w.currentConfig.setAdjustment(e.Object.(*resmgr.Adjustment)) {
						w.sendAdjustment()
					}

				case k8swatch.Deleted:
					w.Info("Adjustment CRD(s) (%T) deleted", e.Object)
					if w.currentConfig.deleteAdjustment(e.Object.(*resmgr.Adjustment)) {
						w.sendAdjustment()
					}

				case SyntheticMissing:
					w.Info("No Adjustment CRD(s)")
					w.sendAdjustment()
				}
				continue
			}
		}

		// shouln't be necessary, but just in case avoid spinning on a closed channel
		time.Sleep(1 * time.Second)
	}
}

// groupMapName returns the our group ConfigMap, or the default one is we have no group.
func groupMapName(group string) string {
	if group == "" {
		return opts.configMapName + ".default"
	}
	return opts.configMapName + ".group." + group
}

// newCacheConfig creates a new cachedConfig instance.
func newCachedConfig() cachedConfig {
	return cachedConfig{
		inscope: resmgrAdjustment{},
		ignored: resmgrAdjustment{},
	}
}

// getConfig is a helper method for getting the config data
func (c *cachedConfig) getConfig() (resmgrConfig, string) {
	c.RLock()
	defer c.RUnlock()

	var cfg *resmgrConfig
	var kind string

	switch {
	case c.nodeCfg != nil:
		kind = "node"
		cfg = c.nodeCfg
	case c.group != "":
		kind = "group " + c.group
		cfg = c.groupCfg
	case c.groupCfg != nil:
		kind = "default"
		cfg = c.groupCfg
	default:
		kind = "fallback"
	}

	if cfg == nil {
		kind = "empty " + kind
		cfg = &resmgrConfig{}
	}

	return *cfg, kind
}

// getAdjustment is a helper method for getting a copy of external adjustments
func (c *cachedConfig) getAdjustment() (resmgrAdjustment, resmgrAdjustment) {
	c.RLock()
	defer c.RUnlock()

	inscope := resmgrAdjustment{}
	for name, value := range c.inscope {
		inscope[name] = value
	}
	ignored := resmgrAdjustment{}
	for name, value := range c.ignored {
		ignored[name] = value
	}

	return inscope, ignored
}

// set node-specific configuration
func (c *cachedConfig) setNode(data *map[string]string) bool {
	c.Lock()
	defer c.Unlock()

	c.nodeCfg = (*resmgrConfig)(data)
	return true
}

// set group-specific or default configuration
func (c *cachedConfig) setGroup(group string, data *map[string]string) bool {
	c.Lock()
	defer c.Unlock()

	c.groupCfg = (*resmgrConfig)(data)
	c.group = group
	return c.nodeCfg == nil
}

// setAdjustment is a helper method for updating external adjustments
func (c *cachedConfig) setAdjustment(adjust *resmgr.Adjustment) bool {
	var inscope, ignored bool
	var updated *resmgr.Adjustment

	c.Lock()
	defer c.Unlock()

	//
	// filter out updates
	//   - for expired watches being recreated
	//   - without any Spec changes (Status updates)
	//

	if updated, inscope = c.inscope[adjust.Name]; inscope {
		if adjust.HasSameVersion(updated) || adjust.Spec.Compare(&updated.Spec) {
			c.inscope[adjust.Name] = adjust
			return false
		}
	} else if updated, ignored = c.ignored[adjust.Name]; ignored {
		if adjust.HasSameVersion(updated) || adjust.Spec.Compare(&updated.Spec) {
			c.ignored[adjust.Name] = adjust
			return false
		}
	}

	//
	// we need to notify cri-resmgr if
	//   - the adjustment applies to this node
	//   - the adjustment used to apply to this node before the update
	//

	notify := false
	if adjust.Spec.IsNodeInScope(nodeName) {
		c.inscope[adjust.Name] = adjust
		if ignored {
			delete(c.ignored, adjust.Name)
		}
		notify = true
	} else {
		c.ignored[adjust.Name] = adjust
		if inscope {
			delete(c.inscope, adjust.Name)
			notify = true
		}
	}

	return notify
}

// deleteAdjustment is a helper method for updating external adjustments
func (c *cachedConfig) deleteAdjustment(o *resmgr.Adjustment) bool {
	c.Lock()
	defer c.Unlock()

	// we need to notify cri-resmgr if the deleted adjustment used to apply to this node
	if _, ok := c.inscope[o.Name]; ok {
		delete(c.inscope, o.Name)
		return true
	}

	delete(c.ignored, o.Name)
	return false
}

// getAdjustmentNames returns the names of in scope and ignored adjustments.
func (c *cachedConfig) getAdjustmentNames() ([]string, []string) {
	c.RLock()
	defer c.RUnlock()

	inscope := make([]string, 0, len(c.inscope))
	ignored := make([]string, 0, len(c.ignored))
	for name := range c.inscope {
		inscope = append(inscope, name)
	}
	for name := range c.ignored {
		ignored = append(ignored, name)
	}
	return inscope, ignored
}

// cache the status of the last adjustment update
func (c *cachedConfig) setStatus(status *resmgrStatus) {
	c.Lock()
	defer c.Unlock()
	c.status = status
}

// get the last cached adjustment update status
func (c *cachedConfig) getStatus() *resmgrStatus {
	c.RLock()
	defer c.RUnlock()
	return c.status
}


================================================
FILE: pkg/apis/resmgr/expression.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package resmgr

import (
	"fmt"
	"path"
	"path/filepath"
	"strings"

	logger "github.com/intel/cri-resource-manager/pkg/log"
)

// Evaluable is the interface objects need to implement to be evaluable against Expressions.
type Evaluable interface {
	Eval(string) interface{}
}

// Expression is used to describe a criteria to select objects within a domain.
type Expression struct {
	Key    string   `json:"key"`              // key to check values of/against
	Op     Operator `json:"operator"`         // operator to apply to value of Key and Values
	Values []string `json:"values,omitempty"` // value(s) for domain key
}

const (
	KeyPod       = "pod"
	KeyID        = "id"
	KeyUID       = "uid"
	KeyName      = "name"
	KeyNamespace = "namespace"
	KeyQOSClass  = "qosclass"
	KeyLabels    = "labels"
	KeyTags      = "tags"
)

// Operator defines the possible operators for an Expression.
type Operator string

const (
	// Equals tests for equality with a single value.
	Equals Operator = "Equals"
	// NotEqual test for inequality with a single value.
	NotEqual Operator = "NotEqual"
	// In tests if the key's value is one of the specified set.
	In Operator = "In"
	// NotIn tests if the key's value is not one of the specified set.
	NotIn Operator = "NotIn"
	// Exists evalutes to true if the named key exists.
	Exists Operator = "Exists"
	// NotExist evalutes to true if the named key does not exist.
	NotExist Operator = "NotExist"
	// AlwaysTrue always evaluates to true.
	AlwaysTrue Operator = "AlwaysTrue"
	// Matches tests if the key value matches the only given globbing pattern.
	Matches Operator = "Matches"
	// MatchesNot is true if Matches would be false for the same key and pattern.
	MatchesNot Operator = "MatchesNot"
	// MatchesAny tests if the key value matches any of the given globbing patterns.
	MatchesAny Operator = "MatchesAny"
	// MatchesNone is true if MatchesAny would be false for the same key and patterns.
	MatchesNone Operator = "MatchesNone"
)

// Our logger instance.
var log = logger.NewLogger("expression")

// Validate checks the expression for (obvious) invalidity.
func (e *Expression) Validate() error {
	if e == nil {
		return exprError("nil expression")
	}

	switch e.Op {
	case Equals, NotEqual:
		if len(e.Values) != 1 {
			return exprError("invalid expression, '%s' requires a single value", e.Op)
		}
	case Matches, MatchesNot:
		if len(e.Values) != 1 {
			return exprError("invalid expression, '%s' requires a single value", e.Op)
		}
	case Exists, NotExist:
		if e.Values != nil && len(e.Values) != 0 {
			return exprError("invalid expression, '%s' does not take any values", e.Op)
		}

	case In, NotIn:
	case MatchesAny, MatchesNone:
	case AlwaysTrue:

	default:
		return exprError("invalid expression, unknown operator: %q", e.Op)
	}
	return nil
}

// Evaluate evaluates an expression against a container.
func (e *Expression) Evaluate(subject Evaluable) bool {
	log.Debug("evaluating %q @ %s...", *e, subject)

	if e.Op == AlwaysTrue {
		return true
	}

	value, ok := e.KeyValue(subject)
	result := false

	switch e.Op {
	case Equals:
		result = ok && (value == e.Values[0] || e.Values[0] == "*")
	case NotEqual:
		result = !ok || value != e.Values[0]
	case Matches, MatchesNot:
		match := false
		if ok {
			match, _ = filepath.Match(e.Values[0], value)
		}
		result = ok && match
		if e.Op == MatchesNot {
			result = !result
		}
	case In, NotIn:
		if ok {
			for _, v := range e.Values {
				if value == v || v == "*" {
					result = true
				}
			}
		}
		if e.Op == NotIn {
			result = !result
		}
	case MatchesAny, MatchesNone:
		if ok {
			for _, pattern := range e.Values {
				if match, _ := filepath.Match(pattern, value); match {
					result = true
					break
				}
			}
		}
		if e.Op == MatchesNone {
			result = !result
		}
	case Exists:
		result = ok
	case NotExist:
		result = !ok
	}

	log.Debug("%q @ %s => %v", *e, subject, result)

	return result
}

// KeyValue extracts the value of the expresssion key from a container.
func (e *Expression) KeyValue(subject Evaluable) (string, bool) {
	log.Debug("looking up %q @ %s...", e.Key, subject)

	value := ""
	ok := false

	keys, vsep := splitKeys(e.Key)
	if len(keys) == 1 {
		value, ok, _ = ResolveRef(subject, keys[0])
	} else {
		vals := make([]string, 0, len(keys))
		for _, key := range keys {
			v, found, _ := ResolveRef(subject, key)
			vals = append(vals, v)
			ok = ok || found
		}
		value = strings.Join(vals, vsep)
	}

	log.Debug("%q @ %s => %q, %v", e.Key, subject, value, ok)

	return value, ok
}

func splitKeys(keys string) ([]string, string) {
	// joint key specs have two valid forms:
	//   - ":keylist" (equivalent to ":::<colon-separated-keylist>")
	//   - ":<ksep><vsep><ksep-separated-keylist>"

	if len(keys) < 4 || keys[0] != ':' {
		return []string{keys}, ""
	}

	keys = keys[1:]
	ksep := keys[0:1]
	vsep := keys[1:2]

	if validSeparator(ksep[0]) && validSeparator(vsep[0]) {
		keys = keys[2:]
	} else {
		ksep = ":"
		vsep = ":"
	}

	return strings.Split(keys, ksep), vsep
}

func validSeparator(b byte) bool {
	switch {
	case '0' <= b && b <= '9':
		return false
	case 'a' <= b && b <= 'z':
		return false
	case 'A' <= b && b <= 'Z':
		return false
	case b == '/', b == '.':
		return false
	}
	return true
}

// ResolveRef walks an object trying to resolve a reference to a value.
func ResolveRef(subject Evaluable, spec string) (string, bool, error) {
	var obj interface{}

	log.Debug("resolving %q @ %s...", spec, subject)

	spec = path.Clean(spec)
	ref := strings.Split(spec, "/")
	if len(ref) == 1 {
		if strings.Index(spec, ".") != -1 {
			ref = []string{"labels", spec}
		}
	}

	obj = subject
	for len(ref) > 0 {
		key := ref[0]

		log.Debug("resolve walking %q @ %s...", key, obj)
		switch v := obj.(type) {
		case string:
			obj = v
		case map[string]string:
			value, ok := v[key]
			if !ok {
				return "", false, nil
			}
			obj = value
		case error:
			return "", false, exprError("%s: failed to resolve %q: %v", subject, spec, v)
		default:
			e, ok := obj.(Evaluable)
			if !ok {
				return "", false, exprError("%s: failed to resolve %q, unexpected type %T",
					subject, spec, obj)
			}
			obj = e.Eval(key)
		}

		ref = ref[1:]
	}

	str, ok := obj.(string)
	if !ok {
		return "", false, exprError("%s: reference %q resolved to non-string: %T",
			subject, spec, obj)
	}

	log.Debug("resolved %q @ %s => %s", spec, subject, str)

	return str, true, nil
}

// String returns the expression as a string.
func (e *Expression) String() string {
	return fmt.Sprintf("<%s %s %s>", e.Key, e.Op, strings.Join(e.Values, ","))
}

// DeepCopy creates a deep copy of the expression.
func (e *Expression) DeepCopy() *Expression {
	out := &Expression{}
	e.DeepCopyInto(out)
	return out
}

// DeepCopyInto copies the expression into another one.
func (e *Expression) DeepCopyInto(out *Expression) {
	out.Key = e.Key
	out.Op = e.Op
	out.Values = make([]string, len(e.Values))
	copy(out.Values, e.Values)
}

// exprError returns a formatted error specific to expressions.
func exprError(format string, args ...interface{}) error {
	return fmt.Errorf("expression: "+format, args...)
}


================================================
FILE: pkg/apis/resmgr/expression_test.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package resmgr

import (
	"fmt"
	"strings"
	"testing"

	logger "github.com/intel/cri-resource-manager/pkg/log"
)

type evaluable struct {
	name      string
	namespace string
	qosclass  string
	labels    map[string]string
	tags      map[string]string
	parent    Evaluable
}

func newEvaluable(name, ns, qos string, labels, tags map[string]string, p Evaluable) *evaluable {
	return &evaluable{
		name:      name,
		namespace: ns,
		qosclass:  qos,
		labels:    labels,
		tags:      tags,
		parent:    p,
	}
}

func (e *evaluable) Eval(key string) interface{} {
	switch key {
	case KeyName:
		return e.name
	case KeyNamespace:
		return e.namespace
	case KeyQOSClass:
		return e.qosclass
	case KeyLabels:
		return e.labels
	case KeyTags:
		return e.tags
	case KeyPod:
		if e.parent != nil {
			return e.parent
		}
		fallthrough
	default:
		return fmt.Errorf("evaluable: cannot evaluate %q", key)
	}
}

func (e *evaluable) String() string {
	s := fmt.Sprintf("{ name: %q, namespace: %q, qosclass: %q, ", e.name, e.namespace, e.qosclass)
	labels, t := "{", ""
	for k, v := range e.labels {
		labels += t + fmt.Sprintf("%q:%q", k, v)
		t = ", "
	}
	labels += "}"
	tags, t := "{", ""
	for k, v := range e.tags {
		tags += t + fmt.Sprintf("%q:%q", k, v)
		t = ", "
	}
	tags += "}"
	s = fmt.Sprintf("%s, labels: %s, tags: %s }", s, labels, tags)
	return s
}

func TestResolveRefAndKeyValue(t *testing.T) {
	defer logger.Flush()

	pod := newEvaluable("P1", "pns", "pqos",
		map[string]string{"l1": "plone", "l2": "pltwo", "l5": "plfive"}, nil, nil)

	tcases := []struct {
		name      string
		subject   Evaluable
		keys      []string
		values    []string
		ok        []bool
		error     []bool
		keyvalues []string
	}{
		{
			name: "test resolving references",
			subject: newEvaluable("C1", "cns", "cqos",
				map[string]string{"l1": "clone", "l2": "cltwo", "l3": "clthree"},
				map[string]string{"t1": "ctone", "t2": "cttwo", "t3": "ctthree"}, pod),
			keys: []string{
				"name", "namespace", "qosclass",
				"labels/l1", "labels/l2", "labels/l3", "labels/l4",
				"tags/t1", "tags/t2", "tags/t3", "tags/t4",
				"pod/labels/l1",
				"pod/labels/l2",
				"pod/labels/l3",
				"pod/labels/l4",
				"pod/labels/l5",
				":,-pod/qosclass,pod/namespace,pod/name,name",
			},
			values: []string{
				"C1", "cns", "cqos",
				"clone", "cltwo", "clthree", "",
				"ctone", "cttwo", "ctthree", "",
				"plone", "pltwo", "", "", "plfive",
				"",
			},
			keyvalues: []string{
				"C1", "cns", "cqos",
				"clone", "cltwo", "clthree", "",
				"ctone", "cttwo", "ctthree", "",
				"plone", "pltwo", "", "", "plfive",
				"pqos-pns-P1-C1",
			},
			ok: []bool{
				true, true, true,
				true, true, true, false,
				true, true, true, false,
				true, true, false, false, true,
				false,
			},
			error: []bool{
				false, false, false,
				false, false, false, false,
				false, false, false, false,
				false, false, false, false, false,
				true,
			},
		},
	}

	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			for i := range tc.keys {
				value, ok, err := ResolveRef(tc.subject, tc.keys[i])
				if err != nil && !tc.error[i] {
					t.Errorf("ResolveRef %s/%q should have given %q, but failed: %v",
						tc.subject, tc.keys[i], tc.values[i], err)
					continue
				}
				if value != tc.values[i] || ok != tc.ok[i] {
					t.Errorf("ResolveRef %s@%q: expected %v, %v got %v, %v",
						tc.subject, tc.keys[i], tc.values[i], tc.ok[i], value, ok)
					continue
				}
				expr := &Expression{
					Key:    tc.keys[i],
					Op:     Equals,
					Values: []string{},
				}
				value, _ = expr.KeyValue(tc.subject)
				if value != tc.keyvalues[i] {
					t.Errorf("KeyValue %s@%q: expected %v, got %v",
						tc.subject, tc.keys[i], tc.keyvalues[i], value)
				}
			}
		})
	}
}

func TestSimpleOperators(t *testing.T) {
	defer logger.Flush()

	pod := newEvaluable("P1", "pns", "pqos",
		map[string]string{"l1": "plone", "l2": "pltwo", "l5": "plfive"},
		nil,
		nil)
	sub := newEvaluable("C1", "cns", "cqos",
		map[string]string{"l1": "clone", "l2": "cltwo", "l3": "clthree"},
		map[string]string{"t1": "ctone", "t2": "cttwo", "t4": "ctfour"},
		pod)

	tcases := []struct {
		name    string
		subject Evaluable
		keys    []string
		ops     []Operator
		values  [][][]string
		results [][]bool
	}{
		{
			name:    "test Equals, NotEqual, In, NotIn operators",
			subject: sub,
			keys: []string{
				"name",
				"pod/name",
				"namespace",
				"pod/namespace",
				"qosclass",
				"pod/qosclass",
				"labels/l1",
				"labels/l2",
				"labels/l3",
				"labels/l4",
				"tags/t1",
				"tags/t2",
				"tags/t3",
				"tags/t4",
				"pod/labels/l1",
				"pod/labels/l2",
				"pod/labels/l3",
				"pod/labels/l4",
				"pod/labels/l5",
			},
			ops: []Operator{Equals, NotEqual, In, NotIn},
			values: [][][]string{
				{{"C1"}, {"C1"}, {"foo", "C1"}, {"foo"}},                    // name
				{{"P1"}, {"P1"}, {"foo", "P1"}, {"foo"}},                    // pod/name
				{{"cns"}, {"cns"}, {"foo", "cns"}, {"foo"}},                 // namespace
				{{"pns"}, {"pns"}, {"foo", "pns"}, {"pns"}},                 // pod/namespace
				{{"cqos"}, {"cqos"}, {"foo", "cqos"}, {"foo"}},              // qosclass
				{{"pqos"}, {"pqos"}, {"foo", "pqos"}, {"pqos"}},             // pod/qosclass
				{{"clone"}, {"clone"}, {"foo", "clone"}, {"foo"}},           // labels/l1
				{{"cltwo"}, {"cltwo"}, {"foo", "cltwo"}, {"foo"}},           // labels/l2
				{{"clthree"}, {"clthree"}, {"foo", "clthree"}, {"clthree"}}, // labels/l3
				{{"clfour"}, {"clfour"}, {"foo", "clfour"}, {"foo"}},        // labels/l4
				{{"ctone"}, {"ctone"}, {"foo", "ctone"}, {"foo"}},           // tags/t1
				{{"cttwo"}, {"cttwo"}, {"foo", "cttwo"}, {"foo"}},           // tags/t2
				{{"ctthree"}, {"ctthree"}, {"foo", "ctthree"}, {"foo"}},     // tags/t3
				{{"ctfour"}, {"ctfour"}, {"foo", "ctfour"}, {"ctfour"}},     // tags/t4
				{{"plone"}, {"plone"}, {"foo", "plone"}, {"foo"}},           // pod/labels/l1
				{{"pltwo"}, {"pltwo"}, {"foo", "pltwo"}, {"foo"}},           // pod/labels/l2
				{{"plthree"}, {"plthree"}, {"foo", "plthree"}, {"foo"}},     // pod/labels/l3
				{{"plfour"}, {"plfour"}, {"foo", "plfour"}, {"foo"}},        // pod/labels/l4
				{{"plfive"}, {"plfive"}, {"foo", "plfive"}, {"foo"}},        // pod/labels/l5
			},
			results: [][]bool{
				{true, false, true, true},  // name
				{true, false, true, true},  // pod/name
				{true, false, true, true},  // namespace
				{true, false, true, false}, // pod/namespace
				{true, false, true, true},  // qosclass
				{true, false, true, false}, // pod/qosclass
				{true, false, true, true},  // labels/l1
				{true, false, true, true},  // labels/l2
				{true, false, true, false}, // labels/l3
				{false, true, false, true}, // labels/l4
				{true, false, true, true},  // tags/t1
				{true, false, true, true},  // tags/t2
				{false, true, false, true}, // tags/t3
				{true, false, true, false}, // tags/t4
				{true, false, true, true},  // pod/labels/l1
				{true, false, true, true},  // pod/labels/l2
				{false, true, false, true}, // pod/labels/l3
				{false, true, false, true}, // pod/labels/l4
				{true, false, true, true},  // pod/labels/l5
			},
		},
	}

	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			for k := range tc.keys {
				for o := range tc.ops {
					expr := &Expression{
						Key:    tc.keys[k],
						Op:     tc.ops[o],
						Values: tc.values[k][o],
					}
					expect := tc.results[k][o]
					result := expr.Evaluate(tc.subject)
					if result != expect {
						t.Errorf("%s for %s: expected %v, got %v", expr, tc.subject, expect, result)
					}
				}
			}
		})
	}
}

func TestMatching(t *testing.T) {
	defer logger.Flush()

	p1 := newEvaluable("P1", "pns1", "pqos1",
		map[string]string{"l1": "plv1", "l2": "plv2", "l5": "plv5"},
		nil,
		nil)
	c11 := newEvaluable("C11", "cns1", "cqos11",
		map[string]string{"l1": "clv1", "l2": "clv2", "l3": "clv3"},
		map[string]string{"t1": "ctv1", "t2": "tag2", "t4": "ctv4"},
		p1)
	c12 := newEvaluable("C12", "cns1", "cqos12",
		map[string]string{"l1": "clv1", "l2": "clv2", "l3": "clv3"},
		map[string]string{"t1": "ctv1", "t2": "foo", "t4": "ctv4"},
		p1)
	c13 := newEvaluable("C12", "cns1", "cqos13",
		map[string]string{"l1": "clv1", "l2": "clv2", "l3": "clv3"},
		map[string]string{"t1": "ctv1", "t2": "ctv2", "t4": "ctv4"},
		p1)

	p2 := newEvaluable("P2", "pns2", "pqos2",
		map[string]string{"l1": "plv1", "l2": "plv2", "l5": "plv5"},
		nil,
		nil)
	c21 := newEvaluable("C21", "cns1", "cqos21",
		map[string]string{"l1": "clv1", "l2": "clv2", "l3": "clv3"},
		map[string]string{"t1": "ctv1", "t2": "tag2", "t4": "ctv4"},
		p2)
	c22 := newEvaluable("C22", "cns1", "cqos22",
		map[string]string{"l1": "clv1", "l2": "clv2", "l3": "clv3"},
		map[string]string{"t1": "ctv1", "t2": "ctv2", "t4": "ctv4"},
		p2)
	c23 := newEvaluable("C23", "cns1", "cqos23",
		map[string]string{"l1": "clv1", "l2": "clv2", "l3": "clv3"},
		map[string]string{"t1": "ctv1", "t2": "foo", "t4": "ctv4"},
		p2)

	p3 := newEvaluable("P3", "pns3", "pqos3",
		map[string]string{"l1": "plv1", "l2": "plv2", "l5": "plv5"},
		nil,
		nil)
	c3 := newEvaluable("C3", "cns3", "cqos3",
		map[string]string{"l1": "clv1", "l2": "clv2", "l3": "clv3"},
		map[string]string{"t1": "ctv1", "t2": "tag2", "t4": "ctv4"},
		p3)

	tcases := []struct {
		name      string
		subjects  []Evaluable
		selectors []*Expression
		expected  [][]string
	}{
		{
			name:     "test inverted membership operator",
			subjects: []Evaluable{c11, c12, c13, c21, c22, c23, c3},
			selectors: []*Expression{
				{
					Key: ":,:pod/qosclass,pod/namespace,pod/name,qosclass,name",
					Op:  Matches,
					Values: []string{
						"pqos2:*:*:*:*",
					},
				},
				{
					Key:    "tags/t2",
					Op:     Matches,
					Values: []string{"[tf][ao][go]*"},
				},
			},
			expected: [][]string{
				{"C21", "C22", "C23"},
				{"C11", "C12", "C21", "C23", "C3"},
			},
		},
	}

	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			for i, expr := range tc.selectors {
				results := []string{}
				for _, s := range tc.subjects {
					if expr.Evaluate(s) {
						results = append(results, s.Eval("name").(string))
					}
				}
				expected := strings.Join(tc.expected[i], ",")
				got := strings.Join(results, ",")
				if expected != got {
					t.Errorf("%s: expected %s, got %s", expr, expected, got)
				}
			}
		})
	}
}


================================================
FILE: pkg/apis/resmgr/generated/clientset/versioned/clientset.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by client-gen. DO NOT EDIT.

package versioned

import (
	"fmt"

	criresmgrv1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1"
	discovery "k8s.io/client-go/discovery"
	rest "k8s.io/client-go/rest"
	flowcontrol "k8s.io/client-go/util/flowcontrol"
)

type Interface interface {
	Discovery() discovery.DiscoveryInterface
	CriresmgrV1alpha1() criresmgrv1alpha1.CriresmgrV1alpha1Interface
}

// Clientset contains the clients for groups. Each group has exactly one
// version included in a Clientset.
type Clientset struct {
	*discovery.DiscoveryClient
	criresmgrV1alpha1 *criresmgrv1alpha1.CriresmgrV1alpha1Client
}

// CriresmgrV1alpha1 retrieves the CriresmgrV1alpha1Client
func (c *Clientset) CriresmgrV1alpha1() criresmgrv1alpha1.CriresmgrV1alpha1Interface {
	return c.criresmgrV1alpha1
}

// Discovery retrieves the DiscoveryClient
func (c *Clientset) Discovery() discovery.DiscoveryInterface {
	if c == nil {
		return nil
	}
	return c.DiscoveryClient
}

// NewForConfig creates a new Clientset for the given config.
// If config's RateLimiter is not set and QPS and Burst are acceptable,
// NewForConfig will generate a rate-limiter in configShallowCopy.
func NewForConfig(c *rest.Config) (*Clientset, error) {
	configShallowCopy := *c
	if configShallowCopy.RateLimiter == nil && configShallowCopy.QPS > 0 {
		if configShallowCopy.Burst <= 0 {
			return nil, fmt.Errorf("Burst is required to be greater than 0 when RateLimiter is not set and QPS is set to greater than 0")
		}
		configShallowCopy.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(configShallowCopy.QPS, configShallowCopy.Burst)
	}
	var cs Clientset
	var err error
	cs.criresmgrV1alpha1, err = criresmgrv1alpha1.NewForConfig(&configShallowCopy)
	if err != nil {
		return nil, err
	}

	cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfig(&configShallowCopy)
	if err != nil {
		return nil, err
	}
	return &cs, nil
}

// NewForConfigOrDie creates a new Clientset for the given config and
// panics if there is an error in the config.
func NewForConfigOrDie(c *rest.Config) *Clientset {
	var cs Clientset
	cs.criresmgrV1alpha1 = criresmgrv1alpha1.NewForConfigOrDie(c)

	cs.DiscoveryClient = discovery.NewDiscoveryClientForConfigOrDie(c)
	return &cs
}

// New creates a new Clientset for the given RESTClient.
func New(c rest.Interface) *Clientset {
	var cs Clientset
	cs.criresmgrV1alpha1 = criresmgrv1alpha1.New(c)

	cs.DiscoveryClient = discovery.NewDiscoveryClient(c)
	return &cs
}


================================================
FILE: pkg/apis/resmgr/generated/clientset/versioned/doc.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by client-gen. DO NOT EDIT.

// This package has the automatically generated clientset.
package versioned


================================================
FILE: pkg/apis/resmgr/generated/clientset/versioned/fake/clientset_generated.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by client-gen. DO NOT EDIT.

package fake

import (
	clientset "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned"
	criresmgrv1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1"
	fakecriresmgrv1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/fake"
	"k8s.io/apimachinery/pkg/runtime"
	"k8s.io/apimachinery/pkg/watch"
	"k8s.io/client-go/discovery"
	fakediscovery "k8s.io/client-go/discovery/fake"
	"k8s.io/client-go/testing"
)

// NewSimpleClientset returns a clientset that will respond with the provided objects.
// It's backed by a very simple object tracker that processes creates, updates and deletions as-is,
// without applying any validations and/or defaults. It shouldn't be considered a replacement
// for a real clientset and is mostly useful in simple unit tests.
func NewSimpleClientset(objects ...runtime.Object) *Clientset {
	o := testing.NewObjectTracker(scheme, codecs.UniversalDecoder())
	for _, obj := range objects {
		if err := o.Add(obj); err != nil {
			panic(err)
		}
	}

	cs := &Clientset{tracker: o}
	cs.discovery = &fakediscovery.FakeDiscovery{Fake: &cs.Fake}
	cs.AddReactor("*", "*", testing.ObjectReaction(o))
	cs.AddWatchReactor("*", func(action testing.Action) (handled bool, ret watch.Interface, err error) {
		gvr := action.GetResource()
		ns := action.GetNamespace()
		watch, err := o.Watch(gvr, ns)
		if err != nil {
			return false, nil, err
		}
		return true, watch, nil
	})

	return cs
}

// Clientset implements clientset.Interface. Meant to be embedded into a
// struct to get a default implementation. This makes faking out just the method
// you want to test easier.
type Clientset struct {
	testing.Fake
	discovery *fakediscovery.FakeDiscovery
	tracker   testing.ObjectTracker
}

func (c *Clientset) Discovery() discovery.DiscoveryInterface {
	return c.discovery
}

func (c *Clientset) Tracker() testing.ObjectTracker {
	return c.tracker
}

var _ clientset.Interface = &Clientset{}

// CriresmgrV1alpha1 retrieves the CriresmgrV1alpha1Client
func (c *Clientset) CriresmgrV1alpha1() criresmgrv1alpha1.CriresmgrV1alpha1Interface {
	return &fakecriresmgrv1alpha1.FakeCriresmgrV1alpha1{Fake: &c.Fake}
}


================================================
FILE: pkg/apis/resmgr/generated/clientset/versioned/fake/doc.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by client-gen. DO NOT EDIT.

// This package has the automatically generated fake clientset.
package fake


================================================
FILE: pkg/apis/resmgr/generated/clientset/versioned/fake/register.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by client-gen. DO NOT EDIT.

package fake

import (
	criresmgrv1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1"
	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	runtime "k8s.io/apimachinery/pkg/runtime"
	schema "k8s.io/apimachinery/pkg/runtime/schema"
	serializer "k8s.io/apimachinery/pkg/runtime/serializer"
	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
)

var scheme = runtime.NewScheme()
var codecs = serializer.NewCodecFactory(scheme)
var parameterCodec = runtime.NewParameterCodec(scheme)
var localSchemeBuilder = runtime.SchemeBuilder{
	criresmgrv1alpha1.AddToScheme,
}

// AddToScheme adds all types of this clientset into the given scheme. This allows composition
// of clientsets, like in:
//
//	import (
//	  "k8s.io/client-go/kubernetes"
//	  clientsetscheme "k8s.io/client-go/kubernetes/scheme"
//	  aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme"
//	)
//
//	kclientset, _ := kubernetes.NewForConfig(c)
//	_ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme)
//
// After this, RawExtensions in Kubernetes types will serialize kube-aggregator types
// correctly.
var AddToScheme = localSchemeBuilder.AddToScheme

func init() {
	v1.AddToGroupVersion(scheme, schema.GroupVersion{Version: "v1"})
	utilruntime.Must(AddToScheme(scheme))
}


================================================
FILE: pkg/apis/resmgr/generated/clientset/versioned/scheme/doc.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by client-gen. DO NOT EDIT.

// This package contains the scheme of the automatically generated clientset.
package scheme


================================================
FILE: pkg/apis/resmgr/generated/clientset/versioned/scheme/register.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by client-gen. DO NOT EDIT.

package scheme

import (
	criresmgrv1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1"
	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	runtime "k8s.io/apimachinery/pkg/runtime"
	schema "k8s.io/apimachinery/pkg/runtime/schema"
	serializer "k8s.io/apimachinery/pkg/runtime/serializer"
	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
)

var Scheme = runtime.NewScheme()
var Codecs = serializer.NewCodecFactory(Scheme)
var ParameterCodec = runtime.NewParameterCodec(Scheme)
var localSchemeBuilder = runtime.SchemeBuilder{
	criresmgrv1alpha1.AddToScheme,
}

// AddToScheme adds all types of this clientset into the given scheme. This allows composition
// of clientsets, like in:
//
//	import (
//	  "k8s.io/client-go/kubernetes"
//	  clientsetscheme "k8s.io/client-go/kubernetes/scheme"
//	  aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme"
//	)
//
//	kclientset, _ := kubernetes.NewForConfig(c)
//	_ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme)
//
// After this, RawExtensions in Kubernetes types will serialize kube-aggregator types
// correctly.
var AddToScheme = localSchemeBuilder.AddToScheme

func init() {
	v1.AddToGroupVersion(Scheme, schema.GroupVersion{Version: "v1"})
	utilruntime.Must(AddToScheme(Scheme))
}


================================================
FILE: pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/adjustment.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by client-gen. DO NOT EDIT.

package v1alpha1

import (
	"context"
	"time"

	scheme "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/scheme"
	v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1"
	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	types "k8s.io/apimachinery/pkg/types"
	watch "k8s.io/apimachinery/pkg/watch"
	rest "k8s.io/client-go/rest"
)

// AdjustmentsGetter has a method to return a AdjustmentInterface.
// A group's client should implement this interface.
type AdjustmentsGetter interface {
	Adjustments(namespace string) AdjustmentInterface
}

// AdjustmentInterface has methods to work with Adjustment resources.
type AdjustmentInterface interface {
	Create(*v1alpha1.Adjustment) (*v1alpha1.Adjustment, error)
	Update(*v1alpha1.Adjustment) (*v1alpha1.Adjustment, error)
	UpdateStatus(*v1alpha1.Adjustment) (*v1alpha1.Adjustment, error)
	Delete(name string, options *v1.DeleteOptions) error
	DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error
	Get(name string, options v1.GetOptions) (*v1alpha1.Adjustment, error)
	List(opts v1.ListOptions) (*v1alpha1.AdjustmentList, error)
	Watch(opts v1.ListOptions) (watch.Interface, error)
	Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.Adjustment, err error)
	AdjustmentExpansion
}

// adjustments implements AdjustmentInterface
type adjustments struct {
	client rest.Interface
	ns     string
}

// newAdjustments returns a Adjustments
func newAdjustments(c *CriresmgrV1alpha1Client, namespace string) *adjustments {
	return &adjustments{
		client: c.RESTClient(),
		ns:     namespace,
	}
}

// Get takes name of the adjustment, and returns the corresponding adjustment object, and an error if there is any.
func (c *adjustments) Get(name string, options v1.GetOptions) (result *v1alpha1.Adjustment, err error) {
	result = &v1alpha1.Adjustment{}
	err = c.client.Get().
		Namespace(c.ns).
		Resource("adjustments").
		Name(name).
		VersionedParams(&options, scheme.ParameterCodec).
		Do(context.TODO()).
		Into(result)
	return
}

// List takes label and field selectors, and returns the list of Adjustments that match those selectors.
func (c *adjustments) List(opts v1.ListOptions) (result *v1alpha1.AdjustmentList, err error) {
	var timeout time.Duration
	if opts.TimeoutSeconds != nil {
		timeout = time.Duration(*opts.TimeoutSeconds) * time.Second
	}
	result = &v1alpha1.AdjustmentList{}
	err = c.client.Get().
		Namespace(c.ns).
		Resource("adjustments").
		VersionedParams(&opts, scheme.ParameterCodec).
		Timeout(timeout).
		Do(context.TODO()).
		Into(result)
	return
}

// Watch returns a watch.Interface that watches the requested adjustments.
func (c *adjustments) Watch(opts v1.ListOptions) (watch.Interface, error) {
	var timeout time.Duration
	if opts.TimeoutSeconds != nil {
		timeout = time.Duration(*opts.TimeoutSeconds) * time.Second
	}
	opts.Watch = true
	return c.client.Get().
		Namespace(c.ns).
		Resource("adjustments").
		VersionedParams(&opts, scheme.ParameterCodec).
		Timeout(timeout).
		Watch(context.TODO())
}

// Create takes the representation of a adjustment and creates it.  Returns the server's representation of the adjustment, and an error, if there is any.
func (c *adjustments) Create(adjustment *v1alpha1.Adjustment) (result *v1alpha1.Adjustment, err error) {
	result = &v1alpha1.Adjustment{}
	err = c.client.Post().
		Namespace(c.ns).
		Resource("adjustments").
		Body(adjustment).
		Do(context.TODO()).
		Into(result)
	return
}

// Update takes the representation of a adjustment and updates it. Returns the server's representation of the adjustment, and an error, if there is any.
func (c *adjustments) Update(adjustment *v1alpha1.Adjustment) (result *v1alpha1.Adjustment, err error) {
	result = &v1alpha1.Adjustment{}
	err = c.client.Put().
		Namespace(c.ns).
		Resource("adjustments").
		Name(adjustment.Name).
		Body(adjustment).
		Do(context.TODO()).
		Into(result)
	return
}

// UpdateStatus was generated because the type contains a Status member.
// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus().

func (c *adjustments) UpdateStatus(adjustment *v1alpha1.Adjustment) (result *v1alpha1.Adjustment, err error) {
	result = &v1alpha1.Adjustment{}
	err = c.client.Put().
		Namespace(c.ns).
		Resource("adjustments").
		Name(adjustment.Name).
		SubResource("status").
		Body(adjustment).
		Do(context.TODO()).
		Into(result)
	return
}

// Delete takes name of the adjustment and deletes it. Returns an error if one occurs.
func (c *adjustments) Delete(name string, options *v1.DeleteOptions) error {
	return c.client.Delete().
		Namespace(c.ns).
		Resource("adjustments").
		Name(name).
		Body(options).
		Do(context.TODO()).
		Error()
}

// DeleteCollection deletes a collection of objects.
func (c *adjustments) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error {
	var timeout time.Duration
	if listOptions.TimeoutSeconds != nil {
		timeout = time.Duration(*listOptions.TimeoutSeconds) * time.Second
	}
	return c.client.Delete().
		Namespace(c.ns).
		Resource("adjustments").
		VersionedParams(&listOptions, scheme.ParameterCodec).
		Timeout(timeout).
		Body(options).
		Do(context.TODO()).
		Error()
}

// Patch applies the patch and returns the patched adjustment.
func (c *adjustments) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.Adjustment, err error) {
	result = &v1alpha1.Adjustment{}
	err = c.client.Patch(pt).
		Namespace(c.ns).
		Resource("adjustments").
		SubResource(subresources...).
		Name(name).
		Body(data).
		Do(context.TODO()).
		Into(result)
	return
}


================================================
FILE: pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/doc.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by client-gen. DO NOT EDIT.

// This package has the automatically generated typed clients.
package v1alpha1


================================================
FILE: pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/fake/doc.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by client-gen. DO NOT EDIT.

// Package fake has the automatically generated clients.
package fake


================================================
FILE: pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/fake/fake_adjustment.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by client-gen. DO NOT EDIT.

package fake

import (
	v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1"
	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	labels "k8s.io/apimachinery/pkg/labels"
	schema "k8s.io/apimachinery/pkg/runtime/schema"
	types "k8s.io/apimachinery/pkg/types"
	watch "k8s.io/apimachinery/pkg/watch"
	testing "k8s.io/client-go/testing"
)

// FakeAdjustments implements AdjustmentInterface
type FakeAdjustments struct {
	Fake *FakeCriresmgrV1alpha1
	ns   string
}

var adjustmentsResource = schema.GroupVersionResource{Group: "criresmgr.intel.com", Version: "v1alpha1", Resource: "adjustments"}

var adjustmentsKind = schema.GroupVersionKind{Group: "criresmgr.intel.com", Version: "v1alpha1", Kind: "Adjustment"}

// Get takes name of the adjustment, and returns the corresponding adjustment object, and an error if there is any.
func (c *FakeAdjustments) Get(name string, options v1.GetOptions) (result *v1alpha1.Adjustment, err error) {
	obj, err := c.Fake.
		Invokes(testing.NewGetAction(adjustmentsResource, c.ns, name), &v1alpha1.Adjustment{})

	if obj == nil {
		return nil, err
	}
	return obj.(*v1alpha1.Adjustment), err
}

// List takes label and field selectors, and returns the list of Adjustments that match those selectors.
func (c *FakeAdjustments) List(opts v1.ListOptions) (result *v1alpha1.AdjustmentList, err error) {
	obj, err := c.Fake.
		Invokes(testing.NewListAction(adjustmentsResource, adjustmentsKind, c.ns, opts), &v1alpha1.AdjustmentList{})

	if obj == nil {
		return nil, err
	}

	label, _, _ := testing.ExtractFromListOptions(opts)
	if label == nil {
		label = labels.Everything()
	}
	list := &v1alpha1.AdjustmentList{ListMeta: obj.(*v1alpha1.AdjustmentList).ListMeta}
	for _, item := range obj.(*v1alpha1.AdjustmentList).Items {
		if label.Matches(labels.Set(item.Labels)) {
			list.Items = append(list.Items, item)
		}
	}
	return list, err
}

// Watch returns a watch.Interface that watches the requested adjustments.
func (c *FakeAdjustments) Watch(opts v1.ListOptions) (watch.Interface, error) {
	return c.Fake.
		InvokesWatch(testing.NewWatchAction(adjustmentsResource, c.ns, opts))

}

// Create takes the representation of a adjustment and creates it.  Returns the server's representation of the adjustment, and an error, if there is any.
func (c *FakeAdjustments) Create(adjustment *v1alpha1.Adjustment) (result *v1alpha1.Adjustment, err error) {
	obj, err := c.Fake.
		Invokes(testing.NewCreateAction(adjustmentsResource, c.ns, adjustment), &v1alpha1.Adjustment{})

	if obj == nil {
		return nil, err
	}
	return obj.(*v1alpha1.Adjustment), err
}

// Update takes the representation of a adjustment and updates it. Returns the server's representation of the adjustment, and an error, if there is any.
func (c *FakeAdjustments) Update(adjustment *v1alpha1.Adjustment) (result *v1alpha1.Adjustment, err error) {
	obj, err := c.Fake.
		Invokes(testing.NewUpdateAction(adjustmentsResource, c.ns, adjustment), &v1alpha1.Adjustment{})

	if obj == nil {
		return nil, err
	}
	return obj.(*v1alpha1.Adjustment), err
}

// UpdateStatus was generated because the type contains a Status member.
// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus().
func (c *FakeAdjustments) UpdateStatus(adjustment *v1alpha1.Adjustment) (*v1alpha1.Adjustment, error) {
	obj, err := c.Fake.
		Invokes(testing.NewUpdateSubresourceAction(adjustmentsResource, "status", c.ns, adjustment), &v1alpha1.Adjustment{})

	if obj == nil {
		return nil, err
	}
	return obj.(*v1alpha1.Adjustment), err
}

// Delete takes name of the adjustment and deletes it. Returns an error if one occurs.
func (c *FakeAdjustments) Delete(name string, options *v1.DeleteOptions) error {
	_, err := c.Fake.
		Invokes(testing.NewDeleteAction(adjustmentsResource, c.ns, name), &v1alpha1.Adjustment{})

	return err
}

// DeleteCollection deletes a collection of objects.
func (c *FakeAdjustments) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error {
	action := testing.NewDeleteCollectionAction(adjustmentsResource, c.ns, listOptions)

	_, err := c.Fake.Invokes(action, &v1alpha1.AdjustmentList{})
	return err
}

// Patch applies the patch and returns the patched adjustment.
func (c *FakeAdjustments) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.Adjustment, err error) {
	obj, err := c.Fake.
		Invokes(testing.NewPatchSubresourceAction(adjustmentsResource, c.ns, name, pt, data, subresources...), &v1alpha1.Adjustment{})

	if obj == nil {
		return nil, err
	}
	return obj.(*v1alpha1.Adjustment), err
}


================================================
FILE: pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/fake/fake_resmgr_client.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by client-gen. DO NOT EDIT.

package fake

import (
	v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1"
	rest "k8s.io/client-go/rest"
	testing "k8s.io/client-go/testing"
)

type FakeCriresmgrV1alpha1 struct {
	*testing.Fake
}

func (c *FakeCriresmgrV1alpha1) Adjustments(namespace string) v1alpha1.AdjustmentInterface {
	return &FakeAdjustments{c, namespace}
}

// RESTClient returns a RESTClient that is used to communicate
// with API server by this client implementation.
func (c *FakeCriresmgrV1alpha1) RESTClient() rest.Interface {
	var ret *rest.RESTClient
	return ret
}


================================================
FILE: pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/generated_expansion.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by client-gen. DO NOT EDIT.

package v1alpha1

type AdjustmentExpansion interface{}


================================================
FILE: pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/resmgr_client.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by client-gen. DO NOT EDIT.

package v1alpha1

import (
	"github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/scheme"
	v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1"
	rest "k8s.io/client-go/rest"
)

type CriresmgrV1alpha1Interface interface {
	RESTClient() rest.Interface
	AdjustmentsGetter
}

// CriresmgrV1alpha1Client is used to interact with features provided by the criresmgr.intel.com group.
type CriresmgrV1alpha1Client struct {
	restClient rest.Interface
}

func (c *CriresmgrV1alpha1Client) Adjustments(namespace string) AdjustmentInterface {
	return newAdjustments(c, namespace)
}

// NewForConfig creates a new CriresmgrV1alpha1Client for the given config.
func NewForConfig(c *rest.Config) (*CriresmgrV1alpha1Client, error) {
	config := *c
	if err := setConfigDefaults(&config); err != nil {
		return nil, err
	}
	client, err := rest.RESTClientFor(&config)
	if err != nil {
		return nil, err
	}
	return &CriresmgrV1alpha1Client{client}, nil
}

// NewForConfigOrDie creates a new CriresmgrV1alpha1Client for the given config and
// panics if there is an error in the config.
func NewForConfigOrDie(c *rest.Config) *CriresmgrV1alpha1Client {
	client, err := NewForConfig(c)
	if err != nil {
		panic(err)
	}
	return client
}

// New creates a new CriresmgrV1alpha1Client for the given RESTClient.
func New(c rest.Interface) *CriresmgrV1alpha1Client {
	return &CriresmgrV1alpha1Client{c}
}

func setConfigDefaults(config *rest.Config) error {
	gv := v1alpha1.SchemeGroupVersion
	config.GroupVersion = &gv
	config.APIPath = "/apis"
	config.NegotiatedSerializer = scheme.Codecs.WithoutConversion()

	if config.UserAgent == "" {
		config.UserAgent = rest.DefaultKubernetesUserAgent()
	}

	return nil
}

// RESTClient returns a RESTClient that is used to communicate
// with API server by this client implementation.
func (c *CriresmgrV1alpha1Client) RESTClient() rest.Interface {
	if c == nil {
		return nil
	}
	return c.restClient
}


================================================
FILE: pkg/apis/resmgr/generated/informers/externalversions/factory.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by informer-gen. DO NOT EDIT.

package externalversions

import (
	reflect "reflect"
	sync "sync"
	time "time"

	versioned "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned"
	internalinterfaces "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/informers/externalversions/internalinterfaces"
	resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/informers/externalversions/resmgr"
	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	runtime "k8s.io/apimachinery/pkg/runtime"
	schema "k8s.io/apimachinery/pkg/runtime/schema"
	cache "k8s.io/client-go/tools/cache"
)

// SharedInformerOption defines the functional option type for SharedInformerFactory.
type SharedInformerOption func(*sharedInformerFactory) *sharedInformerFactory

type sharedInformerFactory struct {
	client           versioned.Interface
	namespace        string
	tweakListOptions internalinterfaces.TweakListOptionsFunc
	lock             sync.Mutex
	defaultResync    time.Duration
	customResync     map[reflect.Type]time.Duration

	informers map[reflect.Type]cache.SharedIndexInformer
	// startedInformers is used for tracking which informers have been started.
	// This allows Start() to be called multiple times safely.
	startedInformers map[reflect.Type]bool
}

// WithCustomResyncConfig sets a custom resync period for the specified informer types.
func WithCustomResyncConfig(resyncConfig map[v1.Object]time.Duration) SharedInformerOption {
	return func(factory *sharedInformerFactory) *sharedInformerFactory {
		for k, v := range resyncConfig {
			factory.customResync[reflect.TypeOf(k)] = v
		}
		return factory
	}
}

// WithTweakListOptions sets a custom filter on all listers of the configured SharedInformerFactory.
func WithTweakListOptions(tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerOption {
	return func(factory *sharedInformerFactory) *sharedInformerFactory {
		factory.tweakListOptions = tweakListOptions
		return factory
	}
}

// WithNamespace limits the SharedInformerFactory to the specified namespace.
func WithNamespace(namespace string) SharedInformerOption {
	return func(factory *sharedInformerFactory) *sharedInformerFactory {
		factory.namespace = namespace
		return factory
	}
}

// NewSharedInformerFactory constructs a new instance of sharedInformerFactory for all namespaces.
func NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) SharedInformerFactory {
	return NewSharedInformerFactoryWithOptions(client, defaultResync)
}

// NewFilteredSharedInformerFactory constructs a new instance of sharedInformerFactory.
// Listers obtained via this SharedInformerFactory will be subject to the same filters
// as specified here.
// Deprecated: Please use NewSharedInformerFactoryWithOptions instead
func NewFilteredSharedInformerFactory(client versioned.Interface, defaultResync time.Duration, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerFactory {
	return NewSharedInformerFactoryWithOptions(client, defaultResync, WithNamespace(namespace), WithTweakListOptions(tweakListOptions))
}

// NewSharedInformerFactoryWithOptions constructs a new instance of a SharedInformerFactory with additional options.
func NewSharedInformerFactoryWithOptions(client versioned.Interface, defaultResync time.Duration, options ...SharedInformerOption) SharedInformerFactory {
	factory := &sharedInformerFactory{
		client:           client,
		namespace:        v1.NamespaceAll,
		defaultResync:    defaultResync,
		informers:        make(map[reflect.Type]cache.SharedIndexInformer),
		startedInformers: make(map[reflect.Type]bool),
		customResync:     make(map[reflect.Type]time.Duration),
	}

	// Apply all options
	for _, opt := range options {
		factory = opt(factory)
	}

	return factory
}

// Start initializes all requested informers.
func (f *sharedInformerFactory) Start(stopCh <-chan struct{}) {
	f.lock.Lock()
	defer f.lock.Unlock()

	for informerType, informer := range f.informers {
		if !f.startedInformers[informerType] {
			go informer.Run(stopCh)
			f.startedInformers[informerType] = true
		}
	}
}

// WaitForCacheSync waits for all started informers' cache were synced.
func (f *sharedInformerFactory) WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool {
	informers := func() map[reflect.Type]cache.SharedIndexInformer {
		f.lock.Lock()
		defer f.lock.Unlock()

		informers := map[reflect.Type]cache.SharedIndexInformer{}
		for informerType, informer := range f.informers {
			if f.startedInformers[informerType] {
				informers[informerType] = informer
			}
		}
		return informers
	}()

	res := map[reflect.Type]bool{}
	for informType, informer := range informers {
		res[informType] = cache.WaitForCacheSync(stopCh, informer.HasSynced)
	}
	return res
}

// InternalInformerFor returns the SharedIndexInformer for obj using an internal
// client.
func (f *sharedInformerFactory) InformerFor(obj runtime.Object, newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer {
	f.lock.Lock()
	defer f.lock.Unlock()

	informerType := reflect.TypeOf(obj)
	informer, exists := f.informers[informerType]
	if exists {
		return informer
	}

	resyncPeriod, exists := f.customResync[informerType]
	if !exists {
		resyncPeriod = f.defaultResync
	}

	informer = newFunc(f.client, resyncPeriod)
	f.informers[informerType] = informer

	return informer
}

// SharedInformerFactory provides shared informers for resources in all known
// API group versions.
type SharedInformerFactory interface {
	internalinterfaces.SharedInformerFactory
	ForResource(resource schema.GroupVersionResource) (GenericInformer, error)
	WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool

	Criresmgr() resmgr.Interface
}

func (f *sharedInformerFactory) Criresmgr() resmgr.Interface {
	return resmgr.New(f, f.namespace, f.tweakListOptions)
}


================================================
FILE: pkg/apis/resmgr/generated/informers/externalversions/generic.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by informer-gen. DO NOT EDIT.

package externalversions

import (
	"fmt"

	v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1"
	schema "k8s.io/apimachinery/pkg/runtime/schema"
	cache "k8s.io/client-go/tools/cache"
)

// GenericInformer is type of SharedIndexInformer which will locate and delegate to other
// sharedInformers based on type
type GenericInformer interface {
	Informer() cache.SharedIndexInformer
	Lister() cache.GenericLister
}

type genericInformer struct {
	informer cache.SharedIndexInformer
	resource schema.GroupResource
}

// Informer returns the SharedIndexInformer.
func (f *genericInformer) Informer() cache.SharedIndexInformer {
	return f.informer
}

// Lister returns the GenericLister.
func (f *genericInformer) Lister() cache.GenericLister {
	return cache.NewGenericLister(f.Informer().GetIndexer(), f.resource)
}

// ForResource gives generic access to a shared informer of the matching type
// TODO extend this to unknown resources with a client pool
func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) {
	switch resource {
	// Group=criresmgr.intel.com, Version=v1alpha1
	case v1alpha1.SchemeGroupVersion.WithResource("adjustments"):
		return &genericInformer{resource: resource.GroupResource(), informer: f.Criresmgr().V1alpha1().Adjustments().Informer()}, nil

	}

	return nil, fmt.Errorf("no informer found for %v", resource)
}


================================================
FILE: pkg/apis/resmgr/generated/informers/externalversions/internalinterfaces/factory_interfaces.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by informer-gen. DO NOT EDIT.

package internalinterfaces

import (
	time "time"

	versioned "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned"
	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	runtime "k8s.io/apimachinery/pkg/runtime"
	cache "k8s.io/client-go/tools/cache"
)

// NewInformerFunc takes versioned.Interface and time.Duration to return a SharedIndexInformer.
type NewInformerFunc func(versioned.Interface, time.Duration) cache.SharedIndexInformer

// SharedInformerFactory a small interface to allow for adding an informer without an import cycle
type SharedInformerFactory interface {
	Start(stopCh <-chan struct{})
	InformerFor(obj runtime.Object, newFunc NewInformerFunc) cache.SharedIndexInformer
}

// TweakListOptionsFunc is a function that transforms a v1.ListOptions.
type TweakListOptionsFunc func(*v1.ListOptions)


================================================
FILE: pkg/apis/resmgr/generated/informers/externalversions/resmgr/interface.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by informer-gen. DO NOT EDIT.

package resmgr

import (
	internalinterfaces "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/informers/externalversions/internalinterfaces"
	v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/informers/externalversions/resmgr/v1alpha1"
)

// Interface provides access to each of this group's versions.
type Interface interface {
	// V1alpha1 provides access to shared informers for resources in V1alpha1.
	V1alpha1() v1alpha1.Interface
}

type group struct {
	factory          internalinterfaces.SharedInformerFactory
	namespace        string
	tweakListOptions internalinterfaces.TweakListOptionsFunc
}

// New returns a new Interface.
func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface {
	return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions}
}

// V1alpha1 returns a new v1alpha1.Interface.
func (g *group) V1alpha1() v1alpha1.Interface {
	return v1alpha1.New(g.factory, g.namespace, g.tweakListOptions)
}


================================================
FILE: pkg/apis/resmgr/generated/informers/externalversions/resmgr/v1alpha1/adjustment.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by informer-gen. DO NOT EDIT.

package v1alpha1

import (
	time "time"

	versioned "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned"
	internalinterfaces "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/informers/externalversions/internalinterfaces"
	v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/listers/resmgr/v1alpha1"
	resmgrv1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1"
	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	runtime "k8s.io/apimachinery/pkg/runtime"
	watch "k8s.io/apimachinery/pkg/watch"
	cache "k8s.io/client-go/tools/cache"
)

// AdjustmentInformer provides access to a shared informer and lister for
// Adjustments.
type AdjustmentInformer interface {
	Informer() cache.SharedIndexInformer
	Lister() v1alpha1.AdjustmentLister
}

type adjustmentInformer struct {
	factory          internalinterfaces.SharedInformerFactory
	tweakListOptions internalinterfaces.TweakListOptionsFunc
	namespace        string
}

// NewAdjustmentInformer constructs a new informer for Adjustment type.
// Always prefer using an informer factory to get a shared informer instead of getting an independent
// one. This reduces memory footprint and number of connections to the server.
func NewAdjustmentInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer {
	return NewFilteredAdjustmentInformer(client, namespace, resyncPeriod, indexers, nil)
}

// NewFilteredAdjustmentInformer constructs a new informer for Adjustment type.
// Always prefer using an informer factory to get a shared informer instead of getting an independent
// one. This reduces memory footprint and number of connections to the server.
func NewFilteredAdjustmentInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer {
	return cache.NewSharedIndexInformer(
		&cache.ListWatch{
			ListFunc: func(options v1.ListOptions) (runtime.Object, error) {
				if tweakListOptions != nil {
					tweakListOptions(&options)
				}
				return client.CriresmgrV1alpha1().Adjustments(namespace).List(options)
			},
			WatchFunc: func(options v1.ListOptions) (watch.Interface, error) {
				if tweakListOptions != nil {
					tweakListOptions(&options)
				}
				return client.CriresmgrV1alpha1().Adjustments(namespace).Watch(options)
			},
		},
		&resmgrv1alpha1.Adjustment{},
		resyncPeriod,
		indexers,
	)
}

func (f *adjustmentInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer {
	return NewFilteredAdjustmentInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions)
}

func (f *adjustmentInformer) Informer() cache.SharedIndexInformer {
	return f.factory.InformerFor(&resmgrv1alpha1.Adjustment{}, f.defaultInformer)
}

func (f *adjustmentInformer) Lister() v1alpha1.AdjustmentLister {
	return v1alpha1.NewAdjustmentLister(f.Informer().GetIndexer())
}


================================================
FILE: pkg/apis/resmgr/generated/informers/externalversions/resmgr/v1alpha1/interface.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by informer-gen. DO NOT EDIT.

package v1alpha1

import (
	internalinterfaces "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/informers/externalversions/internalinterfaces"
)

// Interface provides access to all the informers in this group version.
type Interface interface {
	// Adjustments returns a AdjustmentInformer.
	Adjustments() AdjustmentInformer
}

type version struct {
	factory          internalinterfaces.SharedInformerFactory
	namespace        string
	tweakListOptions internalinterfaces.TweakListOptionsFunc
}

// New returns a new Interface.
func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface {
	return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions}
}

// Adjustments returns a AdjustmentInformer.
func (v *version) Adjustments() AdjustmentInformer {
	return &adjustmentInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions}
}


================================================
FILE: pkg/apis/resmgr/generated/listers/resmgr/v1alpha1/adjustment.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by lister-gen. DO NOT EDIT.

package v1alpha1

import (
	v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1"
	"k8s.io/apimachinery/pkg/api/errors"
	"k8s.io/apimachinery/pkg/labels"
	"k8s.io/client-go/tools/cache"
)

// AdjustmentLister helps list Adjustments.
type AdjustmentLister interface {
	// List lists all Adjustments in the indexer.
	List(selector labels.Selector) (ret []*v1alpha1.Adjustment, err error)
	// Adjustments returns an object that can list and get Adjustments.
	Adjustments(namespace string) AdjustmentNamespaceLister
	AdjustmentListerExpansion
}

// adjustmentLister implements the AdjustmentLister interface.
type adjustmentLister struct {
	indexer cache.Indexer
}

// NewAdjustmentLister returns a new AdjustmentLister.
func NewAdjustmentLister(indexer cache.Indexer) AdjustmentLister {
	return &adjustmentLister{indexer: indexer}
}

// List lists all Adjustments in the indexer.
func (s *adjustmentLister) List(selector labels.Selector) (ret []*v1alpha1.Adjustment, err error) {
	err = cache.ListAll(s.indexer, selector, func(m interface{}) {
		ret = append(ret, m.(*v1alpha1.Adjustment))
	})
	return ret, err
}

// Adjustments returns an object that can list and get Adjustments.
func (s *adjustmentLister) Adjustments(namespace string) AdjustmentNamespaceLister {
	return adjustmentNamespaceLister{indexer: s.indexer, namespace: namespace}
}

// AdjustmentNamespaceLister helps list and get Adjustments.
type AdjustmentNamespaceLister interface {
	// List lists all Adjustments in the indexer for a given namespace.
	List(selector labels.Selector) (ret []*v1alpha1.Adjustment, err error)
	// Get retrieves the Adjustment from the indexer for a given namespace and name.
	Get(name string) (*v1alpha1.Adjustment, error)
	AdjustmentNamespaceListerExpansion
}

// adjustmentNamespaceLister implements the AdjustmentNamespaceLister
// interface.
type adjustmentNamespaceLister struct {
	indexer   cache.Indexer
	namespace string
}

// List lists all Adjustments in the indexer for a given namespace.
func (s adjustmentNamespaceLister) List(selector labels.Selector) (ret []*v1alpha1.Adjustment, err error) {
	err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) {
		ret = append(ret, m.(*v1alpha1.Adjustment))
	})
	return ret, err
}

// Get retrieves the Adjustment from the indexer for a given namespace and name.
func (s adjustmentNamespaceLister) Get(name string) (*v1alpha1.Adjustment, error) {
	obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name)
	if err != nil {
		return nil, err
	}
	if !exists {
		return nil, errors.NewNotFound(v1alpha1.Resource("adjustment"), name)
	}
	return obj.(*v1alpha1.Adjustment), nil
}


================================================
FILE: pkg/apis/resmgr/generated/listers/resmgr/v1alpha1/expansion_generated.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by lister-gen. DO NOT EDIT.

package v1alpha1

// AdjustmentListerExpansion allows custom methods to be added to
// AdjustmentLister.
type AdjustmentListerExpansion interface{}

// AdjustmentNamespaceListerExpansion allows custom methods to be added to
// AdjustmentNamespaceLister.
type AdjustmentNamespaceListerExpansion interface{}


================================================
FILE: pkg/apis/resmgr/v1alpha1/adjustment-schema.yaml
================================================
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
  name: adjustments.criresmgr.intel.com
spec:
  group: criresmgr.intel.com
  names:
    kind: Adjustment
    singular: adjustment
    plural: adjustments
  scope: Namespaced
  versions:
    - name: v1alpha1
      served: true
      storage: true
      schema:
        # openAPI V3 Schema for validating adjustments
        openAPIV3Schema:
          type: object
          required: [ spec ]
          properties:
            spec:
              type: object
              required: [ scope ]
              properties:
                scope:
                  type: array
                  items:
                    type: object
                    properties:
                      nodes:
                        type: array
                        items:
                          type: string
                      containers:
                        type: array
                        items:
                          type: object
                          properties:
                            key:
                              type: string
                            operator:
                              type: string
                            values:
                              type: array
                              items:
                                type: string
                resources:
                  type: object
                  properties:
                    requests:
                      type: object
                      properties:
                        cpu:
                          type: string
                        memory:
                          type: string
                    limits:
                      type: object
                      properties:
                        cpu:
                          type: string
                        memory:
                          type: string
                classes:
                  type: object
                  properties:
                    rdt:
                      type: string
                    blockio:
                      type: string
                toptierLimit:
                  type: string
            status:
              type: object
              properties:
                nodes:
                  type: object
                  additionalProperties:
                    type: object
                    properties:
                      errors:
                        type: object
                        additionalProperties:
                          type: string


================================================
FILE: pkg/apis/resmgr/v1alpha1/adjustment.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package v1alpha1

import (
	"fmt"
	"strings"

	resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr"
	corev1 "k8s.io/api/core/v1"
)

// HasSameVersion checks if the policy has the same version as the other.
func (a *Adjustment) HasSameVersion(o *Adjustment) bool {
	if a.ResourceVersion != o.ResourceVersion {
		return false
	}
	if a.Generation != o.Generation {
		return false
	}
	return true
}

// NodeScope returns the sub-slice of scopes that apply to the given node.
func (spec *AdjustmentSpec) NodeScope(node string) []AdjustmentScope {
	filtered := []AdjustmentScope{}
	for _, scope := range spec.Scope {
		if scope.IsNodeInScope(node) {
			filtered = append(filtered, scope)
		}
	}
	return filtered
}

// GetResourceRequirements returns the k8s resource requirements for this adjustment.
func (spec *AdjustmentSpec) GetResourceRequirements() (corev1.ResourceRequirements, bool) {
	if spec.Resources != nil {
		return *spec.Resources, true
	}
	return corev1.ResourceRequirements{}, false
}

// GetRDTClass returns the RDT class for this adjustment.
func (spec *AdjustmentSpec) GetRDTClass() (string, bool) {
	if spec.Classes == nil || spec.Classes.RDT == nil {
		return "", false
	}
	return *spec.Classes.RDT, true
}

// GetBlockIOClass returns the Block I/O class for this adjustment.
func (spec *AdjustmentSpec) GetBlockIOClass() (string, bool) {
	if spec.Classes == nil || spec.Classes.BlockIO == nil {
		return "", false
	}
	return *spec.Classes.BlockIO, true
}

// IsNodeInScope tests if the node is within the scope of this spec.
func (spec *AdjustmentSpec) IsNodeInScope(node string) bool {
	if len(spec.Scope) == 0 {
		return true
	}
	for _, s := range spec.Scope {
		if s.IsNodeInScope(node) {
			return true
		}
	}
	return false
}

// IsContainerInScope tests if the container is within the scope of this spec.
func (spec *AdjustmentSpec) IsContainerInScope(container resmgr.Evaluable) bool {
	if len(spec.Scope) == 0 {
		return true
	}
	for _, s := range spec.Scope {
		if s.IsContainerInScope(container) {
			return true
		}
	}
	return false
}

// Compare checks if this spec is identical to another.
func (spec *AdjustmentSpec) Compare(other *AdjustmentSpec) bool {
	switch {
	case !CompareScopes(spec.Scope, other.Scope):
		return false
	case !spec.compareResources(other):
		return false
	case !spec.Classes.Compare(other.Classes):
		return false
	case spec.ToptierLimit == nil && other.ToptierLimit != nil:
		return false
	case spec.ToptierLimit != nil && other.ToptierLimit == nil:
		return false
	case spec.ToptierLimit != nil && spec.ToptierLimit.Value() != other.ToptierLimit.Value():
		return false
	}
	return true
}

// Verify checks the given spec for obvious errors.
func (spec *AdjustmentSpec) Verify() error {
	if err := spec.verifyResources(); err != nil {
		return err
	}
	if err := spec.verifyToptierLimit(); err != nil {
		return err
	}

	return nil
}

// Check if the resources in this spec are identical to another one.
func (spec *AdjustmentSpec) compareResources(other *AdjustmentSpec) bool {
	switch {
	case spec == nil && other == nil:
		return true
	case spec != nil && other == nil:
		return true
	case spec == nil && other != nil:
		return true
	case spec.Resources == nil && other.Resources == nil:
		return true
	case spec.Resources != nil && other.Resources == nil:
		return false
	case spec.Resources == nil && other.Resources != nil:
		return false
	}

	r := *spec.Resources
	o := *other.Resources

	if len(r.Requests) != len(o.Requests) {
		return false
	}
	if len(r.Limits) != len(o.Limits) {
		return false
	}
	for name, qty := range r.Requests {
		oqty, ok := o.Requests[name]
		if !ok || qty.Cmp(oqty) != 0 {
			return false
		}
	}
	for name, qty := range r.Limits {
		oqty, ok := o.Limits[name]
		if !ok || qty.Cmp(oqty) != 0 {
			return false
		}
	}

	return true
}

// verifyResources verifies the resource requirements of this spec.
func (spec *AdjustmentSpec) verifyResources() error {
	if spec.Resources == nil {
		return nil
	}

	r := *spec.Resources
	if r.Requests == nil {
		r.Requests = corev1.ResourceList{}
	}
	if r.Limits == nil {
		r.Limits = corev1.ResourceList{}
	}

	req, rok := r.Requests[corev1.ResourceCPU]
	lim, lok := r.Limits[corev1.ResourceCPU]
	switch {
	case !rok && lok:
		r.Requests[corev1.ResourceCPU] = lim
	case rok && lok:
		if lim.Cmp(req) < 0 {
			return apiError("invalid CPU limit %q < request %q", lim, req)
		}
	}

	req, rok = r.Requests[corev1.ResourceMemory]
	lim, lok = r.Limits[corev1.ResourceMemory]
	switch {
	case !rok && lok:
		r.Requests[corev1.ResourceMemory] = lim
	case rok && lok:
		if lim.Cmp(req) < 0 {
			return apiError("invalid memory limit %q < request %q", lim, req)
		}
	}

	for name := range r.Requests {
		switch name {
		case corev1.ResourceCPU, corev1.ResourceMemory:
		default:
			return apiError("invalid resource requests: unsupported resource %v", name)
		}
	}

	for name := range r.Limits {
		switch name {
		case corev1.ResourceCPU, corev1.ResourceMemory:
		default:
			return apiError("invalid resource limits: unsupported resource %v", name)
		}
	}

	return nil
}

// verifyToptierLimit verifies the top tier memory limit settings of this spec.
func (spec *AdjustmentSpec) verifyToptierLimit() error {
	if spec.ToptierLimit == nil {
		return nil
	}

	l := spec.ToptierLimit.Value()
	if l < 0 {
		return apiError("invalid ToptierLimit %v", l)
	}

	return nil
}

// IsNodeInScope tests if the node is within this scope.
func (scope *AdjustmentScope) IsNodeInScope(node string) bool {
	if len(scope.Nodes) == 0 {
		return true
	}
	for _, n := range scope.Nodes {
		if matches(n, node) {
			return true
		}
	}
	return false
}

// IsContainerInScope tests if the container is within this scope.
func (scope *AdjustmentScope) IsContainerInScope(container resmgr.Evaluable) bool {
	if len(scope.Containers) == 0 {
		return true
	}
	for _, expr := range scope.Containers {
		if expr.Evaluate(container) {
			return true
		}
	}
	return false
}

// match a string against a primitive pattern with a single optional trailing '*'.
func matches(pattern, name string) bool {
	if pattern == "" {
		return true
	}
	if !strings.HasSuffix(pattern, "*") {
		return pattern == name
	}
	return strings.HasPrefix(name, pattern[0:len(pattern)-1])
}

// CompareScopes checks if two slices of scopes are (syntactically) identical.
func CompareScopes(scopes []AdjustmentScope, others []AdjustmentScope) bool {
	if len(scopes) != len(others) {
		return false
	}
	for idx, s := range scopes {
		o := others[idx]
		if !s.Compare(&o) {
			return false
		}
	}
	return true
}

// Compare check if the scope is identical to another one.
func (scope *AdjustmentScope) Compare(other *AdjustmentScope) bool {
	if len(scope.Nodes) != len(other.Nodes) || len(scope.Containers) != len(other.Containers) {
		return false
	}
	for idx, n := range scope.Nodes {
		if other.Nodes[idx] != n {
			return false
		}
	}
	for idx, c := range scope.Containers {
		if other.Containers[idx] != c {
			return false
		}
	}
	return true
}

// Compare checks if the classes are identical to another set.
func (c *Classes) Compare(o *Classes) bool {
	switch {
	case c == nil && o == nil:
		return true
	case c != nil && o == nil, c == nil && o != nil:
		return false
	case c.RDT != nil && o.RDT == nil, c.RDT == nil && o.RDT != nil:
		return false
	case c.BlockIO != nil && o.BlockIO == nil, c.BlockIO == nil && o.BlockIO != nil:
		return false
	case c.RDT == nil && c.BlockIO == nil:
		return true
	}
	return *c.RDT == *o.RDT && *c.BlockIO == *o.BlockIO
}

// apiError returns a format error specific to this API.
func apiError(format string, args ...interface{}) error {
	return fmt.Errorf("adjustment API error: "+format, args...)
}


================================================
FILE: pkg/apis/resmgr/v1alpha1/doc.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +k8s:deepcopy-gen=package
// +groupName=criresmgr.intel.com

package v1alpha1


================================================
FILE: pkg/apis/resmgr/v1alpha1/register.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package v1alpha1

import (
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/apimachinery/pkg/runtime"
	"k8s.io/apimachinery/pkg/runtime/schema"
)

var (
	// SchemeBuilder initializes a scheme builder
	SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes)
	// AddToScheme is a global function that registers this API group & version to a scheme
	AddToScheme = SchemeBuilder.AddToScheme
)

// SchemeGroupVersion is group version used to register these objects.
var SchemeGroupVersion = schema.GroupVersion{
	Group:   GroupName,
	Version: Version,
}

func Resource(resource string) schema.GroupResource {
	return SchemeGroupVersion.WithResource(resource).GroupResource()
}

// Adds the list of known types to api.Scheme.
func addKnownTypes(scheme *runtime.Scheme) error {
	scheme.AddKnownTypes(SchemeGroupVersion,
		&Adjustment{},
		&AdjustmentList{},
	)
	metav1.AddToGroupVersion(scheme, SchemeGroupVersion)
	return nil
}


================================================
FILE: pkg/apis/resmgr/v1alpha1/types.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package v1alpha1

import (
	corev1 "k8s.io/api/core/v1"
	resapi "k8s.io/apimachinery/pkg/api/resource"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

	resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr"
)

const (
	GroupName string = "criresmgr.intel.com"    // GroupName is the group of our CRD.
	Version   string = "v1alpha1"               // Version is the API version of our CRD.
	Kind      string = "Adjustment"             // Kind is the object kind of our CRD.
	Plural    string = "adjustments"            // Plural is Kind in plural form.
	Singular  string = "adjustment"             // Singular is Kind in singular form.
	Name      string = Plural + "." + GroupName // Name is the full name of our CRD.
)

// +genclient
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object

// Adjustment is a CRD used to externally adjust containers resource assignments.
type Adjustment struct {
	metav1.TypeMeta   `json:",inline"`
	metav1.ObjectMeta `json:"metadata,omitempty"`

	Spec   AdjustmentSpec   `json:"spec"`
	Status AdjustmentStatus `json:"status"`
}

// AdjustmentSpec specifies the scope for an external adjustment.
type AdjustmentSpec struct {
	Scope        []AdjustmentScope            `json:"scope"`
	Resources    *corev1.ResourceRequirements `json:"resources"`
	Classes      *Classes                     `json:"classes"`
	ToptierLimit *resapi.Quantity             `json:"toptierLimit"`
}

// AdjustmentStatus represents the status of applying an adjustment.
type AdjustmentStatus struct {
	Nodes map[string]AdjustmentNodeStatus `json:"nodes"`
}

// AdjustmentNodeStatus represents the status of an adjustment on a node.
type AdjustmentNodeStatus struct {
	Errors map[string]string `json:"errors"`
}

// AdjustmentScope defines the scope for an adjustment.
type AdjustmentScope struct {
	Nodes      []string             `json:"nodes"`
	Containers []*resmgr.Expression `json:"containers"`
}

// Classes defines RDT and BlockIO class assignments.
type Classes struct {
	BlockIO *string `json:"blockio"`
	RDT     *string `json:"rdt"`
}

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object

// AdjustmentList is a list of Adjustments.
type AdjustmentList struct {
	metav1.TypeMeta `json:",inline"`
	metav1.ListMeta `json:"metadata"`

	Items []Adjustment `json:"items"`
}


================================================
FILE: pkg/apis/resmgr/v1alpha1/zz_generated.deepcopy.go
================================================
//go:build !ignore_autogenerated
// +build !ignore_autogenerated

// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by deepcopy-gen. DO NOT EDIT.

package v1alpha1

import (
	resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr"
	v1 "k8s.io/api/core/v1"
	runtime "k8s.io/apimachinery/pkg/runtime"
)

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Adjustment) DeepCopyInto(out *Adjustment) {
	*out = *in
	out.TypeMeta = in.TypeMeta
	in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
	in.Spec.DeepCopyInto(&out.Spec)
	in.Status.DeepCopyInto(&out.Status)
	return
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Adjustment.
func (in *Adjustment) DeepCopy() *Adjustment {
	if in == nil {
		return nil
	}
	out := new(Adjustment)
	in.DeepCopyInto(out)
	return out
}

// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *Adjustment) DeepCopyObject() runtime.Object {
	if c := in.DeepCopy(); c != nil {
		return c
	}
	return nil
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *AdjustmentList) DeepCopyInto(out *AdjustmentList) {
	*out = *in
	out.TypeMeta = in.TypeMeta
	in.ListMeta.DeepCopyInto(&out.ListMeta)
	if in.Items != nil {
		in, out := &in.Items, &out.Items
		*out = make([]Adjustment, len(*in))
		for i := range *in {
			(*in)[i].DeepCopyInto(&(*out)[i])
		}
	}
	return
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AdjustmentList.
func (in *AdjustmentList) DeepCopy() *AdjustmentList {
	if in == nil {
		return nil
	}
	out := new(AdjustmentList)
	in.DeepCopyInto(out)
	return out
}

// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *AdjustmentList) DeepCopyObject() runtime.Object {
	if c := in.DeepCopy(); c != nil {
		return c
	}
	return nil
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *AdjustmentNodeStatus) DeepCopyInto(out *AdjustmentNodeStatus) {
	*out = *in
	if in.Errors != nil {
		in, out := &in.Errors, &out.Errors
		*out = make(map[string]string, len(*in))
		for key, val := range *in {
			(*out)[key] = val
		}
	}
	return
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AdjustmentNodeStatus.
func (in *AdjustmentNodeStatus) DeepCopy() *AdjustmentNodeStatus {
	if in == nil {
		return nil
	}
	out := new(AdjustmentNodeStatus)
	in.DeepCopyInto(out)
	return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *AdjustmentScope) DeepCopyInto(out *AdjustmentScope) {
	*out = *in
	if in.Nodes != nil {
		in, out := &in.Nodes, &out.Nodes
		*out = make([]string, len(*in))
		copy(*out, *in)
	}
	if in.Containers != nil {
		in, out := &in.Containers, &out.Containers
		*out = make([]*resmgr.Expression, len(*in))
		for i := range *in {
			if (*in)[i] != nil {
				in, out := &(*in)[i], &(*out)[i]
				*out = (*in).DeepCopy()
			}
		}
	}
	return
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AdjustmentScope.
func (in *AdjustmentScope) DeepCopy() *AdjustmentScope {
	if in == nil {
		return nil
	}
	out := new(AdjustmentScope)
	in.DeepCopyInto(out)
	return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *AdjustmentSpec) DeepCopyInto(out *AdjustmentSpec) {
	*out = *in
	if in.Scope != nil {
		in, out := &in.Scope, &out.Scope
		*out = make([]AdjustmentScope, len(*in))
		for i := range *in {
			(*in)[i].DeepCopyInto(&(*out)[i])
		}
	}
	if in.Resources != nil {
		in, out := &in.Resources, &out.Resources
		*out = new(v1.ResourceRequirements)
		(*in).DeepCopyInto(*out)
	}
	if in.Classes != nil {
		in, out := &in.Classes, &out.Classes
		*out = new(Classes)
		(*in).DeepCopyInto(*out)
	}
	if in.ToptierLimit != nil {
		in, out := &in.ToptierLimit, &out.ToptierLimit
		x := (*in).DeepCopy()
		*out = &x
	}
	return
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AdjustmentSpec.
func (in *AdjustmentSpec) DeepCopy() *AdjustmentSpec {
	if in == nil {
		return nil
	}
	out := new(AdjustmentSpec)
	in.DeepCopyInto(out)
	return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *AdjustmentStatus) DeepCopyInto(out *AdjustmentStatus) {
	*out = *in
	if in.Nodes != nil {
		in, out := &in.Nodes, &out.Nodes
		*out = make(map[string]AdjustmentNodeStatus, len(*in))
		for key, val := range *in {
			(*out)[key] = *val.DeepCopy()
		}
	}
	return
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AdjustmentStatus.
func (in *AdjustmentStatus) DeepCopy() *AdjustmentStatus {
	if in == nil {
		return nil
	}
	out := new(AdjustmentStatus)
	in.DeepCopyInto(out)
	return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Classes) DeepCopyInto(out *Classes) {
	*out = *in
	if in.BlockIO != nil {
		in, out := &in.BlockIO, &out.BlockIO
		*out = new(string)
		**out = **in
	}
	if in.RDT != nil {
		in, out := &in.RDT, &out.RDT
		*out = new(string)
		**out = **in
	}
	return
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Classes.
func (in *Classes) DeepCopy() *Classes {
	if in == nil {
		return nil
	}
	out := new(Classes)
	in.DeepCopyInto(out)
	return out
}


================================================
FILE: pkg/avx/collector.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package avx

//go:generate go run elfdump.go

import (
	"bytes"
	"flag"
	"fmt"
	"os"
	"path/filepath"
	"regexp"
	"strconv"
	"strings"
	"sync"
	"syscall"
	"unsafe"

	bpf "github.com/cilium/ebpf"
	"github.com/intel/cri-resource-manager/pkg/cgroups"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/pkg/errors"
	"github.com/prometheus/client_golang/prometheus"
	"golang.org/x/sys/unix"
)

const (
	// LastCPUName is the Prometheuse Gauge name for last CPU with AVX512 instructions.
	LastCPUName = "last_cpu_avx_task_switches"
	// AVXSwitchCountName is the Prometheuse Gauge name for AVX switch count per cgroup.
	AVXSwitchCountName = "avx_switch_count_per_cgroup"
	// AllSwitchCountName is the Prometheuse Gauge name for all switch count per cgroup.
	AllSwitchCountName = "all_switch_count_per_cgroup"
	// LastUpdateNs is the Prometheuse Gauge name for per cgroup AVX512 activity timestamp.
	LastUpdateNs = "last_update_ns"
	// Path to kernel tracepoints
	kernelTracepointPath = "/sys/kernel/debug/tracing/events"
	// rlimit value (512k) needed to lock map data in memory
	mapMemLockLimit = 524288
)

// Prometheus Metric descriptor indices and descriptor table
const (
	lastCPUDesc = iota
	avxSwitchCountDesc
	allSwitchCountDesc
	lastUpdateNsDesc
	numDescriptors
)

var descriptors = [numDescriptors]*prometheus.Desc{
	lastCPUDesc: prometheus.NewDesc(
		LastCPUName,
		"Number of task switches on the CPU where AVX512 instructions were used.",
		[]string{
			"cpu_id",
		}, nil,
	),
	avxSwitchCountDesc: prometheus.NewDesc(
		AVXSwitchCountName,
		"Number of task switches where AVX512 instructions were used in a particular cgroup.",
		[]string{
			"container_id",
		}, nil,
	),
	allSwitchCountDesc: prometheus.NewDesc(
		AllSwitchCountName,
		"Total number of task switches in a particular cgroup.",
		[]string{
			"container_id",
		}, nil,
	),
	lastUpdateNsDesc: prometheus.NewDesc(
		"last_update_ns",
		"Time since last AVX512 activity in a particular cgroup.",
		[]string{
			"container_id",
		}, nil,
	),
}

var (
	bpfBinaryName  = "avx512.o"
	bpfInstallpath = "/usr/libexec/bpf"

	// our logger instance
	log = logger.NewLogger("avx")
)

type collector struct {
	root string
	ebpf *bpf.Collection
	fds  []int
}

func enablePerfTracepoint(prog *bpf.Program, tracepoint string) (int, error) {

	id, err := os.ReadFile(filepath.Join(kernelTracepointPath, tracepoint, "id"))
	if err != nil {
		return -1, errors.Wrap(err, "unable to read tracepoint ID")
	}

	tid, err := strconv.Atoi(strings.TrimSpace(string(id)))
	if err != nil {
		return -1, errors.New("unable to convert tracepoint ID")
	}

	attr := unix.PerfEventAttr{
		Type:        unix.PERF_TYPE_TRACEPOINT,
		Config:      uint64(tid), // tracepoint id
		Sample_type: unix.PERF_SAMPLE_RAW,
		Sample:      1,
		Wakeup:      1,
	}

	pfd, err := unix.PerfEventOpen(&attr, -1, 0, -1, unix.PERF_FLAG_FD_CLOEXEC)
	if err != nil {
		return -1, errors.Wrap(err, "unable to open perf events")
	}

	if _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(pfd), unix.PERF_EVENT_IOC_ENABLE, 0); errno != 0 {
		return -1, errors.Errorf("unable to set up perf events: %s", errno)
	}

	if _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(pfd), unix.PERF_EVENT_IOC_SET_BPF, uintptr(prog.FD())); errno != 0 {
		return -1, errors.Errorf("unable to attach bpf program to perf events: %s", errno)
	}

	return pfd, nil
}

func getKernelVersion() uint32 {

	var uts unix.Utsname

	err := unix.Uname(&uts)
	if err != nil {
		return 0
	}

	str := string(bytes.SplitN(uts.Release[:], []byte{0}, 2)[0])

	ver := strings.SplitN(str, ".", 3)

	major, err := strconv.ParseUint(ver[0], 10, 8)
	if err != nil {
		return 0
	}
	minor, err := strconv.ParseUint(ver[1], 10, 8)
	if err != nil {
		return uint32(major << 16)
	}

	// ignore patch version
	return uint32(major<<16 + minor<<8)
}

func kernelVersionStr(v uint32) string {
	return fmt.Sprintf("%d.%d.0", v>>16, (v>>8)&0xff)
}

// NewCollector creates new Prometheus collector for AVX metrics
func NewCollector() (prometheus.Collector, error) {

	// Set rlimit to be able to lock map values in memory
	memlockLimit := &unix.Rlimit{
		Cur: mapMemLockLimit,
		Max: mapMemLockLimit,
	}
	err := unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
	if err != nil {
		return nil, errors.Wrap(err, "unable to set rlimit")
	}

	spec, err := bpf.LoadCollectionSpec(filepath.Join(bpfInstallpath, bpfBinaryName))
	if err != nil {
		log.Info("Unable to load user eBPF (%v). Using default CollectionSpec from ELF program bytes", err)
		spec, err = bpf.LoadCollectionSpecFromReader(bytes.NewReader(program[:]))
		if err != nil {
			return nil, errors.Wrap(err, "unable to load default CollectionSpec from ELF program bytes")
		}
	}

	hostVer := getKernelVersion()
	progVer := spec.Programs["tracepoint__x86_fpu_regs_deactivated"].KernelVersion

	if hostVer < progVer {
		return nil, errors.Wrapf(err, "The host kernel version (v%s) is too old to run the AVX512 collector program. Minimum version is v%s", kernelVersionStr(hostVer), kernelVersionStr(progVer))
	}

	collection, err := bpf.NewCollection(spec)
	if err != nil {
		return nil, errors.Wrap(err, "unable to create new Collection")
	}

	ffd, err := enablePerfTracepoint(collection.Programs["tracepoint__x86_fpu_regs_deactivated"], "x86_fpu/x86_fpu_regs_deactivated")
	if err != nil {
		return nil, errors.Wrap(err, "unable to enable fpu tracepoint")
	}

	sfd, err := enablePerfTracepoint(collection.Programs["tracepoint__sched_switch"], "sched/sched_switch")
	if err != nil {
		return nil, errors.Wrap(err, "unable to enable sched tracepoint")
	}

	return &collector{
		root: cgroups.GetV2Dir(),
		ebpf: collection,
		fds:  []int{ffd, sfd},
	}, nil
}

// Describe implements prometheus.Collector interface
func (c *collector) Describe(ch chan<- *prometheus.Desc) {
	for _, d := range descriptors {
		ch <- d
	}
}

// from iovisor/gobpf: bpf.NowNanoseconds()
// nowNanoseconds returns a time that can be compared to bpf_ktime_get_ns()
func nowNanoseconds() uint64 {
	var ts syscall.Timespec
	syscall.Syscall(syscall.SYS_CLOCK_GETTIME, 1 /* CLOCK_MONOTONIC */, uintptr(unsafe.Pointer(&ts)), 0)
	sec, nsec := ts.Unix()
	return 1000*1000*1000*uint64(sec) + uint64(nsec)
}

// Collect implements prometheus.Collector interface
func (c collector) Collect(ch chan<- prometheus.Metric) {
	var (
		wg sync.WaitGroup

		key       uint64
		perCPUVal []uint32
	)

	cgroupids := make(map[uint64]uint32)
	lastCPUs := make(map[string]uint32)

	cg := cgroups.NewCgroupID(c.root)

	m := c.ebpf.Maps["avx_context_switch_count_hash"]
	iter := m.Iterate()

	for iter.Next(&key, &perCPUVal) {
		var sum uint32
		for cpuID, count := range perCPUVal {
			if count == 0 {
				continue
			}
			sum = sum + count

			cpuX := fmt.Sprintf("CPU%d", cpuID)
			lastCPUs[cpuX] = lastCPUs[cpuX] + count

		}
		cgroupids[key] = sum
		log.Debug("cgroupid %d => counter %d", key, sum)

		// reset the counter by deleting the key
		err := m.Delete(key)
		if err != nil {
			log.Error("%+v", err)
		}
	}
	if iter.Err() != nil {
		log.Error("unable to iterate all elements of avx_context_switch_count: %+v", iter.Err())
	}

	for lastCPU, count := range lastCPUs {
		ch <- prometheus.MustNewConstMetric(
			descriptors[lastCPUDesc],
			prometheus.GaugeValue,
			float64(count),
			lastCPU)
	}

	for cgroupid, counter := range cgroupids {
		wg.Add(1)
		go func(cgroupid_ uint64, counter_ uint32) {
			var allCount uint32
			var lastUpdate uint64

			defer wg.Done()

			path, err := cg.Find(cgroupid_)
			if err != nil {
				log.Error("failed to find cgroup by id: %v", err)
				return
			}

			re := regexp.MustCompile(`[a-z0-9]{64}`)
			matches := re.FindStringSubmatch(filepath.Base(path))
			if len(matches) == 0 {
				return
			}

			ch <- prometheus.MustNewConstMetric(
				descriptors[avxSwitchCountDesc],
				prometheus.GaugeValue,
				float64(counter_),
				matches[0])

			if err := c.ebpf.Maps["all_context_switch_count_hash"].Lookup(uint64(cgroupid_), &allCount); err != nil {
				log.Error("unable to find 'all' context switch count: %+v", err)
				return
			}
			log.Debug("all: %d", allCount)

			if err := c.ebpf.Maps["last_update_ns_hash"].Lookup(uint64(cgroupid_), &lastUpdate); err != nil {
				log.Error("unable to find last update timestamp: %+v", err)
				return
			}
			log.Debug("last: %d", lastUpdate)

			ch <- prometheus.MustNewConstMetric(
				descriptors[allSwitchCountDesc],
				prometheus.GaugeValue,
				float64(allCount),
				re.FindStringSubmatch(filepath.Base(path))[0])

			ch <- prometheus.MustNewConstMetric(
				descriptors[lastUpdateNsDesc],
				prometheus.GaugeValue,
				float64(nowNanoseconds()-lastUpdate),
				re.FindStringSubmatch(filepath.Base(path))[0])

		}(cgroupid, counter)
	}

	// We need to wait so that the response channel doesn't get closed.
	wg.Wait()

	m = c.ebpf.Maps["all_context_switch_count_hash"]
	iter = m.Iterate()

	var val uint32
	for iter.Next(&key, &val) {
		// reset the counter by deleting the key
		err := m.Delete(key)
		if err != nil {
			log.Error("%+v", err)
		}
	}

	if iter.Err() != nil {
		log.Error("unable to reset all elements of all_context_switch_count: %+v", iter.Err())
	}
}

func init() {
	flag.StringVar(&bpfInstallpath, "bpf-install-path", bpfInstallpath,
		"Path to eBPF install directory")
}


================================================
FILE: pkg/avx/elfdump.go
================================================
//go:build ignore
// +build ignore

/*
Copyright 2020 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
	"encoding/hex"
	"fmt"
	"os"
	"strings"
	"text/template"
)

const (
	blocksPerRow = 12
)

type Program struct {
	ProgramLines []string
}

func main() {
	f, err := os.ReadFile("../../libexec/avx512.o")
	if err != nil {
		fmt.Println("Note: AVX512 eBPF ELF not available.")
	}
	enc := make([]byte, hex.EncodedLen(len(f)))
	enclen := hex.Encode(enc, f)

	var j int
	var row strings.Builder
	program := make([]string, 0)

	for i := 0; i < enclen-1; i = i + 2 {
		fmt.Fprintf(&row, "0x%s, ", enc[i:i+2])
		j++
		if j%blocksPerRow == 0 {
			program = append(program, row.String())
			row.Reset()
		}
	}
	// flush last row
	program = append(program, row.String())

	p := Program{
		ProgramLines: program,
	}

	template := template.Must(template.New("").Parse(`// Code generated by go generate; DO NOT EDIT.

package avx

var program = [...]byte{
{{- range .ProgramLines }}
	{{ printf "%s" . }}
{{- end }}
}
`))

	outfile, err := os.Create("programbytes_gendata.go")
	if err != nil {
		fmt.Println("elfdump:", err)
		os.Exit(1)
	}
	defer outfile.Close()

	err = template.Execute(outfile, p)
	if err != nil {
		fmt.Println("elfdump:", err)
	}
}


================================================
FILE: pkg/avx/register.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !noavx
// +build !noavx

package avx

import (
	"github.com/intel/cri-resource-manager/pkg/metrics"
)

func init() {
	err := metrics.RegisterCollector("avx", NewCollector)
	if err != nil {
		log.Error("Failed to register AVX collector: %v", err)
	}
}


================================================
FILE: pkg/blockio/blockio.go
================================================
/*
Copyright 2020 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package blockio

import (
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"sort"
	"strings"
	"syscall"

	"golang.org/x/sys/unix"

	"k8s.io/apimachinery/pkg/api/resource"

	"github.com/intel/cri-resource-manager/pkg/cgroups"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	logger "github.com/intel/cri-resource-manager/pkg/log"
)

const (
	// ConfigModuleName is the configuration section of blockio class definitions
	ConfigModuleName = "blockio"

	// sysfsBlockDeviceIOSchedulerPaths expands (with glob) to block device scheduler files.
	// If modified, check how to parse device node from expanded paths.
	sysfsBlockDeviceIOSchedulerPaths = "/sys/block/*/queue/scheduler"
)

// Class represents a block I/O class, a class name together with its associated
// parameters, essentially a single key/value pair from staticOciBlockIO below.
// This type is only used for querying all (static) block I/O classes in a sorting-
// form.
type Class struct {
	Name       string
	Parameters cgroups.OciBlockIOParameters
}

// BlockDeviceInfo holds information on a block device to be configured.
// As users can specify block devices using wildcards ("/dev/disk/by-id/*SSD*")
// BlockDeviceInfo.Origin is maintained for traceability: why this
// block device is included in configuration.
// BlockDeviceInfo.DevNode contains resolved device node, like "/dev/sda".
type BlockDeviceInfo struct {
	Major   int64
	Minor   int64
	DevNode string
	Origin  string
}

// Our logger instance.
var log logger.Logger = logger.NewLogger("blockio")

// staticOciBlockIO connects user-defined block I/O classes to
// corresponding OCI BlockIO parameters. "Static" means that
// new/current block devices matching device wildcards in these
// classes are not expanded every time new containers are assigned to
// these classes. Devices are scanned on only at the beginning and on
// blockio configuration changes.
var staticOciBlockIO = map[string]cgroups.OciBlockIOParameters{}

// currentIOSchedulers contains io-schedulers (found in
// sysfsBlockDeviceIOSchedulerPaths) of device nodes:
// {"/dev/sda": "bfq"}
var currentIOSchedulers map[string]string

// GetClasses returns block I/O class names and associated parameters in sorted slice.
func GetClasses() []*Class {
	classes := make([]*Class, 0, len(staticOciBlockIO))
	for name, params := range staticOciBlockIO {
		classes = append(classes, &Class{Name: name, Parameters: params})
	}
	sort.Slice(classes, func(i, j int) bool {
		return strings.Compare(classes[i].Name, classes[j].Name) < 0
	})
	return classes
}

// UpdateOciConfig converts the configuration in the opt variable into staticOciBlockIO
func UpdateOciConfig(ignoreErrors bool) error {
	currentIOSchedulers, ioSchedulerDetectionError := getCurrentIOSchedulers()
	if ioSchedulerDetectionError != nil {
		log.Warn("configuration validation partly disabled due to IO scheduler detection error %#v", ioSchedulerDetectionError.Error())
	}

	staticOciBlockIO = map[string]cgroups.OciBlockIOParameters{}
	// Create static OCI BlockIO structures for each blockio class
	for class := range opt.Classes {
		ociBlockIO, err := devicesParametersToOci(opt.Classes[class], currentIOSchedulers)
		if err != nil {
			if ignoreErrors {
				log.Error("ignoring: %v", err)
			} else {
				return err
			}
		}
		// Handle all configurations as static for now. That
		// is, the list of block devices matching Devices
		// wildcards will not be updated without new
		// configNotify(). class.DynamicDevices not supported
		// yet.
		staticOciBlockIO[class] = ociBlockIO
	}
	return nil
}

// SetContainerClass assigns the pod in a container to a blockio class.
func SetContainerClass(c cache.Container, class string) error {
	ociBlockIO, classIsStatic := staticOciBlockIO[class]
	if !classIsStatic {
		return blockioError("no OCI BlockIO parameters for class %#v", class)
	}

	blkioCgroupRoot := cgroups.Blkio.Path()
	containerCgroupDir := c.GetCgroupDir()
	if containerCgroupDir == "" {
		return blockioError("failed to find cgroup directory for container %s under %#v, container id %#v", c.PrettyName(), blkioCgroupRoot, c.GetID())
	}
	containerCgroupPath := filepath.Join(blkioCgroupRoot, containerCgroupDir)

	err := cgroups.ResetBlkioParameters(containerCgroupPath, ociBlockIO)
	if err != nil {
		return blockioError("assigning container %v to class %#v failed: %w", c.PrettyName(), class, err)
	}

	return nil
}

// getCurrentIOSchedulers returns currently active io-scheduler used for each block device in the system.
func getCurrentIOSchedulers() (map[string]string, error) {
	var ios = map[string]string{}
	schedulerFiles, err := filepath.Glob(sysfsBlockDeviceIOSchedulerPaths)
	if err != nil {
		return ios, blockioError("error in IO scheduler wildcards %#v: %w", sysfsBlockDeviceIOSchedulerPaths, err)
	}
	for _, schedulerFile := range schedulerFiles {
		devName := strings.SplitN(schedulerFile, "/", 5)[3]
		schedulerDataB, err := os.ReadFile(schedulerFile)
		if err != nil {
			// A block device may be disconnected. Continue without error.
			log.Error("failed to read current IO scheduler %#v: %v\n", schedulerFile, err)
			continue
		}
		schedulerData := strings.Trim(string(schedulerDataB), "\n")
		currentScheduler := ""
		if strings.IndexByte(schedulerData, ' ') == -1 {
			currentScheduler = schedulerData
		} else {
			openB := strings.Index(schedulerData, "[")
			closeB := strings.Index(schedulerData, "]")
			if -1 < openB && openB < closeB {
				currentScheduler = schedulerData[openB+1 : closeB]
			}
		}
		if currentScheduler == "" {
			return ios, blockioError("could not parse current scheduler in %#v\n", schedulerFile)
		}

		ios["/dev/"+devName] = currentScheduler
	}
	return ios, nil
}

// deviceParametersToOci converts single blockio class parameters into OCI BlockIO structure.
func devicesParametersToOci(dps []DevicesParameters, currentIOSchedulers map[string]string) (cgroups.OciBlockIOParameters, error) {
	errs := []error{}
	oci := cgroups.NewOciBlockIOParameters()
	for _, dp := range dps {
		var err error
		var weight, throttleReadBps, throttleWriteBps, throttleReadIOPS, throttleWriteIOPS int64
		weight, err = parseAndValidateInt64("Weight", dp.Weight, -1, 10, 1000)
		errs = append(errs, err)
		throttleReadBps, err = parseAndValidateInt64("ThrottleReadBps", dp.ThrottleReadBps, -1, 0, -1)
		errs = append(errs, err)
		throttleWriteBps, err = parseAndValidateInt64("ThrottleWriteBps", dp.ThrottleWriteBps, -1, 0, -1)
		errs = append(errs, err)
		throttleReadIOPS, err = parseAndValidateInt64("ThrottleReadIOPS", dp.ThrottleReadIOPS, -1, 0, -1)
		errs = append(errs, err)
		throttleWriteIOPS, err = parseAndValidateInt64("ThrottleWriteIOPS", dp.ThrottleWriteIOPS, -1, 0, -1)
		errs = append(errs, err)
		if dp.Devices == nil {
			if weight > -1 {
				oci.Weight = weight
			}
			if throttleReadBps > -1 || throttleWriteBps > -1 || throttleReadIOPS > -1 || throttleWriteIOPS > -1 {
				errs = append(errs, fmt.Errorf("ignoring throttling (rbps=%#v wbps=%#v riops=%#v wiops=%#v): Devices not listed",
					dp.ThrottleReadBps, dp.ThrottleWriteBps, dp.ThrottleReadIOPS, dp.ThrottleWriteIOPS))
			}
		} else {
			blockDevices, err := currentPlatform.configurableBlockDevices(dp.Devices)
			if err != nil {
				// Problems in matching block device wildcards and resolving symlinks
				// are worth reporting, but must not block configuring blkio where possible.
				log.Error(err.Error())
			}
			if len(blockDevices) == 0 {
				log.Warn("no matches on any of Devices: %v, parameters ignored", dp.Devices)
			}
			for _, blockDeviceInfo := range blockDevices {
				if weight != -1 {
					if ios, found := currentIOSchedulers[blockDeviceInfo.DevNode]; found {
						if ios != "bfq" && ios != "cfq" {
							log.Warn("weight has no effect on device %#v due to "+
								"incompatible io-scheduler %#v (bfq or cfq required)", blockDeviceInfo.DevNode, ios)
						}
					}
					oci.WeightDevice.Update(blockDeviceInfo.Major, blockDeviceInfo.Minor, weight)
				}
				if throttleReadBps != -1 {
					oci.ThrottleReadBpsDevice.Update(blockDeviceInfo.Major, blockDeviceInfo.Minor, throttleReadBps)
				}
				if throttleWriteBps != -1 {
					oci.ThrottleWriteBpsDevice.Update(blockDeviceInfo.Major, blockDeviceInfo.Minor, throttleWriteBps)
				}
				if throttleReadIOPS != -1 {
					oci.ThrottleReadIOPSDevice.Update(blockDeviceInfo.Major, blockDeviceInfo.Minor, throttleReadIOPS)
				}
				if throttleWriteIOPS != -1 {
					oci.ThrottleWriteIOPSDevice.Update(blockDeviceInfo.Major, blockDeviceInfo.Minor, throttleWriteIOPS)
				}
			}
		}
	}
	return oci, errors.Join(errs...)
}

// parseAndValidateInt64 parses quantities, like "64 M", and validates that they are in given range.
func parseAndValidateInt64(fieldName string, fieldContent string,
	defaultValue int64, min int64, max int64) (int64, error) {
	// Returns field content
	if fieldContent == "" {
		return defaultValue, nil
	}
	qty, err := resource.ParseQuantity(fieldContent)
	if err != nil {
		return defaultValue, fmt.Errorf("syntax error in %#v (%#v)", fieldName, fieldContent)
	}
	value := qty.Value()
	if min != -1 && min > value {
		return defaultValue, fmt.Errorf("value of %#v (%#v) smaller than minimum (%#v)", fieldName, value, min)
	}
	if max != -1 && value > max {
		return defaultValue, fmt.Errorf("value of %#v (%#v) bigger than maximum (%#v)", fieldName, value, max)
	}
	return value, nil
}

// platformInterface includes functions that access the system. Enables mocking the system.
type platformInterface interface {
	configurableBlockDevices(devWildcards []string) ([]BlockDeviceInfo, error)
}

// defaultPlatform versions of platformInterface functions access the underlying system.
type defaultPlatform struct{}

// currentPlatform defines which platformInterface is used: defaultPlatform or a mock, for instance.
var currentPlatform platformInterface = defaultPlatform{}

// configurableBlockDevices finds major:minor numbers for device filenames (wildcards allowed)
func (dpm defaultPlatform) configurableBlockDevices(devWildcards []string) ([]BlockDeviceInfo, error) {
	// Return map {devNode: BlockDeviceInfo}
	// Example: {"/dev/sda": {Major:8, Minor:0, Origin:"from symlink /dev/disk/by-id/ata-VendorXSSD from wildcard /dev/disk/by-id/*SSD*"}}
	errs := []error{}
	blockDevices := []BlockDeviceInfo{}
	var origin string

	// 1. Expand wildcards to device filenames (may be symlinks)
	// Example: devMatches["/dev/disk/by-id/ata-VendorSSD"] == "from wildcard \"dev/disk/by-id/*SSD*\""
	devMatches := map[string]string{} // {devNodeOrSymlink: origin}
	for _, devWildcard := range devWildcards {
		devWildcardMatches, err := filepath.Glob(devWildcard)
		if err != nil {
			errs = append(errs, fmt.Errorf("bad device wildcard %#v: %w", devWildcard, err))
			continue
		}
		if len(devWildcardMatches) == 0 {
			errs = append(errs, fmt.Errorf("device wildcard %#v does not match any device nodes", devWildcard))
			continue
		}
		for _, devMatch := range devWildcardMatches {
			if devMatch != devWildcard {
				origin = fmt.Sprintf("from wildcard %#v", devWildcard)
			} else {
				origin = ""
			}
			devMatches[devMatch] = strings.TrimSpace(fmt.Sprintf("%v %v", devMatches[devMatch], origin))
		}
	}

	// 2. Find out real device nodes behind symlinks
	// Example: devRealPaths["/dev/sda"] == "from symlink \"/dev/disk/by-id/ata-VendorSSD\""
	devRealpaths := map[string]string{} // {devNode: origin}
	for devMatch, devOrigin := range devMatches {
		realDevNode, err := filepath.EvalSymlinks(devMatch)
		if err != nil {
			errs = append(errs, fmt.Errorf("cannot filepath.EvalSymlinks(%#v): %w", devMatch, err))
			continue
		}
		if realDevNode != devMatch {
			origin = fmt.Sprintf("from symlink %#v %v", devMatch, devOrigin)
		} else {
			origin = devOrigin
		}
		devRealpaths[realDevNode] = strings.TrimSpace(fmt.Sprintf("%v %v", devRealpaths[realDevNode], origin))
	}

	// 3. Filter out everything but block devices that are not partitions
	// Example: blockDevices[0] == {Major: 8, Minor: 0, DevNode: "/dev/sda", Origin: "..."}
	for devRealpath, devOrigin := range devRealpaths {
		origin := ""
		if devOrigin != "" {
			origin = fmt.Sprintf(" (origin: %s)", devOrigin)
		}
		fileInfo, err := os.Stat(devRealpath)
		if err != nil {
			errs = append(errs, fmt.Errorf("cannot os.Stat(%#v): %w%s", devRealpath, err, origin))
			continue
		}
		fileMode := fileInfo.Mode()
		if fileMode&os.ModeDevice == 0 {
			errs = append(errs, fmt.Errorf("file %#v is not a device%s", devRealpath, origin))
			continue
		}
		if fileMode&os.ModeCharDevice != 0 {
			errs = append(errs, fmt.Errorf("file %#v is a character device%s", devRealpath, origin))
			continue
		}
		sys, ok := fileInfo.Sys().(*syscall.Stat_t)
		major := unix.Major(sys.Rdev)
		minor := unix.Minor(sys.Rdev)
		if !ok {
			errs = append(errs, fmt.Errorf("cannot get syscall stat_t from %#v: %w%s", devRealpath, err, origin))
			continue
		}
		if minor&0xf != 0 {
			errs = append(errs, fmt.Errorf("skipping %#v: cannot weight/throttle partitions%s", devRealpath, origin))
			continue
		}
		blockDevices = append(blockDevices, BlockDeviceInfo{
			Major:   int64(major),
			Minor:   int64(minor),
			DevNode: devRealpath,
			Origin:  devOrigin,
		})
	}
	return blockDevices, errors.Join(errs...)
}

// blockioError creates a formatted error message.
func blockioError(format string, args ...interface{}) error {
	return fmt.Errorf(format, args...)
}


================================================
FILE: pkg/blockio/blockio_test.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package blockio

import (
	"fmt"
	"path/filepath"
	"strings"
	"testing"

	"github.com/intel/cri-resource-manager/pkg/cgroups"
	"github.com/intel/cri-resource-manager/pkg/testutils"
)

var knownIOSchedulers = map[string]bool{
	"bfq":         true,
	"cfq":         true,
	"deadline":    true,
	"kyber":       true,
	"mq-deadline": true,
	"none":        true,
	"noop":        true,
}

// TestGetCurrentIOSchedulers: unit test for getCurrentIOSchedulers()
func TestGetCurrentIOSchedulers(t *testing.T) {
	currentIOSchedulers, err := getCurrentIOSchedulers()
	testutils.VerifyError(t, err, 0, nil)
	for blockDev, ioScheduler := range currentIOSchedulers {
		s, ok := knownIOSchedulers[ioScheduler]
		if !ok || !s {
			t.Errorf("unknown io scheduler %#v on block device %#v", ioScheduler, blockDev)
		}
	}
}

// TestConfigurableBlockDevices: unit tests for configurableBlockDevices()
func TestConfigurableBlockDevices(t *testing.T) {
	sysfsBlockDevs, err := filepath.Glob("/sys/block/*")
	if err != nil {
		sysfsBlockDevs = []string{}
	}
	devBlockDevs := []string{}
	for _, sysfsBlockDev := range sysfsBlockDevs {
		if strings.HasPrefix(sysfsBlockDev, "/sys/block/sd") || strings.HasPrefix(sysfsBlockDev, "/sys/block/vd") {
			devBlockDevs = append(devBlockDevs, strings.Replace(sysfsBlockDev, "/sys/block/", "/dev/", 1))
		}
	}
	devPartitions := []string{}
	for _, devBlockDev := range devBlockDevs {
		devPartitions, _ = filepath.Glob(devBlockDev + "[0-9]")
		if len(devPartitions) > 0 {
			break
		}
	}
	t.Logf("test real block devices: %v", devBlockDevs)
	t.Logf("test partitions: %v", devPartitions)
	tcases := []struct {
		name                    string
		devWildcards            []string
		expectedErrorCount      int
		expectedErrorSubstrings []string
		expectedMatches         int
		disabled                bool
		disabledReason          string
	}{
		{
			name:               "no device wildcards",
			devWildcards:       nil,
			expectedErrorCount: 0,
		},
		{
			name:                    "bad wildcard",
			devWildcards:            []string{"/[-/verybadwildcard]"},
			expectedErrorCount:      1,
			expectedErrorSubstrings: []string{"verybadwildcard", "syntax error"},
		},
		{
			name:                    "not matching wildcard",
			devWildcards:            []string{"/dev/path that should not exist/*"},
			expectedErrorCount:      1,
			expectedErrorSubstrings: []string{"does not match any"},
		},
		{
			name:                    "two wildcards: empty string and a character device",
			devWildcards:            []string{"/dev/null", ""},
			expectedErrorCount:      2,
			expectedErrorSubstrings: []string{"\"/dev/null\" is a character device", "\"\" does not match any"},
		},
		{
			name:                    "not a device or even a file",
			devWildcards:            []string{"/proc", "/proc/meminfo", "/proc/notexistingfile"},
			expectedErrorCount:      3,
			expectedErrorSubstrings: []string{"\"/proc\" is not a device", "\"/proc/meminfo\" is not a device"},
		},
		{
			name:            "real block devices",
			devWildcards:    devBlockDevs,
			expectedMatches: len(devBlockDevs),
		},
		{
			name:                    "partition",
			devWildcards:            devPartitions,
			expectedErrorCount:      len(devPartitions),
			expectedErrorSubstrings: []string{"cannot weight/throttle partitions"},
			disabled:                len(devPartitions) == 0,
			disabledReason:          "no block device partitions found",
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			if tc.disabled {
				t.Skip(tc.disabledReason)
			}
			realPlatform := defaultPlatform{}
			bdis, err := realPlatform.configurableBlockDevices(tc.devWildcards)
			testutils.VerifyError(t, err, tc.expectedErrorCount, tc.expectedErrorSubstrings)
			if len(bdis) != tc.expectedMatches {
				t.Errorf("expected %d matching block devices, got %d", tc.expectedMatches, len(bdis))
			}
		})
	}
}

// TestDevicesParametersToOci: unit tests for devicesParametersToOci
func TestDevicesParametersToOci(t *testing.T) {
	// switch real devicesParametersToOci to call mockPlatform.configurableBlockDevices
	currentPlatform = mockPlatform{}
	tcases := []struct {
		name                    string
		dps                     []DevicesParameters
		iosched                 map[string]string
		expectedOci             *cgroups.OciBlockIOParameters
		expectedErrorCount      int
		expectedErrorSubstrings []string
	}{
		{
			name: "all OCI fields",
			dps: []DevicesParameters{
				{
					Weight: "144",
				},
				{
					Devices:           []string{"/dev/sda"},
					ThrottleReadBps:   "1G",
					ThrottleWriteBps:  "2M",
					ThrottleReadIOPS:  "3k",
					ThrottleWriteIOPS: "4",
					Weight:            "50",
				},
			},
			iosched: map[string]string{"/dev/sda": "bfq"},
			expectedOci: &cgroups.OciBlockIOParameters{
				Weight: 144,
				WeightDevice: cgroups.OciDeviceWeights{
					{Major: 11, Minor: 12, Weight: 50},
				},
				ThrottleReadBpsDevice: cgroups.OciDeviceRates{
					{Major: 11, Minor: 12, Rate: 1000000000},
				},
				ThrottleWriteBpsDevice: cgroups.OciDeviceRates{
					{Major: 11, Minor: 12, Rate: 2000000},
				},
				ThrottleReadIOPSDevice: cgroups.OciDeviceRates{
					{Major: 11, Minor: 12, Rate: 3000},
				},
				ThrottleWriteIOPSDevice: cgroups.OciDeviceRates{
					{Major: 11, Minor: 12, Rate: 4},
				},
			},
		},
		{
			name: "later match overrides value",
			dps: []DevicesParameters{
				{
					Devices:         []string{"/dev/sda", "/dev/sdb", "/dev/sdc"},
					ThrottleReadBps: "100",
					Weight:          "110",
				},
				{
					Devices:         []string{"/dev/sdb", "/dev/sdc"},
					ThrottleReadBps: "300",
					Weight:          "330",
				},
				{
					Devices:         []string{"/dev/sdb"},
					ThrottleReadBps: "200",
					Weight:          "220",
				},
			},
			iosched: map[string]string{"/dev/sda": "bfq", "/dev/sdb": "bfq", "/dev/sdc": "cfq"},
			expectedOci: &cgroups.OciBlockIOParameters{
				Weight: -1,
				WeightDevice: cgroups.OciDeviceWeights{
					{Major: 11, Minor: 12, Weight: 110},
					{Major: 21, Minor: 22, Weight: 220},
					{Major: 31, Minor: 32, Weight: 330},
				},
				ThrottleReadBpsDevice: cgroups.OciDeviceRates{
					{Major: 11, Minor: 12, Rate: 100},
					{Major: 21, Minor: 22, Rate: 200},
					{Major: 31, Minor: 32, Rate: 300},
				},
			},
		},
		{
			name: "invalid weights, many errors in different parameter sets",
			dps: []DevicesParameters{
				{
					Weight: "99999",
				},
				{
					Devices: []string{"/dev/sda"},
					Weight:  "1",
				},
				{
					Devices: []string{"/dev/sdb"},
					Weight:  "-2",
				},
			},
			expectedErrorCount: 3,
			expectedErrorSubstrings: []string{
				"(99999) bigger than maximum",
				"(1) smaller than minimum",
				"(-2) smaller than minimum",
			},
		},
		{
			name: "throttling without listing Devices",
			dps: []DevicesParameters{
				{
					ThrottleReadBps:   "100M",
					ThrottleWriteIOPS: "20k",
				},
			},
			expectedErrorCount: 1,
			expectedErrorSubstrings: []string{
				"Devices not listed",
				"\"100M\"",
				"\"20k\"",
			},
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			oci, err := devicesParametersToOci(tc.dps, tc.iosched)
			testutils.VerifyError(t, err, tc.expectedErrorCount, tc.expectedErrorSubstrings)
			if tc.expectedOci != nil {
				testutils.VerifyDeepEqual(t, "OCI parameters", *tc.expectedOci, oci)
			}
		})
	}
}

// mockPlatform implements mock versions of platformInterface functions.
type mockPlatform struct{}

// configurableBlockDevices mock always returns a set of block devices.
func (mpf mockPlatform) configurableBlockDevices(devWildcards []string) ([]BlockDeviceInfo, error) {
	blockDevices := []BlockDeviceInfo{}
	for _, devWildcard := range devWildcards {
		if devWildcard == "/dev/sda" {
			blockDevices = append(blockDevices, BlockDeviceInfo{
				Major:   11,
				Minor:   12,
				DevNode: devWildcard,
				Origin:  fmt.Sprintf("from wildcards %v", devWildcard),
			})
		} else if devWildcard == "/dev/sdb" {
			blockDevices = append(blockDevices, BlockDeviceInfo{
				Major:   21,
				Minor:   22,
				DevNode: devWildcard,
				Origin:  fmt.Sprintf("from wildcards %v", devWildcard),
			})
		} else if devWildcard == "/dev/sdc" {
			blockDevices = append(blockDevices, BlockDeviceInfo{
				Major:   31,
				Minor:   32,
				DevNode: devWildcard,
				Origin:  fmt.Sprintf("from wildcards %v", devWildcard),
			})
		}
	}
	return blockDevices, nil
}


================================================
FILE: pkg/blockio/config.go
================================================
/*
Copyright 2020 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package blockio

import (
	pkgcfg "github.com/intel/cri-resource-manager/pkg/config"
)

// options captures our configurable parameters.
type options struct {
	// Classes define weights and throttling parameters for sets of devices.
	Classes map[string][]DevicesParameters `json:",omitempty"`
}

// DevicesParameters defines Block IO parameters for a set of devices.
type DevicesParameters struct {
	Devices           []string `json:",omitempty"`
	ThrottleReadBps   string   `json:",omitempty"`
	ThrottleWriteBps  string   `json:",omitempty"`
	ThrottleReadIOPS  string   `json:",omitempty"`
	ThrottleWriteIOPS string   `json:",omitempty"`
	Weight            string   `json:",omitempty"`
}

// Currently active set of "raw" options
var opt = defaultOptions().(*options)

// defaultOptions returns a new instance of "raw" options set to their defaults
func defaultOptions() interface{} {
	return &options{}
}

func init() {
	pkgcfg.Register(ConfigModuleName, "Block I/O class control", opt, defaultOptions)
}


================================================
FILE: pkg/cgroups/cgroupblkio.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cgroups

import (
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"strconv"
	"strings"

	logger "github.com/intel/cri-resource-manager/pkg/log"
)

const (
	blkioCgroupDir = "/sys/fs/cgroup/blkio/"
)

// logger
var log logger.Logger = logger.NewLogger("cgroupblkio")

// cgroups blkio parameter filenames.
var blkioWeightFiles = []string{"blkio.bfq.weight", "blkio.weight"}
var blkioWeightDeviceFiles = []string{"blkio.bfq.weight_device", "blkio.weight_device"}
var blkioThrottleReadBpsFiles = []string{"blkio.throttle.read_bps_device"}
var blkioThrottleWriteBpsFiles = []string{"blkio.throttle.write_bps_device"}
var blkioThrottleReadIOPSFiles = []string{"blkio.throttle.read_iops_device"}
var blkioThrottleWriteIOPSFiles = []string{"blkio.throttle.write_iops_device"}

// OciBlockIOParameters contains OCI standard configuration of cgroups blkio parameters.
//
// Effects of Weight and Rate values in SetBlkioParameters():
// Value  |  Effect
// -------+-------------------------------------------------------------------
//
//	  -1  |  Do not write to cgroups, value is missing
//	   0  |  Write to cgroups, will remove the setting as specified in cgroups blkio interface
//	other |  Write to cgroups, sets the value
type OciBlockIOParameters struct {
	Weight                  int64
	WeightDevice            OciDeviceWeights
	ThrottleReadBpsDevice   OciDeviceRates
	ThrottleWriteBpsDevice  OciDeviceRates
	ThrottleReadIOPSDevice  OciDeviceRates
	ThrottleWriteIOPSDevice OciDeviceRates
}

// OciDeviceWeight contains values for
// - blkio.[io-scheduler].weight
type OciDeviceWeight struct {
	Major  int64
	Minor  int64
	Weight int64
}

// OciDeviceRate contains values for
// - blkio.throttle.read_bps_device
// - blkio.throttle.write_bps_device
// - blkio.throttle.read_iops_device
// - blkio.throttle.write_iops_device
type OciDeviceRate struct {
	Major int64
	Minor int64
	Rate  int64
}

// OciDeviceWeights contains weights for devices
type OciDeviceWeights []OciDeviceWeight

// OciDeviceRates contains throttling rates for devices
type OciDeviceRates []OciDeviceRate

// OciDeviceParameters interface provides functions common to OciDeviceWeights and OciDeviceRates
type OciDeviceParameters interface {
	Append(maj, min, val int64)
	Update(maj, min, val int64)
}

// Append appends (major, minor, value) to OciDeviceWeights slice.
func (w *OciDeviceWeights) Append(maj, min, val int64) {
	*w = append(*w, OciDeviceWeight{Major: maj, Minor: min, Weight: val})
}

// Append appends (major, minor, value) to OciDeviceRates slice.
func (r *OciDeviceRates) Append(maj, min, val int64) {
	*r = append(*r, OciDeviceRate{Major: maj, Minor: min, Rate: val})
}

// Update updates device weight in OciDeviceWeights slice, or appends it if not found.
func (w *OciDeviceWeights) Update(maj, min, val int64) {
	for index, devWeight := range *w {
		if devWeight.Major == maj && devWeight.Minor == min {
			(*w)[index].Weight = val
			return
		}
	}
	w.Append(maj, min, val)
}

// Update updates device rate in OciDeviceRates slice, or appends it if not found.
func (r *OciDeviceRates) Update(maj, min, val int64) {
	for index, devRate := range *r {
		if devRate.Major == maj && devRate.Minor == min {
			(*r)[index].Rate = val
			return
		}
	}
	r.Append(maj, min, val)
}

// NewOciBlockIOParameters creates new OciBlockIOParameters instance.
func NewOciBlockIOParameters() OciBlockIOParameters {
	return OciBlockIOParameters{
		Weight: -1,
	}
}

// NewOciDeviceWeight creates new OciDeviceWeight instance.
func NewOciDeviceWeight() OciDeviceWeight {
	return OciDeviceWeight{
		Major:  -1,
		Minor:  -1,
		Weight: -1,
	}
}

// NewOciDeviceRate creates new OciDeviceRate instance.
func NewOciDeviceRate() OciDeviceRate {
	return OciDeviceRate{
		Major: -1,
		Minor: -1,
		Rate:  -1,
	}
}

// GetBlkioDir returns the cgroups blkio controller directory.
func GetBlkioDir() string {
	return blkioCgroupDir
}

type devMajMin struct {
	Major int64
	Minor int64
}

// ResetBlkioParameters adds new, changes existing and removes missing blockIO parameters in cgroupsDir
func ResetBlkioParameters(cgroupsDir string, blockIO OciBlockIOParameters) error {
	errs := []error{}
	oldBlockIO, getErr := GetBlkioParameters(cgroupsDir)
	errs = append(errs, getErr)
	newBlockIO := NewOciBlockIOParameters()
	newBlockIO.Weight = blockIO.Weight
	// Set new device weights
	seenDev := map[devMajMin]bool{}
	for _, ociWDP := range blockIO.WeightDevice {
		seenDev[devMajMin{ociWDP.Major, ociWDP.Minor}] = true
		newBlockIO.WeightDevice = append(newBlockIO.WeightDevice, ociWDP)
	}
	// Reset old device weights that were missing from blockIO.WeightDevice
	for _, ociWDP := range oldBlockIO.WeightDevice {
		if !seenDev[devMajMin{ociWDP.Major, ociWDP.Minor}] {
			newBlockIO.WeightDevice = append(newBlockIO.WeightDevice, OciDeviceWeight{ociWDP.Major, ociWDP.Minor, 0})
		}
	}
	newBlockIO.ThrottleReadBpsDevice = resetDevRates(oldBlockIO.ThrottleReadBpsDevice, blockIO.ThrottleReadBpsDevice)
	newBlockIO.ThrottleWriteBpsDevice = resetDevRates(oldBlockIO.ThrottleWriteBpsDevice, blockIO.ThrottleWriteBpsDevice)
	newBlockIO.ThrottleReadIOPSDevice = resetDevRates(oldBlockIO.ThrottleReadIOPSDevice, blockIO.ThrottleReadIOPSDevice)
	newBlockIO.ThrottleWriteIOPSDevice = resetDevRates(oldBlockIO.ThrottleWriteIOPSDevice, blockIO.ThrottleWriteIOPSDevice)
	errs = append(errs, SetBlkioParameters(cgroupsDir, newBlockIO))
	return errors.Join(errs...)
}

// resetDevRates adds wanted rate parameters to new and resets unwated rates
func resetDevRates(old, wanted []OciDeviceRate) []OciDeviceRate {
	rates := []OciDeviceRate{}
	seenDev := map[devMajMin]bool{}
	for _, rdp := range wanted {
		rates = append(rates, rdp)
		seenDev[devMajMin{rdp.Major, rdp.Minor}] = true
	}
	for _, rdp := range old {
		if !seenDev[devMajMin{rdp.Major, rdp.Minor}] {
			rates = append(rates, OciDeviceRate{rdp.Major, rdp.Minor, 0})
		}
	}
	return rates
}

// GetBlkioParameters returns OCI BlockIO parameters from files in cgroups blkio controller directory.
func GetBlkioParameters(cgroupsDir string) (OciBlockIOParameters, error) {
	errs := []error{}
	blockIO := NewOciBlockIOParameters()
	content, err := readFromFileInDir(cgroupsDir, blkioWeightFiles)
	if err == nil {
		weight, err := strconv.ParseInt(strings.TrimSuffix(content, "\n"), 10, 64)
		if err == nil {
			blockIO.Weight = weight
		} else {
			errs = append(errs, fmt.Errorf("parsing weight from %#v failed: %w", content, err))
		}
	} else {
		errs = append(errs, err)
	}
	errs = append(errs, readOciDeviceParameters(cgroupsDir, blkioWeightDeviceFiles, &blockIO.WeightDevice))
	errs = append(errs, readOciDeviceParameters(cgroupsDir, blkioThrottleReadBpsFiles, &blockIO.ThrottleReadBpsDevice))
	errs = append(errs, readOciDeviceParameters(cgroupsDir, blkioThrottleWriteBpsFiles, &blockIO.ThrottleWriteBpsDevice))
	errs = append(errs, readOciDeviceParameters(cgroupsDir, blkioThrottleReadIOPSFiles, &blockIO.ThrottleReadIOPSDevice))
	errs = append(errs, readOciDeviceParameters(cgroupsDir, blkioThrottleWriteIOPSFiles, &blockIO.ThrottleWriteIOPSDevice))
	return blockIO, errors.Join(errs...)
}

// readOciDeviceParameters parses device lines used for weights and throttling rates
func readOciDeviceParameters(baseDir string, filenames []string, params OciDeviceParameters) error {
	errs := []error{}
	contents, err := readFromFileInDir(baseDir, filenames)
	if err != nil {
		return err
	}
	for _, line := range strings.Split(contents, "\n") {
		// Device weight files may have "default NNN" line at the beginning. Skip it.
		if line == "" || strings.HasPrefix(line, "default ") {
			continue
		}
		// Expect syntax MAJOR:MINOR VALUE
		devVal := strings.Split(line, " ")
		if len(devVal) != 2 {
			errs = append(errs, fmt.Errorf("invalid line %q, single space expected", line))
			continue
		}
		majMin := strings.Split(devVal[0], ":")
		if len(majMin) != 2 {
			errs = append(errs, fmt.Errorf("invalid line %q, single colon expected before space", line))
			continue
		}
		major, majErr := strconv.ParseInt(majMin[0], 10, 64)
		minor, minErr := strconv.ParseInt(majMin[1], 10, 64)
		value, valErr := strconv.ParseInt(devVal[1], 10, 64)
		if majErr != nil || minErr != nil || valErr != nil {
			errs = append(errs, fmt.Errorf("invalid number when parsing \"major:minor value\" from \"%s:%s %s\"", majMin[0], majMin[1], devVal[1]))
			continue
		}
		params.Append(major, minor, value)
	}
	return errors.Join(errs...)
}

// readFromFileInDir returns content from the first successfully read file.
func readFromFileInDir(baseDir string, filenames []string) (string, error) {
	errs := []error{}
	// If reading all the files fails, return list of read errors.
	for _, filename := range filenames {
		filepath := filepath.Join(baseDir, filename)
		content, err := currentPlatform.readFromFile(filepath)
		if err == nil {
			return content, nil
		}
		errs = append(errs, err)
	}
	err := errors.Join(errs...)
	if err != nil {
		return "", fmt.Errorf("could not read any of files %q: %w", filenames, err)
	}
	return "", nil
}

// SetBlkioParameters writes OCI BlockIO parameters to files in cgroups blkio contoller directory.
func SetBlkioParameters(cgroupsDir string, blockIO OciBlockIOParameters) error {
	log.Debug("configuring cgroups blkio controller in directory %#v with parameters %+v", cgroupsDir, blockIO)
	errs := []error{}
	if blockIO.Weight >= 0 {
		errs = append(errs, writeToFileInDir(cgroupsDir, blkioWeightFiles, strconv.FormatInt(blockIO.Weight, 10)))
	}
	for _, weightDevice := range blockIO.WeightDevice {
		errs = append(errs, writeDevValueToFileInDir(cgroupsDir, blkioWeightDeviceFiles, weightDevice.Major, weightDevice.Minor, weightDevice.Weight))
	}
	for _, rateDevice := range blockIO.ThrottleReadBpsDevice {
		errs = append(errs, writeDevValueToFileInDir(cgroupsDir, blkioThrottleReadBpsFiles, rateDevice.Major, rateDevice.Minor, rateDevice.Rate))
	}
	for _, rateDevice := range blockIO.ThrottleWriteBpsDevice {
		errs = append(errs, writeDevValueToFileInDir(cgroupsDir, blkioThrottleWriteBpsFiles, rateDevice.Major, rateDevice.Minor, rateDevice.Rate))
	}
	for _, rateDevice := range blockIO.ThrottleReadIOPSDevice {
		errs = append(errs, writeDevValueToFileInDir(cgroupsDir, blkioThrottleReadIOPSFiles, rateDevice.Major, rateDevice.Minor, rateDevice.Rate))
	}
	for _, rateDevice := range blockIO.ThrottleWriteIOPSDevice {
		errs = append(errs, writeDevValueToFileInDir(cgroupsDir, blkioThrottleWriteIOPSFiles, rateDevice.Major, rateDevice.Minor, rateDevice.Rate))
	}
	return errors.Join(errs...)
}

// writeDevValueToFileInDir writes MAJOR:MINOR VALUE to the first existing file under baseDir
func writeDevValueToFileInDir(baseDir string, filenames []string, major, minor, value int64) error {
	content := fmt.Sprintf("%d:%d %d", major, minor, value)
	return writeToFileInDir(baseDir, filenames, content)
}

// writeToFileInDir writes content to the first existing file in the list under baseDir.
func writeToFileInDir(baseDir string, filenames []string, content string) error {
	errs := []error{}
	// Returns list of errors from writes, list of single error due to all filenames missing or nil on success.
	for _, filename := range filenames {
		filepath := filepath.Join(baseDir, filename)
		err := currentPlatform.writeToFile(filepath, content)
		if err == nil {
			return nil
		}
		errs = append(errs, err)
	}
	err := errors.Join(errs...)
	if err != nil {
		return fmt.Errorf("could not write content %#v to any of files %q: %w", content, filenames, err)
	}
	return nil
}

// platformInterface includes functions that access the system. Enables mocking the platform.
type platformInterface interface {
	readFromFile(filename string) (string, error)
	writeToFile(filename string, content string) error
}

// defaultPlatform versions of platformInterface functions access the underlying system.
type defaultPlatform struct{}

// currentPlatform defines which platformInterface is used: defaultPlatform or a mock, for instance.
var currentPlatform platformInterface = defaultPlatform{}

// readFromFile returns file contents as a string.
func (dpm defaultPlatform) readFromFile(filename string) (string, error) {
	content, err := os.ReadFile(filename)
	return string(content), err
}

// writeToFile writes content to an existing file.
func (dpm defaultPlatform) writeToFile(filename string, content string) error {
	f, err := os.OpenFile(filename, os.O_WRONLY, 0666)
	if err != nil {
		return err
	}
	defer f.Close()
	_, err = f.Write([]byte(content))
	return err
}


================================================
FILE: pkg/cgroups/cgroupblkio_test.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cgroups

import (
	"fmt"
	"testing"

	"github.com/intel/cri-resource-manager/pkg/testutils"
)

func TestUpdateAppend(t *testing.T) {
	tcases := []struct {
		name                    string
		inputMajMinVals         [][]int64
		inputItem               []int64
		expectedMajMinVal       [][]int64
		expectedErrorCount      int
		expectedErrorSubstrings []string
	}{
		{
			name:              "update empty list",
			inputItem:         []int64{1, 2, 3},
			expectedMajMinVal: [][]int64{{1, 2, 3}},
		},
		{
			name:              "update appends non-existing element",
			inputMajMinVals:   [][]int64{{10, 20, 30}, {40, 50, 60}},
			inputItem:         []int64{1, 2, 3},
			expectedMajMinVal: [][]int64{{10, 20, 30}, {40, 50, 60}, {1, 2, 3}},
		},
		{
			name:              "update the first existing element",
			inputMajMinVals:   [][]int64{{10, 20, 30}, {40, 50, 60}, {40, 50, 60}},
			inputItem:         []int64{40, 50, 66},
			expectedMajMinVal: [][]int64{{10, 20, 30}, {40, 50, 66}, {40, 50, 60}},
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			devWeights := OciDeviceWeights{}
			devRates := OciDeviceRates{}
			expDevWeights := OciDeviceWeights{}
			expDevRates := OciDeviceRates{}
			for _, item := range tc.inputMajMinVals {
				devWeights.Append(item[0], item[1], item[2])
				devRates.Append(item[0], item[1], item[2])
			}
			devWeights.Update(tc.inputItem[0], tc.inputItem[1], tc.inputItem[2])
			devRates.Update(tc.inputItem[0], tc.inputItem[1], tc.inputItem[2])
			for _, item := range tc.expectedMajMinVal {
				expDevWeights = append(expDevWeights, OciDeviceWeight{item[0], item[1], item[2]})
				expDevRates = append(expDevRates, OciDeviceRate{item[0], item[1], item[2]})
			}
			testutils.VerifyDeepEqual(t, "device weights", expDevWeights, devWeights)
			testutils.VerifyDeepEqual(t, "device rates", expDevRates, devRates)
		})
	}
}

// TestResetBlkioParameters: unit test for ResetBlkioParameters()
func TestResetBlkioParameters(t *testing.T) {
	tcases := []struct {
		name                    string
		cgroupsDir              string
		blockIO                 OciBlockIOParameters
		fsContent               map[string]string
		expectedFsWrites        map[string]string
		expectedBlockIO         *OciBlockIOParameters
		expectedErrorCount      int
		expectedErrorSubstrings []string
	}{
		{
			name:       "write to clean cgroups",
			cgroupsDir: "/write/to/clean",
			blockIO: OciBlockIOParameters{
				Weight:                  222,
				WeightDevice:            OciDeviceWeights{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}},
				ThrottleReadBpsDevice:   OciDeviceRates{{11, 12, 13}, {111, 112, 113}},
				ThrottleWriteBpsDevice:  OciDeviceRates{{21, 22, 23}, {221, 222, 223}},
				ThrottleReadIOPSDevice:  OciDeviceRates{{31, 32, 33}, {331, 332, 333}},
				ThrottleWriteIOPSDevice: OciDeviceRates{{41, 42, 43}, {441, 442, 443}},
			},
			fsContent: map[string]string{
				"/write/to/clean/blkio.bfq.weight":                 "100\n",
				"/write/to/clean/blkio.bfq.weight_device":          "",
				"/write/to/clean/blkio.throttle.read_bps_device":   "",
				"/write/to/clean/blkio.throttle.write_bps_device":  "",
				"/write/to/clean/blkio.throttle.read_iops_device":  "",
				"/write/to/clean/blkio.throttle.write_iops_device": "",
			},
			expectedFsWrites: map[string]string{
				"/write/to/clean/blkio.bfq.weight":                 "222",
				"/write/to/clean/blkio.bfq.weight_device":          "1:2 3+4:5 6+7:8 9",
				"/write/to/clean/blkio.throttle.read_bps_device":   "11:12 13+111:112 113",
				"/write/to/clean/blkio.throttle.write_bps_device":  "21:22 23+221:222 223",
				"/write/to/clean/blkio.throttle.read_iops_device":  "31:32 33+331:332 333",
				"/write/to/clean/blkio.throttle.write_iops_device": "41:42 43+441:442 443",
			},
		},
		{
			name:       "reset all existing",
			cgroupsDir: "/reset/all",
			blockIO:    NewOciBlockIOParameters(),
			fsContent: map[string]string{
				"/reset/all/blkio.bfq.weight":                 "200\n",
				"/reset/all/blkio.bfq.weight_device":          "default 200\n1:2 3\n4:5 6\n",
				"/reset/all/blkio.throttle.read_bps_device":   "11:12 13\n14:15 16\n",
				"/reset/all/blkio.throttle.write_bps_device":  "21:22 23\n",
				"/reset/all/blkio.throttle.read_iops_device":  "31:32 33\n",
				"/reset/all/blkio.throttle.write_iops_device": "41:42 43\n",
			},
			expectedFsWrites: map[string]string{
				"/reset/all/blkio.bfq.weight_device":          "1:2 0+4:5 0",
				"/reset/all/blkio.throttle.read_bps_device":   "11:12 0+14:15 0",
				"/reset/all/blkio.throttle.write_bps_device":  "21:22 0",
				"/reset/all/blkio.throttle.read_iops_device":  "31:32 0",
				"/reset/all/blkio.throttle.write_iops_device": "41:42 0",
			},
		},
		{
			name:       "merge",
			cgroupsDir: "/merge",
			blockIO: OciBlockIOParameters{
				Weight:                  80,
				WeightDevice:            OciDeviceWeights{{1, 2, 1113}, {7, 8, 9}},       // drop middle, update first, keep last
				ThrottleReadBpsDevice:   OciDeviceRates{{11, 12, 13}},                    // keep the first entry
				ThrottleWriteBpsDevice:  OciDeviceRates{{24, 25, 26}},                    // keep the last entry
				ThrottleReadIOPSDevice:  OciDeviceRates{{31, 32, 33}, {331, 332, 333}},   // keep all
				ThrottleWriteIOPSDevice: OciDeviceRates{{41, 42, 430}, {441, 442, 4430}}, // change all
			},
			fsContent: map[string]string{
				"/merge/blkio.bfq.weight":                 "200\n",
				"/merge/blkio.bfq.weight_device":          "default 200\n1:2 3\n4:5 6\n7:8 9",
				"/merge/blkio.throttle.read_bps_device":   "11:12 13\n14:15 16\n",
				"/merge/blkio.throttle.write_bps_device":  "21:22 23\n24:25 26\n",
				"/merge/blkio.throttle.read_iops_device":  "31:32 33\n331:332 333\n",
				"/merge/blkio.throttle.write_iops_device": "41:42 43\n441:442 443\n",
			},
			expectedFsWrites: map[string]string{
				"/merge/blkio.bfq.weight":                 "80",
				"/merge/blkio.bfq.weight_device":          "1:2 1113+7:8 9+4:5 0",
				"/merge/blkio.throttle.read_bps_device":   "11:12 13+14:15 0",
				"/merge/blkio.throttle.write_bps_device":  "24:25 26+21:22 0",
				"/merge/blkio.throttle.read_iops_device":  "31:32 33+331:332 333",
				"/merge/blkio.throttle.write_iops_device": "41:42 430+441:442 4430",
			},
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			mpf := mockPlatform{
				fsOrigContent: tc.fsContent,
				fsWrites:      make(map[string]string),
			}
			currentPlatform = &mpf
			err := ResetBlkioParameters(tc.cgroupsDir, tc.blockIO)
			testutils.VerifyError(t, err, tc.expectedErrorCount, tc.expectedErrorSubstrings)
			if tc.expectedFsWrites != nil {
				testutils.VerifyDeepEqual(t, "filesystem writes", tc.expectedFsWrites, mpf.fsWrites)
			}
		})
	}
}

// TestGetBlkioParameters: unit test for GetBlkioParameters()
func TestGetBlkioParameters(t *testing.T) {
	tcases := []struct {
		name                    string
		cgroupsDir              string
		readsFail               int
		fsContent               map[string]string
		expectedBlockIO         *OciBlockIOParameters
		expectedErrorCount      int
		expectedErrorSubstrings []string
	}{
		{
			name:       "empty files",
			cgroupsDir: "/empty/ok",
			fsContent: map[string]string{
				"/empty/ok/blkio.bfq.weight":                 "",
				"/empty/ok/blkio.bfq.weight_device":          "",
				"/empty/ok/blkio.throttle.read_bps_device":   "",
				"/empty/ok/blkio.throttle.write_bps_device":  "",
				"/empty/ok/blkio.throttle.read_iops_device":  "",
				"/empty/ok/blkio.throttle.write_iops_device": "",
			},
			expectedBlockIO:         &OciBlockIOParameters{Weight: -1},
			expectedErrorCount:      1, // weight is not expected to be empty
			expectedErrorSubstrings: []string{"parsing weight"},
		},
		{
			name:       "everything defined",
			cgroupsDir: "/read/ok",
			fsContent: map[string]string{
				"/read/ok/blkio.bfq.weight": "1",
				// test weight_device file with real "default" line
				"/read/ok/blkio.bfq.weight_device": "default 10\n1:2 3\n",
				// test parsing two lines and skipping empty lines
				"/read/ok/blkio.throttle.read_bps_device": "\n11:22 33\n\n111:222 333\n",
				// test single line file
				"/read/ok/blkio.throttle.write_bps_device": "1111:2222 3333\n",
				// test single line, missing LF at the end
				"/read/ok/blkio.throttle.read_iops_device": "11111:22222 33333",
				// test small and large values
				"/read/ok/blkio.throttle.write_iops_device": "0:0 0\n4294967296:4294967297 9223372036854775807\n",
			},
			expectedBlockIO: &OciBlockIOParameters{
				Weight:                  1,
				WeightDevice:            OciDeviceWeights{{1, 2, 3}},
				ThrottleReadBpsDevice:   OciDeviceRates{{11, 22, 33}, {111, 222, 333}},
				ThrottleWriteBpsDevice:  OciDeviceRates{{1111, 2222, 3333}},
				ThrottleReadIOPSDevice:  OciDeviceRates{{11111, 22222, 33333}},
				ThrottleWriteIOPSDevice: OciDeviceRates{{0, 0, 0}, {4294967296, 4294967297, 9223372036854775807}},
			},
		},
		{
			name:       "test bad lines",
			cgroupsDir: "read/bad",
			fsContent: map[string]string{
				"read/bad/blkio.bfq.weight": "xyz",
				// test bad line in the middle
				"read/bad/blkio.bfq.weight_device": "default 10\n1:2 3\nbad\n4:5 6\n",
				// test no spaces
				"read/bad/blkio.throttle.read_bps_device": "11:22:33",
				// test too many spaces
				"read/bad/blkio.throttle.write_bps_device": "1111 2222 3333 \n",
				// test no colons
				"read/bad/blkio.throttle.read_iops_device": "1111122222 33333",
				// test missing number
				"read/bad/blkio.throttle.write_iops_device": "0: 0\n",
			},
			expectedErrorCount:      6,
			expectedErrorSubstrings: []string{"bad", "xyz", "11:22:33", "1111 2222 3333 ", "1111122222 33333", "0: 0"},
			expectedBlockIO: &OciBlockIOParameters{
				Weight:       -1,
				WeightDevice: OciDeviceWeights{{1, 2, 3}, {4, 5, 6}},
			},
		},
		{
			name:               "all files missing",
			cgroupsDir:         "/missing/err",
			fsContent:          map[string]string{},
			expectedBlockIO:    &OciBlockIOParameters{Weight: -1},
			expectedErrorCount: 6,
			expectedErrorSubstrings: []string{
				"file not found",
				"blkio.bfq.weight",
				"blkio.bfq.weight_device",
				"blkio.throttle.read_bps_device",
				"blkio.throttle.write_bps_device",
				"blkio.throttle.read_iops_device",
				"blkio.throttle.write_iops_device",
			},
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			mpf := mockPlatform{
				fsOrigContent: tc.fsContent,
				readsFail:     tc.readsFail,
			}
			currentPlatform = &mpf
			blockIO, err := GetBlkioParameters(tc.cgroupsDir)
			testutils.VerifyError(t, err, tc.expectedErrorCount, tc.expectedErrorSubstrings)
			if tc.expectedBlockIO != nil {
				testutils.VerifyDeepEqual(t, "blockio parameters", *tc.expectedBlockIO, blockIO)
			}
		})
	}

}

// TestSetBlkioParameters: unit test for SetBlkioParameters()
func TestSetBlkioParameters(t *testing.T) {
	tcases := []struct {
		name                    string
		cgroupsDir              string
		blockIO                 OciBlockIOParameters
		writesFail              int
		expectedFsWrites        map[string]string
		expectedErrorCount      int
		expectedErrorSubstrings []string
	}{
		{
			name:       "write full OCI struct",
			cgroupsDir: "/my/full",
			blockIO: OciBlockIOParameters{
				Weight:                  10,
				WeightDevice:            OciDeviceWeights{{Major: 1, Minor: 2, Weight: 3}},
				ThrottleReadBpsDevice:   OciDeviceRates{{Major: 11, Minor: 12, Rate: 13}},
				ThrottleWriteBpsDevice:  OciDeviceRates{{Major: 21, Minor: 22, Rate: 23}},
				ThrottleReadIOPSDevice:  OciDeviceRates{{Major: 31, Minor: 32, Rate: 33}},
				ThrottleWriteIOPSDevice: OciDeviceRates{{Major: 41, Minor: 42, Rate: 43}},
			},
			expectedFsWrites: map[string]string{
				"/my/full/blkio.bfq.weight":                 "10",
				"/my/full/blkio.bfq.weight_device":          "1:2 3",
				"/my/full/blkio.throttle.read_bps_device":   "11:12 13",
				"/my/full/blkio.throttle.write_bps_device":  "21:22 23",
				"/my/full/blkio.throttle.read_iops_device":  "31:32 33",
				"/my/full/blkio.throttle.write_iops_device": "41:42 43",
			},
		},
		{
			name:       "write empty struct",
			cgroupsDir: "/my/empty",
			blockIO:    OciBlockIOParameters{},
			expectedFsWrites: map[string]string{
				"/my/empty/blkio.bfq.weight": "0",
			},
		},
		{
			name:       "multidevice weight and throttling, no weight write on -1",
			cgroupsDir: "/my/multidev",
			blockIO: OciBlockIOParameters{
				Weight:                  -1,
				WeightDevice:            OciDeviceWeights{{1, 2, 3}, {4, 5, 6}},
				ThrottleReadBpsDevice:   OciDeviceRates{{11, 12, 13}, {111, 112, 113}},
				ThrottleWriteBpsDevice:  OciDeviceRates{{21, 22, 23}, {221, 222, 223}},
				ThrottleReadIOPSDevice:  OciDeviceRates{{31, 32, 33}, {331, 332, 333}},
				ThrottleWriteIOPSDevice: OciDeviceRates{{41, 42, 43}, {441, 442, 443}},
			},
			expectedFsWrites: map[string]string{
				"/my/multidev/blkio.bfq.weight_device":          "1:2 3+4:5 6",
				"/my/multidev/blkio.throttle.read_bps_device":   "11:12 13+111:112 113",
				"/my/multidev/blkio.throttle.write_bps_device":  "21:22 23+221:222 223",
				"/my/multidev/blkio.throttle.read_iops_device":  "31:32 33+331:332 333",
				"/my/multidev/blkio.throttle.write_iops_device": "41:42 43+441:442 443",
			},
		},
		{
			name:             "no bfq.weight",
			cgroupsDir:       "/my/nobfq",
			blockIO:          OciBlockIOParameters{Weight: 100},
			writesFail:       1,
			expectedFsWrites: map[string]string{"/my/nobfq/blkio.weight": "100"},
		},
		{
			name:       "all writes fail",
			cgroupsDir: "/my/writesfail",
			blockIO: OciBlockIOParameters{
				Weight:       -1,
				WeightDevice: OciDeviceWeights{{1, 0, 100}},
			},
			writesFail:         9999,
			expectedErrorCount: 1,
			expectedErrorSubstrings: []string{
				"could not write content \"1:0 100\" to any of files",
				"\"blkio.bfq.weight_device\"",
				"\"blkio.weight_device\"",
			},
		},
	}

	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			mpf := mockPlatform{
				fsWrites:   make(map[string]string),
				writesFail: tc.writesFail,
			}
			currentPlatform = &mpf
			err := SetBlkioParameters(tc.cgroupsDir, tc.blockIO)
			testutils.VerifyError(t, err, tc.expectedErrorCount, tc.expectedErrorSubstrings)
			if tc.expectedFsWrites != nil {
				testutils.VerifyDeepEqual(t, "filesystem writes", tc.expectedFsWrites, mpf.fsWrites)
			}
		})
	}
}

// mockPlatform implements mock versions of platformInterface functions.
type mockPlatform struct {
	fsOrigContent map[string]string
	fsWrites      map[string]string
	readsFail     int
	writesFail    int
}

func (mpf *mockPlatform) readFromFile(filename string) (string, error) {
	if mpf.readsFail > 0 {
		mpf.readsFail--
		return "", fmt.Errorf("mockPlatofrm: reading from %#v failed", filename)
	}
	if content, ok := mpf.fsOrigContent[filename]; ok {
		return content, nil
	}
	return "", fmt.Errorf("mockPlatform: file not found %#v", filename)
}

func (mpf *mockPlatform) writeToFile(filename string, content string) error {
	var newContent string
	if mpf.writesFail > 0 {
		mpf.writesFail--
		return fmt.Errorf("mockPlatform: writing to %#v failed", filename)
	}
	if oldContent, ok := mpf.fsWrites[filename]; ok {
		newContent = fmt.Sprintf("%s+%s", oldContent, content)
	} else {
		newContent = content
	}
	mpf.fsWrites[filename] = newContent
	return nil
}


================================================
FILE: pkg/cgroups/cgroupcontrol.go
================================================
// Copyright 2020-2021 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cgroups

import (
	"bufio"
	"errors"
	"fmt"
	"os"
	"path"
	"strings"
	"syscall"
)

// Controller is our enumerated type for cgroup controllers.
type Controller int

// Group represents a control group.
type Group string

// nolint
const (
	// UnkownController represents a controller of unknown type.
	UnknownController Controller = iota
	// blkio cgroup controller.
	Blkio
	// cpu cgroup controller.
	Cpu
	// cpuacct cgroup controller.
	Cpuacct
	// cpuset cgroup controller.
	Cpuset
	// devices cgroup controller.
	Devices
	// freezer cgroup controller.
	Freezer
	// hugetlb cgroup controller.
	Hugetlb
	// memory cgroup controller.
	Memory
	// net_cls cgroup controller.
	NetCls
	// net_prio cgroup controller.
	NetPrio
	// per_event cgroup controller.
	PerfEvent
	// pids cgroup controller.
	Pids
)

var (
	// controllerNames maps controllers to names/relative paths.
	controllerNames = map[Controller]string{
		Blkio:     "blkio",
		Cpu:       "cpu",
		Cpuacct:   "cpuacct",
		Cpuset:    "cpuset",
		Devices:   "devices",
		Freezer:   "freezer",
		Hugetlb:   "hugetlb",
		Memory:    "memory",
		NetCls:    "net_cls",
		NetPrio:   "net_prio",
		PerfEvent: "perf_event",
		Pids:      "pids",
	}

	// controllerNames maps controllers to names/relative paths.
	controllerDirs = map[string]Controller{
		"blkio":      Blkio,
		"cpu":        Cpu,
		"cpuacct":    Cpuacct,
		"cpuset":     Cpuset,
		"devices":    Devices,
		"freezer":    Freezer,
		"hugetlb":    Hugetlb,
		"memory":     Memory,
		"net_cls":    NetCls,
		"net_prio":   NetPrio,
		"perf_event": PerfEvent,
		"pids":       Pids,
	}
)

// String returns the name of the given controller.
func (c Controller) String() string {
	if name, ok := controllerNames[c]; ok {
		return name
	}
	return "unknown"
}

// Path returns the absolute path of the given controller.
func (c Controller) Path() string {
	DetectSystemCgroupVersion()
	if systemCgroupVersion == 2 {
		return GetMountDir()
	}
	return path.Join(mountDir, c.String())
}

// RelPath returns the relative path of the given controller.
func (c Controller) RelPath() string {
	DetectSystemCgroupVersion()
	if systemCgroupVersion == 2 {
		return ""
	}
	return c.String()
}

// Group returns the given group for the controller.
func (c Controller) Group(group string) Group {
	return Group(path.Join(c.Path(), group))
}

// AsGroup returns the group for the given absolute directory path.
func AsGroup(absDir string) Group {
	return Group(absDir)
}

// Controller returns the controller for the group.
func (g Group) Controller() Controller {
	DetectSystemCgroupVersion()
	if systemCgroupVersion == 2 {
		return UnknownController
	}
	relPath := strings.TrimPrefix(string(g), mountDir+"/")
	split := strings.SplitN(relPath, "/", 2)
	if len(split) > 0 {
		return controllerDirs[split[0]]
	}
	return UnknownController
}

// GetTasks reads the pids of threads currently assigned to the group.
func (g Group) GetTasks() ([]string, error) {
	return g.readPids(Tasks)
}

// GetProcesses reads the pids of processes currently assigned to the group.
func (g Group) GetProcesses() ([]string, error) {
	return g.readPids(Procs)
}

// AddTasks writes the given thread pids to the group.
func (g Group) AddTasks(pids ...string) error {
	return g.writePids(Tasks, pids...)
}

// AddProcesses writes the given process pids to the group.
func (g Group) AddProcesses(pids ...string) error {
	return g.writePids(Procs, pids...)
}

// Write writes the formatted data to the groups entry.
func (g Group) Write(entry, format string, args ...interface{}) error {
	entryPath := path.Join(string(g), entry)
	f, err := os.OpenFile(entryPath, os.O_WRONLY, 0644)
	if err != nil {
		return g.errorf("%q: failed to open: %v", entry, err)
	}
	defer f.Close()

	data := fmt.Sprintf(format, args...)
	if _, err := f.Write([]byte(data)); err != nil {
		return g.errorf("%q: failed to write %q: %v", entry, data, err)
	}

	return nil
}

// readPids reads pids from a cgroup's tasks or procs entry.
func (g Group) readPids(entry string) ([]string, error) {
	var pids []string

	pidFile := path.Join(string(g), entry)

	f, err := os.OpenFile(pidFile, os.O_RDONLY, 0644)
	if err != nil {
		return nil, g.errorf("failed to open %q: %v", entry, err)
	}
	defer f.Close()

	s := bufio.NewScanner(f)
	for s.Scan() {
		pids = append(pids, s.Text())
	}
	if s.Err() != nil {
		return nil, g.errorf("failed to read %q: %v", entry, err)
	}

	return pids, nil
}

// writePids writes pids to a cgroup's tasks or procs entry.
func (g Group) writePids(entry string, pids ...string) error {
	pidFile := path.Join(string(g), entry)

	f, err := os.OpenFile(pidFile, os.O_WRONLY, 0644)
	if err != nil {
		return g.errorf("failed to write pids to %q: %v", pidFile, err)
	}
	defer f.Close()

	for _, pid := range pids {
		if _, err := f.Write([]byte(pid)); err != nil {
			if !errors.Is(err, syscall.ESRCH) {
				return g.errorf("failed to write pid %s to %q: %v",
					pid, pidFile, err)
			}
		}
	}

	return nil
}

// error returns a formatted group-specific error.
func (g Group) errorf(format string, args ...interface{}) error {
	name := strings.TrimPrefix(string(g), mountDir+"/")
	return fmt.Errorf("cgroup "+name+": "+format, args...)
}


================================================
FILE: pkg/cgroups/cgroupid.go
================================================
package cgroups

import (
	"encoding/binary"
	"fmt"
	"os"
	"path/filepath"
	"sync"

	"golang.org/x/sys/unix"
)

// CgroupID implements mapping kernel cgroup IDs to cgroupfs paths with transparent caching.
type CgroupID struct {
	root  string
	cache map[uint64]string
	sync.Mutex
}

// NewCgroupID creates a new CgroupID map/cache.
func NewCgroupID(root string) *CgroupID {
	return &CgroupID{
		root:  root,
		cache: make(map[uint64]string),
	}
}

func getID(path string) uint64 {
	h, _, err := unix.NameToHandleAt(unix.AT_FDCWD, path, 0)
	if err != nil {
		return 0
	}

	return binary.LittleEndian.Uint64(h.Bytes())
}

// Find finds the path for the given cgroup id.
func (cgid *CgroupID) Find(id uint64) (string, error) {
	found := false
	var p string

	cgid.Lock()
	defer cgid.Unlock()

	if path, ok := cgid.cache[id]; ok {
		return path, nil
	}

	err := filepath.Walk(cgid.root, func(path string, info os.FileInfo, err error) error {
		if err != nil {
			if os.IsNotExist(err) {
				return nil
			}
			fmt.Printf("WalkFunc called with an error (path %q: %v\n)", path, err)
			return err
		}

		if found {
			return filepath.SkipDir
		}

		if info.IsDir() && id == getID(path) {
			found = true
			p = path
			return filepath.SkipDir
		}
		return nil
	})

	if err != nil {
		return "", err
	} else if !found {
		return "", fmt.Errorf("cgroupid %v not found", id)
	}
	cgid.cache[id] = p
	return p, nil
}


================================================
FILE: pkg/cgroups/cgrouppath.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cgroups

import (
	"flag"
	"os"
	"path"
	"path/filepath"
)

// nolint
const (
	// Tasks is a cgroup's "tasks" entry.
	Tasks = "tasks"
	// Procs is cgroup's "cgroup.procs" entry.
	Procs = "cgroup.procs"
	// CpuShares is the cpu controller's "cpu.shares" entry.
	CpuShares = "cpu.shares"
	// CpuPeriod is the cpu controller's "cpu.cfs_period_us" entry.
	CpuPeriod = "cpu.cfs_period_us"
	// CpuQuota is the cpu controller's "cpu.cfs_quota_us" entry.
	CpuQuota = "cpu.cfs_quota_us"
	// CpusetCpus is the cpuset controller's cpuset.cpus entry.
	CpusetCpus = "cpuset.cpus"
	// CpusetMems is the cpuset controller's cpuset.mems entry.
	CpusetMems = "cpuset.mems"
	// Controllers is the cgroup v2 controllers file
	Controllers = "cgroup.controllers"
)

var (
	// mount is the parent directory for per-controller cgroupfs mounts.
	mountDir = "/sys/fs/cgroup"
	// v2Dir is the parent directory for per-controller cgroupfs mounts.
	v2Dir = path.Join(mountDir, "unified")
	// KubeletRoot is the --cgroup-root option the kubelet is running with.
	KubeletRoot = ""
	// detected system cgroup version, 0 is undetected
	systemCgroupVersion = 0
)

// GetMountDir returns the common mount point for cgroup v1 controllers.
func GetMountDir() string {
	return mountDir
}

// SetMountDir sets the common mount point for the cgroup v1 controllers.
func SetMountDir(dir string) {
	v2, _ := filepath.Rel(mountDir, v2Dir)
	mountDir = dir
	if v2 != "" {
		v2Dir = path.Join(mountDir, v2)
	}
}

// GetV2Dir() returns the cgroup v2 unified mount directory.
func GetV2Dir() string {
	return v2Dir
}

// SetV2Dir sets the unified cgroup v2 mount directory.
func SetV2Dir(dir string) {
	if dir[0] == '/' {
		v2Dir = dir
	} else {
		v2Dir = path.Join(mountDir, v2Dir)
	}
}

func init() {
	flag.StringVar(&mountDir, "cgroup-mount", mountDir,
		"directory under which cgroup v1 controllers are mounted")
	flag.StringVar(&v2Dir, "cgroup-v2-dir",
		v2Dir, "cgroup v2 unified mount directory")
	flag.StringVar(&KubeletRoot, "kubelet-cgroup-root", KubeletRoot,
		"--cgroup-root options the kubelet is running with")
}

func DetectSystemCgroupVersion() int {
	if systemCgroupVersion == 0 {
		if _, err := os.Stat(path.Join(GetMountDir(), Controllers)); err == nil {
			systemCgroupVersion = 2
		} else {
			systemCgroupVersion = 1
		}
	}
	return systemCgroupVersion
}


================================================
FILE: pkg/cgroups/cgroupstats.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cgroups

import (
	"fmt"
	"os"
	"path"
	"path/filepath"
	"strconv"
	"strings"

	"github.com/intel/cri-resource-manager/pkg/sysfs"
)

// BlkioDeviceBytes contains a single operations line of blkio.throttle.io_service_bytes_recursive file
type BlkioDeviceBytes struct {
	Major      int
	Minor      int
	Operations map[string]int64
}

// BlkioThrottleBytes has parsed contents of blkio.throttle.io_service_bytes_recursive file
type BlkioThrottleBytes struct {
	DeviceBytes []*BlkioDeviceBytes
	TotalBytes  int64
}

// CPUAcctUsage has a parsed line of cpuacct.usage_all file
type CPUAcctUsage struct {
	CPU    int
	User   int64
	System int64
}

// HugetlbUsage has parsed contents of huge pages usage in bytes.
type HugetlbUsage struct {
	Size     string
	Bytes    int64
	MaxBytes int64
}

// MemoryUsage has parsed contents of memory usage in bytes.
type MemoryUsage struct {
	Bytes    int64
	MaxBytes int64
}

// NumaLine represents one line in the NUMA statistics file.
type NumaLine struct {
	Total int64
	Nodes map[string]int64
}

// NumaStat has parsed contets of a NUMA statistics file.
type NumaStat struct {
	Total       NumaLine
	File        NumaLine
	Anon        NumaLine
	Unevictable NumaLine

	HierarchicalTotal       NumaLine
	HierarchicalFile        NumaLine
	HierarchicalAnon        NumaLine
	HierarchicalUnevictable NumaLine
}

// GlobalNumaStats has the statistics from one global NUMA nodestats file.
type GlobalNumaStats struct {
	NumaHit       int64
	NumaMiss      int64
	NumaForeign   int64
	InterleaveHit int64
	LocalNode     int64
	OtherNode     int64
}

func readCgroupFileLines(filePath string) ([]string, error) {

	f, err := os.ReadFile(filePath)

	if err != nil {
		return nil, err
	}

	data := string(f)

	rawLines := strings.Split(data, "\n")

	lines := make([]string, 0)

	// Sanitize the lines and remove empty ones.
	for _, rawLine := range rawLines {
		if len(strings.TrimSpace(rawLine)) > 0 {
			lines = append(lines, rawLine)
		}
	}

	return lines, nil
}

func readCgroupSingleNumber(filePath string) (int64, error) {

	// File looks like this:
	//
	// 4

	lines, err := readCgroupFileLines(filePath)

	if err != nil {
		return 0, err
	}

	if len(lines) != 1 {
		return 0, fmt.Errorf("error parsing file")
	}

	number, err := strconv.ParseInt(lines[0], 10, 64)
	if err != nil {
		return 0, err
	}

	return number, nil
}

// GetBlkioThrottleBytes returns amount of bytes transferred to/from the disk.
func GetBlkioThrottleBytes(cgroupPath string) (BlkioThrottleBytes, error) {
	const (
		cgroupEntry = "blkio.throttle.io_service_bytes_recursive"
	)

	// File looks like this:
	//
	// 8:16 Read 4223325184
	// 8:16 Write 3207528448
	// 8:16 Sync 5387592704
	// 8:16 Async 2043260928
	// 8:16 Discard 0
	// 8:16 Total 7430853632
	// 8:0 Read 5246572032
	// 8:0 Write 2361737216
	// 8:0 Sync 5575892480
	// 8:0 Async 2032416768
	// 8:0 Discard 0
	// 8:0 Total 7608309248
	// Total 15039162880

	entry := path.Join(cgroupPath, cgroupEntry)
	lines, err := readCgroupFileLines(entry)
	if err != nil {
		return BlkioThrottleBytes{}, err
	}

	if len(lines) == 1 && lines[0] == "Total 0" {
		return BlkioThrottleBytes{}, nil
	}

	result := BlkioThrottleBytes{DeviceBytes: make([]*BlkioDeviceBytes, 0)}
	devidx := map[string]int{}

	for _, line := range lines {
		split := strings.Split(line, " ")
		key := split[0]
		if key == "Total" {
			if len(split) != 2 {
				continue
			}
			totalBytes, err := strconv.ParseInt(split[1], 10, 64)
			if err != nil {
				return BlkioThrottleBytes{}, err
			}
			result.TotalBytes = totalBytes
		} else {
			var dev *BlkioDeviceBytes

			majmin := strings.Split(key, ":")
			if len(majmin) != 2 {
				return BlkioThrottleBytes{}, fmt.Errorf("error parsing file %s", entry)
			}
			maj64, err := strconv.ParseInt(string(majmin[0]), 10, 32)
			if err != nil {
				return BlkioThrottleBytes{}, err
			}
			min64, err := strconv.ParseInt(string(majmin[1]), 10, 32)
			if err != nil {
				return BlkioThrottleBytes{}, err
			}
			major := int(maj64)
			minor := int(min64)

			idx, ok := devidx[split[0]]
			if ok {
				dev = result.DeviceBytes[idx]
			} else {
				dev = &BlkioDeviceBytes{
					Major:      major,
					Minor:      minor,
					Operations: make(map[string]int64),
				}
				idx = len(result.DeviceBytes)
				devidx[key] = idx
				result.DeviceBytes = append(result.DeviceBytes, dev)
			}

			op, count := split[1], split[2]
			bytes, err := strconv.ParseInt(count, 10, 64)
			if err != nil {
				return BlkioThrottleBytes{}, err
			}
			dev.Operations[op] = bytes
		}
	}

	return result, nil
}

// GetCPUAcctStats retrieves CPU account statistics for a given cgroup.
func GetCPUAcctStats(cgroupPath string) ([]CPUAcctUsage, error) {

	// File looks like this:
	//
	// cpu user system
	// 0 3723082232186 2456599218
	// 1 3748398003001 1149546796

	lines, err := readCgroupFileLines(path.Join(cgroupPath, "cpuacct.usage_all"))

	if err != nil {
		return nil, err
	}

	result := make([]CPUAcctUsage, 0, len(lines)-1)

	for _, line := range lines[1:] {
		tokens := strings.Split(line, " ")
		if len(tokens) != 3 {
			continue
		}
		cpu, err := strconv.ParseInt(tokens[0], 10, 32)
		if err != nil {
			return nil, err
		}
		user, err := strconv.ParseInt(tokens[1], 10, 64)
		if err != nil {
			return nil, err
		}
		system, err := strconv.ParseInt(tokens[2], 10, 64)
		if err != nil {
			return nil, err
		}
		result = append(result, CPUAcctUsage{CPU: int(cpu), User: user, System: system})
	}
	return result, nil
}

// GetCPUSetMemoryMigrate returns boolean indicating whether memory migration is enabled.
func GetCPUSetMemoryMigrate(cgroupPath string) (bool, error) {

	// File looks like this:
	//
	// 0

	number, err := readCgroupSingleNumber(path.Join(cgroupPath, "cpuset.memory_migrate"))

	if err != nil {
		return false, err
	}

	if number == 0 {
		return false, nil
	} else if number == 1 {
		return true, nil
	}

	return false, fmt.Errorf("error parsing file")
}

// GetHugetlbUsage retrieves huge pages statistics for a given cgroup.
func GetHugetlbUsage(cgroupPath string) ([]HugetlbUsage, error) {
	const (
		prefix         = "/hugetlb."
		usageSuffix    = ".usage_in_bytes"
		maxUsageSuffix = ".max_usage_in_bytes"
	)

	// Files look like this:
	//
	// 124

	usageFiles, err := filepath.Glob(path.Join(cgroupPath, prefix+"*"+usageSuffix))
	if err != nil {
		return nil, err
	}

	result := make([]HugetlbUsage, 0, len(usageFiles))

	for _, file := range usageFiles {
		if strings.Contains(filepath.Base(file), ".rsvd") {
			// Skip reservations files.
			continue
		}
		size := strings.SplitN(filepath.Base(file), ".", 3)[1]
		bytes, err := readCgroupSingleNumber(file)
		if err != nil {
			return nil, err
		}
		max, err := readCgroupSingleNumber(strings.TrimSuffix(file, usageSuffix) + maxUsageSuffix)
		if err != nil {
			return nil, err
		}
		result = append(result, HugetlbUsage{
			Size:     size,
			Bytes:    bytes,
			MaxBytes: max,
		})
	}

	return result, nil
}

// GetMemoryUsage retrieves cgroup memory usage.
func GetMemoryUsage(cgroupPath string) (MemoryUsage, error) {

	// Files look like this:
	//
	// 142

	usage, err := readCgroupSingleNumber(path.Join(cgroupPath, "memory.usage_in_bytes"))
	if err != nil {
		return MemoryUsage{}, err
	}

	maxUsage, err := readCgroupSingleNumber(path.Join(cgroupPath, "memory.max_usage_in_bytes"))
	if err != nil {
		return MemoryUsage{}, err
	}

	result := MemoryUsage{
		Bytes:    usage,
		MaxBytes: maxUsage,
	}

	return result, nil
}

// GetNumaStats returns parsed cgroup NUMA statistics.
func GetNumaStats(cgroupPath string) (NumaStat, error) {
	const (
		cgroupEntry = "memory.numa_stat"
	)

	// File looks like this:
	//
	// total=44611 N0=32631 N1=7501 N2=1982 N3=2497
	// file=44428 N0=32614 N1=7335 N2=1982 N3=2497
	// anon=183 N0=17 N1=166 N2=0 N3=0
	// unevictable=0 N0=0 N1=0 N2=0 N3=0
	// hierarchical_total=768133 N0=509113 N1=138887 N2=20464 N3=99669
	// hierarchical_file=722017 N0=496516 N1=119997 N2=20181 N3=85323
	// hierarchical_anon=46096 N0=12597 N1=18890 N2=283 N3=14326
	// hierarchical_unevictable=20 N0=0 N1=0 N2=0 N3=20

	entry := path.Join(cgroupPath, cgroupEntry)
	lines, err := readCgroupFileLines(entry)
	if err != nil {
		return NumaStat{}, err
	}

	result := NumaStat{}
	for _, line := range lines {
		split := strings.Split(line, " ")
		if len(line) < 2 {
			return NumaStat{}, fmt.Errorf("error parsing file %s", entry)
		}

		keytotal := strings.Split(split[0], "=")
		if len(keytotal) != 2 {
			return NumaStat{}, fmt.Errorf("error parsing file %s", entry)
		}
		key, tot := keytotal[0], keytotal[1]

		total, err := strconv.ParseInt(tot, 10, 64)
		if err != nil {
			return NumaStat{}, fmt.Errorf("error parsing file %s: %v", entry, err)
		}

		nodes := make(map[string]int64)
		for _, nodeEntry := range split[1:] {
			nodeamount := strings.Split(nodeEntry, "=")
			if len(nodeamount) != 2 {
				return NumaStat{}, fmt.Errorf("error parsing file %s", entry)
			}
			node, amount := nodeamount[0], nodeamount[1]
			number, err := strconv.ParseInt(amount, 10, 64)
			if err != nil {
				return NumaStat{}, fmt.Errorf("error parsing file %s: %v", entry, err)
			}
			nodes[node] = number
		}

		switch key {
		case "total":
			result.Total.Total = total
			result.Total.Nodes = nodes
		case "file":
			result.File.Total = total
			result.File.Nodes = nodes
		case "anon":
			result.Anon.Total = total
			result.Anon.Nodes = nodes
		case "unevictable":
			result.Unevictable.Total = total
			result.Unevictable.Nodes = nodes
		case "hierarchical_total":
			result.HierarchicalTotal.Total = total
			result.HierarchicalTotal.Nodes = nodes
		case "hierarchical_file":
			result.HierarchicalFile.Total = total
			result.HierarchicalFile.Nodes = nodes
		case "hierarchical_anon":
			result.HierarchicalAnon.Total = total
			result.HierarchicalAnon.Nodes = nodes
		case "hierarchical_unevictable":
			result.HierarchicalUnevictable.Total = total
			result.HierarchicalUnevictable.Nodes = nodes
		default:
			return NumaStat{}, fmt.Errorf("error parsing file, unknown key %s", key)
		}
	}

	return result, nil
}

// GetGlobalNumaStats returns the global (non-cgroup) NUMA statistics per node.
func GetGlobalNumaStats() (map[int]GlobalNumaStats, error) {
	const (
		prefix = "/sys/devices/system/node/node"
	)

	// Files look like this:
	//
	// numa_hit 1851614569
	// numa_miss 0
	// numa_foreign 0
	// interleave_hit 49101
	// local_node 1851614569
	// other_node 0

	result := make(map[int]GlobalNumaStats)

	nodeDirs, err := filepath.Glob(prefix + "*")
	if err != nil {
		return map[int]GlobalNumaStats{}, err
	}

	for _, dir := range nodeDirs {
		id := strings.TrimPrefix(dir, prefix)
		node, err := strconv.ParseInt(id, 10, 0)
		if err != nil {
			return map[int]GlobalNumaStats{}, fmt.Errorf("error parsing directory name")
		}

		nodeStat := GlobalNumaStats{}

		numastat := path.Join(dir, "numastat")
		err = sysfs.ParseFileEntries(numastat,
			map[string]interface{}{
				"numa_hit":       &nodeStat.NumaHit,
				"numa_miss":      &nodeStat.NumaMiss,
				"numa_foreign":   &nodeStat.NumaForeign,
				"interleave_hit": &nodeStat.InterleaveHit,
				"local_node":     &nodeStat.LocalNode,
				"other_node":     &nodeStat.OtherNode,
			},
			func(line string) (string, string, error) {
				fields := strings.Fields(strings.TrimSpace(line))
				if len(fields) != 2 {
					return "", "", fmt.Errorf("failed to parse line '%s'", line)
				}
				return fields[0], fields[1], nil
			},
		)

		if err != nil {
			return map[int]GlobalNumaStats{}, fmt.Errorf("error parsing numastat file: %v", err)
		}

		result[int(node)] = nodeStat
	}

	return result, nil
}


================================================
FILE: pkg/cgroupstats/collector.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cgroupstats

import (
	"flag"
	"os"
	"path/filepath"
	"regexp"
	"strconv"
	"strings"
	"sync"

	"github.com/intel/cri-resource-manager/pkg/cgroups"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/metrics"
	"github.com/prometheus/client_golang/prometheus"
)

// Prometheus Metric descriptor indices and descriptor table
const (
	numaStatsDesc = iota
	memoryUsageDesc
	memoryMigrateDesc
	cpuAcctUsageDesc
	hugeTlbUsageDesc
	blkioDeviceUsageDesc
	numDescriptors
)

var descriptors = [numDescriptors]*prometheus.Desc{
	numaStatsDesc: prometheus.NewDesc(
		"cgroup_numa_stats",
		"NUMA statistics for a given container and pod.",
		[]string{
			// cgroup path
			"container_id",
			// NUMA node ID
			"numa_node_id",
			// NUMA memory type
			"type",
		}, nil,
	),
	memoryUsageDesc: prometheus.NewDesc(
		"cgroup_memory_usage",
		"Memory usage statistics for a given container and pod.",
		[]string{
			"container_id",
			"type",
		}, nil,
	),
	memoryMigrateDesc: prometheus.NewDesc(
		"cgroup_memory_migrate",
		"Memory migrate status for a given container and pod.",
		[]string{
			"container_id",
		}, nil,
	),
	cpuAcctUsageDesc: prometheus.NewDesc(
		"cgroup_cpu_acct",
		"CPU accounting for a given container and pod.",
		[]string{
			"container_id",
			// CPU ID
			"cpu",
			"type",
		}, nil,
	),
	hugeTlbUsageDesc: prometheus.NewDesc(
		"cgroup_hugetlb_usage",
		"Hugepages usage for a given container and pod.",
		[]string{
			"container_id",
			"size",
			"type",
		}, nil,
	),
	blkioDeviceUsageDesc: prometheus.NewDesc(
		"cgroup_blkio_device_usage",
		"Blkio Device bytes usage for a given container and pod.",
		[]string{
			"container_id",
			"major",
			"minor",
			"operation",
		}, nil,
	),
}

var (
	// cgroupRoot is the mount point for the cgroup (v1) filesystem
	cgroupRoot = "/sys/fs/cgroup"
	// our logger instance
	log = logger.NewLogger("cgroupstats")
)

const (
	kubepodsDir = "kubepods.slice"
)

type collector struct {
}

// NewCollector creates new Prometheus collector
func NewCollector() (prometheus.Collector, error) {
	return &collector{}, nil
}

// Describe implements prometheus.Collector interface
func (c *collector) Describe(ch chan<- *prometheus.Desc) {
	for _, d := range descriptors {
		ch <- d
	}
}

func updateCPUAcctUsageMetric(ch chan<- prometheus.Metric, path string, metric []cgroups.CPUAcctUsage) {
	for i, acct := range metric {
		ch <- prometheus.MustNewConstMetric(
			descriptors[cpuAcctUsageDesc],
			prometheus.CounterValue,
			float64(acct.CPU),
			path, strconv.FormatInt(int64(i), 10), "CPU",
		)
		ch <- prometheus.MustNewConstMetric(
			descriptors[cpuAcctUsageDesc],
			prometheus.CounterValue,
			float64(acct.User),
			path, strconv.FormatInt(int64(i), 10), "User",
		)
		ch <- prometheus.MustNewConstMetric(
			descriptors[cpuAcctUsageDesc],
			prometheus.CounterValue,
			float64(acct.System),
			path, strconv.FormatInt(int64(i), 10), "System",
		)
	}
}

func updateMemoryMigrateMetric(ch chan<- prometheus.Metric, path string, migrate bool) {
	migrateValue := 0
	if migrate {
		migrateValue = 1
	}
	ch <- prometheus.MustNewConstMetric(
		descriptors[memoryMigrateDesc],
		prometheus.GaugeValue,
		float64(migrateValue),
		path,
	)
}

func updateMemoryUsageMetric(ch chan<- prometheus.Metric, path string, metric cgroups.MemoryUsage) {
	ch <- prometheus.MustNewConstMetric(
		descriptors[memoryUsageDesc],
		prometheus.GaugeValue,
		float64(metric.Bytes),
		path, "Bytes",
	)
	ch <- prometheus.MustNewConstMetric(
		descriptors[memoryUsageDesc],
		prometheus.GaugeValue,
		float64(metric.MaxBytes),
		path, "MaxBytes",
	)
}

func updateNumaStatMetric(ch chan<- prometheus.Metric, path string, metric cgroups.NumaStat) {
	// TODO: use "reflect" to iterate through the struct fields of NumaStat?

	for key, value := range metric.Total.Nodes {
		ch <- prometheus.MustNewConstMetric(
			descriptors[numaStatsDesc],
			prometheus.GaugeValue,
			float64(value),
			path, key, "Total",
		)
	}
	for key, value := range metric.File.Nodes {
		ch <- prometheus.MustNewConstMetric(
			descriptors[numaStatsDesc],
			prometheus.GaugeValue,
			float64(value),
			path, key, "File",
		)
	}
	for key, value := range metric.Anon.Nodes {
		ch <- prometheus.MustNewConstMetric(
			descriptors[numaStatsDesc],
			prometheus.GaugeValue,
			float64(value),
			path, key, "Anon",
		)
	}
	for key, value := range metric.Unevictable.Nodes {
		ch <- prometheus.MustNewConstMetric(
			descriptors[numaStatsDesc],
			prometheus.GaugeValue,
			float64(value),
			path, key, "Unevictable",
		)
	}
	for key, value := range metric.HierarchicalTotal.Nodes {
		ch <- prometheus.MustNewConstMetric(
			descriptors[numaStatsDesc],
			prometheus.GaugeValue,
			float64(value),
			path, key, "HierarchicalTotal",
		)
	}
	for key, value := range metric.HierarchicalFile.Nodes {
		ch <- prometheus.MustNewConstMetric(
			descriptors[numaStatsDesc],
			prometheus.GaugeValue,
			float64(value),
			path, key, "HierarchicalFile",
		)
	}
	for key, value := range metric.HierarchicalAnon.Nodes {
		ch <- prometheus.MustNewConstMetric(
			descriptors[numaStatsDesc],
			prometheus.GaugeValue,
			float64(value),
			path, key, "HierarchicalAnon",
		)
	}
	for key, value := range metric.HierarchicalUnevictable.Nodes {
		ch <- prometheus.MustNewConstMetric(
			descriptors[numaStatsDesc],
			prometheus.GaugeValue,
			float64(value),
			path, key, "HierarchicalUnevictable",
		)
	}
}

func updateHugeTlbUsageMetric(ch chan<- prometheus.Metric, path string, metric []cgroups.HugetlbUsage) {
	// One HugeTlbUsage for each size.
	for _, hugeTlbUsage := range metric {
		ch <- prometheus.MustNewConstMetric(
			descriptors[hugeTlbUsageDesc],
			prometheus.GaugeValue,
			float64(hugeTlbUsage.Bytes),
			path, hugeTlbUsage.Size, "Bytes",
		)
		ch <- prometheus.MustNewConstMetric(
			descriptors[hugeTlbUsageDesc],
			prometheus.GaugeValue,
			float64(hugeTlbUsage.MaxBytes),
			path, hugeTlbUsage.Size, "MaxBytes",
		)
	}
}

func updateBlkioDeviceUsageMetric(ch chan<- prometheus.Metric, path string, metric cgroups.BlkioThrottleBytes) {
	for _, deviceBytes := range metric.DeviceBytes {
		for operation, val := range deviceBytes.Operations {
			ch <- prometheus.MustNewConstMetric(
				descriptors[blkioDeviceUsageDesc],
				prometheus.CounterValue,
				float64(val),
				path, strconv.FormatInt(int64(deviceBytes.Major), 10),
				strconv.FormatInt(int64(deviceBytes.Minor), 10), operation,
			)
		}
	}
}

func walkCgroups() []string {
	// XXX TODO: add support for kubelet cgroupfs cgroup driver.

	containerDirs := []string{}

	cpuset := filepath.Join(cgroupRoot, "cpuset")
	filepath.Walk(filepath.Join(cpuset, kubepodsDir),
		func(path string, info os.FileInfo, err error) error {
			if err != nil {
				if os.IsNotExist(err) {
					return nil
				}
				return err
			}
			if !info.IsDir() {
				return nil
			}

			dir := info.Name()
			if !strings.HasSuffix(dir, ".scope") {
				return nil
			}

			switch {
			case strings.HasPrefix(dir, "cri-containerd-"):
				break
			case strings.HasPrefix(dir, "crio-"):
				break
			case strings.HasPrefix(dir, "docker-"):
				break
			default:
				return filepath.SkipDir
			}

			path = strings.TrimPrefix(path, cpuset+"/")
			containerDirs = append(containerDirs, path)

			return nil
		})

	return containerDirs
}

func cgroupPath(controller, path string) string {
	return filepath.Join(cgroupRoot, controller, path)
}

// Collect implements prometheus.Collector interface
func (c collector) Collect(ch chan<- prometheus.Metric) {
	var wg sync.WaitGroup

	// We don't bail out on errors because those can happen if there is a race condition between
	// the destruction of a container and us getting to read the cgroup data. We just don't report
	// the values we don't get.

	collectors := []func(string, *regexp.Regexp){
		func(path string, re *regexp.Regexp) {
			defer wg.Done()
			numa, err := cgroups.GetNumaStats(cgroupPath("memory", path))
			if err == nil {
				updateNumaStatMetric(ch, re.FindStringSubmatch(filepath.Base(path))[0], numa)
			} else {
				log.Error("failed to collect NUMA stats for %s: %v", path, err)
			}
		},
		func(path string, re *regexp.Regexp) {
			defer wg.Done()
			memory, err := cgroups.GetMemoryUsage(cgroupPath("memory", path))
			if err == nil {
				updateMemoryUsageMetric(ch, re.FindStringSubmatch(filepath.Base(path))[0], memory)
			} else {
				log.Error("failed to collect memory usage stats for %s: %v", path, err)
			}
		},
		func(path string, re *regexp.Regexp) {
			defer wg.Done()
			migrate, err := cgroups.GetCPUSetMemoryMigrate(cgroupPath("cpuset", path))
			if err == nil {
				updateMemoryMigrateMetric(ch, re.FindStringSubmatch(filepath.Base(path))[0], migrate)
			} else {
				log.Error("failed to collect memory migration stats for %s: %v", path, err)
			}
		},
		func(path string, re *regexp.Regexp) {
			defer wg.Done()
			cpuAcctUsage, err := cgroups.GetCPUAcctStats(cgroupPath("cpuacct", path))
			if err == nil {
				updateCPUAcctUsageMetric(ch, re.FindStringSubmatch(filepath.Base(path))[0], cpuAcctUsage)
			} else {
				log.Error("failed to collect CPU accounting stats for %s: %v", path, err)
			}
		},
		func(path string, re *regexp.Regexp) {
			defer wg.Done()
			hugeTlbUsage, err := cgroups.GetHugetlbUsage(cgroupPath("hugetlb", path))
			if err == nil {
				updateHugeTlbUsageMetric(ch, re.FindStringSubmatch(filepath.Base(path))[0], hugeTlbUsage)
			} else {
				log.Error("failed to collect hugetlb stats for %s: %v", path, err)
			}
		},
		func(path string, re *regexp.Regexp) {
			defer wg.Done()
			blkioDeviceUsage, err := cgroups.GetBlkioThrottleBytes(cgroupPath("blkio", path))
			if err == nil {
				updateBlkioDeviceUsageMetric(ch, re.FindStringSubmatch(filepath.Base(path))[0], blkioDeviceUsage)
			} else {
				log.Error("failed to collect blkio stats for %s: %v", path, err)
			}
		},
	}

	containerIDRegexp := regexp.MustCompile(`[a-z0-9]{64}`)

	for _, path := range walkCgroups() {
		wg.Add(len(collectors))
		for _, fn := range collectors {
			go fn(path, containerIDRegexp)
		}
	}

	// We need to wait so that the response channel doesn't get closed.
	wg.Wait()
}

func init() {
	flag.StringVar(&cgroupRoot, "cgroup-path", cgroupRoot,
		"Path to cgroup filesystem mountpoint")

	err := metrics.RegisterCollector("cgroupstats", NewCollector)
	if err != nil {
		log.Error("failed register cgroupstats collector: %v", err)
	}
}


================================================
FILE: pkg/config/config.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package config

import (
	"reflect"
	"sigs.k8s.io/yaml"
	"strings"
)

const (
	// MainModule is the default parent for all configuration.
	MainModule = "main"
)

// GetConfigFn is used to query a module for its default configuration.
type GetConfigFn func() interface{}

// NotifyFn is used to notify a module about configuration changes.
type NotifyFn func(Event, Source) error

// Event describes what triggered an invocation of a configuration notification callback.
type Event string

const (
	// UpdateEvent corresponds to a normal configuration udpate.
	UpdateEvent = "update"
	// RevertEvent corresponds to a configuration rollback in case of errors.
	RevertEvent = "rollback"
)

// Source describes where configuration is originated from.
type Source string

const (
	// ConfigFile is a YAML/JSON file configuration source.
	ConfigFile Source = "configuration file"
	// ConfigExternal is an external configuration source, for instance a node agent.
	ConfigExternal Source = "external configuration"
	// ConfigBackup is a backup of the previous configuration.
	ConfigBackup Source = "backed up configuration"
)

// Module is a logical unit of configuration, declared using Declare().
type Module struct {
	path        string             // fully qualified path in dotted notation, parent.name
	description string             // verbose module description
	help        string             // verbose description/help about this module
	ptr         interface{}        // pointer to module configuration data
	parent      *Module            // parent module
	name        string             // name relative to parent, last part of path
	children    map[string]*Module // modules nested under this module
	getdefault  GetConfigFn        // getter for default configuration
	notifiers   []NotifyFn         // update notification callbacks
	noValidate  bool               // omit data validation
}

// main is the root of our configuration.
var main = &Module{
	path:     MainModule,
	name:     MainModule,
	children: make(map[string]*Module),
}

// GetConfig returns the current configuration.
func GetConfig() (Data, error) {
	return main.getconfig()
}

// SetConfig updates the configuration using data from an external source.
func SetConfig(cfg map[string]string) error {
	data, err := DataFromStringMap(cfg)
	if err != nil {
		return configError("failed to update configuration: %v", err)
	}
	return setconfig(data, ConfigExternal)
}

// SetConfigFromFile updates the configuration from the given file.
func SetConfigFromFile(path string) error {
	data, err := DataFromFile(path)
	if err != nil {
		return configError("failed to apply configuration from file: %v", err)
	}
	return setconfig(data, ConfigFile)
}

// GetModule looks up the module for the given path, implicitly creating it if necessary.
func GetModule(path string) *Module {
	return lookup(path)
}

// AddNotify attaches the given update notification callback to the module.
func (m *Module) AddNotify(fn NotifyFn) error {
	return WithNotify(fn).apply(m)
}

// Register registers a unit of configuration data to be handled by this package.
func Register(path, description string, ptr interface{}, getfn GetConfigFn, opts ...Option) *Module {
	m := lookup(path)

	if !m.isImplicit() {
		log.Fatal("module %s: conflicting module with same path already declared (%s)",
			path, m.description)
	}

	m.setDescription(description)
	m.ptr = ptr
	m.getdefault = getfn

	m.check()

	foreign := m.notifiers
	m.notifiers = nil
	for _, opt := range opts {
		opt.apply(m)
	}
	m.notifiers = append(m.notifiers, foreign...)

	return m
}

// setconfig updates the configuration, notifies all modules, and does a rollback if necessary.
func setconfig(data Data, source Source) error {
	snapshot, err := main.getconfig()
	if err != nil {
		return configError("pre-update configuration snapshot failed: %v", err)
	}

	log.Info("validating configuration...")
	err = main.validate(data)
	if err != nil {
		return err
	}

	log.Info("applying configuration...")
	err = main.configure(data, false)
	if err != nil {
		revertconfig(snapshot, false)
		return err
	}

	log.Info("activating configuration...")
	err = main.notify(UpdateEvent, source)
	if err != nil {
		log.Error("configuration rejected: %v", err)
		revertconfig(snapshot, true)
		return err
	}

	return nil
}

// revertconfig reverts configuration using a previously taken snapshot
func revertconfig(snapshot Data, notify bool) {
	err := main.configure(snapshot, true)
	if err != nil {
		log.Error("failed to revert configuration: %v", err)
	}

	if !notify {
		return
	}

	err = main.notify(RevertEvent, ConfigBackup)
	if err != nil {
		log.Error("reverted configuration rejected: %v", err)
	}
}

// getconfig returns the configuration for the given module and its submodules.
func (m *Module) getconfig() (Data, error) {
	var mcfg, ccfg Data
	var err error

	if m.isImplicit() {
		mcfg = make(Data)
	} else {
		mcfg, err = DataFromObject(m.ptr)
		if err != nil {
			return nil, configError("module %s: failed to get confguration: %v",
				m.path, err)
		}
	}

	for name, child := range m.children {
		ccfg, err = child.getconfig()
		if err != nil {
			return nil, configError("module %s: failed to get child configuration for %s: %v",
				m.path, child.path, err)
		}
		mcfg[name] = ccfg
	}

	return mcfg, nil
}

// isImplict returns true if the module has not been explicitly declared.
func (m *Module) isImplicit() bool {
	return m.description == ""
}

// hasChild checks if the module has a child with the given name.
func (m *Module) hasChild(name string) bool {
	_, ok := m.children[name]
	return ok
}

// configure reconfigures the given module and its submodules with the provided data.
func (m *Module) configure(data Data, force bool) error {
	log.Debug("module %s: reconfiguring...", m.path)

	modcfg, subcfg := data.split(m.hasChild)
	if err := m.apply(modcfg); err != nil {
		if !force {
			return err
		}
		log.Error("%v", err)
	}

	for name, child := range m.children {
		childcfg, err := subcfg.pick(name, true)
		if err != nil {
			err = configError("module %s: failed to pick configuration: %v", child.path, err)
			if !force {
				return err
			}
			log.Error("%v", err)
		}
		err = child.configure(childcfg, force)
		if err != nil {
			if !force {
				return err
			}
			log.Error("%v", err)
		}
	}

	return nil
}

// apply applies the given module-local configuration to the module.
func (m *Module) apply(cfg Data) error {
	if m.isImplicit() {
		return nil
	}

	log.Debug("module %s: applying module configuration...", m.path)

	// First, reset module config to defaults
	defcfg, err := DataFromObject(m.getdefault())
	if err != nil {
		return configError("module %s: failed to retrieve default configuration: %v", m.path, err)
	}
	raw, err := yaml.Marshal(defcfg)
	if err != nil {
		return configError("module %s: failed to marshal default configuration: %v", m.path, err)
	}
	if err = yaml.Unmarshal(raw, m.ptr); err != nil {
		return configError("module %s: failed to pre-reset to default configuration: %v", m.path, err)
	}

	// Second, apply given conf on top of the defaults
	if len(cfg) > 0 {
		raw, err = yaml.Marshal(cfg)
		if err != nil {
			return configError("module %s: failed to marshal configuration: %v", m.path, err)
		}
		if err = yaml.Unmarshal(raw, m.ptr); err != nil {
			return configError("module %s: failed to apply configuration: %v", m.path, err)
		}
	}
	return nil
}

// notify notifies this module and its children about a configuration change.
func (m *Module) notify(event Event, source Source) error {
	for _, child := range m.children {
		if err := child.notify(event, source); err != nil {
			return err
		}
	}

	for _, fn := range m.notifiers {
		if err := fn(event, source); err != nil {
			return configError("module %s rejected %v configuration: %v", m.path, event, err)
		}
	}

	return nil
}

// check performs basic sanity checks on the module.
func (m *Module) check() {
	ptrType := reflect.TypeOf(m.ptr)
	ptr := reflect.ValueOf(m.ptr).Elem()
	if ptrType.Kind() != reflect.Ptr || ptr.Kind() != reflect.Struct {
		log.Fatal("module %s: configuration data must be a pointer to a struct, not %T",
			m.path, m.ptr)
	}

	if m.parent == nil || m.parent.isImplicit() {
		return
	}

	ptr = reflect.ValueOf(m.parent.ptr).Elem()
	for i := 0; i < ptr.NumField(); i++ {
		field := ptr.Type().Field(i)
		if m.name == fieldName(field) {
			log.Fatal("module %s: parent has configuration data with conflicting field", m.name)
		}
	}
}

// getFields() does a deep discovery of all fields of struct, handling also
// embedded (i.e. struct composition) fields.
func getFields(typ reflect.Type) map[string]struct{} {
	fields := make(map[string]struct{})

	var get func(t reflect.Type)
	get = func(t reflect.Type) {
		for i := 0; i < t.NumField(); i++ {
			f := t.Field(i)

			if f.Type.Kind() == reflect.Struct && f.Anonymous {
				get(f.Type)
			} else {
				fields[fieldName(f)] = struct{}{}
			}
		}
	}

	get(typ)

	return fields
}

// validate checks that each field of data refers to either module data or a submodule.
func (m *Module) validate(data Data) error {
	log.Debug("validating data for module %s...", m.path)

	modcfg, subcfg := data.split(m.hasChild)
	fields := map[string]struct{}{}

	if m.isImplicit() {
		if len(modcfg) > 0 {
			names := []string{}
			for name := range modcfg {
				names = append(names, name)
			}
			if !m.noValidate {
				return configError("implicit module %s: given configuration data %s",
					m.path, strings.Join(names, ","))
			}
			log.Error("implicit module %s: given configuration date %s",
				m.path, strings.Join(names, ","))
		}
	} else {
		fields = getFields(reflect.TypeOf(m.ptr).Elem())
	}

	for field := range modcfg {
		if _, ok := fields[field]; !ok {
			if !m.noValidate {
				return configError("module %s: given unknown configuration data %s", m.path, field)
			}
			log.Error("module %s: given unknown configuration data %s", m.path, field)
		}
	}

	subcfg = subcfg.copy()
	for name, child := range m.children {
		childcfg, err := subcfg.pick(name, true)
		if err != nil {
			return configError("module %s: failed to pick configuration for child %s: %v",
				m.path, child.path, err)
		}
		err = child.validate(childcfg)
		if err != nil {
			return err
		}
	}

	if len(subcfg) > 0 {
		unconsumed := []string{}
		for name := range subcfg {
			unconsumed = append(unconsumed, name)
		}
		return configError("module %s: no child corresponding to data %s",
			m.path, strings.Join(unconsumed, ","))
	}

	return nil
}

// fieldName returns the name used to refer to the struct field in JSON/YAML encoding.
func fieldName(f reflect.StructField) string {
	val, ok := f.Tag.Lookup("json")
	if !ok {
		return f.Name
	}
	tags := strings.Split(val, ",")
	if len(tags) < 1 {
		return f.Name
	}
	name := tags[0]
	if name == "" {
		return f.Name
	}
	return name
}

// lookup finds/creates a module corresponding to the given split module path.
func lookup(path string) *Module {
	names := strings.Split(path, ".")
	path = ""
	module := main
	for _, name := range names {
		if path != "" {
			path += "." + name
		} else {
			path = name
		}
		m, ok := module.children[name]
		if !ok {
			m = &Module{
				path:     path,
				parent:   module,
				name:     name,
				children: make(map[string]*Module),
			}
			module.children[name] = m
		}
		module = m
	}
	return module
}

// Print prints the current configuration, using the given function or fmt.Printf.
func Print(printfn func(string, ...interface{})) {
	data, err := GetConfig()
	if err != nil {
		log.Error("error: failed to get configuration: %v", err)
		return
	}
	data.Print(printfn)
}


================================================
FILE: pkg/config/data.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package config

import (
	"fmt"
	"os"
	"sigs.k8s.io/yaml"
	"strings"
)

// Data is our internal representation of configuration data.
type Data map[string]interface{}

// DataFromObject remarshals the given object into configuration data.
func DataFromObject(obj interface{}) (Data, error) {
	raw, err := yaml.Marshal(obj)
	if err != nil {
		return nil, configError("failed to marshal object %T to data: %v", obj, err)
	}
	data := make(Data)
	if err = yaml.Unmarshal(raw, &data); err != nil {
		return nil, configError("failed to unmarshal object %T to data: %v", obj, err)
	}
	return data, nil
}

// DataFromStringMap remarshals the given map into configuration data.
func DataFromStringMap(smap map[string]string) (Data, error) {
	data := make(Data)
	for key, val := range smap {
		var obj interface{}
		if err := yaml.Unmarshal([]byte(val), &obj); err != nil {
			return nil, configError("failed to unmarshal data from map: %v", err)
		}
		data[key] = obj
	}
	return data, nil
}

// DataFromFile unmarshals the content of the given file into configuration data.
func DataFromFile(path string) (Data, error) {
	raw, err := os.ReadFile(path)
	if err != nil {
		return nil, configError("failed to read file %q: %v", path, err)
	}
	data := make(Data)
	if err := yaml.Unmarshal(raw, &data); err != nil {
		return nil, configError("failed to load configuration from file %q: %v", path, err)
	}
	return data, nil
}

// copy does a shallow copy of the given data.
func (d Data) copy() Data {
	data := make(Data)
	for key, value := range d {
		data[key] = value
	}
	return data
}

// split splits up the given data to module- and child-specific parts.
func (d Data) split(hasChild func(string) bool) (Data, Data) {
	mod, sub := make(Data), make(Data)
	for key, val := range d {
		if hasChild(key) || strings.IndexByte(key, '.') != -1 {
			sub[key] = val
		} else {
			mod[key] = val
		}
	}
	return mod, sub
}

// pick picks data for the given key.
func (d Data) pick(key string, removePicked bool) (Data, error) {
	var data Data
	var err error

	if obj, ok := d[key]; ok {
		data, err = DataFromObject(obj)
		if err != nil {
			return nil, err
		}
		if removePicked {
			delete(d, key)
		}
	}

	// pick/remove data for all dotted keys matching the key being picked
	for k, v := range d {
		split := strings.Split(k, ".")
		if len(split) > 1 && split[0] == key {
			if data == nil {
				data = make(Data)
			}
			subkey := strings.Join(split[1:], ".")
			if _, ok := data[subkey]; ok {
				return nil, configError("dotted key %q conflicts with nested key %q", k, subkey)
			}
			data[subkey] = v
			if removePicked {
				delete(d, k)
			}
		}
	}

	return data, nil
}

// String returns configuration data as a string.
func (d Data) String() string {
	raw, err := yaml.Marshal(d)
	if err != nil {
		return fmt.Sprintf("<config.data: failed to marshal: %v>", err)
	}
	return string(raw)
}

// Print prints the configuration data using the given function or fmt.Printf.
func (d Data) Print(fn func(string, ...interface{})) {
	if fn == nil {
		fn = func(format string, args ...interface{}) {
			fmt.Printf(format+"\n", args...)
		}
	}

	for _, line := range strings.Split(d.String(), "\n") {
		fn("%s", line)
	}
}


================================================
FILE: pkg/config/duration.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package config

import (
	"fmt"
	"time"
)

// Duration is a time.Duration which implements JSON marshalling/unmarshalling.
type Duration time.Duration

// MarshalJSON is the JSON marshaller for (time.)Duration.
func (d Duration) MarshalJSON() ([]byte, error) {
	return []byte("\"" + time.Duration(d).String() + "\""), nil
}

// UnmarshalJSON is the JSON unmarshaller for (time.)Duration.
func (d *Duration) UnmarshalJSON(data []byte) error {
	if len(data) < 2 {
		return fmt.Errorf("invalid Duration data")
	}
	parsed, err := time.ParseDuration(string(data[1 : len(data)-1]))
	if err != nil {
		return err
	}
	*d = Duration(parsed)
	return nil
}

// String returns the value of Duration as a string.
func (d *Duration) String() string {
	return time.Duration(*d).String()
}


================================================
FILE: pkg/config/error.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package config

import (
	"fmt"
)

// configError creates a formatted configuration-specific error.
func configError(format string, args ...interface{}) error {
	return fmt.Errorf("config error: "+format, args...)
}


================================================
FILE: pkg/config/help.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package config

import (
	"fmt"
	"reflect"
	"sort"
	"strings"
)

// Describe provides help about configuration of the given modules.
func Describe(names ...string) {
	modules := findModules(names, nil)

	if len(modules) == 0 {
		fmt.Printf("No matching modules found.\n")
		return
	}

	for _, m := range modules {
		m.showHelp()
		fmt.Printf("\n\n")
	}
}

func (m *Module) setDescription(description string) {
	description = strings.Trim(description, "\n")

	if description == "" {
		m.description = "Module " + m.path + " has no description."
		return
	}

	if strings.IndexByte(description, '\n') == -1 {
		m.description = description
	} else {
		lines := strings.Split(description, "\n")
		m.description = lines[0]
		m.help = strings.Trim(strings.Join(lines[1:], "\n"), "\n")
	}
}

func (m *Module) showHelp() {
	kind := "module"
	if m.isImplicit() {
		kind = "implicit module"
	}
	fmt.Printf("- %s %s: %s\n", kind, m.name, m.description)
	fmt.Printf("  full path: %s\n", m.path)
	if len(m.children) > 0 {
		submodules, sep := "", ""
		for _, child := range m.children {
			submodules += sep + child.path
			sep = ", "
		}
		fmt.Printf("  sub-modules: %s\n", submodules)
	}
	fmt.Printf("  description:\n")
	if m.help != "" {
		fmt.Printf("\n")
		for _, line := range strings.Split(m.help, "\n") {
			fmt.Printf("    %s\n", line)
		}
	} else {
		m.describeData()
	}
}

func (m *Module) describeData() {
	if m.isImplicit() {
		return
	}

	cfg := reflect.ValueOf(m.ptr).Elem()
	fmt.Printf("    No runtime configuration documentation for this package...\n")
	fmt.Printf("    Package runtime configuration data type: %s %s.\n",
		cfg.Type().Kind().String(), cfg.Type().String())
}

func findModules(names []string, m *Module) []*Module {
	if m == nil {
		m = main
	}

	matches := []*Module{}

	if len(names) == 0 {
		matches = append(matches, m)
	} else {
		for _, name := range names {
			switch {
			case name == m.name || name == m.path:
				matches = append(matches, m)
			case name[0] == '.' && name[len(name)-1] == '.' && strings.Index(m.path, name) > 0:
				matches = append(matches, m)
			case name[0] == '.' && strings.HasSuffix(m.path, name):
				matches = append(matches, m)
			case name[len(name)-1] == '.' && strings.HasPrefix(m.path, name):
				matches = append(matches, m)
			}
		}
	}

	children := []*Module{}
	for _, child := range m.children {
		children = append(children, child)
	}
	sort.Slice(children,
		func(i, j int) bool {
			return strings.Compare(children[i].path, children[j].path) < 0
		},
	)
	for _, child := range children {
		matches = append(matches, findModules(names, child)...)
	}

	return matches
}


================================================
FILE: pkg/config/log.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package config

import (
	"fmt"
	"os"
)

//
// Notes:
//   Unless we split the Logger interface (pkg/log.Logger) from its actual implementation
//   we cannot import it import here. pkg/log itself implements its runtime configurability
//   using this module so we would end up with an import cycle. As a workaround for now we
//   let out logger be set externally and we set it from pkg/log.
//

// Logger is our set of logging functions.
type Logger struct {
	DebugEnabled func() bool
	Debug        func(string, ...interface{})
	Info         func(string, ...interface{})
	Warning      func(string, ...interface{})
	Error        func(string, ...interface{})
	Fatal        func(string, ...interface{})
	Panic        func(string, ...interface{})
}

// log is our Logger.
var log = defaultLogger()

// SetLogger sets our logger.
func SetLogger(logger Logger) {
	if logger.DebugEnabled != nil {
		log.DebugEnabled = logger.DebugEnabled
	}
	if logger.Debug != nil {
		log.Debug = logger.Debug
	}
	if logger.Info != nil {
		log.Info = logger.Info
	}
	if logger.Warning != nil {
		log.Warning = logger.Warning
	}
	if logger.Error != nil {
		log.Error = logger.Error
	}
	if logger.Panic != nil {
		log.Panic = logger.Panic
	}
	if logger.Fatal != nil {
		log.Fatal = logger.Fatal
	}
}

func defaultLogger() Logger {
	return Logger{
		DebugEnabled: debugEnabled,
		Debug:        debugmsg,
		Info:         infomsg,
		Warning:      warningmsg,
		Error:        errormsg,
		Fatal:        fatalmsg,
		Panic:        panicmsg,
	}
}

func debugEnabled() bool {
	return true
}

func debugmsg(format string, args ...interface{}) {
	fmt.Printf("D: [config] "+format+"\n", args...)
}

func infomsg(format string, args ...interface{}) {
	fmt.Printf("I: [config] "+format+"\n", args...)
}

func warningmsg(format string, args ...interface{}) {
	fmt.Printf("W: [config] "+format+"\n", args...)
}

func errormsg(format string, args ...interface{}) {
	fmt.Printf("E: [config] "+format+"\n", args...)
}

func fatalmsg(format string, args ...interface{}) {
	fmt.Printf("E: [config] fatal error: "+format+"\n", args...)
	os.Exit(1)
}

func panicmsg(format string, args ...interface{}) {
	errormsg(format, args...)
	panic(fmt.Sprintf("fatal error: "+format+"\n", args...))
}


================================================
FILE: pkg/config/options.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package config

// WithNotify specifies a notification function to be called after configuration updates.
func WithNotify(fn NotifyFn) Option {
	return newFuncOption(func(o interface{}) error {
		switch o.(type) {
		case *Module:
			m := o.(*Module)
			m.notifiers = append(m.notifiers, fn)
		default:
			return configError("WithNotify is not valid for object of type %T", o)
		}
		return nil
	})
}

// WithoutDataValidation specifies that data passed to this module should not be validated.
func WithoutDataValidation() Option {
	return newFuncOption(func(o interface{}) error {
		switch o.(type) {
		case *Module:
			m := o.(*Module)
			m.noValidate = true
		default:
			return configError("WithoutDataValidation is not valid for object of type %T", o)
		}
		return nil
	})
}

// Option is the generic interface for any option applicable to a Module or Config.
type Option interface {
	apply(interface{}) error
}

// funcOption is a generic functional option.
type funcOption struct {
	f func(interface{}) error
}

// apply applies a functional option to an object.
func (fo *funcOption) apply(o interface{}) error {
	return fo.f(o)
}

// newFuncOption creates a new option instance.
func newFuncOption(f func(interface{}) error) *funcOption {
	return &funcOption{
		f: f,
	}
}


================================================
FILE: pkg/cpuallocator/allocator.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cpuallocator

import (
	"fmt"
	"sort"

	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/sysfs"
	"github.com/intel/cri-resource-manager/pkg/utils"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	"github.com/intel/goresctrl/pkg/sst"
	idset "github.com/intel/goresctrl/pkg/utils"
)

// AllocFlag represents CPU allocation preferences.
type AllocFlag uint

const (
	// AllocIdlePackages requests allocation of full idle packages.
	AllocIdlePackages AllocFlag = 1 << iota
	// AllocIdleNodes requests allocation of full idle NUMA nodes.
	AllocIdleNodes
	// AllocIdleCores requests allocation of full idle cores (all threads in core).
	AllocIdleCores
	// AllocDefault is the default allocation preferences.
	AllocDefault = AllocIdlePackages | AllocIdleCores

	logSource = "cpuallocator"
)

// allocatorHelper encapsulates state for allocating CPUs.
type allocatorHelper struct {
	logger.Logger               // allocatorHelper logger instance
	sys           sysfs.System  // sysfs CPU and topology information
	topology      topologyCache // cached topology information
	flags         AllocFlag     // allocation preferences
	from          cpuset.CPUSet // set of CPUs to allocate from
	prefer        CPUPriority   // CPU priority to prefer
	cnt           int           // number of CPUs to allocate
	result        cpuset.CPUSet // set of CPUs allocated

	pkgs []sysfs.CPUPackage // physical CPU packages, sorted by preference
	cpus []sysfs.CPU        // CPU cores, sorted by preference
}

// CPUAllocator is an interface for a generic CPU allocator
type CPUAllocator interface {
	AllocateCpus(from *cpuset.CPUSet, cnt int, prefer CPUPriority) (cpuset.CPUSet, error)
	ReleaseCpus(from *cpuset.CPUSet, cnt int, prefer CPUPriority) (cpuset.CPUSet, error)
}

type CPUPriority int

const (
	PriorityHigh CPUPriority = iota
	PriorityNormal
	PriorityLow
	NumCPUPriorities
	PriorityNone = NumCPUPriorities
)

type cpuAllocator struct {
	logger.Logger
	sys           sysfs.System  // wrapped sysfs.System instance
	topologyCache topologyCache // topology lookups
}

// topologyCache caches topology lookups
type topologyCache struct {
	pkg  map[idset.ID]cpuset.CPUSet
	node map[idset.ID]cpuset.CPUSet
	core map[idset.ID]cpuset.CPUSet

	cpuPriorities cpuPriorities // CPU priority mapping
}

type cpuPriorities [NumCPUPriorities]cpuset.CPUSet

// IDFilter helps filtering Ids.
type IDFilter func(idset.ID) bool

// IDSorter helps sorting Ids.
type IDSorter func(int, int) bool

// our logger instance
var log = logger.NewLogger(logSource)

// NewCPUAllocator return a new cpuAllocator instance
func NewCPUAllocator(sys sysfs.System) CPUAllocator {
	ca := cpuAllocator{
		Logger:        log,
		sys:           sys,
		topologyCache: newTopologyCache(sys),
	}

	return &ca
}

// Pick packages, nodes or CPUs by filtering according to a function.
func pickIds(idSlice []idset.ID, f IDFilter) []idset.ID {
	ids := make([]idset.ID, len(idSlice))

	idx := 0
	for _, id := range idSlice {
		if f == nil || f(id) {
			ids[idx] = id
			idx++
		}
	}

	return ids[0:idx]
}

// newAllocatorHelper creates a new CPU allocatorHelper.
func newAllocatorHelper(sys sysfs.System, topo topologyCache) *allocatorHelper {
	a := &allocatorHelper{
		Logger:   log,
		sys:      sys,
		topology: topo,
		flags:    AllocDefault,
	}

	return a
}

// Allocate full idle CPU packages.
func (a *allocatorHelper) takeIdlePackages() {
	a.Debug("* takeIdlePackages()...")

	offline := a.sys.Offlined()

	// pick idle packages
	pkgs := pickIds(a.sys.PackageIDs(),
		func(id idset.ID) bool {
			cset := a.topology.pkg[id].Difference(offline)
			return cset.Intersection(a.from).Equals(cset)
		})

	// sorted by number of preferred cpus and then by cpu id
	sort.Slice(pkgs,
		func(i, j int) bool {
			if res := a.topology.cpuPriorities.cmpCPUSet(a.topology.pkg[pkgs[i]], a.topology.pkg[pkgs[j]], a.prefer, -1); res != 0 {
				return res > 0
			}
			return pkgs[i] < pkgs[j]
		})

	a.Debug(" => idle packages sorted by preference: %v", pkgs)

	// take as many idle packages as we need/can
	for _, id := range pkgs {
		cset := a.topology.pkg[id].Difference(offline)
		a.Debug(" => considering package %v (#%s)...", id, cset)
		if a.cnt >= cset.Size() {
			a.Debug(" => taking package %v...", id)
			a.result = a.result.Union(cset)
			a.from = a.from.Difference(cset)
			a.cnt -= cset.Size()

			if a.cnt == 0 {
				break
			}
		}
	}
}

// Allocate full idle CPU cores.
func (a *allocatorHelper) takeIdleCores() {
	a.Debug("* takeIdleCores()...")

	offline := a.sys.Offlined()

	// pick (first id for all) idle cores
	cores := pickIds(a.sys.CPUIDs(),
		func(id idset.ID) bool {
			cset := a.topology.core[id].Difference(offline)
			if cset.IsEmpty() {
				return false
			}
			return cset.Intersection(a.from).Equals(cset) && cset.List()[0] == int(id)
		})

	// sorted by id
	sort.Slice(cores,
		func(i, j int) bool {
			if res := a.topology.cpuPriorities.cmpCPUSet(a.topology.core[cores[i]], a.topology.core[cores[j]], a.prefer, -1); res != 0 {
				return res > 0
			}
			return cores[i] < cores[j]
		})

	a.Debug(" => idle cores sorted by preference: %v", cores)

	// take as many idle cores as we can
	for _, id := range cores {
		cset := a.topology.core[id].Difference(offline)
		a.Debug(" => considering core %v (#%s)...", id, cset)
		if a.cnt >= cset.Size() {
			a.Debug(" => taking core %v...", id)
			a.result = a.result.Union(cset)
			a.from = a.from.Difference(cset)
			a.cnt -= cset.Size()

			if a.cnt == 0 {
				break
			}
		}
	}
}

// Allocate idle CPU hyperthreads.
func (a *allocatorHelper) takeIdleThreads() {
	offline := a.sys.Offlined()

	// pick all threads with free capacity
	cores := pickIds(a.sys.CPUIDs(),
		func(id idset.ID) bool {
			return a.from.Difference(offline).Contains(int(id))
		})

	a.Debug(" => idle threads unsorted: %v", cores)

	// sorted for preference by id, mimicking cpus_assignment.go for now:
	//   IOW, prefer CPUs
	//     - from packages with higher number of CPUs/cores already in a.result
	//     - from packages having larger number of available cpus with preferred priority
	//     - from a single package
	//     - from the list of cpus with preferred priority
	//     - from packages with fewer remaining free CPUs/cores in a.from
	//     - from cores with fewer remaining free CPUs/cores in a.from
	//     - from packages with lower id
	//     - with lower id
	sort.Slice(cores,
		func(i, j int) bool {
			iCore := cores[i]
			jCore := cores[j]
			iPkg := a.sys.CPU(iCore).PackageID()
			jPkg := a.sys.CPU(jCore).PackageID()

			iCoreSet := a.topology.core[iCore]
			jCoreSet := a.topology.core[jCore]
			iPkgSet := a.topology.pkg[iPkg]
			jPkgSet := a.topology.pkg[jPkg]

			iPkgColo := iPkgSet.Intersection(a.result).Size()
			jPkgColo := jPkgSet.Intersection(a.result).Size()
			if iPkgColo != jPkgColo {
				return iPkgColo > jPkgColo
			}

			// Always sort cores in package order
			if res := a.topology.cpuPriorities.cmpCPUSet(iPkgSet.Intersection(a.from), jPkgSet.Intersection(a.from), a.prefer, a.cnt); res != 0 {
				return res > 0
			}
			if iPkg != jPkg {
				return iPkg < jPkg
			}

			iCset := cpuset.New(int(cores[i]))
			jCset := cpuset.New(int(cores[j]))
			if res := a.topology.cpuPriorities.cmpCPUSet(iCset, jCset, a.prefer, 0); res != 0 {
				return res > 0
			}

			iPkgFree := iPkgSet.Intersection(a.from).Size()
			jPkgFree := jPkgSet.Intersection(a.from).Size()
			if iPkgFree != jPkgFree {
				return iPkgFree < jPkgFree
			}

			iCoreFree := iCoreSet.Intersection(a.from).Size()
			jCoreFree := jCoreSet.Intersection(a.from).Size()
			if iCoreFree != jCoreFree {
				return iCoreFree < jCoreFree
			}

			return iCore < jCore
		})

	a.Debug(" => idle threads sorted: %v", cores)

	// take as many idle cores as we can
	for _, id := range cores {
		cset := a.topology.core[id].Difference(offline)
		a.Debug(" => considering thread %v (#%s)...", id, cset)
		cset = cpuset.New(int(id))
		a.result = a.result.Union(cset)
		a.from = a.from.Difference(cset)
		a.cnt -= cset.Size()

		if a.cnt == 0 {
			break
		}
	}
}

// takeAny is a dummy allocator not dependent on sysfs topology information
func (a *allocatorHelper) takeAny() {
	a.Debug("* takeAnyCores()...")

	cpus := a.from.List()

	if len(cpus) >= a.cnt {
		cset := cpuset.New(cpus[0:a.cnt]...)
		a.result = a.result.Union(cset)
		a.from = a.from.Difference(cset)
		a.cnt = 0
	}
}

// Perform CPU allocation.
func (a *allocatorHelper) allocate() cpuset.CPUSet {
	if a.sys != nil {
		if (a.flags & AllocIdlePackages) != 0 {
			a.takeIdlePackages()
		}
		if a.cnt > 0 && (a.flags&AllocIdleCores) != 0 {
			a.takeIdleCores()
		}
		if a.cnt > 0 {
			a.takeIdleThreads()
		}
	} else {
		a.takeAny()
	}
	if a.cnt == 0 {
		return a.result
	}

	return cpuset.New()
}

func (ca *cpuAllocator) allocateCpus(from *cpuset.CPUSet, cnt int, prefer CPUPriority) (cpuset.CPUSet, error) {
	var result cpuset.CPUSet
	var err error

	switch {
	case from.Size() < cnt:
		result, err = cpuset.New(), fmt.Errorf("cpuset %s does not have %d CPUs", from, cnt)
	case from.Size() == cnt:
		result, err, *from = from.Clone(), nil, cpuset.New()
	default:
		a := newAllocatorHelper(ca.sys, ca.topologyCache)
		a.from = from.Clone()
		a.cnt = cnt
		a.prefer = prefer

		result, err, *from = a.allocate(), nil, a.from.Clone()

		a.Debug("%d cpus from #%v (preferring #%v) => #%v", cnt, from.Union(result), a.prefer, result)
	}

	return result, err
}

// AllocateCpus allocates a number of CPUs from the given set.
func (ca *cpuAllocator) AllocateCpus(from *cpuset.CPUSet, cnt int, prefer CPUPriority) (cpuset.CPUSet, error) {
	result, err := ca.allocateCpus(from, cnt, prefer)
	return result, err
}

// ReleaseCpus releases a number of CPUs from the given set.
func (ca *cpuAllocator) ReleaseCpus(from *cpuset.CPUSet, cnt int, prefer CPUPriority) (cpuset.CPUSet, error) {
	oset := from.Clone()

	result, err := ca.allocateCpus(from, from.Size()-cnt, prefer)

	ca.Debug("ReleaseCpus(#%s, %d) => kept: #%s, released: #%s", oset, cnt, from, result)

	return result, err
}

func newTopologyCache(sys sysfs.System) topologyCache {
	c := topologyCache{
		pkg:  make(map[idset.ID]cpuset.CPUSet),
		node: make(map[idset.ID]cpuset.CPUSet),
		core: make(map[idset.ID]cpuset.CPUSet)}
	if sys != nil {
		for _, id := range sys.PackageIDs() {
			c.pkg[id] = sys.Package(id).CPUSet()
		}
		for _, id := range sys.NodeIDs() {
			c.node[id] = sys.Node(id).CPUSet()
		}
		for _, id := range sys.CPUIDs() {
			c.core[id] = sys.CPU(id).ThreadCPUSet()
		}
	}

	c.discoverCPUPriorities(sys)

	return c
}

func (c *topologyCache) discoverCPUPriorities(sys sysfs.System) {
	if sys == nil {
		return
	}
	var prio cpuPriorities

	// Discover on per-package basis
	for id := range c.pkg {
		cpuPriorities, sstActive := c.discoverSstCPUPriority(sys, id)

		if !sstActive {
			cpuPriorities = c.discoverCpufreqPriority(sys, id)
		}

		for p, cpus := range cpuPriorities {
			source := map[bool]string{true: "sst", false: "cpufreq"}[sstActive]
			cset := sysfs.CPUSetFromIDSet(idset.NewIDSet(cpus...))
			log.Debug("package #%d (%s): %d %s priority cpus (%v)", id, source, len(cpus), CPUPriority(p), cset)
			prio[p] = prio[p].Union(cset)
		}
	}
	c.cpuPriorities = prio
}

func (c *topologyCache) discoverSstCPUPriority(sys sysfs.System, pkgID idset.ID) ([NumCPUPriorities][]idset.ID, bool) {
	active := false

	pkg := sys.Package(pkgID)
	sst := pkg.SstInfo()
	cpuIDs := c.pkg[pkgID].List()
	prios := make(map[idset.ID]CPUPriority, len(cpuIDs))

	// Determine SST-based priority. Based on experimentation there is some
	// hierarchy between the SST features. Without trying to be too smart
	// we follow the principles below:
	// 1. SST-TF has highest preference, mastering over SST-BF and making most
	//    of SST-CP settings ineffective
	// 2. SST-CP dictates over SST-BF
	// 3. SST-BF is meaningful if neither SST-TF nor SST-CP is enabled
	switch {
	case sst == nil:
	case sst.TFEnabled:
		log.Debug("package #%d: using SST-TF based CPU prioritization", pkgID)
		// We only look at the CLOS id as SST-TF (seems to) follows ordered CLOS priority
		for _, i := range cpuIDs {
			id := idset.ID(i)
			p := PriorityLow
			// First two CLOSes are prioritized by SST
			if sys.CPU(id).SstClos() < 2 {
				p = PriorityHigh
			}
			prios[id] = p
		}
		active = true
	case sst.CPEnabled:
		closPrio := c.sstClosPriority(sys, pkgID)
		log.Debug("package #%d: using SST-CP based CPU prioritization with CLOS mapping %v", pkgID, closPrio)

		active = false
		for _, i := range cpuIDs {
			id := idset.ID(i)
			clos := sys.CPU(id).SstClos()
			p := closPrio[clos]
			if p != PriorityNormal {
				active = true
			}
			prios[id] = p
		}
	}

	if !active && sst != nil && sst.BFEnabled {
		log.Debug("package #%d: using SST-BF based CPU prioritization", pkgID)
		for _, i := range cpuIDs {
			id := idset.ID(i)
			p := PriorityLow
			if sst.BFCores.Has(id) {
				p = PriorityHigh
			}
			prios[id] = p
		}
		active = true
	}

	var ret [NumCPUPriorities][]idset.ID

	for cpu, prio := range prios {
		ret[prio] = append(ret[prio], cpu)
	}
	return ret, active
}

func (c *topologyCache) sstClosPriority(sys sysfs.System, pkgID idset.ID) map[int]CPUPriority {
	sortedKeys := func(m map[int]int) []int {
		keys := make([]int, 0, len(m))
		for k := range m {
			keys = append(keys, k)
		}
		sort.Ints(keys)
		return keys
	}

	pkg := sys.Package(pkgID)
	sstinfo := pkg.SstInfo()

	// Get a list of unique CLOS proportional priority values
	closPps := make(map[int]int)
	closIds := make(map[int]int)
	for _, cpuID := range c.pkg[pkgID].List() {
		clos := sys.CPU(idset.ID(cpuID)).SstClos()
		pp := sstinfo.ClosInfo[clos].ProportionalPriority
		closPps[pp] = clos
		closIds[clos] = 0 // 0 is a dummy value here
	}

	// Form a list of (active) CLOS ids in sorted order
	var closSorted []int
	if sstinfo.CPPriority == sst.Ordered {
		// In ordered mode the priority is simply the CLOS id
		closSorted = sortedKeys(closIds)
		log.Debug("package #%d, ordered SST-CP priority with CLOS ids %v", pkgID, closSorted)
	} else {
		// In proportional mode we sort by the proportional priority parameter
		closPpSorted := sortedKeys(closPps)

		for _, pp := range closPpSorted {
			closSorted = append(closSorted, closPps[pp])
		}
		log.Debug("package #%d, proportional SST-CP priority with PP-to-CLOS parity %v", pkgID, closPps)
	}

	// Map from CLOS id to cpuallocator CPU priority
	closPriority := make(map[int]CPUPriority, len(closSorted))
	for _, id := range closSorted {
		// Default to normal priority
		closPriority[id] = PriorityNormal
	}
	if len(closSorted) > 1 {
		// Highest CLOS id maps to high CPU priority
		closPriority[closSorted[0]] = PriorityHigh
		closPriority[closSorted[len(closSorted)-1]] = PriorityLow
	}

	return closPriority
}

func (c *topologyCache) discoverCpufreqPriority(sys sysfs.System, pkgID idset.ID) [NumCPUPriorities][]idset.ID {
	var prios [NumCPUPriorities][]idset.ID

	// Group cpus by base frequency and energy performance profile
	freqs := map[uint64][]idset.ID{}
	epps := map[sysfs.EPP][]idset.ID{}
	cpuIDs := c.pkg[pkgID].List()
	for _, num := range cpuIDs {
		id := idset.ID(num)
		cpu := sys.CPU(id)
		bf := cpu.BaseFrequency()
		freqs[bf] = append(freqs[bf], id)

		epp := cpu.EPP()
		epps[epp] = append(epps[epp], id)
	}

	// Construct a sorted lists of detected frequencies and epp values
	freqList := []uint64{}
	for freq := range freqs {
		if freq > 0 {
			freqList = append(freqList, freq)
		}
	}
	utils.SortUint64s(freqList)

	eppList := []int{}
	for e := range epps {
		if e != sysfs.EPPUnknown {
			eppList = append(eppList, int(e))
		}
	}
	sort.Ints(eppList)

	// Finally, determine priority of each CPU
	for _, num := range cpuIDs {
		id := idset.ID(num)
		cpu := sys.CPU(id)
		p := PriorityNormal

		if len(freqList) > 1 {
			bf := cpu.BaseFrequency()

			// All cpus NOT in the lowest base frequency bin are considered high prio
			if bf > freqList[0] {
				p = PriorityHigh
			} else {
				p = PriorityLow
			}
		}

		// All cpus NOT in the lowest performance epp are considered high prio
		// NOTE: higher EPP value denotes lower performance preference
		if len(eppList) > 1 {
			epp := cpu.EPP()
			if int(epp) < eppList[len(eppList)-1] {
				p = PriorityHigh
			} else {
				p = PriorityLow
			}
		}

		prios[p] = append(prios[p], id)
	}

	return prios
}

func (p CPUPriority) String() string {
	switch p {
	case PriorityHigh:
		return "high"
	case PriorityNormal:
		return "normal"
	case PriorityLow:
		return "low"
	}
	return "none"
}

// cmpCPUSet compares two cpusets in terms of preferred cpu priority. Returns:
//
//	> 0 if cpuset A is preferred
//	< 0 if cpuset B is preferred
//	0 if cpusets A and B are equal in terms of cpu priority
func (c *cpuPriorities) cmpCPUSet(csetA, csetB cpuset.CPUSet, prefer CPUPriority, cpuCnt int) int {
	if prefer == PriorityNone {
		return 0
	}

	// Favor cpuset having CPUs with priorities equal to or lower than what was requested
	for prio := prefer; prio < NumCPUPriorities; prio++ {
		prefA := csetA.Intersection(c[prio]).Size()
		prefB := csetB.Intersection(c[prio]).Size()
		if cpuCnt > 0 && prio == prefer && prefA >= cpuCnt && prefB >= cpuCnt {
			// Prefer the tightest fitting if both cpusets satisfy the
			// requested amount of CPUs with the preferred priority
			return prefB - prefA
		}
		if prefA != prefB {
			return prefA - prefB
		}
	}
	// Repel cpuset having CPUs with higher priority than what was requested
	for prio := PriorityHigh; prio < prefer; prio++ {
		nonprefA := csetA.Intersection(c[prio]).Size()
		nonprefB := csetB.Intersection(c[prio]).Size()
		if nonprefA != nonprefB {
			return nonprefB - nonprefA
		}
	}
	return 0
}


================================================
FILE: pkg/cpuallocator/cpuallocator_test.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cpuallocator

import (
	"os"
	"path"
	"testing"

	"github.com/intel/cri-resource-manager/pkg/sysfs"
	"github.com/intel/cri-resource-manager/pkg/utils"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
)

func TestAllocatorHelper(t *testing.T) {
	// Create tmpdir and decompress testdata there
	tmpdir, err := os.MkdirTemp("", "cri-resource-manager-test-")
	if err != nil {
		t.Fatalf("failed to create tmpdir: %v", err)
	}
	defer os.RemoveAll(tmpdir)

	if err := utils.UncompressTbz2(path.Join("testdata", "sysfs.tar.bz2"), tmpdir); err != nil {
		t.Fatalf("failed to decompress testdata: %v", err)
	}

	// Discover mock system from the testdata
	sys, err := sysfs.DiscoverSystemAt(path.Join(tmpdir, "sysfs", "2-socket-4-node-40-core", "sys"))
	if err != nil {
		t.Fatalf("failed to discover mock system: %v", err)
	}
	topoCache := newTopologyCache(sys)

	// Fake cpu priorities: 5 cores from pkg #0 as high prio
	// Package CPUs: #0: [0-19,40-59], #1: [20-39,60-79]
	topoCache.cpuPriorities = [NumCPUPriorities]cpuset.CPUSet{
		cpuset.MustParse("2,5,8,15,17,42,45,48,55,57"),
		cpuset.MustParse("20-39,60-79"),
		cpuset.MustParse("0,1,3,4,6,7,9-14,16,18,19,40,41,43,44,46,47,49-54,56,58,59"),
	}

	tcs := []struct {
		description string
		from        cpuset.CPUSet
		prefer      CPUPriority
		cnt         int
		expected    cpuset.CPUSet
	}{
		{
			description: "too few available CPUs",
			from:        cpuset.MustParse("2,3,10-14,20"),
			prefer:      PriorityNormal,
			cnt:         9,
			expected:    cpuset.New(),
		},
		{
			description: "request all available CPUs",
			from:        cpuset.MustParse("2,3,10-14,20"),
			prefer:      PriorityNormal,
			cnt:         8,
			expected:    cpuset.MustParse("2,3,10-14,20"),
		},
		{
			description: "prefer high priority cpus",
			from:        cpuset.MustParse("2,3,10-25"),
			prefer:      PriorityHigh,
			cnt:         4,
			expected:    cpuset.New(2, 3, 15, 17),
		},
	}

	// Run tests
	for _, tc := range tcs {
		t.Run(tc.description, func(t *testing.T) {
			a := newAllocatorHelper(sys, topoCache)
			a.from = tc.from
			a.prefer = tc.prefer
			a.cnt = tc.cnt
			result := a.allocate()
			if !result.Equals(tc.expected) {
				t.Errorf("expected %q, result was %q", tc.expected, result)
			}
		})
	}
}


================================================
FILE: pkg/cri/client/client.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package client

import (
	"context"
	"fmt"
	"net"
	"os"
	"syscall"
	"time"

	"google.golang.org/grpc"
	"google.golang.org/grpc/connectivity"

	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"

	"github.com/intel/cri-resource-manager/pkg/instrumentation"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/utils"

	v1 "github.com/intel/cri-resource-manager/pkg/cri/client/v1"
)

// DialNotifyFn is a function to call after a successful net.Dial[Timeout]().
type DialNotifyFn func(string, int, int, os.FileMode, error)

// Options contains the configurable options of our CRI client.
type Options struct {
	// ImageSocket is the socket path for the CRI image service.
	ImageSocket string
	// RuntimeSocket is the socket path for the CRI runtime service.
	RuntimeSocket string
	// DialNotify is an optional function to notify after net.Dial returns for a socket.
	DialNotify DialNotifyFn
}

// ConnectOptions contains options for connecting to the server.
type ConnectOptions struct {
	// Wait indicates whether Connect() should wait (indefinitely) for the server.
	Wait bool
	// Reconnect indicates whether CheckConnection() should attempt to Connect().
	Reconnect bool
}

// Client is the interface we expose to our CRI client.
type Client interface {
	// Connect tries to connect the client to the specified image and runtime services.
	Connect(ConnectOptions) error
	// Close closes any existing client connections.
	Close()
	// CheckConnection checks if we have (un-Close()'d as opposed to working) connections.
	CheckConnection(ConnectOptions) error
	// HasRuntimeService checks if the client is configured with runtime services.
	HasRuntimeService() bool

	// We expose full image and runtime client services.
	criv1.ImageServiceClient
	criv1.RuntimeServiceClient
}

type criClient interface {
	criv1.ImageServiceClient
	criv1.RuntimeServiceClient
}

// client is the implementation of Client.
type client struct {
	logger.Logger
	criv1.ImageServiceClient
	criv1.RuntimeServiceClient
	options Options          // client options
	icc     *grpc.ClientConn // our gRPC connection to the image service
	rcc     *grpc.ClientConn // our gRPC connection to the runtime service

	client criClient
}

const (
	// DontConnect is used to mark a socket to not be connected.
	DontConnect = "-"
)

// NewClient creates a new client instance.
func NewClient(options Options) (Client, error) {
	if options.ImageSocket == DontConnect && options.RuntimeSocket == DontConnect {
		return nil, clientError("neither image nor runtime socket specified")
	}

	c := &client{
		Logger:  logger.NewLogger("cri/client"),
		options: options,
	}

	return c, nil
}

// Connect attempts to establish gRPC client connections to the configured services.
func (c *client) Connect(options ConnectOptions) error {
	var err error

	kind, socket := "image services", c.options.ImageSocket
	if c.icc, err = c.connect(kind, socket, options); err != nil {
		return err
	}

	kind, socket = "runtime services", c.options.RuntimeSocket
	if socket == c.options.ImageSocket {
		c.rcc = c.icc
	} else {
		if c.rcc, err = c.connect(kind, socket, options); err != nil {
			c.icc = nil
			return err
		}
	}

	client, err := v1.Connect(c.rcc, c.icc)
	if err != nil {
		return err
	}

	c.client = client
	return nil
}

// Close any open service connection.
func (c *client) Close() {
	if c.icc != nil {
		c.Debug("closing image service connection...")
		c.icc.Close()
	}

	if c.rcc != nil {
		c.Debug("closing runtime service connection...")
		if c.rcc != c.icc {
			c.rcc.Close()
		}
	}

	c.icc = nil
	c.rcc = nil
}

// Check if the connecton to CRI services is up, try to reconnect if requested.
func (c *client) CheckConnection(options ConnectOptions) error {
	if (c.icc == nil || c.icc.GetState() == connectivity.Ready) &&
		(c.rcc == nil || c.rcc.GetState() == connectivity.Ready) {
		return nil
	}

	c.Close()

	if options.Reconnect {
		c.Warn("client connections are down")
		if err := c.Connect(ConnectOptions{Wait: false}); err == nil {
			return nil
		}
	}

	return clientError("client connections are down")
}

// HasRuntimeService checks if the client is configured with runtime services.
func (c *client) HasRuntimeService() bool {
	return c.options.RuntimeSocket != "" && c.options.RuntimeSocket != DontConnect
}

func (c *client) checkRuntimeService() error {
	if c.client == nil || c.rcc == nil {
		return clientError("no CRI RuntimeService client")
	}
	return nil
}

func (c *client) checkImageService() error {
	if c.client == nil || c.icc == nil {
		return clientError("no CRI ImageService client")
	}
	return nil
}

// connect attempts to create a gRPC client connection to the given socket.
func (c *client) connect(kind, socket string, options ConnectOptions) (*grpc.ClientConn, error) {
	var cc *grpc.ClientConn
	var err error

	if socket == DontConnect {
		return nil, nil
	}

	dialOpts := instrumentation.InjectGrpcClientTrace(
		grpc.WithInsecure(),
		grpc.WithBlock(),
		grpc.FailOnNonTempDialError(true),
		grpc.WithDialer(func(socket string, timeout time.Duration) (net.Conn, error) {
			conn, err := net.DialTimeout("unix", socket, timeout)
			if err != nil {
				return conn, err
			}
			c.dialNotify(socket)
			return conn, err
		}))

	if options.Wait {
		c.Info("waiting for %s on socket %s...", kind, socket)
		if err = utils.WaitForServer(socket, -1, dialOpts, &cc); err != nil {
			return nil, clientError("failed to connect to %s: %v", kind, err)
		}
	} else {
		if cc, err = grpc.Dial(socket, dialOpts...); err != nil {
			return nil, clientError("failed to connect to %s: %v", kind, err)
		}
	}

	return cc, nil
}

func (c *client) dialNotify(socket string) {
	if c.options.DialNotify == nil {
		return
	}

	info, err := os.Stat(socket)
	if err != nil {
		c.options.DialNotify(socket, -1, -1, 0, err)
		return
	}

	st, ok := info.Sys().(*syscall.Stat_t)
	if !ok {
		err := clientError("failed to stat socket %q: %v", socket, err)
		c.options.DialNotify(socket, -1, -1, 0, err)
		return
	}

	uid, gid := int(st.Uid), int(st.Gid)
	mode := info.Mode() & os.ModePerm
	c.options.DialNotify(socket, uid, gid, mode, nil)
}

func (c *client) Version(ctx context.Context, in *criv1.VersionRequest, _ ...grpc.CallOption) (*criv1.VersionResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.Version(ctx, in)
}

func (c *client) RunPodSandbox(ctx context.Context, in *criv1.RunPodSandboxRequest, _ ...grpc.CallOption) (*criv1.RunPodSandboxResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.RunPodSandbox(ctx, in)
}

func (c *client) StopPodSandbox(ctx context.Context, in *criv1.StopPodSandboxRequest, _ ...grpc.CallOption) (*criv1.StopPodSandboxResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.StopPodSandbox(ctx, in)
}

func (c *client) RemovePodSandbox(ctx context.Context, in *criv1.RemovePodSandboxRequest, _ ...grpc.CallOption) (*criv1.RemovePodSandboxResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.RemovePodSandbox(ctx, in)
}

func (c *client) PodSandboxStatus(ctx context.Context, in *criv1.PodSandboxStatusRequest, _ ...grpc.CallOption) (*criv1.PodSandboxStatusResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.PodSandboxStatus(ctx, in)
}

func (c *client) ListPodSandbox(ctx context.Context, in *criv1.ListPodSandboxRequest, _ ...grpc.CallOption) (*criv1.ListPodSandboxResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.ListPodSandbox(ctx, in)
}

func (c *client) CreateContainer(ctx context.Context, in *criv1.CreateContainerRequest, _ ...grpc.CallOption) (*criv1.CreateContainerResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.CreateContainer(ctx, in)
}

func (c *client) StartContainer(ctx context.Context, in *criv1.StartContainerRequest, _ ...grpc.CallOption) (*criv1.StartContainerResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.StartContainer(ctx, in)
}

func (c *client) StopContainer(ctx context.Context, in *criv1.StopContainerRequest, _ ...grpc.CallOption) (*criv1.StopContainerResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.StopContainer(ctx, in)
}

func (c *client) RemoveContainer(ctx context.Context, in *criv1.RemoveContainerRequest, _ ...grpc.CallOption) (*criv1.RemoveContainerResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.RemoveContainer(ctx, in)
}

func (c *client) ListContainers(ctx context.Context, in *criv1.ListContainersRequest, _ ...grpc.CallOption) (*criv1.ListContainersResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.ListContainers(ctx, in)
}

func (c *client) ContainerStatus(ctx context.Context, in *criv1.ContainerStatusRequest, _ ...grpc.CallOption) (*criv1.ContainerStatusResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.ContainerStatus(ctx, in)
}

func (c *client) UpdateContainerResources(ctx context.Context, in *criv1.UpdateContainerResourcesRequest, _ ...grpc.CallOption) (*criv1.UpdateContainerResourcesResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.UpdateContainerResources(ctx, in)
}

func (c *client) ReopenContainerLog(ctx context.Context, in *criv1.ReopenContainerLogRequest, _ ...grpc.CallOption) (*criv1.ReopenContainerLogResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.ReopenContainerLog(ctx, in)
}

func (c *client) ExecSync(ctx context.Context, in *criv1.ExecSyncRequest, _ ...grpc.CallOption) (*criv1.ExecSyncResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.ExecSync(ctx, in)
}

func (c *client) Exec(ctx context.Context, in *criv1.ExecRequest, _ ...grpc.CallOption) (*criv1.ExecResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.Exec(ctx, in)
}

func (c *client) Attach(ctx context.Context, in *criv1.AttachRequest, _ ...grpc.CallOption) (*criv1.AttachResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.Attach(ctx, in)
}

func (c *client) PortForward(ctx context.Context, in *criv1.PortForwardRequest, _ ...grpc.CallOption) (*criv1.PortForwardResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.PortForward(ctx, in)
}

func (c *client) ContainerStats(ctx context.Context, in *criv1.ContainerStatsRequest, _ ...grpc.CallOption) (*criv1.ContainerStatsResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.ContainerStats(ctx, in)
}

func (c *client) ListContainerStats(ctx context.Context, in *criv1.ListContainerStatsRequest, _ ...grpc.CallOption) (*criv1.ListContainerStatsResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.ListContainerStats(ctx, in)
}

func (c *client) PodSandboxStats(ctx context.Context, in *criv1.PodSandboxStatsRequest, _ ...grpc.CallOption) (*criv1.PodSandboxStatsResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.PodSandboxStats(ctx, in)
}

func (c *client) ListPodSandboxStats(ctx context.Context, in *criv1.ListPodSandboxStatsRequest, _ ...grpc.CallOption) (*criv1.ListPodSandboxStatsResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.ListPodSandboxStats(ctx, in)
}

func (c *client) UpdateRuntimeConfig(ctx context.Context, in *criv1.UpdateRuntimeConfigRequest, _ ...grpc.CallOption) (*criv1.UpdateRuntimeConfigResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.UpdateRuntimeConfig(ctx, in)
}

func (c *client) Status(ctx context.Context, in *criv1.StatusRequest, _ ...grpc.CallOption) (*criv1.StatusResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.client.Status(ctx, in)
}

func (c *client) CheckpointContainer(ctx context.Context, in *criv1.CheckpointContainerRequest, _ ...grpc.CallOption) (*criv1.CheckpointContainerResponse, error) {
	return c.client.CheckpointContainer(ctx, in)
}

func (c *client) GetContainerEvents(ctx context.Context, in *criv1.GetEventsRequest, _ ...grpc.CallOption) (criv1.RuntimeService_GetContainerEventsClient, error) {
	return c.client.GetContainerEvents(ctx, in)
}

func (c *client) ListMetricDescriptors(ctx context.Context, in *criv1.ListMetricDescriptorsRequest, _ ...grpc.CallOption) (*criv1.ListMetricDescriptorsResponse, error) {
	return c.client.ListMetricDescriptors(ctx, in)
}

func (c *client) ListPodSandboxMetrics(ctx context.Context, in *criv1.ListPodSandboxMetricsRequest, _ ...grpc.CallOption) (*criv1.ListPodSandboxMetricsResponse, error) {
	return c.client.ListPodSandboxMetrics(ctx, in)
}

func (c *client) RuntimeConfig(ctx context.Context, in *criv1.RuntimeConfigRequest, _ ...grpc.CallOption) (*criv1.RuntimeConfigResponse, error) {
	return c.client.RuntimeConfig(ctx, in)
}

func (c *client) ListImages(ctx context.Context, in *criv1.ListImagesRequest, _ ...grpc.CallOption) (*criv1.ListImagesResponse, error) {
	if err := c.checkImageService(); err != nil {
		return nil, err
	}

	return c.client.ListImages(ctx, in)
}

func (c *client) ImageStatus(ctx context.Context, in *criv1.ImageStatusRequest, _ ...grpc.CallOption) (*criv1.ImageStatusResponse, error) {
	if err := c.checkImageService(); err != nil {
		return nil, err
	}

	return c.client.ImageStatus(ctx, in)
}

func (c *client) PullImage(ctx context.Context, in *criv1.PullImageRequest, _ ...grpc.CallOption) (*criv1.PullImageResponse, error) {
	if err := c.checkImageService(); err != nil {
		return nil, err
	}

	return c.client.PullImage(ctx, in)
}

func (c *client) RemoveImage(ctx context.Context, in *criv1.RemoveImageRequest, _ ...grpc.CallOption) (*criv1.RemoveImageResponse, error) {
	if err := c.checkImageService(); err != nil {
		return nil, err
	}

	return c.client.RemoveImage(ctx, in)
}

func (c *client) ImageFsInfo(ctx context.Context, in *criv1.ImageFsInfoRequest, _ ...grpc.CallOption) (*criv1.ImageFsInfoResponse, error) {
	if err := c.checkImageService(); err != nil {
		return nil, err
	}

	return c.client.ImageFsInfo(ctx, in)
}

// Return a formatted client-specific error.
func clientError(format string, args ...interface{}) error {
	return fmt.Errorf("cri/client: "+format, args...)
}


================================================
FILE: pkg/cri/client/v1/client.go
================================================
// Copyright Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package v1

import (
	"context"
	"fmt"

	"google.golang.org/grpc"

	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"

	logger "github.com/intel/cri-resource-manager/pkg/log"
)

type Client interface {
	criv1.ImageServiceClient
	criv1.RuntimeServiceClient
}

type client struct {
	logger.Logger
	isc criv1.ImageServiceClient
	rsc criv1.RuntimeServiceClient
	rcc *grpc.ClientConn
	icc *grpc.ClientConn
}

// Connect v2alpha1 RuntimeService and ImageService clients.
func Connect(runtime, image *grpc.ClientConn) (Client, error) {
	c := &client{
		Logger: logger.Get("cri/client"),
		rcc:    runtime,
		icc:    image,
	}

	if c.rcc != nil {
		c.Info("probing CRI v1 RuntimeService client...")
		c.rsc = criv1.NewRuntimeServiceClient(c.rcc)
		_, err := c.rsc.Version(context.Background(), &criv1.VersionRequest{})
		if err != nil {
			return nil, err
		}
	}

	if c.icc != nil {
		c.Info("probing CRI v1 ImageService client...")
		c.isc = criv1.NewImageServiceClient(c.icc)
		_, err := c.isc.ListImages(context.Background(), &criv1.ListImagesRequest{})
		if err != nil {
			return nil, err
		}
	}

	return c, nil
}

func (c *client) checkRuntimeService() error {
	if c.rcc == nil {
		return fmt.Errorf("no CRI v1 RuntimeService client")
	}
	return nil
}

func (c *client) checkImageService() error {
	if c.icc == nil {
		return fmt.Errorf("no CRI v1 ImageService client")
	}
	return nil
}

func (c *client) Version(ctx context.Context, in *criv1.VersionRequest, _ ...grpc.CallOption) (*criv1.VersionResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.Version(ctx, in)
}

func (c *client) RunPodSandbox(ctx context.Context, in *criv1.RunPodSandboxRequest, _ ...grpc.CallOption) (*criv1.RunPodSandboxResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.RunPodSandbox(ctx, in)
}

func (c *client) StopPodSandbox(ctx context.Context, in *criv1.StopPodSandboxRequest, _ ...grpc.CallOption) (*criv1.StopPodSandboxResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.StopPodSandbox(ctx, in)
}

func (c *client) RemovePodSandbox(ctx context.Context, in *criv1.RemovePodSandboxRequest, _ ...grpc.CallOption) (*criv1.RemovePodSandboxResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.RemovePodSandbox(ctx, in)
}

func (c *client) PodSandboxStatus(ctx context.Context, in *criv1.PodSandboxStatusRequest, _ ...grpc.CallOption) (*criv1.PodSandboxStatusResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.PodSandboxStatus(ctx, in)
}

func (c *client) ListPodSandbox(ctx context.Context, in *criv1.ListPodSandboxRequest, _ ...grpc.CallOption) (*criv1.ListPodSandboxResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.ListPodSandbox(ctx, in)
}

func (c *client) CreateContainer(ctx context.Context, in *criv1.CreateContainerRequest, _ ...grpc.CallOption) (*criv1.CreateContainerResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.CreateContainer(ctx, in)
}

func (c *client) StartContainer(ctx context.Context, in *criv1.StartContainerRequest, _ ...grpc.CallOption) (*criv1.StartContainerResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.StartContainer(ctx, in)
}

func (c *client) StopContainer(ctx context.Context, in *criv1.StopContainerRequest, _ ...grpc.CallOption) (*criv1.StopContainerResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.StopContainer(ctx, in)
}

func (c *client) RemoveContainer(ctx context.Context, in *criv1.RemoveContainerRequest, _ ...grpc.CallOption) (*criv1.RemoveContainerResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.RemoveContainer(ctx, in)
}

func (c *client) ListContainers(ctx context.Context, in *criv1.ListContainersRequest, _ ...grpc.CallOption) (*criv1.ListContainersResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.ListContainers(ctx, in)
}

func (c *client) ContainerStatus(ctx context.Context, in *criv1.ContainerStatusRequest, _ ...grpc.CallOption) (*criv1.ContainerStatusResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.ContainerStatus(ctx, in)
}

func (c *client) UpdateContainerResources(ctx context.Context, in *criv1.UpdateContainerResourcesRequest, _ ...grpc.CallOption) (*criv1.UpdateContainerResourcesResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.UpdateContainerResources(ctx, in)
}

func (c *client) ReopenContainerLog(ctx context.Context, in *criv1.ReopenContainerLogRequest, _ ...grpc.CallOption) (*criv1.ReopenContainerLogResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.ReopenContainerLog(ctx, in)
}

func (c *client) ExecSync(ctx context.Context, in *criv1.ExecSyncRequest, _ ...grpc.CallOption) (*criv1.ExecSyncResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.ExecSync(ctx, in)
}

func (c *client) Exec(ctx context.Context, in *criv1.ExecRequest, _ ...grpc.CallOption) (*criv1.ExecResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.Exec(ctx, in)
}

func (c *client) Attach(ctx context.Context, in *criv1.AttachRequest, _ ...grpc.CallOption) (*criv1.AttachResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.Attach(ctx, in)
}

func (c *client) PortForward(ctx context.Context, in *criv1.PortForwardRequest, _ ...grpc.CallOption) (*criv1.PortForwardResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.PortForward(ctx, in)
}

func (c *client) ContainerStats(ctx context.Context, in *criv1.ContainerStatsRequest, _ ...grpc.CallOption) (*criv1.ContainerStatsResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.ContainerStats(ctx, in)
}

func (c *client) ListContainerStats(ctx context.Context, in *criv1.ListContainerStatsRequest, _ ...grpc.CallOption) (*criv1.ListContainerStatsResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.ListContainerStats(ctx, in)
}

func (c *client) PodSandboxStats(ctx context.Context, in *criv1.PodSandboxStatsRequest, _ ...grpc.CallOption) (*criv1.PodSandboxStatsResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.PodSandboxStats(ctx, in)
}

func (c *client) ListPodSandboxStats(ctx context.Context, in *criv1.ListPodSandboxStatsRequest, _ ...grpc.CallOption) (*criv1.ListPodSandboxStatsResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.ListPodSandboxStats(ctx, in)
}

func (c *client) UpdateRuntimeConfig(ctx context.Context, in *criv1.UpdateRuntimeConfigRequest, _ ...grpc.CallOption) (*criv1.UpdateRuntimeConfigResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.UpdateRuntimeConfig(ctx, in)
}

func (c *client) Status(ctx context.Context, in *criv1.StatusRequest, _ ...grpc.CallOption) (*criv1.StatusResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.Status(ctx, in)
}

func (c *client) CheckpointContainer(ctx context.Context, in *criv1.CheckpointContainerRequest, _ ...grpc.CallOption) (*criv1.CheckpointContainerResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.CheckpointContainer(ctx, in)
}

func (c *client) GetContainerEvents(ctx context.Context, in *criv1.GetEventsRequest, _ ...grpc.CallOption) (criv1.RuntimeService_GetContainerEventsClient, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	eventsClient, err := c.rsc.GetContainerEvents(ctx, in)
	if err != nil {
		return nil, err
	}

	return eventsClient, err
}

func (c *client) ListMetricDescriptors(ctx context.Context, in *criv1.ListMetricDescriptorsRequest, _ ...grpc.CallOption) (*criv1.ListMetricDescriptorsResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.ListMetricDescriptors(ctx, in)
}

func (c *client) ListPodSandboxMetrics(ctx context.Context, in *criv1.ListPodSandboxMetricsRequest, _ ...grpc.CallOption) (*criv1.ListPodSandboxMetricsResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.ListPodSandboxMetrics(ctx, in)
}

func (c *client) RuntimeConfig(ctx context.Context, in *criv1.RuntimeConfigRequest, _ ...grpc.CallOption) (*criv1.RuntimeConfigResponse, error) {
	if err := c.checkRuntimeService(); err != nil {
		return nil, err
	}

	return c.rsc.RuntimeConfig(ctx, in)
}

func (c *client) ListImages(ctx context.Context, in *criv1.ListImagesRequest, _ ...grpc.CallOption) (*criv1.ListImagesResponse, error) {
	if err := c.checkImageService(); err != nil {
		return nil, err
	}

	return c.isc.ListImages(ctx, in)
}

func (c *client) ImageStatus(ctx context.Context, in *criv1.ImageStatusRequest, _ ...grpc.CallOption) (*criv1.ImageStatusResponse, error) {
	if err := c.checkImageService(); err != nil {
		return nil, err
	}

	return c.isc.ImageStatus(ctx, in)
}

func (c *client) PullImage(ctx context.Context, in *criv1.PullImageRequest, _ ...grpc.CallOption) (*criv1.PullImageResponse, error) {
	if err := c.checkImageService(); err != nil {
		return nil, err
	}

	return c.isc.PullImage(ctx, in)
}

func (c *client) RemoveImage(ctx context.Context, in *criv1.RemoveImageRequest, _ ...grpc.CallOption) (*criv1.RemoveImageResponse, error) {
	if err := c.checkImageService(); err != nil {
		return nil, err
	}

	return c.isc.RemoveImage(ctx, in)
}

func (c *client) ImageFsInfo(ctx context.Context, in *criv1.ImageFsInfoRequest, _ ...grpc.CallOption) (*criv1.ImageFsInfoResponse, error) {
	if err := c.checkImageService(); err != nil {
		return nil, err
	}

	return c.isc.ImageFsInfo(ctx, in)
}

// Return a formatted client-specific error.
func clientError(format string, args ...interface{}) error {
	return fmt.Errorf("cri/client: "+format, args...)
}


================================================
FILE: pkg/cri/relay/image-service.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package relay

import (
	"context"

	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"
)

func (r *relay) ListImages(ctx context.Context,
	req *criv1.ListImagesRequest) (*criv1.ListImagesResponse, error) {
	return r.client.ListImages(ctx, req)
}

func (r *relay) ImageStatus(ctx context.Context,
	req *criv1.ImageStatusRequest) (*criv1.ImageStatusResponse, error) {
	return r.client.ImageStatus(ctx, req)
}

func (r *relay) PullImage(ctx context.Context,
	req *criv1.PullImageRequest) (*criv1.PullImageResponse, error) {
	return r.client.PullImage(ctx, req)
}

func (r *relay) RemoveImage(ctx context.Context,
	req *criv1.RemoveImageRequest) (*criv1.RemoveImageResponse, error) {
	return r.client.RemoveImage(ctx, req)
}

func (r *relay) ImageFsInfo(ctx context.Context,
	req *criv1.ImageFsInfoRequest) (*criv1.ImageFsInfoResponse, error) {
	return r.client.ImageFsInfo(ctx, req)
}


================================================
FILE: pkg/cri/relay/relay.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package relay

import (
	"fmt"
	"os"
	"sync"

	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"

	"github.com/intel/cri-resource-manager/pkg/cri/client"
	"github.com/intel/cri-resource-manager/pkg/cri/server"
	logger "github.com/intel/cri-resource-manager/pkg/log"
)

const (
	// DisableService is used to mark a socket/service to not be connected.
	DisableService = client.DontConnect
	// DefaultImageSocket uses the runtime socket for the image servie, too.
	DefaultImageSocket = "default"
)

// Options contains the configurable options of our CRI relay.
type Options struct {
	// RelaySocket is the socket path for the CRI relay services.
	RelaySocket string
	// ImageSocket is the socket path for the (real) CRI image services.
	ImageSocket string
	// RuntimeSocket is the socket path for the (real) CRI runtime services.
	RuntimeSocket string
	// QualifyReqFn produces context for disambiguating a CRI request/reply.
	QualifyReqFn func(interface{}) string
}

// Relay is the interface we expose for controlling our CRI relay.
type Relay interface {
	// Setup prepares the relay to start processing CRI requests.
	Setup() error
	// Start starts the relay.
	Start() error
	// Stop stops the relay.
	Stop()
	// Client returns the relays client interface.
	Client() client.Client
	// Server returns the relays server interface.
	Server() server.Server
}

// relay is the implementation of Relay.
type relay struct {
	logger.Logger
	sync.Mutex
	options Options       // relay options
	client  client.Client // relay CRI client
	server  server.Server // relay CRI server

	evtClient criv1.RuntimeService_GetContainerEventsClient
	evtChans  map[*criv1.GetEventsRequest]chan *criv1.ContainerEventResponse
}

// NewRelay creates a new relay instance.
func NewRelay(options Options) (Relay, error) {
	var err error

	r := &relay{
		Logger:   logger.NewLogger("cri/relay"),
		options:  options,
		evtChans: map[*criv1.GetEventsRequest]chan *criv1.ContainerEventResponse{},
	}

	imageSocket := r.options.ImageSocket
	if imageSocket == DefaultImageSocket {
		imageSocket = r.options.RuntimeSocket
	}

	cltopts := client.Options{
		ImageSocket:   imageSocket,
		RuntimeSocket: r.options.RuntimeSocket,
		DialNotify:    r.dialNotify,
	}
	if r.client, err = client.NewClient(cltopts); err != nil {
		return nil, relayError("failed to create relay client: %v", err)
	}

	srvopts := server.Options{
		Socket:       r.options.RelaySocket,
		User:         -1,
		Group:        -1,
		Mode:         0660,
		QualifyReqFn: r.options.QualifyReqFn,
	}
	if r.server, err = server.NewServer(srvopts); err != nil {
		return nil, relayError("failed to create relay server: %v", err)
	}

	return r, nil
}

// Setup prepares the relay to start processing requests.
func (r *relay) Setup() error {
	if err := r.client.Connect(client.ConnectOptions{Wait: true}); err != nil {
		return relayError("client connection failed: %v", err)
	}

	if r.options.ImageSocket != DisableService {
		if err := r.server.RegisterImageService(r); err != nil {
			return relayError("failed to register image service: %v", err)
		}
	}

	if r.options.RuntimeSocket != DisableService {
		if err := r.server.RegisterRuntimeService(r); err != nil {
			return relayError("failed to register runtime service: %v", err)
		}
	}

	return nil
}

// Start starts the relays request processing goroutine.
func (r *relay) Start() error {
	if err := r.server.Start(); err != nil {
		return relayError("failed to start relay: %v", err)
	}

	return nil
}

// Stop stops the relay.
func (r *relay) Stop() {
	r.client.Close()
	r.server.Stop()
}

// Client returns the relays Client interface.
func (r *relay) Client() client.Client {
	return r.client
}

// Server returns the relays Server interface.
func (r *relay) Server() server.Server {
	return r.server
}

func (r *relay) dialNotify(socket string, uid int, gid int, mode os.FileMode, err error) {
	if err != nil {
		r.Error("failed to determine permissions/ownership of client socket %q: %v",
			socket, err)
		return
	}

	// Notes:
	//   Kubelet has separate configuration/command line options for the container
	//   runtime's Image and Runtime Services. Hence, in principle it is possible
	//   that we run with two separate sockets for those. However, we always expose
	//   both services over our single relay socket. Since we cannot set two set of
	//   ownerships and permissions on a single socket, if this situation arises in
	//   practice we choose to go with the runtime socket's properties.
	if r.options.ImageSocket != r.options.RuntimeSocket {
		if socket != r.options.RuntimeSocket && r.options.RuntimeSocket != client.DontConnect {
			r.Warn("ignoring ownership/permissions of dedicated CR Image Service socket...")
			return
		}
	}

	if err := r.server.Chown(uid, gid); err != nil {
		r.Error("server socket ownership change request failed: %v", err)
	} else {
		if err := r.server.Chmod(mode); err != nil {
			r.Error("server socket permissions change request failed: %v", err)
		}
	}
}

// relayError creates a formatted relay-specific error.
func relayError(format string, args ...interface{}) error {
	return fmt.Errorf("cri/relay: "+format, args...)
}


================================================
FILE: pkg/cri/relay/runtime-service.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package relay

import (
	"context"
	"fmt"
	"time"

	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"

	"github.com/intel/cri-resource-manager/pkg/dump"
)

func (r *relay) dump(method string, req interface{}) {
	if r.DebugEnabled() {
		qualif := r.qualifier(req)
		dump.RequestMessage("relayed", method, qualif, req, true)
	}
}

// qualifier pulls a qualifier for disambiguation from a CRI request message.
func (r *relay) qualifier(msg interface{}) string {
	if fn := r.options.QualifyReqFn; fn != nil {
		return fn(msg)
	}
	return ""
}

func (r *relay) Version(ctx context.Context,
	req *criv1.VersionRequest) (*criv1.VersionResponse, error) {
	r.dump("Version", req)
	return r.client.Version(ctx, req)
}

func (r *relay) RunPodSandbox(ctx context.Context,
	req *criv1.RunPodSandboxRequest) (*criv1.RunPodSandboxResponse, error) {
	r.dump("RunPodSandbox", req)
	return r.client.RunPodSandbox(ctx, req)
}

func (r *relay) StopPodSandbox(ctx context.Context,
	req *criv1.StopPodSandboxRequest) (*criv1.StopPodSandboxResponse, error) {
	r.dump("StopPodSandbox", req)
	return r.client.StopPodSandbox(ctx, req)
}

func (r *relay) RemovePodSandbox(ctx context.Context,
	req *criv1.RemovePodSandboxRequest) (*criv1.RemovePodSandboxResponse, error) {
	r.dump("RemovePodSandbox", req)
	return r.client.RemovePodSandbox(ctx, req)
}

func (r *relay) PodSandboxStatus(ctx context.Context,
	req *criv1.PodSandboxStatusRequest) (*criv1.PodSandboxStatusResponse, error) {
	r.dump("PodSandboxStatus", req)
	return r.client.PodSandboxStatus(ctx, req)
}

func (r *relay) ListPodSandbox(ctx context.Context,
	req *criv1.ListPodSandboxRequest) (*criv1.ListPodSandboxResponse, error) {
	r.dump("ListPodSandbox", req)
	return r.client.ListPodSandbox(ctx, req)
}

func (r *relay) CreateContainer(ctx context.Context,
	req *criv1.CreateContainerRequest) (*criv1.CreateContainerResponse, error) {
	r.dump("CreateContainer", req)
	return r.client.CreateContainer(ctx, req)
}

func (r *relay) StartContainer(ctx context.Context,
	req *criv1.StartContainerRequest) (*criv1.StartContainerResponse, error) {
	r.dump("StartContainer", req)
	return r.client.StartContainer(ctx, req)
}

func (r *relay) StopContainer(ctx context.Context,
	req *criv1.StopContainerRequest) (*criv1.StopContainerResponse, error) {
	r.dump("StopContainer", req)
	return r.client.StopContainer(ctx, req)
}

func (r *relay) RemoveContainer(ctx context.Context,
	req *criv1.RemoveContainerRequest) (*criv1.RemoveContainerResponse, error) {
	r.dump("RemoveContainer", req)
	return r.client.RemoveContainer(ctx, req)
}

func (r *relay) ListContainers(ctx context.Context,
	req *criv1.ListContainersRequest) (*criv1.ListContainersResponse, error) {
	r.dump("ListContainers", req)
	return r.client.ListContainers(ctx, req)
}

func (r *relay) ContainerStatus(ctx context.Context,
	req *criv1.ContainerStatusRequest) (*criv1.ContainerStatusResponse, error) {
	r.dump("ContainerStatus", req)
	return r.client.ContainerStatus(ctx, req)
}

func (r *relay) UpdateContainerResources(ctx context.Context,
	req *criv1.UpdateContainerResourcesRequest) (*criv1.UpdateContainerResourcesResponse, error) {
	r.dump("UpdateContainerResources", req)
	return r.client.UpdateContainerResources(ctx, req)
}

func (r *relay) ReopenContainerLog(ctx context.Context,
	req *criv1.ReopenContainerLogRequest) (*criv1.ReopenContainerLogResponse, error) {
	r.dump("ReopenContainerLog", req)
	return r.client.ReopenContainerLog(ctx, req)
}

func (r *relay) ExecSync(ctx context.Context,
	req *criv1.ExecSyncRequest) (*criv1.ExecSyncResponse, error) {
	r.dump("ExecSync", req)
	return r.client.ExecSync(ctx, req)
}

func (r *relay) Exec(ctx context.Context,
	req *criv1.ExecRequest) (*criv1.ExecResponse, error) {
	r.dump("Exec", req)
	return r.client.Exec(ctx, req)
}

func (r *relay) Attach(ctx context.Context,
	req *criv1.AttachRequest) (*criv1.AttachResponse, error) {
	r.dump("Attach", req)
	return r.client.Attach(ctx, req)
}

func (r *relay) PortForward(ctx context.Context,
	req *criv1.PortForwardRequest) (*criv1.PortForwardResponse, error) {
	r.dump("PortForward", req)
	return r.client.PortForward(ctx, req)
}

func (r *relay) ContainerStats(ctx context.Context,
	req *criv1.ContainerStatsRequest) (*criv1.ContainerStatsResponse, error) {
	r.dump("ContainerStats", req)
	return r.client.ContainerStats(ctx, req)
}

func (r *relay) ListContainerStats(ctx context.Context,
	req *criv1.ListContainerStatsRequest) (*criv1.ListContainerStatsResponse, error) {
	r.dump("ListContainerStats", req)
	return r.client.ListContainerStats(ctx, req)
}

func (r *relay) PodSandboxStats(ctx context.Context,
	req *criv1.PodSandboxStatsRequest) (*criv1.PodSandboxStatsResponse, error) {
	r.dump("PodSandboxStats", req)
	return r.client.PodSandboxStats(ctx, req)
}

func (r *relay) ListPodSandboxStats(ctx context.Context,
	req *criv1.ListPodSandboxStatsRequest) (*criv1.ListPodSandboxStatsResponse, error) {
	r.dump("ListPodSandboxStats", req)
	return r.client.ListPodSandboxStats(ctx, req)
}

func (r *relay) UpdateRuntimeConfig(ctx context.Context,
	req *criv1.UpdateRuntimeConfigRequest) (*criv1.UpdateRuntimeConfigResponse, error) {
	r.dump("UpdateRuntimeConfig", req)
	return r.client.UpdateRuntimeConfig(ctx, req)
}

func (r *relay) Status(ctx context.Context,
	req *criv1.StatusRequest) (*criv1.StatusResponse, error) {
	r.dump("Status", req)
	return r.client.Status(ctx, req)
}

func (r *relay) CheckpointContainer(ctx context.Context, req *criv1.CheckpointContainerRequest) (*criv1.CheckpointContainerResponse, error) {
	r.dump("CheckpointContainer", req)
	return r.client.CheckpointContainer(ctx, req)
}

func (r *relay) GetContainerEvents(req *criv1.GetEventsRequest, srv criv1.RuntimeService_GetContainerEventsServer) error {
	evtC := r.addEventServer(req)

	if err := r.startEventRelay(req); err != nil {
		r.delEventServer(req)
		return err
	}

	for evt := range evtC {
		if err := srv.Send(evt); err != nil {
			r.Errorf("failed to relay/send container event: %v", err)
			r.delEventServer(req)
			return err
		}
	}

	return nil
}

func (r *relay) ListMetricDescriptors(ctx context.Context, req *criv1.ListMetricDescriptorsRequest) (*criv1.ListMetricDescriptorsResponse, error) {
	r.dump("ListMetricDescriptors", req)
	return r.client.ListMetricDescriptors(ctx, req)
}

func (r *relay) ListPodSandboxMetrics(ctx context.Context, req *criv1.ListPodSandboxMetricsRequest) (*criv1.ListPodSandboxMetricsResponse, error) {
	r.dump("ListPodSandboxMetrics", req)
	return r.client.ListPodSandboxMetrics(ctx, req)
}

func (r *relay) RuntimeConfig(ctx context.Context, req *criv1.RuntimeConfigRequest) (*criv1.RuntimeConfigResponse, error) {
	r.dump("RuntimeConfig", req)
	return r.client.RuntimeConfig(ctx, req)
}

const (
	eventRelayTimeout = 1 * time.Second
)

func (r *relay) addEventServer(req *criv1.GetEventsRequest) chan *criv1.ContainerEventResponse {
	r.Lock()
	defer r.Unlock()

	evtC := make(chan *criv1.ContainerEventResponse, 128)
	r.evtChans[req] = evtC

	return evtC
}

func (r *relay) delEventServer(req *criv1.GetEventsRequest) chan *criv1.ContainerEventResponse {
	r.Lock()
	defer r.Unlock()

	evtC := r.evtChans[req]
	delete(r.evtChans, req)

	return evtC
}

func (r *relay) startEventRelay(req *criv1.GetEventsRequest) error {
	r.Lock()
	defer r.Unlock()

	if r.evtClient != nil {
		return nil
	}

	c, err := r.client.GetContainerEvents(context.Background(), req)
	if err != nil {
		return fmt.Errorf("failed to create container event client: %w", err)
	}

	r.evtClient = c
	go r.relayEvents()

	return nil
}

func (r *relay) relayEvents() {
	for {
		evt, err := r.evtClient.Recv()
		if err != nil {
			r.Errorf("failed to relay/receive container event: %v", err)
		}

		r.Lock()

		if err != nil {
			for req, evtC := range r.evtChans {
				delete(r.evtChans, req)
				close(evtC)
			}
			r.evtClient = nil
		} else {
			for req, evtC := range r.evtChans {
				select {
				case evtC <- evt:
				case _ = <-time.After(eventRelayTimeout):
					delete(r.evtChans, req)
					close(evtC)
				}
			}
		}

		r.Unlock()

		if err != nil {
			return
		}
	}
}


================================================
FILE: pkg/cri/resource-manager/agent/agent.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package agent

import (
	"context"
	"encoding/json"
	"fmt"
	"net"
	"strings"
	"time"

	"google.golang.org/grpc"
	core_v1 "k8s.io/api/core/v1"

	agent_v1 "github.com/intel/cri-resource-manager/pkg/agent/api/v1"
)

const (
	SocketDisabled = "disabled"
)

// Interface describe interfaces of cri-resource-manager agent
type Interface interface {
	IsDisabled() bool

	GetNode(time.Duration) (core_v1.Node, error)
	PatchNode([]*agent_v1.JsonPatch, time.Duration) error
	UpdateNodeCapacity(map[string]string, time.Duration) error

	GetLabels(time.Duration) (map[string]string, error)
	SetLabels(map[string]string, time.Duration) error
	RemoveLabels([]string, time.Duration) error

	GetAnnotations(time.Duration) (map[string]string, error)
	SetAnnotations(map[string]string, time.Duration) error
	RemoveAnnotations([]string, time.Duration) error

	GetTaints(time.Duration) ([]core_v1.Taint, error)
	SetTaints([]core_v1.Taint, time.Duration) error
	RemoveTaints([]core_v1.Taint, time.Duration) error

	FindTaintIndex([]core_v1.Taint, *core_v1.Taint) (int, bool)
}

// agentInterface implements Interface
type agentInterface struct {
	socket string
	cli    agent_v1.AgentClient
}

// NewAgentInterface connects to cri-resource-manager-agent gRPC server
// and return a new Interface
func NewAgentInterface(socket string) (Interface, error) {
	a := &agentInterface{
		socket: socket,
	}

	if a.IsDisabled() {
		return a, nil
	}

	dialOpts := []grpc.DialOption{
		//		grpc.WithBlock(),
		//		grpc.WithTimeout(10 * time.Second),
		grpc.WithInsecure(),
		//		grpc.FailOnNonTempDialError(true),
		grpc.WithDialer(func(sock string, timeout time.Duration) (net.Conn, error) {
			return net.Dial("unix", sock)
		}),
	}
	conn, err := grpc.Dial(socket, dialOpts...)
	if err != nil {
		return nil, agentError("failed to connect to cri-resmgr agent: %v", err)
	}
	a.cli = agent_v1.NewAgentClient(conn)

	return a, nil
}

// IsDisabled returns true if the agent interface is disabled.
func (a *agentInterface) IsDisabled() bool {
	return a.socket == SocketDisabled || a.socket == ""
}

func (a *agentInterface) GetNode(timeout time.Duration) (core_v1.Node, error) {
	if a.IsDisabled() {
		return core_v1.Node{}, agentError("agent interface is disabled")
	}

	ctx, cancel, callOpts := prepareCall(timeout)
	defer cancel()

	req := &agent_v1.GetNodeRequest{}

	node := core_v1.Node{}
	rsp, err := a.cli.GetNode(ctx, req, callOpts...)
	if err != nil {
		return node, agentError("failed to get node object: %v", err)
	}

	if err = json.Unmarshal([]byte(rsp.Node), &node); err != nil {
		return node, agentError("invalid response, failed to unmarshal v1.Node: %v", err)
	}

	return node, nil
}

func (a *agentInterface) PatchNode(patches []*agent_v1.JsonPatch, timeout time.Duration) error {
	if a.IsDisabled() {
		return agentError("agent interface is disabled")
	}

	ctx, cancel, callOpts := prepareCall(timeout)
	defer cancel()

	req := &agent_v1.PatchNodeRequest{
		Patches: patches,
	}

	_, err := a.cli.PatchNode(ctx, req, callOpts...)
	if err != nil {
		return agentError("failed to patch node object: %v", err)
	}
	return nil
}

func (a *agentInterface) UpdateNodeCapacity(caps map[string]string, timeout time.Duration) error {
	if a.IsDisabled() {
		return agentError("agent interface is disabled")
	}

	ctx, cancel, callOpts := prepareCall(timeout)
	defer cancel()

	req := &agent_v1.UpdateNodeCapacityRequest{
		Capacities: caps,
	}
	_, err := a.cli.UpdateNodeCapacity(ctx, req, callOpts...)
	if err != nil {
		return agentError("failed to update node capacities: %v", err)
	}
	return nil
}

const (
	// PatchAdd specifies an add operation.
	PatchAdd string = "add"
	// PatchRemove specifies an remove operation.
	PatchRemove string = "remove"
	// PatchReplace specifies an replace operation.
	PatchReplace string = "replace"
)

func patchPath(class, key string) string {
	return "/metadata/" + class + "/" + strings.Replace(key, "/", "~1", -1)
}

func labelPatchPath(key string) string {
	return patchPath("labels", key)
}

func annotationPatchPath(key string) string {
	return patchPath("annotations", key)
}

func taintPatchPath(idx int) string {
	return fmt.Sprintf("/spec/taints/%d", idx)
}

func (a *agentInterface) GetLabels(timeout time.Duration) (map[string]string, error) {
	if a.IsDisabled() {
		return nil, agentError("agent interface is disabled")
	}

	node, err := a.GetNode(timeout)
	if err != nil {
		return nil, err
	}

	return node.Labels, nil
}

func (a *agentInterface) SetLabels(labels map[string]string, timeout time.Duration) error {
	if a.IsDisabled() {
		return agentError("agent interface is disabled")
	}

	if len(labels) == 0 {
		return nil
	}

	node, err := a.GetNode(timeout)
	if err != nil {
		return err
	}

	patches := []*agent_v1.JsonPatch{}
	for key, val := range labels {
		patch := &agent_v1.JsonPatch{
			Path: labelPatchPath(key),
			// Value is supposed to be in marshalled JSON format. Thus, we need
			// to add quotes so that it will be interpreted as a string.
			Value: "\"" + val + "\"",
		}
		if _, ok := node.Labels[key]; ok {
			patch.Op = PatchReplace
		} else {
			patch.Op = PatchAdd
		}
		patches = append(patches, patch)
	}

	return a.PatchNode(patches, timeout)
}

func (a *agentInterface) RemoveLabels(keys []string, timeout time.Duration) error {
	if a.IsDisabled() {
		return agentError("agent interface is disabled")
	}

	if len(keys) == 0 {
		return nil
	}

	node, err := a.GetNode(timeout)
	if err != nil {
		return err
	}

	patches := []*agent_v1.JsonPatch{}
	for _, key := range keys {
		if _, ok := node.Labels[key]; !ok {
			continue
		}
		patch := &agent_v1.JsonPatch{
			Op:   PatchRemove,
			Path: labelPatchPath(key),
		}
		patches = append(patches, patch)
	}
	if len(patches) == 0 {
		return nil
	}

	return a.PatchNode(patches, timeout)
}

func (a *agentInterface) GetAnnotations(timeout time.Duration) (map[string]string, error) {
	if a.IsDisabled() {
		return nil, agentError("agent interface is disabled")
	}

	node, err := a.GetNode(timeout)
	if err != nil {
		return nil, err
	}
	return node.Annotations, nil
}

func (a *agentInterface) SetAnnotations(annotations map[string]string, timeout time.Duration) error {
	if a.IsDisabled() {
		return agentError("agent interface is disabled")
	}

	if len(annotations) == 0 {
		return nil
	}

	node, err := a.GetNode(timeout)
	if err != nil {
		return err
	}

	patches := []*agent_v1.JsonPatch{}
	for key, val := range annotations {
		patch := &agent_v1.JsonPatch{
			Path:  annotationPatchPath(key),
			Value: val,
		}
		if _, ok := node.Annotations[key]; ok {
			patch.Op = PatchReplace
		} else {
			patch.Op = PatchAdd
		}
		patches = append(patches, patch)
	}

	return a.PatchNode(patches, timeout)
}

func (a *agentInterface) RemoveAnnotations(keys []string, timeout time.Duration) error {
	if a.IsDisabled() {
		return agentError("agent interface is disabled")
	}

	if len(keys) == 0 {
		return nil
	}

	node, err := a.GetNode(timeout)
	if err != nil {
		return err
	}

	patches := []*agent_v1.JsonPatch{}
	for _, key := range keys {
		if _, ok := node.Annotations[key]; !ok {
			continue
		}

		patch := &agent_v1.JsonPatch{
			Op:   PatchRemove,
			Path: annotationPatchPath(key),
		}
		patches = append(patches, patch)
	}
	if len(patches) == 0 {
		return nil
	}

	return a.PatchNode(patches, timeout)
}

func (a *agentInterface) GetTaints(timeout time.Duration) ([]core_v1.Taint, error) {
	if a.IsDisabled() {
		return nil, agentError("agent interface is disabled")
	}

	node, err := a.GetNode(timeout)
	if err != nil {
		return nil, err
	}
	return node.Spec.Taints, nil
}

func (a *agentInterface) SetTaints(taints []core_v1.Taint, timeout time.Duration) error {
	if a.IsDisabled() {
		return agentError("agent interface is disabled")
	}

	if len(taints) == 0 {
		return nil
	}

	node, err := a.GetNode(timeout)
	if err != nil {
		return err
	}

	patches := []*agent_v1.JsonPatch{}
	if node.Spec.Taints == nil {
		patch := &agent_v1.JsonPatch{
			Op:    PatchAdd,
			Path:  "/spec/taints",
			Value: "[]"}
		patches = append(patches, patch)
	}

	for _, t := range taints {
		value, err := json.Marshal(t)
		if err != nil {
			return agentError("BUG: failed to marshal taint %v: %v", t, err)
		}
		idx, found := findTaintIndex(node.Spec.Taints, &t)
		patch := &agent_v1.JsonPatch{Value: string(value)}
		patch.Path = taintPatchPath(idx)
		if !found {
			patch.Op = PatchAdd
		} else {
			patch.Op = PatchReplace
		}
		patches = append(patches, patch)
	}

	return a.PatchNode(patches, timeout)
}

func (a *agentInterface) RemoveTaints(taints []core_v1.Taint, timeout time.Duration) error {
	if a.IsDisabled() {
		return agentError("agent interface is disabled")
	}

	if len(taints) == 0 {
		return nil
	}

	node, err := a.GetNode(timeout)
	if err != nil {
		return err
	}
	if node.Spec.Taints == nil {
		return nil
	}

	patches := []*agent_v1.JsonPatch{}
	for _, t := range taints {
		idx, found := findTaintIndex(node.Spec.Taints, &t)
		if found {
			patch := &agent_v1.JsonPatch{
				Op:   "remove",
				Path: taintPatchPath(idx),
			}
			patches = append(patches, patch)
		}
	}
	if len(patches) == 0 {
		return nil
	}

	return a.PatchNode(patches, timeout)
}

func findTaintIndex(taints []core_v1.Taint, taint *core_v1.Taint) (int, bool) {
	for idx, t := range taints {
		if t.Key == taint.Key && t.Value == taint.Value && t.Effect == taint.Effect {
			return idx, true
		}
	}
	return 0, false
}

func (a *agentInterface) FindTaintIndex(taints []core_v1.Taint, taint *core_v1.Taint) (int, bool) {
	return findTaintIndex(taints, taint)
}

func agentError(format string, args ...interface{}) error {
	return fmt.Errorf("agent-client: "+format, args...)
}

func prepareCall(timeout time.Duration) (context.Context, context.CancelFunc, []grpc.CallOption) {
	callOpts := []grpc.CallOption{grpc.FailFast(false)}
	ctx := context.Background()
	cancel := func() {}
	if timeout >= 0 {
		ctx, cancel = context.WithTimeout(context.Background(), timeout)
	}

	return ctx, cancel, callOpts
}


================================================
FILE: pkg/cri/resource-manager/builtin-policies.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package resmgr

import (
	// List of builtin policies
	_ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/balloons"
	_ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/dynamic-pools"
	_ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/none"
	_ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/podpools"
	_ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/static"
	_ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/static-plus"
	_ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/static-pools"
	_ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/topology-aware"
)

// TODO: add unit tests to verify that all builtin policies are found


================================================
FILE: pkg/cri/resource-manager/cache/affinity.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cache

import (
	"fmt"

	"sigs.k8s.io/yaml"

	"github.com/intel/cri-resource-manager/pkg/apis/resmgr"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes"
)

const (
	// annotation key for specifying container affinity rules
	keyAffinity = "affinity"
	// annotation key for specifying container anti-affinity rules
	keyAntiAffinity = "anti-affinity"
)

// Expression is used to describe affinity container scope and matching criteria.
type Expression struct {
	resmgr.Expression
}

// simpleAffinity is an alternative, simplified syntax for intra-pod container affinity.
type simpleAffinity map[string][]string

// PodContainerAffinity defines a set of per-container affinities and anti-affinities.
type podContainerAffinity map[string][]*Affinity

// Affinity specifies a single container affinity.
type Affinity struct {
	Scope  *resmgr.Expression `json:"scope,omitempty"`  // scope for evaluating this affinity
	Match  *resmgr.Expression `json:"match"`            // affinity expression
	Weight int32              `json:"weight,omitempty"` // (optional) weight for this affinity
}

const (
	// UserWeightCutoff is the cutoff we clamp user-provided weights to.
	UserWeightCutoff = 1000
	// DefaultWeight is the default assigned weight if omitted in annotations.
	DefaultWeight int32 = 1
)

// ImplicitAffinity can implicitly inject affinities to containers.
type ImplicitAffinity func(Container, bool) *Affinity

// Validate checks the affinity for (obvious) invalidity.
func (a *Affinity) Validate() error {
	if err := a.Scope.Validate(); err != nil {
		return cacheError("invalid affinity scope: %v", err)
	}

	if err := a.Match.Validate(); err != nil {
		return cacheError("invalid affinity match: %v", err)
	}

	switch {
	case a.Weight > UserWeightCutoff:
		a.Weight = UserWeightCutoff
	case a.Weight < -UserWeightCutoff:
		a.Weight = -UserWeightCutoff
	}

	return nil
}

// EvaluateAffinity evaluates the given affinity against all known in-scope containers.
func (cch *cache) EvaluateAffinity(a *Affinity) map[string]int32 {
	results := make(map[string]int32)
	for _, c := range cch.FilterScope(a.Scope) {
		if a.Match.Evaluate(c) {
			id := c.GetCacheID()
			results[id] += a.Weight
		}
	}
	return results
}

// FilterScope returns the containers selected by the scope expression.
func (cch *cache) FilterScope(scope *resmgr.Expression) []Container {
	cch.Debug("calculating scope %s", scope.String())
	result := []Container{}
	for _, c := range cch.GetContainers() {
		if scope.Evaluate(c) {
			cch.Debug("  + container %s: IN scope", c.PrettyName())
			result = append(result, c)
		} else {
			cch.Debug("  - container %s: NOT IN scope", c.PrettyName())
		}
	}
	return result
}

// String returns the affinity as a string.
func (a *Affinity) String() string {
	kind := ""
	if a.Weight < 0 {
		kind = "anti-"
	}
	return fmt.Sprintf("<%saffinity: scope %s %s => %d>",
		kind, a.Scope.String(), a.Match.String(), a.Weight)
}

// Try to parse affinities in simplified notation from the given annotation value.
func (pca *podContainerAffinity) parseSimple(pod *pod, value string, weight int32) bool {
	parsed := simpleAffinity{}
	if err := yaml.UnmarshalStrict([]byte(value), &parsed); err != nil {
		return false
	}

	podScope := pod.ScopeExpression()

	//
	// Notes:
	//   We turn affinities given in the simple notation into a symmetric set of
	//   affinities. IOW, if X has affinity on Y with wight W, then Y will have
	//   affinity on X with W as well. In practice this is done by
	//     1) ensuring there is an affinity Y: X for every affinity X: Y
	//     2) generating an affinity expression for every container with affinities
	//  The generated expression uses the operator Equal or In depending on whether
	//  if the container has affinities on exactly one container in the symmetric
	//  set.
	//

	symmetric := map[string]map[string]struct{}{}

	for name, values := range parsed {
		for _, v := range values {
			forw, ok := symmetric[name]
			if !ok {
				forw = map[string]struct{}{}
				symmetric[name] = forw
			}
			back, ok := symmetric[v]
			if !ok {
				back = map[string]struct{}{}
				symmetric[v] = back
			}
			forw[v], back[name] = struct{}{}, struct{}{}
		}
	}

	var op resmgr.Operator
	for name, affinities := range symmetric {
		others := []string{}
		for o := range affinities {
			others = append(others, o)
		}
		if len(others) == 1 {
			op = resmgr.Equals
		} else {
			op = resmgr.In
		}
		(*pca)[name] = append((*pca)[name],
			&Affinity{
				Scope: podScope,
				Match: &resmgr.Expression{
					Key:    kubernetes.ContainerNameLabel,
					Op:     op,
					Values: others,
				},
				Weight: weight,
			})
	}

	return true
}

// Try to parse affinities in full notation from the given annotation value.
func (pca *podContainerAffinity) parseFull(pod *pod, value string, weight int32) error {
	parsed := podContainerAffinity{}
	if err := yaml.UnmarshalStrict([]byte(value), &parsed); err != nil {
		return cacheError("failed to parse affinity annotation '%s': %v", value, err)
	}

	podScope := pod.ScopeExpression()
	for name, pa := range parsed {
		ca, ok := (*pca)[name]
		if !ok {
			ca = make([]*Affinity, 0, len(pa))
		}

		for _, a := range pa {
			if a.Scope == nil {
				a.Scope = podScope
			}
			if a.Weight == 0 {
				a.Weight = weight
			} else {
				if weight < 0 {
					a.Weight *= -1
				}
			}
			if err := a.Validate(); err != nil {
				return err
			}

			ca = append(ca, a)
		}

		(*pca)[name] = ca
	}

	return nil
}

// GlobalAffinity creates an affinity with all containers in scope.
func GlobalAffinity(key string, weight int32) *Affinity {
	return &Affinity{
		Scope: &resmgr.Expression{
			Op: resmgr.AlwaysTrue, // evaluate against all containers
		},
		Match: &resmgr.Expression{
			Key: key,
			Op:  resmgr.Exists,
		},
		Weight: weight,
	}
}

// GlobalAntiAffinity creates an anti-affinity with all containers in scope.
func GlobalAntiAffinity(key string, weight int32) *Affinity {
	return GlobalAffinity(key, -weight)
}

// AddImplicitAffinities registers a set of implicit affinities.
func (cch *cache) AddImplicitAffinities(implicit map[string]ImplicitAffinity) error {
	for name := range implicit {
		if _, ok := cch.implicit[name]; ok {
			return cacheError("implicit affinity %s already defined", name)
		}
	}
	for name, a := range implicit {
		cch.implicit[name] = a
	}
	return nil
}

// DeleteImplicitAffinities removes a previously registered set of implicit affinities.
func (cch *cache) DeleteImplicitAffinities(names []string) {
	for _, name := range names {
		delete(cch.implicit, name)
	}
}


================================================
FILE: pkg/cri/resource-manager/cache/affinity_test.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cache

import (
	"testing"
)

func TestSimpleParsingSymmetry(t *testing.T) {
	c1, c2, c3, c4, c5 := "c1", "c2", "c3", "c4", "c5"

	tcases := []struct {
		name   string
		source string
		result map[string][]string
	}{
		{
			name:   "trivial 2 by 2",
			source: `c1: [ c2 ]`,
			result: map[string][]string{
				c1: {c2},
				c2: {c1},
			},
		},
		{
			name:   "simple",
			source: `c1: [ c2, c3, c4, c5 ]`,
			result: map[string][]string{
				c1: {c2, c3, c4, c5},
				c2: {c1},
				c3: {c1},
				c4: {c1},
				c5: {c1},
			},
		},
		{
			name: "a bit more complex",
			source: `
c1: [ c2 ]
c2: [ c3, c4, c5 ]
c4: [ c5 ]
`,
			result: map[string][]string{
				c1: {c2},
				c2: {c1, c3, c4, c5},
				c3: {c2},
				c4: {c2, c5},
				c5: {c2, c4},
			},
		},
	}

	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			pca := podContainerAffinity{}
			if !pca.parseSimple(&pod{Name: "testpod"}, tc.source, 1) {
				t.Errorf("failed to parse simple container affinity %q", tc.source)
				return
			}

			found := map[string]map[string]struct{}{}
			for name, affinities := range pca {
				for _, a := range affinities {
					for _, o := range a.Match.Values {
						forw, ok := found[name]
						if !ok {
							forw = map[string]struct{}{}
							found[name] = forw
						}
						back, ok := found[o]
						if !ok {
							back = map[string]struct{}{}
							found[o] = back
						}
						forw[o] = struct{}{}
						back[name] = struct{}{}
					}
				}
			}

			for name, others := range tc.result {
				for _, o := range others {
					if _, ok := found[name][o]; !ok {
						t.Errorf("simple affinity %q did not produce %s: %s",
							tc.source, name, o)
					} else {
						delete(found[name], o)
						if len(found[name]) == 0 {
							delete(found, name)
						}
					}
				}
			}
			for name, others := range found {
				val := ""
				sep := ""
				for o := range others {
					val += sep + o
					sep = ", "
				}
				t.Errorf("simple affinity %q produced unexpected %s: [ %s ]", tc.source, name, val)
			}
		})
	}
}

func TestStrictParsing(t *testing.T) {
	tcases := []struct {
		name    string
		source  string
		invalid bool
	}{
		{
			name: "invalid annotation",
			source: `
  memtier-benchmark:
    - scope:
      key: pod/name
      operator: Matches
      values:
        - redis-*
      match:
        key: name
        operator: Equals
        values:
          - redis
      weight: 10
`,
			invalid: true,
		},
		{
			name: "valid annotation",
			source: `
  memtier-benchmark:
    - scope:
        key: pod/name
        operator: Matches
        values:
          - redis-*
      match:
        key: name
        operator: Equals
        values:
          - redis
      weight: 10
`,
		},
	}

	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			pca := podContainerAffinity{}
			err := pca.parseFull(&pod{Name: "testpod"}, tc.source, 1)
			if tc.invalid && err == nil {
				t.Errorf("parsing invalid affinity expression should have failed")
				return
			}
			if !tc.invalid && err != nil {
				t.Errorf("parsing valid affinity expression should not fail")
			}
		})
	}
}


================================================
FILE: pkg/cri/resource-manager/cache/cache.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cache

import (
	"encoding/json"
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"strconv"
	"strings"
	"sync"

	v1 "k8s.io/api/core/v1"
	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"

	"github.com/intel/cri-resource-manager/pkg/apis/resmgr"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/config"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/topology"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	idset "github.com/intel/goresctrl/pkg/utils"
)

const (
	// CPU marks changes that can be applied by the CPU controller.
	CPU = "cpu"
	// CRI marks changes that can be applied by the CRI controller.
	CRI = "cri"
	// RDT marks changes that can be applied by the RDT controller.
	RDT = "rdt"
	// BlockIO marks changes that can be applied by the BlockIO controller.
	BlockIO = "blockio"
	// Memory marks changes that can be applied by the Memory controller.
	Memory = "memory"
	// PageMigration marks changes that can be applied by the PageMigration controller.
	PageMigration = "page-migration"

	// TagAVX512 tags containers that use AVX512 instructions.
	TagAVX512 = "AVX512"

	// RDTClassKey is the pod annotation key for specifying a container RDT class.
	RDTClassKey = "rdtclass" + "." + kubernetes.ResmgrKeyNamespace
	// BlockIOClassKey is the pod annotation key for specifying a container Block I/O class.
	BlockIOClassKey = "blockioclass" + "." + kubernetes.ResmgrKeyNamespace
	// ToptierLimitKey is the pod annotation key for specifying container top tier memory limits.
	ToptierLimitKey = "toptierlimit" + "." + kubernetes.ResmgrKeyNamespace

	// RDTClassPodQoS denotes that the RDTClass should be taken from PodQosClass
	RDTClassPodQoS = "/PodQos"

	// ToptierLimitUnset is the reserved value for indicating unset top tier limits.
	ToptierLimitUnset int64 = -1

	// TopologyHintsKey can be used to opt out from automatic topology hint generation.
	TopologyHintsKey = "topologyhints" + "." + kubernetes.ResmgrKeyNamespace
)

// allControllers is a slice of all controller domains.
var allControllers = []string{CPU, CRI, RDT, BlockIO, Memory}

// PodState is the pod state in the runtime.
type PodState int32

const (
	// PodStateReady marks a pod ready.
	PodStateReady = PodState(int32(criv1.PodSandboxState_SANDBOX_READY))
	// PodStateNotReady marks a pod as not ready.
	PodStateNotReady = PodState(int32(criv1.PodSandboxState_SANDBOX_NOTREADY))
	// PodStateStale marks a pod as removed.
	PodStateStale = PodState(int32(PodStateNotReady) + 1)
)

// PodResourceRequirements are per container resource requirements, annotated by our webhook.
type PodResourceRequirements struct {
	// InitContainers is the resource requirements by init containers.
	InitContainers map[string]v1.ResourceRequirements `json:"initContainers"`
	// Containers is the resource requirements by normal container.
	Containers map[string]v1.ResourceRequirements `json:"containers"`
}

// PodStatus wraps a PodSandboxStatus response for data extraction.
type PodStatus struct {
	CgroupParent string // extracted CgroupParent
}

// Pod is the exposed interface from a cached pod.
type Pod interface {
	resmgr.Evaluable
	fmt.Stringer
	// GetInitContainers returns the init containers of the pod.
	GetInitContainers() []Container
	// GetContainers returns the (non-init) containers of the pod.
	GetContainers() []Container
	// GetContainer returns the named container of the pod.
	GetContainer(string) (Container, bool)
	// GetId returns the pod id of the pod.
	GetID() string
	// GetUID returns the (kubernetes) unique id of the pod.
	GetUID() string
	// GetName returns the name of the pod.
	GetName() string
	// GetNamespace returns the namespace of the pod.
	GetNamespace() string
	// GetState returns the PodState of the pod.
	GetState() PodState
	// GetQOSClass returns the PodQOSClass of the pod.
	GetQOSClass() v1.PodQOSClass
	// GetLabelKeys returns the keys of all pod labels as a string slice.
	GetLabelKeys() []string
	// GetLabel returns the value of the given label and whether it was found.
	GetLabel(string) (string, bool)
	// GetResmgrLabelKeys returns pod label keys (without the namespace
	// part) in cri-resource-manager namespace.
	GetResmgrLabelKeys() []string
	// GetResmgrLabel returns the value of a pod label from the
	// cri-resource-manager namespace.
	GetResmgrLabel(string) (string, bool)
	// GetAnnotationKeys returns the keys of all pod annotations as a string slice.
	GetAnnotationKeys() []string
	// GetAnnotation returns the value of the given annotation and whether it was found.
	GetAnnotation(key string) (string, bool)
	// GetAnnotationObject decodes the value of the given annotation with the given function.
	GetAnnotationObject(key string, objPtr interface{},
		decode func([]byte, interface{}) error) (bool, error)
	// GetResmgrAnnotationKeys returns pod annotation keys (without the
	// namespace part) in cri-resource-manager namespace as a string slice.
	GetResmgrAnnotationKeys() []string
	// GetAnnotation returns the value of a pod annotation from the
	// cri-resource-manager namespace and whether it was found.
	GetResmgrAnnotation(key string) (string, bool)
	// GetResmgrAnnotationObject decodes the value of the given annotation in the
	// cri-resource-manager namespace.
	GetResmgrAnnotationObject(key string, objPtr interface{},
		decode func([]byte, interface{}) error) (bool, error)
	// GetEffectiveAnnotation returns the effective annotation for a container.
	// For any given key $K and container $C it will look for annotations in
	// this order and return the first one found:
	//     $K/container.$C
	//     $K/pod
	//     $K
	// and return the value of the first key found.
	GetEffectiveAnnotation(key, container string) (string, bool)
	// GetCgroupParentDir returns the pods cgroup parent directory.
	GetCgroupParentDir() string
	// GetPodResourceRequirements returns container resource requirements if the
	// necessary associated annotation put in place by the CRI resource manager
	// webhook was found.
	GetPodResourceRequirements() PodResourceRequirements
	// GetContainerAffinity returns the affinity expressions for the named container.
	GetContainerAffinity(string) ([]*Affinity, error)
	// ScopeExpression returns an affinity expression for defining this pod as the scope.
	ScopeExpression() *resmgr.Expression

	// GetProcesses returns the pids of all processes in the pod either excluding
	// container processes, if called with false, or including those if called with true.
	GetProcesses(bool) ([]string, error)
	// GetTasks returns the pids of all threads in the pod either excluding cotnainer
	// processes, if called with false, or including those if called with true.
	GetTasks(bool) ([]string, error)
}

// A cached pod.
type pod struct {
	cache        *cache            // our cache of object
	ID           string            // pod sandbox runtime id
	UID          string            // (k8s) unique id
	Name         string            // pod sandbox name
	Namespace    string            // pod namespace
	State        PodState          // ready/not ready
	QOSClass     v1.PodQOSClass    // pod QoS class
	Labels       map[string]string // pod labels
	Annotations  map[string]string // pod annotations
	CgroupParent string            // cgroup parent directory
	containers   map[string]string // container name to ID map

	Resources *PodResourceRequirements // annotated resource requirements
	Affinity  *podContainerAffinity    // annotated container affinity
}

// ContainerState is the container state in the runtime.
type ContainerState int32

const (
	// ContainerStateCreated marks a container created, not running.
	ContainerStateCreated = ContainerState(int32(criv1.ContainerState_CONTAINER_CREATED))
	// ContainerStateRunning marks a container created, running.
	ContainerStateRunning = ContainerState(int32(criv1.ContainerState_CONTAINER_RUNNING))
	// ContainerStateExited marks a container exited.
	ContainerStateExited = ContainerState(int32(criv1.ContainerState_CONTAINER_EXITED))
	// ContainerStateUnknown marks a container to be in an unknown state.
	ContainerStateUnknown = ContainerState(int32(criv1.ContainerState_CONTAINER_UNKNOWN))
	// ContainerStateCreating marks a container as being created.
	ContainerStateCreating = ContainerState(int32(ContainerStateUnknown) + 1)
	// ContainerStateStale marks a container removed.
	ContainerStateStale = ContainerState(int32(ContainerStateUnknown) + 2)
)

// Container is the exposed interface from a cached container.
type Container interface {
	resmgr.Evaluable
	fmt.Stringer
	// PrettyName returns the user-friendly <podname>:<containername> for the container.
	PrettyName() string
	// GetPod returns the pod of the container and a boolean indicating if there was one.
	GetPod() (Pod, bool)
	// GetID returns the ID of the container.
	GetID() string
	// GetPodID returns the pod ID of the container.
	GetPodID() string
	// GetCacheID returns the cacheID of the container.
	GetCacheID() string
	// GetName returns the name of the container.
	GetName() string
	// GetNamespace returns the namespace of the container.
	GetNamespace() string
	// UpdateState updates the state of the container.
	UpdateState(ContainerState)
	// GetState returns the ContainerState of the container.
	GetState() ContainerState
	// GetQOSClass returns the QoS class the pod would have if this was its only container.
	GetQOSClass() v1.PodQOSClass
	// GetImage returns the image of the container.
	GetImage() string
	// GetCommand returns the container command.
	GetCommand() []string
	// GetArgs returns the container command arguments.
	GetArgs() []string
	// GetLabelKeys returns the keys of all labels of the container.
	GetLabelKeys() []string
	// GetLabel returns the value of a container label.
	GetLabel(string) (string, bool)
	// GetLabels returns a copy of all container labels.
	GetLabels() map[string]string
	// GetResmgrLabelKeys returns container label keys (without the namespace
	// part) in cri-resource-manager namespace.
	GetResmgrLabelKeys() []string
	// GetResmgrLabel returns the value of a container label from the
	// cri-resource-manager namespace.
	GetResmgrLabel(string) (string, bool)
	// GetAnnotationKeys returns the keys of all annotations of the container.
	GetAnnotationKeys() []string
	// GetAnnotation returns the value of a container annotation.
	GetAnnotation(key string, objPtr interface{}) (string, bool)
	// GetResmgrAnnotationKeys returns container annotation keys (without the
	// namespace part) in cri-resource-manager namespace.
	GetResmgrAnnotationKeys() []string
	// GetAnnotation returns the value of a container annotation from the
	// cri-resource-manager namespace.
	GetResmgrAnnotation(key string, objPtr interface{}) (string, bool)
	// GetEffectiveAnnotation returns the effective annotation for the container from the pod.
	GetEffectiveAnnotation(key string) (string, bool)
	// GetAnnotations returns a copy of all container annotations.
	GetAnnotations() map[string]string
	// GetEnvKeys returns the keys of all container environment variables.
	GetEnvKeys() []string
	// GetEnv returns the value of a container environment variable.
	GetEnv(string) (string, bool)
	// GetMounts returns all the mounts of the container.
	GetMounts() []Mount
	// GetMountByHost returns the container path corresponding to the host path.
	// XXX We should remove this as is might not be unique.
	GetMountByHost(string) *Mount
	// GetmountByContainer returns the host path mounted to a container path.
	GetMountByContainer(string) *Mount
	// GetDevices returns the devices of the container.
	GetDevices() []Device
	// GetDeviceByHost returns the device for a host path.
	GetDeviceByHost(string) *Device
	// GetDeviceByContainer returns the device for a container path.
	GetDeviceByContainer(string) *Device
	// GetResourceRequirements returns the webhook-annotated requirements for ths container.
	GetResourceRequirements() v1.ResourceRequirements
	// GetLinuxResources returns the CRI linux resource request of the container.
	GetLinuxResources() *criv1.LinuxContainerResources

	// SetCommand sets the container command.
	SetCommand([]string)
	// SetArgs sets the container command arguments.
	SetArgs([]string)
	// SetLabel sets the value for a container label.
	SetLabel(string, string)
	// DeleteLabel removes a container label.
	DeleteLabel(string)
	// SetAnnotation sets the value for a container annotation.
	SetAnnotation(string, string)
	// DeleteAnnotation removes a container annotation.
	DeleteAnnotation(string)
	// SetEnv sets a container environment variable.
	SetEnv(string, string)
	// UnsetEnv unsets a container environment variable.
	UnsetEnv(string)
	// InsertMount inserts a mount into the container.
	InsertMount(*Mount)
	// DeleteMount removes a mount from the container.
	DeleteMount(string)
	// InsertDevice inserts a device into the container.
	InsertDevice(*Device)
	// DeleteDevice removes a device from the container.
	DeleteDevice(string)

	// Get any attached topology hints.
	GetTopologyHints() topology.Hints

	// GetCPUPeriod gets the CFS CPU period of the container.
	GetCPUPeriod() int64
	// GetCpuQuota gets the CFS CPU quota of the container.
	GetCPUQuota() int64
	// GetCPUShares gets the CFS CPU shares of the container.
	GetCPUShares() int64
	// GetmemoryLimit gets the memory limit in bytes for the container.
	GetMemoryLimit() int64
	// GetOomScoreAdj gets the OOM score adjustment for the container.
	GetOomScoreAdj() int64
	// GetCpusetCPUs gets the cgroup cpuset.cpus of the container.
	GetCpusetCpus() string
	// GetCpusetMems gets the cgroup cpuset.mems of the container.
	GetCpusetMems() string

	// SetLinuxResources sets the Linux-specific resource request of the container.
	SetLinuxResources(*criv1.LinuxContainerResources)
	// SetCPUPeriod sets the CFS CPU period of the container.
	SetCPUPeriod(int64)
	// SetCPUQuota sets the CFS CPU quota of the container.
	SetCPUQuota(int64)
	// SetCPUShares sets the CFS CPU shares of the container.
	SetCPUShares(int64)
	// SetmemoryLimit sets the memory limit in bytes for the container.
	SetMemoryLimit(int64)
	// SetOomScoreAdj sets the OOM score adjustment for the container.
	SetOomScoreAdj(int64)
	// SetCpusetCpu sets the cgroup cpuset.cpus of the container.
	SetCpusetCpus(string)
	// SetCpusetMems sets the cgroup cpuset.mems of the container.
	SetCpusetMems(string)

	// GetAffinity returns the annotated affinity expressions for this container.
	GetAffinity() ([]*Affinity, error)

	// GetCgroupDir returns the relative path of the cgroup directory for the container.
	GetCgroupDir() string

	// SetRDTClass assigns this container to the given RDT class.
	SetRDTClass(string)
	// GetRDTClass returns the RDT class for this container.
	GetRDTClass() string

	// SetBlockIOClass assigns this container to the given BlockIO class.
	SetBlockIOClass(string)
	// GetBlockIOClass returns the BlockIO class for this container.
	GetBlockIOClass() string

	// SetToptierLimit sets the tier memory limit for the container.
	SetToptierLimit(int64)
	// GetToptierLimit returns the top tier memory limit for the container.
	GetToptierLimit() int64

	// SetPageMigration sets the page migration policy/options for the container.
	SetPageMigration(*PageMigrate)
	// GetPageMigration returns the current page migration policy/options for the container.
	GetPageMigration() *PageMigrate

	// GetProcesses returns the pids of processes in the container.
	GetProcesses() ([]string, error)
	// GetTasks returns the pids of threads in the container.
	GetTasks() ([]string, error)

	// SetCRIRequest sets the current pending CRI request of the container.
	SetCRIRequest(req interface{}) error
	// GetCRIRequest returns the current pending CRI request of the container.
	GetCRIRequest() (interface{}, bool)
	// ClearCRIRequest clears and returns the current pending CRI request of the container.
	ClearCRIRequest() (interface{}, bool)

	// GetCRIEnvs returns container environment variables.
	GetCRIEnvs() []*criv1.KeyValue
	// GetCRIMounts returns container mounts.
	GetCRIMounts() []*criv1.Mount
	// GetCRIDevices returns container devices.
	GetCRIDevices() []*criv1.Device

	// GetPending gets the names of the controllers with pending changes.
	GetPending() []string
	// HasPending checks if the container has pending chanhes for the given controller.
	HasPending(string) bool
	// ClearPending clears the pending change marker for the given controller.
	ClearPending(string)

	// GetTag gets the value of the given tag.
	GetTag(string) (string, bool)
	// SetTag sets the value of the given tag and returns its previous value..
	SetTag(string, string) (string, bool)
	// DeleteTag deletes the given tag, returning its deleted value.
	DeleteTag(string) (string, bool)
}

// A cached container.
type container struct {
	cache         *cache             // our cache of objects
	ID            string             // container runtime id
	PodID         string             // associate pods runtime id
	CacheID       string             // our cache id
	Name          string             // container name
	Namespace     string             // container namespace
	State         ContainerState     // created/running/exited/unknown
	Image         string             // containers image
	Command       []string           // command to run in container
	Args          []string           // arguments for command
	Labels        map[string]string  // container labels
	Annotations   map[string]string  // container annotations
	Env           map[string]string  // environment variables
	Mounts        map[string]*Mount  // mounts
	Devices       map[string]*Device // devices
	TopologyHints topology.Hints     // Set of topology hints for all containers within Pod
	Tags          map[string]string  // container tags (local dynamic labels)
	Adjustment    string             // name of applicable external adjustment, if any

	Resources v1.ResourceRequirements        // container resources (from webhook annotation)
	LinuxReq  *criv1.LinuxContainerResources // used to estimate Resources if we lack annotations
	req       *interface{}                   // pending CRI request

	CgroupDir    string       // cgroup directory relative to a(ny) controller.
	RDTClass     string       // RDT class this container is assigned to.
	BlockIOClass string       // Block I/O class this container is assigned to.
	ToptierLimit int64        // Top tier memory limit.
	PageMigrate  *PageMigrate // Page migration policy/options for this container.

	pending map[string]struct{} // controllers with pending changes for this container

	prettyName string // cached PrettyName()
}

// MountType is a propagation type.
type MountType int32

const (
	// MountPrivate is a private container mount.
	MountPrivate MountType = MountType(criv1.MountPropagation_PROPAGATION_PRIVATE)
	// MountHostToContainer is a host-to-container mount.
	MountHostToContainer MountType = MountType(criv1.MountPropagation_PROPAGATION_HOST_TO_CONTAINER)
	// MountBidirectional is a bidirectional mount.
	MountBidirectional MountType = MountType(criv1.MountPropagation_PROPAGATION_BIDIRECTIONAL)
)

// Mount is a filesystem entry mounted inside a container.
type Mount struct {
	// Container is the path inside the container.
	Container string
	// Host is the path on the host.
	Host string
	// Readonly specifies if the mount is read-only or read-write.
	Readonly bool
	// Relabels denotes SELinux relabeling.
	Relabel bool
	// Propagation identifies the mount propagation type.
	Propagation MountType
}

// Device is a device exposed to a container.
type Device struct {
	// Container is the device path inside the container.
	Container string
	// Host is the device path on the host side.
	Host string
	// Permissions specify the device permissions for the container.
	Permissions string
}

// PageMigrate contains the policy/preferences for container page migration.
type PageMigrate struct {
	SourceNodes idset.IDSet // idle memory pages on these NUMA nodes
	TargetNodes idset.IDSet // should be migrated to these NUMA nodes
}

// Clone creates a copy of the page migration policy/preferences.
func (pm *PageMigrate) Clone() *PageMigrate {
	if pm == nil {
		return nil
	}
	c := &PageMigrate{}
	if pm.SourceNodes != nil {
		c.SourceNodes = pm.SourceNodes.Clone()
	}
	if pm.TargetNodes != nil {
		c.TargetNodes = pm.TargetNodes.Clone()
	}
	return c
}

// Cachable is an interface opaque cachable data must implement.
type Cachable interface {
	// Set value (via a pointer receiver) to the object.
	Set(value interface{})
	// Get the object that should be cached.
	Get() interface{}
}

// Cache is the primary interface exposed for tracking pods and containers.
//
// Cache tracks pods and containers in the runtime, mostly by processing CRI
// requests and responses which the cache is fed as these are being procesed.
// Cache also saves its state upon changes to secondary storage and restores
// itself upon startup.
type Cache interface {
	// InsertPod inserts a pod into the cache, using a runtime request or reply.
	InsertPod(id string, msg interface{}, status *PodStatus) (Pod, error)
	// DeletePod deletes a pod from the cache.
	DeletePod(id string) Pod
	// LookupPod looks up a pod in the cache.
	LookupPod(id string) (Pod, bool)
	// InsertContainer inserts a container into the cache, using a runtime request or reply.
	InsertContainer(msg interface{}) (Container, error)
	// UpdateContainerID updates a containers runtime id.
	UpdateContainerID(cacheID string, msg interface{}) (Container, error)
	// DeleteContainer deletes a container from the cache.
	DeleteContainer(id string) Container
	// LookupContainer looks up a container in the cache.
	LookupContainer(id string) (Container, bool)
	// LookupContainerByCgroup looks up a container for the given cgroup path.
	LookupContainerByCgroup(path string) (Container, bool)

	// GetPendingContainers returs all containers with pending changes.
	GetPendingContainers() []Container

	// GetPods returns all the pods known to the cache.
	GetPods() []Pod
	// GetContainers returns all the containers known to the cache.
	GetContainers() []Container

	// GetContainerCacheIds returns the cache ids of all containers.
	GetContainerCacheIds() []string
	// GetContaineIds return the ids of all containers.
	GetContainerIds() []string

	// FilterScope returns the containers selected by the scope expression.
	FilterScope(*resmgr.Expression) []Container
	// EvaluateAffinity evaluates the given affinity against all known in-scope containers
	EvaluateAffinity(*Affinity) map[string]int32
	// AddImplicitAffinities adds a set of implicit affinities (added to all containers).
	AddImplicitAffinities(map[string]ImplicitAffinity) error

	// GetActivePolicy returns the name of the active policy stored in the cache.
	GetActivePolicy() string
	// SetActivePolicy updates the name of the active policy stored in the cache.
	SetActivePolicy(string) error

	// ResetActivePolicy clears the active policy any any policy-specific data from the cache.
	ResetActivePolicy() error

	// SetPolicyEntry sets the policy entry for a key.
	SetPolicyEntry(string, interface{})
	// GetPolicyEntry gets the policy entry for a key.
	GetPolicyEntry(string, interface{}) bool

	// SetConfig caches the given configuration.
	SetConfig(*config.RawConfig) error
	// GetConfig returns the current/cached configuration.
	GetConfig() *config.RawConfig
	// ResetConfig clears any stored configuration from the cache.
	ResetConfig() error

	// SetAdjustment updates external adjustments and containers based this.
	SetAdjustment(*config.Adjustment) (bool, map[string]error)

	// Save requests a cache save.
	Save() error

	// RefreshPods purges/inserts stale/new pods/containers using a pod sandbox list response.
	RefreshPods(*criv1.ListPodSandboxResponse, map[string]*PodStatus) ([]Pod, []Pod, []Container)
	// RefreshContainers purges/inserts stale/new containers using a container list response.
	RefreshContainers(*criv1.ListContainersResponse) ([]Container, []Container)

	// Get the container (data) directory for a container.
	ContainerDirectory(string) string
	// OpenFile opens the names container data file, creating it if necessary.
	OpenFile(string, string, os.FileMode) (*os.File, error)
	// WriteFile writes a container data file, creating it if necessary.
	WriteFile(string, string, os.FileMode, []byte) error
}

const (
	// CacheVersion is the running version of the cache.
	CacheVersion = "1"
)

// permissions describe preferred/expected ownership and permissions for a file or directory.
type permissions struct {
	prefer os.FileMode // permissions to create file/directory with
	reject os.FileMode // bits that cause rejection to use an existing entry
}

// permissions to create with/check against
var (
	cacheDirPerm  = &permissions{prefer: 0710, reject: 0022}
	cacheFilePerm = &permissions{prefer: 0644, reject: 0022}
	dataDirPerm   = &permissions{prefer: 0755, reject: 0022}
	dataFilePerm  = &permissions{prefer: 0644, reject: 0022}
)

// Our cache of objects.
type cache struct {
	sync.Mutex    `json:"-"` // we're lockable
	logger.Logger `json:"-"` // cache logger instance
	filePath      string     // where to store to/load from
	dataDir       string     // container data directory

	Pods       map[string]*pod       // known/cached pods
	Containers map[string]*container // known/cache containers
	NextID     uint64                // next container cache id to use

	Cfg        *config.RawConfig      // cached/current configuration
	External   *config.Adjustment     // cached/current external adjustments
	PolicyName string                 // name of the active policy
	policyData map[string]interface{} // opaque policy data
	PolicyJSON map[string]string      // ditto in raw, unmarshaled form

	pending map[string]struct{} // cache IDs of containers with pending changes

	implicit map[string]ImplicitAffinity // implicit affinities
}

// Make sure cache implements Cache.
var _ Cache = &cache{}

// Options contains the configurable cache options.
type Options struct {
	// CacheDir is the directory the cache should save its state in.
	CacheDir string
}

// NewCache instantiates a new cache. Load it from the given path if it exists.
func NewCache(options Options) (Cache, error) {
	cch := &cache{
		filePath:   filepath.Join(options.CacheDir, "cache"),
		dataDir:    filepath.Join(options.CacheDir, "containers"),
		Logger:     logger.NewLogger("cache"),
		Pods:       make(map[string]*pod),
		Containers: make(map[string]*container),
		NextID:     1,
		policyData: make(map[string]interface{}),
		PolicyJSON: make(map[string]string),
		implicit:   make(map[string]ImplicitAffinity),
	}

	if _, err := cch.checkPerm("cache", cch.filePath, false, cacheFilePerm); err != nil {
		return nil, cacheError("refusing to use existing cache file: %v", err)
	}
	if err := cch.mkdirAll("cache", options.CacheDir, cacheDirPerm); err != nil {
		return nil, err
	}
	if err := cch.mkdirAll("container", cch.dataDir, dataDirPerm); err != nil {
		return nil, err
	}
	if err := cch.Load(); err != nil {
		return nil, err
	}

	return cch, nil
}

// GetActivePolicy returns the name of the active policy stored in the cache.
func (cch *cache) GetActivePolicy() string {
	return cch.PolicyName
}

// SetActivePolicy updaes the name of the active policy stored in the cache.
func (cch *cache) SetActivePolicy(policy string) error {
	cch.PolicyName = policy
	return cch.Save()
}

// ResetActivePolicy clears the active policy any any policy-specific data from the cache.
func (cch *cache) ResetActivePolicy() error {
	cch.Warn("clearing all data for active policy (%q) from cache...",
		cch.PolicyName)

	cch.PolicyName = ""
	cch.policyData = make(map[string]interface{})
	cch.PolicyJSON = make(map[string]string)

	return cch.Save()
}

// SetConfig caches the given configuration.
func (cch *cache) SetConfig(cfg *config.RawConfig) error {
	old := cch.Cfg
	cch.Cfg = cfg

	if err := cch.Save(); err != nil {
		cch.Cfg = old
		return err
	}

	return nil
}

// GetConfig returns the current/cached configuration.
func (cch *cache) GetConfig() *config.RawConfig {
	return cch.Cfg
}

// ResetConfig clears any stored configuration from the cache.
func (cch *cache) ResetConfig() error {
	old := cch.Cfg
	cch.Cfg = nil

	if err := cch.Save(); err != nil {
		cch.Cfg = old
		return err
	}

	return nil
}

// SetAdjustment updates external adjustments and containers based on this.
func (cch *cache) SetAdjustment(external *config.Adjustment) (bool, map[string]error) {
	effective := map[*container]string{}

	// collect per container external adjustments, checking for obvious errors
	errors := map[string]error{}
	for id, c := range cch.Containers {
		if id != c.GetCacheID() {
			continue
		}

		adjustments := cch.getApplicableAdjustments(external, c)

		if len(adjustments) == 0 {
			continue
		}

		// conflict: multiple adjustments per container
		if len(adjustments) > 1 {
			errors[c.GetID()] = cacheError("conflicting adjustments for %s: %s",
				c.PrettyName(), strings.Join(adjustments, ","))
			continue
		}

		adjust := external.Adjustments[adjustments[0]]

		// error: trying to override resources for BestEffort container
		if c.GetQOSClass() == v1.PodQOSBestEffort {
			if adjust.Resources != nil {
				errors[c.GetID()] = cacheError("%s: can't override resources for BestEffort %s",
					adjustments[0], c.PrettyName())
				continue
			}
		}

		effective[c] = adjustments[0]
	}
	if len(errors) > 0 {
		return false, errors
	}

	// update per container external adjustments, mark all containers with pending changes
	for id, c := range cch.Containers {
		if id != c.GetCacheID() {
			continue
		}

		uptodate := effective[c]
		previous := c.setEffectiveAdjustment(uptodate)
		effective[c] = previous

		if previous != uptodate {
			cch.Info("%s effective external adjustment changed from %q to %q",
				c.PrettyName(), previous, uptodate)
		}

		c.markPending(allControllers...)
	}

	if err := cch.Save(); err != nil {
		for id, c := range cch.Containers {
			if id != c.GetCacheID() {
				continue
			}
			c.setEffectiveAdjustment(effective[c])
		}
		return false, map[string]error{"cache": err}
	}

	cch.External = external
	return true, nil
}

// Get all external adjustments applicable to the given container.
func (cch *cache) getApplicableAdjustments(ext *config.Adjustment, c *container) []string {
	if ext == nil {
		return []string{}
	}
	applicable := []string{}
	for name, adjust := range ext.Adjustments {
		if adjust.IsContainerInScope(c) {
			applicable = append(applicable, name)
		}
	}
	return applicable
}

// setEffectiveAdjustment updates the effective adjustments of all containers.
func (cch *cache) setEffectiveAdjustment(effective map[*container]string) {
	for id, c := range cch.Containers {
		if id != c.GetCacheID() {
			continue
		}

		uptodate := effective[c]
		previous := c.setEffectiveAdjustment(uptodate)

		if previous != uptodate {
			cch.Info("%s effective external adjustment changed from %q to %q",
				c.PrettyName(), previous, uptodate)
		}

		// we forcibly mark the container as updated in all controller domains
		for _, ctrl := range allControllers {
			c.markPending(ctrl)
		}
	}
}

// Derive cache id using pod uid, or allocate a new unused local cache id.
func (cch *cache) createCacheID(c *container) string {
	if pod, ok := c.cache.LookupPod(c.PodID); ok {
		uid := pod.GetUID()
		if uid != "" {
			return uid + ":" + c.Name
		}
	}

	cch.Warn("can't find unique id for pod %s, assigning local cache id", c.PodID)
	id := "cache:" + strconv.FormatUint(cch.NextID, 16)
	cch.NextID++

	return id
}

// Insert a pod into the cache.
func (cch *cache) InsertPod(id string, msg interface{}, status *PodStatus) (Pod, error) {
	var err error

	p := &pod{cache: cch, ID: id}

	switch msg.(type) {
	case *criv1.RunPodSandboxRequest:
		err = p.fromRunRequest(msg.(*criv1.RunPodSandboxRequest))
	case *criv1.PodSandbox:
		err = p.fromListResponse(msg.(*criv1.PodSandbox), status)
	default:
		err = fmt.Errorf("cannot create pod from message %T", msg)
	}

	if err != nil {
		cch.Error("failed to insert pod %s: %v", id, err)
		return nil, err
	}

	cch.Pods[p.ID] = p

	cch.Save()

	return p, nil
}

// Delete a pod from the cache.
func (cch *cache) DeletePod(id string) Pod {
	p, ok := cch.Pods[id]
	if !ok {
		return nil
	}

	cch.Debug("removing pod %s (%s)", p.Name, p.ID)
	delete(cch.Pods, id)

	cch.Save()

	return p
}

// Look up a pod in the cache.
func (cch *cache) LookupPod(id string) (Pod, bool) {
	p, ok := cch.Pods[id]
	return p, ok
}

// Insert a container into the cache.
func (cch *cache) InsertContainer(msg interface{}) (Container, error) {
	var err error

	c := &container{
		cache: cch,
	}

	switch msg.(type) {
	case *criv1.CreateContainerRequest:
		err = c.fromCreateRequest(msg.(*criv1.CreateContainerRequest))
	case *criv1.Container:
		err = c.fromListResponse(msg.(*criv1.Container))
	default:
		err = fmt.Errorf("cannot create container from message %T", msg)
	}

	if err != nil {
		return nil, cacheError("failed to insert container %s: %v", c.CacheID, err)
	}

	c.CacheID = cch.createCacheID(c)

	cch.Containers[c.CacheID] = c
	if c.ID != "" {
		cch.Containers[c.ID] = c
	}

	cch.createContainerDirectory(c.CacheID)

	adjustments := cch.getApplicableAdjustments(cch.External, c)
	switch {
	case len(adjustments) > 1:
		cch.Error("conflicting adjustments for %s: %s",
			c.PrettyName(), strings.Join(adjustments, ","))
	case len(adjustments) == 1:
		c.setEffectiveAdjustment(adjustments[0])
	}

	cch.Save()

	return c, nil
}

// UpdateContainerID updates a containers runtime id.
func (cch *cache) UpdateContainerID(cacheID string, msg interface{}) (Container, error) {
	c, ok := cch.Containers[cacheID]
	if !ok {
		return nil, cacheError("%s: failed to update ID, container not found",
			cacheID)
	}

	reply, ok := msg.(*criv1.CreateContainerResponse)
	if !ok {
		return nil, cacheError("%s: failed to update ID from message %T",
			c.PrettyName(), msg)
	}

	c.ID = reply.ContainerId
	cch.Containers[c.ID] = c

	cch.Save()

	return c, nil
}

// Delete a pod from the cache.
func (cch *cache) DeleteContainer(id string) Container {
	c, ok := cch.Containers[id]
	if !ok {
		return nil
	}

	cch.Debug("removing container %s", c.PrettyName())
	cch.removeContainerDirectory(c.CacheID)
	delete(cch.Containers, c.ID)
	delete(cch.Containers, c.CacheID)

	cch.Save()

	return c
}

// Look up a pod in the cache.
func (cch *cache) LookupContainer(id string) (Container, bool) {
	c, ok := cch.Containers[id]
	return c, ok
}

// LookupContainerByCgroup looks up the container for the given cgroup path.
func (cch *cache) LookupContainerByCgroup(path string) (Container, bool) {
	cch.Debug("resolving %s to a container...", path)

	for id, c := range cch.Containers {
		if id != c.CacheID {
			continue
		}

		parent := ""
		if pod, ok := c.GetPod(); ok {
			parent = pod.GetCgroupParentDir()
		}
		if parent == "" {
			continue
		}

		if !strings.HasPrefix(path, parent+"/") {
			continue
		}

		if strings.Index(path, c.GetID()) != -1 {
			return c, true
		}
	}

	return nil, false
}

// RefreshPods purges/inserts stale/new pods/containers using a pod sandbox list response.
func (cch *cache) RefreshPods(msg *criv1.ListPodSandboxResponse, status map[string]*PodStatus) ([]Pod, []Pod, []Container) {
	valid := make(map[string]struct{})

	add := []Pod{}
	del := []Pod{}
	containers := []Container{}

	for _, item := range msg.Items {
		valid[item.Id] = struct{}{}
		if _, ok := cch.Pods[item.Id]; !ok {
			cch.Debug("inserting discovered pod %s...", item.Id)
			pod, err := cch.InsertPod(item.Id, item, status[item.Id])
			if err != nil {
				cch.Error("failed to insert discovered pod %s to cache: %v",
					item.Id, err)
			} else {
				add = append(add, pod)
			}
		}
	}

	for _, pod := range cch.Pods {
		if _, ok := valid[pod.ID]; !ok {
			cch.Debug("purging stale pod %s...", pod.ID)
			pod.State = PodStateStale
			del = append(del, cch.DeletePod(pod.ID))
		}
	}

	for id, c := range cch.Containers {
		if _, ok := valid[c.PodID]; !ok {
			cch.Debug("purging container %s of stale pod %s...", c.CacheID, c.PodID)
			cch.DeleteContainer(c.CacheID)
			c.State = ContainerStateStale
			if id == c.CacheID {
				containers = append(containers, c)
			}
		}
	}

	return add, del, containers
}

// RefreshContainers purges/inserts stale/new containers using a container list response.
func (cch *cache) RefreshContainers(msg *criv1.ListContainersResponse) ([]Container, []Container) {
	valid := make(map[string]struct{})

	add := []Container{}
	del := []Container{}

	for _, c := range msg.Containers {
		if ContainerState(c.State) == ContainerStateExited {
			continue
		}

		valid[c.Id] = struct{}{}
		if _, ok := cch.Containers[c.Id]; !ok {
			cch.Debug("inserting discovered container %s...", c.Id)
			inserted, err := cch.InsertContainer(c)
			if err != nil {
				cch.Error("failed to insert discovered container %s to cache: %v",
					c.Id, err)
			} else {
				add = append(add, inserted)
			}
		}
	}

	for id, c := range cch.Containers {
		if _, ok := valid[c.ID]; !ok {
			cch.Debug("purging stale container %s (state: %v)...", c.CacheID, c.GetState())
			cch.DeleteContainer(c.CacheID)
			c.State = ContainerStateStale
			if id == c.CacheID {
				del = append(del, c)
			}
		}
	}

	return add, del
}

// Mark a container as having pending changes.
func (cch *cache) markPending(c *container) {
	if cch.pending == nil {
		cch.pending = make(map[string]struct{})
	}
	cch.pending[c.CacheID] = struct{}{}
}

// Get all containers with pending changes.
func (cch *cache) GetPendingContainers() []Container {
	pending := make([]Container, 0, len(cch.pending))
	for id := range cch.pending {
		c, ok := cch.LookupContainer(id)
		if ok {
			pending = append(pending, c)
		}
	}
	return pending
}

// clear the pending state of the given container.
func (cch *cache) clearPending(c *container) {
	delete(cch.pending, c.CacheID)
}

// Get the cache ids of all cached containers.
func (cch *cache) GetContainerCacheIds() []string {
	ids := make([]string, len(cch.Containers))

	idx := 0
	for id, c := range cch.Containers {
		if id != c.CacheID {
			continue
		}
		ids[idx] = c.CacheID
		idx++
	}

	return ids[0:idx]
}

// Get the ids of all cached containers.
func (cch *cache) GetContainerIds() []string {
	ids := make([]string, len(cch.Containers))

	idx := 0
	for id, c := range cch.Containers {
		if id == c.CacheID {
			continue
		}
		ids[idx] = c.ID
		idx++
	}

	return ids[0:idx]
}

// GetPods returns all pods present in the cache.
func (cch *cache) GetPods() []Pod {
	pods := make([]Pod, 0, len(cch.Pods))
	for _, pod := range cch.Pods {
		pods = append(pods, pod)
	}
	return pods
}

// GetContainers returns all the containers present in the cache.
func (cch *cache) GetContainers() []Container {
	containers := make([]Container, 0, len(cch.Containers)/2)
	for id, container := range cch.Containers {
		if id != container.CacheID {
			continue
		}
		containers = append(containers, container)
	}
	return containers
}

// Set the policy entry for a key.
func (cch *cache) SetPolicyEntry(key string, obj interface{}) {
	cch.policyData[key] = obj

	if cch.DebugEnabled() {
		if data, err := marshalEntry(obj); err != nil {
			cch.Error("marshalling of policy entry '%s' failed: %v", key, err)
		} else {
			cch.Debug("policy entry '%s' set to '%s'", key, string(data))
		}
	}
}

// Get the policy entry for a key.
func (cch *cache) GetPolicyEntry(key string, ptr interface{}) bool {

	//
	// Notes:
	//     We try to serve requests from the demarshaled cache (policyData).
	//     If that fails (may be a first access since load) we look for the
	//     entry in the unmarshaled cache (PolicyJSON), demarshal, and cache
	//     the entry if found.
	//     Note the quirk: in the latter case we first directly unmarshal to
	//     the pointer provided by the caller, only then Get() and cache the
	//     result.
	//

	obj, ok := cch.policyData[key]
	if !ok {
		entry, ok := cch.PolicyJSON[key]
		if !ok {
			return false
		}

		// first access to key since startup
		if err := unmarshalEntry([]byte(entry), ptr); err != nil {
			cch.Fatal("failed to unmarshal '%s' policy entry for key '%s' (%T): %v",
				cch.PolicyName, key, ptr, err)
		}

		if err := cch.cacheEntry(key, ptr); err != nil {
			cch.Fatal("failed to cache '%s' policy entry for key '%s': %v",
				cch.PolicyName, key, err)
		}
	} else {
		// subsequent accesses to key
		if err := cch.setEntry(ptr, obj); err != nil {
			cch.Fatal("failed use cached entry for key '%s' of policy '%s': %v",
				key, cch.PolicyName, err)
		}
	}

	return true
}

// Marshal an opaque policy entry, special-casing cpusets and maps of cpusets.
func marshalEntry(obj interface{}) ([]byte, error) {
	switch obj.(type) {
	case cpuset.CPUSet:
		return []byte("\"" + obj.(cpuset.CPUSet).String() + "\""), nil
	case map[string]cpuset.CPUSet:
		dst := make(map[string]string)
		for key, cset := range obj.(map[string]cpuset.CPUSet) {
			dst[key] = cset.String()
		}
		return json.Marshal(dst)

	default:
		return json.Marshal(obj)
	}
}

// Unmarshal an opaque policy entry, special-casing cpusets and maps of cpusets.
func unmarshalEntry(data []byte, ptr interface{}) error {
	switch ptr.(type) {
	case *cpuset.CPUSet:
		cset, err := cpuset.Parse(string(data[1 : len(data)-1]))
		if err != nil {
			return err
		}
		*ptr.(*cpuset.CPUSet) = cset
		return nil

	case *map[string]cpuset.CPUSet:
		src := make(map[string]string)
		if err := json.Unmarshal([]byte(data), &src); err != nil {
			return cacheError("failed to unmarshal map[string]cpuset.CPUSet: %v", err)
		}

		dst := make(map[string]cpuset.CPUSet)
		for key, str := range src {
			cset, err := cpuset.Parse(str)
			if err != nil {
				return cacheError("failed to unmarshal cpuset.CPUSet '%s': %v", str, err)
			}
			dst[key] = cset
		}

		*ptr.(*map[string]cpuset.CPUSet) = dst
		return nil

	default:
		err := json.Unmarshal(data, ptr)
		return err
	}
}

// Cache an unmarshaled opaque policy entry, special-casing some simple/common types.
func (cch *cache) cacheEntry(key string, ptr interface{}) error {
	if cachable, ok := ptr.(Cachable); ok {
		cch.policyData[key] = cachable.Get()
		return nil
	}

	switch ptr.(type) {
	case *cpuset.CPUSet:
		cch.policyData[key] = *ptr.(*cpuset.CPUSet)
	case *map[string]cpuset.CPUSet:
		cch.policyData[key] = *ptr.(*map[string]cpuset.CPUSet)
	case *map[string]string:
		cch.policyData[key] = *ptr.(*map[string]string)

	case *string:
		cch.policyData[key] = *ptr.(*string)
	case *bool:
		cch.policyData[key] = *ptr.(*bool)

	case *int32:
		cch.policyData[key] = *ptr.(*int32)
	case *uint32:
		cch.policyData[key] = *ptr.(*uint32)
	case *int64:
		cch.policyData[key] = *ptr.(*int64)
	case *uint64:
		cch.policyData[key] = *ptr.(*uint64)

	case *int:
		cch.policyData[key] = *ptr.(*int)
	case *uint:
		cch.policyData[key] = *ptr.(*uint)

	default:
		return cacheError("can't handle policy data of type %T", ptr)
	}

	return nil
}

// Serve an unmarshaled opaque policy entry, special-casing some simple/common types.
func (cch *cache) setEntry(ptr, obj interface{}) error {
	if cachable, ok := ptr.(Cachable); ok {
		cachable.Set(obj)
		return nil
	}

	switch ptr.(type) {
	case *cpuset.CPUSet:
		*ptr.(*cpuset.CPUSet) = obj.(cpuset.CPUSet)
	case *map[string]cpuset.CPUSet:
		*ptr.(*map[string]cpuset.CPUSet) = obj.(map[string]cpuset.CPUSet)
	case *map[string]string:
		*ptr.(*map[string]string) = obj.(map[string]string)

	case *string:
		*ptr.(*string) = obj.(string)
	case *bool:
		*ptr.(*bool) = obj.(bool)

	case *int32:
		*ptr.(*int32) = obj.(int32)
	case *uint32:
		*ptr.(*uint32) = obj.(uint32)
	case *int64:
		*ptr.(*int64) = obj.(int64)
	case *uint64:
		*ptr.(*uint64) = obj.(uint64)

	case *int:
		*ptr.(*int) = obj.(int)
	case *uint:
		*ptr.(*uint) = obj.(uint)

	default:
		return cacheError("can't handle policy data of type %T", ptr)
	}

	return nil
}

// checkPerm checks permissions of an already existing file or directory.
func (cch *cache) checkPerm(what, path string, isDir bool, p *permissions) (bool, error) {
	if isDir {
		what += " directory"
	}

	info, err := os.Stat(path)
	if err != nil {
		if !errors.Is(err, os.ErrNotExist) {
			return true, cacheError("failed to os.Stat() %s %q: %v", what, path, err)
		}
		return false, nil
	}

	// check expected file type
	if isDir {
		if !info.IsDir() {
			return true, cacheError("%s %q exists, but is not a directory", what, path)
		}
	} else {
		if info.Mode()&os.ModeType != 0 {
			return true, cacheError("%s %q exists, but is not a regular file", what, path)
		}
	}

	existing := info.Mode().Perm()
	expected := p.prefer
	rejected := p.reject
	if ((expected | rejected) &^ os.ModePerm) != 0 {
		cch.Panic("internal error: current permissions check only handles permission bits (rwx)")
	}

	// check that we don't have any of the rejectable permission bits set
	if existing&rejected != 0 {
		return true, cacheError("existing %s %q has disallowed permissions set: %v",
			what, path, existing&rejected)
	}

	// warn if permissions are less strict than the preferred defaults
	if (existing | expected) != expected {
		cch.Warn("existing %s %q has less strict permissions %v than expected %v",
			what, path, existing, expected)
	}

	return true, nil
}

// mkdirAll creates a directory, checking permissions if it already exists.
func (cch *cache) mkdirAll(what, path string, p *permissions) error {
	exists, err := cch.checkPerm(what, path, true, p)
	if err != nil {
		return err
	}
	if exists {
		return nil
	}

	if err := os.MkdirAll(path, p.prefer); err != nil {
		return cacheError("failed to create %s directory %q: %v", what, path, err)
	}

	return nil
}

// snapshot is used to serialize the cache into a saveable/loadable state.
type snapshot struct {
	Version    string
	Pods       map[string]*pod
	Containers map[string]*container
	NextID     uint64
	Cfg        *config.RawConfig
	PolicyName string
	PolicyJSON map[string]string
}

// Snapshot takes a restorable snapshot of the current state of the cache.
func (cch *cache) Snapshot() ([]byte, error) {
	s := snapshot{
		Version:    CacheVersion,
		Pods:       make(map[string]*pod),
		Containers: make(map[string]*container),
		Cfg:        cch.Cfg,
		NextID:     cch.NextID,
		PolicyName: cch.PolicyName,
		PolicyJSON: cch.PolicyJSON,
	}

	for id, p := range cch.Pods {
		s.Pods[id] = p
	}

	for id, c := range cch.Containers {
		if id == c.CacheID {
			s.Containers[c.CacheID] = c
		}
	}

	for key, obj := range cch.policyData {
		data, err := marshalEntry(obj)
		if err != nil {
			return nil, cacheError("failed to marshal policy entry '%s': %v", key, err)
		}

		s.PolicyJSON[key] = string(data)
	}

	data, err := json.Marshal(s)
	if err != nil {
		return nil, cacheError("failed to marshal cache: %v", err)
	}

	return data, nil
}

// Restore restores a previously takes snapshot of the cache.
func (cch *cache) Restore(data []byte) error {
	s := snapshot{
		Pods:       make(map[string]*pod),
		Containers: make(map[string]*container),
		PolicyJSON: make(map[string]string),
	}

	if err := json.Unmarshal(data, &s); err != nil {
		return cacheError("failed to unmarshal snapshot data: %v", err)
	}

	if s.Version != CacheVersion {
		return cacheError("can't restore snapshot, version '%s' != running version %s",
			s.Version, CacheVersion)
	}

	cch.Pods = s.Pods
	cch.Containers = s.Containers
	cch.Cfg = s.Cfg
	cch.NextID = s.NextID
	cch.PolicyJSON = s.PolicyJSON
	cch.PolicyName = s.PolicyName
	cch.policyData = make(map[string]interface{})

	for _, p := range cch.Pods {
		p.cache = cch
		p.containers = make(map[string]string)
	}
	for _, c := range cch.Containers {
		c.cache = cch
		cch.Containers[c.CacheID] = c
		if c.ID != "" {
			cch.Containers[c.ID] = c
		}
	}

	return nil
}

// Save the state of the cache.
func (cch *cache) Save() error {
	cch.Debug("saving cache to file '%s'...", cch.filePath)

	data, err := cch.Snapshot()
	if err != nil {
		return cacheError("failed to save cache: %v", err)
	}

	tmpPath := cch.filePath + ".saving"
	if err = os.WriteFile(tmpPath, data, cacheFilePerm.prefer); err != nil {
		return cacheError("failed to write cache to file %q: %v", tmpPath, err)
	}
	if err := os.Rename(tmpPath, cch.filePath); err != nil {
		return cacheError("failed to rename %q to %q: %v",
			tmpPath, cch.filePath, err)
	}

	return nil
}

// Load loads the last saved state of the cache.
func (cch *cache) Load() error {
	cch.Debug("loading cache from file '%s'...", cch.filePath)

	data, err := os.ReadFile(cch.filePath)

	switch {
	case os.IsNotExist(err):
		cch.Debug("no cache file '%s', nothing to restore", cch.filePath)
		return nil
	case len(data) == 0:
		cch.Debug("empty cache file '%s', nothing to restore", cch.filePath)
		return nil
	case err != nil:
		return cacheError("failed to load cache from file '%s': %v", cch.filePath, err)
	}

	return cch.Restore(data)
}

func (cch *cache) ContainerDirectory(id string) string {
	c, ok := cch.Containers[id]
	if !ok {
		return ""
	}
	return filepath.Join(cch.dataDir, strings.Replace(c.CacheID, ":", "-", 1))
}

func (cch *cache) createContainerDirectory(id string) error {
	dir := cch.ContainerDirectory(id)
	if dir == "" {
		return cacheError("failed to determine container directory path for container %s", id)
	}
	return cch.mkdirAll("container directory", dir, dataDirPerm)
}

func (cch *cache) removeContainerDirectory(id string) error {
	dir := cch.ContainerDirectory(id)
	if dir == "" {
		return cacheError("failed to delete directory for container %s", id)
	}
	return os.RemoveAll(dir)
}

func (cch *cache) OpenFile(id string, name string, perm os.FileMode) (*os.File, error) {
	dir := cch.ContainerDirectory(id)
	if dir == "" {
		return nil, cacheError("failed to determine data directory for container %s", id)
	}
	if err := cch.mkdirAll("container directory", dir, dataDirPerm); err != nil {
		return nil, cacheError("container %s: can't create data file %q: %v", id, name, err)
	}

	path := filepath.Join(dir, name)
	if _, err := cch.checkPerm("container", path, false, dataFilePerm); err != nil {
		return nil, err
	}

	file, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm)
	if err != nil {
		return nil, cacheError("container %s: can't open data file %q: %v", id, path, err)
	}

	return file, nil
}

func (cch *cache) WriteFile(id string, name string, perm os.FileMode, data []byte) error {
	file, err := cch.OpenFile(id, name, perm)
	if err != nil {
		return err
	}
	defer file.Close()
	_, err = file.Write(data)

	return err
}


================================================
FILE: pkg/cri/resource-manager/cache/cache_test.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cache

import (
	"fmt"
	"os"
	"strings"
	"testing"

	v1 "k8s.io/api/core/v1"
	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"

	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes"
)

var nextFakePodID = 1
var nextFakeContainerID = 1

type fakePod struct {
	name        string
	uid         string
	id          string
	qos         v1.PodQOSClass
	labels      map[string]string
	annotations map[string]string
	podCfg      *criv1.PodSandboxConfig
}

type fakeContainer struct {
	fakePod     *fakePod
	name        string
	id          string
	labels      map[string]string
	annotations map[string]string
	resources   criv1.LinuxContainerResources
}

func createTmpCache() (Cache, string, error) {
	dir, err := os.MkdirTemp("", "cache-test")
	if err != nil {
		return nil, "", err
	}
	cch, err := NewCache(Options{CacheDir: dir})
	if err != nil {
		return nil, "", err
	}
	return cch, dir, nil
}

func removeTmpCache(dir string) {
	if dir != "" {
		os.RemoveAll(dir)
	}
}

func createFakePod(cch Cache, fp *fakePod) (Pod, error) {
	if fp.labels == nil {
		fp.labels = make(map[string]string)
	}
	fp.id = fmt.Sprintf("pod%4.4d", nextFakePodID)
	fp.uid = fmt.Sprintf("poduid%4.4d", nextFakePodID)
	fp.labels[kubernetes.PodUIDLabel] = fp.uid
	nextFakePodID++

	if string(fp.qos) == "" {
		fp.qos = v1.PodQOSBurstable
	}

	cgroupPath := ""
	if fp.qos != v1.PodQOSGuaranteed {
		pathClass := "kubepods-" + strings.ToLower(string(fp.qos))
		cgroupPath = "/kubepods.slice/" + pathClass + ".slice/" + pathClass + "-pod" + fp.uid
	} else {
		cgroupPath = "/kubepods.slice/kubepods-pod" + strings.ReplaceAll(fp.uid, "-", "_")
	}

	req := &criv1.RunPodSandboxRequest{
		Config: &criv1.PodSandboxConfig{
			Metadata: &criv1.PodSandboxMetadata{
				Name:      fp.name,
				Uid:       fp.uid,
				Namespace: "default",
			},
			Labels:      fp.labels,
			Annotations: fp.annotations,
			Linux: &criv1.LinuxPodSandboxConfig{
				CgroupParent: cgroupPath,
			},
		},
	}
	fp.podCfg = req.Config

	cch.(*cache).Debug("*** => creating Pod: %+v\n", *req)
	p, err := cch.InsertPod(fp.id, req, nil)
	if err != nil {
		cch.(*cache).Debug("*** <= created Pod FAILED: %+v\n", err)
		return nil, err
	}
	cch.(*cache).Debug("*** <= created Pod: %+v\n", *p.(*pod))
	return p, nil
}

func createFakeContainer(cch Cache, fc *fakeContainer) (Container, error) {
	if fc.labels == nil {
		fc.labels = make(map[string]string)
	}
	fc.id = fmt.Sprintf("container-id-%4.4d", nextFakeContainerID)
	nextFakeContainerID++

	req := &criv1.CreateContainerRequest{
		PodSandboxId: fc.fakePod.id,
		Config: &criv1.ContainerConfig{
			Metadata: &criv1.ContainerMetadata{
				Name: fc.name,
			},
			Labels:      fc.labels,
			Annotations: fc.annotations,
			Linux: &criv1.LinuxContainerConfig{
				Resources: &fc.resources,
			},
		},
		SandboxConfig: fc.fakePod.podCfg,
	}

	cch.(*cache).Debug("*** => creating Container: %+v\n", *req)
	c, err := cch.InsertContainer(req)
	if err != nil {
		return nil, err
	}
	cch.(*cache).Debug("*** <= created Container: %+v\n", *c.(*container))
	update := &criv1.CreateContainerResponse{ContainerId: fc.id}
	if _, err := cch.UpdateContainerID(c.GetCacheID(), update); err != nil {
		return nil, err
	}
	return c, nil
}

func TestLookupContainerByCgroup(t *testing.T) {
	fakePods := map[string]*fakePod{
		"pod1": {name: "pod1"},
		"pod2": {name: "pod2"},
		"pod3": {name: "pod3"},
	}

	fakePodContainers := map[string][]*fakeContainer{
		"pod1": {{name: "container1"}, {name: "container2"}, {name: "err-container3"}},
		"pod2": {{name: "err-container4"}, {name: "container5"}, {name: "err-container6"}},
		"pod3": {{name: "container7"}, {name: "container8"}, {name: "container10"}},
	}

	cch, dir, err := createTmpCache()
	if err != nil {
		t.Errorf("failed: %v", err)
	}
	defer removeTmpCache(dir)

	for _, fp := range fakePods {
		_, err := createFakePod(cch, fp)
		if err != nil {
			t.Errorf("failed to create fake pod: %v", err)
		}
	}

	for podName, fcs := range fakePodContainers {
		fp, ok := fakePods[podName]
		if !ok {
			t.Errorf("failed to find fake pod '%s'", podName)
		}
		for _, fc := range fcs {
			fc.fakePod = fp
			if _, err := createFakeContainer(cch, fc); err != nil {
				t.Errorf("failed to create fake container '%s.%s': %v", podName, fc.name, err)
			}
		}
	}

	for _, c := range cch.GetContainers() {
		p, ok := c.GetPod()
		if !ok {
			t.Errorf("failed to find Pod for Container %s", c.PrettyName())
		}
		podCgroupDir := p.GetCgroupParentDir()
		path := podCgroupDir + "/container-" + c.GetID() + ".scope"

		cch.(*cache).Info("=> %s: testing lookup by cgroup path %s...", c.PrettyName(), path)
		chk, ok := cch.LookupContainerByCgroup(path)
		if !ok {
			t.Errorf("failed to look up container %s by cgroup path %s (pod parent cgroup: %s)",
				c.PrettyName(), path, podCgroupDir)
		}
		cch.(*cache).Info("<= %s", chk.PrettyName())

		if strings.HasPrefix(c.GetName(), "err-") {
			path := podCgroupDir + "-another/container-" + c.GetID() + ".scope"

			cch.(*cache).Info("=> %s: testing lookup failure by cgroup path %s...",
				c.PrettyName(), path)
			chk, ok := cch.LookupContainerByCgroup(path)
			if ok {
				t.Errorf("look up of container %s by path %s should have failed, but gave %s",
					c.PrettyName(), path, chk.PrettyName())
			}
			cch.(*cache).Info("<= OK (not found as expected)")
		}

		if chk.GetID() != c.GetID() {
			t.Errorf("found container %s is not the expected %s", chk.GetID(), c.GetID())
		}
	}
}

func TestDefaultRDTAndBlockIOClasses(t *testing.T) {
	fakePods := map[string]*fakePod{
		"pod1": {
			name: "pod1",
			qos:  v1.PodQOSBestEffort,
			annotations: map[string]string{
				"rdtclass." + kubernetes.ResmgrKeyNamespace + "/pod": "Pod1RDT",

				"rdtclass." + kubernetes.ResmgrKeyNamespace + "/container.container1":     "RDT1",
				"blockioclass." + kubernetes.ResmgrKeyNamespace + "/container.container1": "BLKIO1",
				"rdtclass." + kubernetes.ResmgrKeyNamespace + "/container.container2":     "RDT2",
				"blockioclass." + kubernetes.ResmgrKeyNamespace + "/container.container2": "BLKIO2",
				"rdtclass." + kubernetes.ResmgrKeyNamespace + "/container.container3":     "RDT3",
				"blockioclass." + kubernetes.ResmgrKeyNamespace + "/container.container4": "BLKIO4",
			},
		},
		"pod2": {
			name: "pod2",
			qos:  v1.PodQOSBurstable,
			annotations: map[string]string{
				"blockioclass." + kubernetes.ResmgrKeyNamespace: "Pod2BLKIO",

				"rdtclass." + kubernetes.ResmgrKeyNamespace + "/container.3":     "RDT3",
				"blockioclass." + kubernetes.ResmgrKeyNamespace + "/container.3": "BLKIO3",
				"rdtclass." + kubernetes.ResmgrKeyNamespace + "/container.4":     "RDT4",
				"rdtclass." + kubernetes.ResmgrKeyNamespace + "/container.1":     "RDT1",
				"blockioclass." + kubernetes.ResmgrKeyNamespace + "/container.2": "BLKIO2",
			},
		},
	}

	fakePodContainers := map[string][]*fakeContainer{
		"pod1": {
			{name: "container1"},
			{name: "container2"},
			{name: "container3"},
			{name: "container4"},
		},
	}

	type classes struct {
		RDT     string
		BlockIO string
	}

	expected := map[string]map[string]classes{
		"pod1": {
			"container1": {
				RDT:     "RDT1",
				BlockIO: "BLKIO1",
			},
			"container2": {
				RDT:     "RDT2",
				BlockIO: "BLKIO2",
			},
			"container3": {
				RDT:     "RDT3",
				BlockIO: string(fakePods["pod1"].qos),
			},
			"container4": {
				RDT:     "Pod1RDT",
				BlockIO: "BLKIO4",
			},
		},
		"pod2": {
			"container1": {
				RDT:     "RDT1",
				BlockIO: "Pod2BLKIO",
			},
			"container2": {
				RDT:     string(fakePods["pod2"].qos),
				BlockIO: "BLKIO2",
			},
			"container3": {
				RDT:     "RDT3",
				BlockIO: "BLKIO3",
			},
			"container4": {
				RDT:     "RDT4",
				BlockIO: "Pod2BLKIO",
			},
		},
	}

	cch, dir, err := createTmpCache()
	if err != nil {
		t.Errorf("failed: %v", err)
	}
	defer removeTmpCache(dir)

	for _, fp := range fakePods {
		_, err := createFakePod(cch, fp)
		if err != nil {
			t.Errorf("failed to create fake pod: %v", err)
		}
	}

	for podName, fcs := range fakePodContainers {
		fp, ok := fakePods[podName]
		if !ok {
			t.Errorf("failed to find fake pod '%s'", podName)
		}
		for _, fc := range fcs {
			fc.fakePod = fp
			if _, err := createFakeContainer(cch, fc); err != nil {
				t.Errorf("failed to create fake container '%s.%s': %v", podName, fc.name, err)
			}
		}
	}

	for _, c := range cch.GetContainers() {
		pod, ok := c.GetPod()
		if !ok {
			t.Errorf("failed to find Pod for Container %s", c.PrettyName())
		}

		exp, ok := expected[pod.GetName()][c.GetName()]
		if !ok {
			t.Errorf("failed to find expected results Container %s", c.PrettyName())
		}

		if c.GetRDTClass() != exp.RDT {
			t.Errorf("container %s: RDT class %s, expected %s", c.PrettyName(),
				c.GetRDTClass(), exp.RDT)
		}

		if c.GetBlockIOClass() != exp.BlockIO {
			t.Errorf("container %s: BlockIO class %s, expected %s", c.PrettyName(),
				c.GetBlockIOClass(), exp.BlockIO)
		}
	}
}

const (
	// anything below 2 millicpus will yield 0 as an estimate
	minNonZeroRequest = 2
	// check CPU request/limit estimate accuracy up to this many CPU cores
	maxCPU = (kubernetes.MaxShares / kubernetes.SharesPerCPU) * kubernetes.MilliCPUToCPU
	// we expect our estimates to be within 1 millicpu from the real ones
	expectedAccuracy = 1
)

func TestCPURequestCalculationAccuracy(t *testing.T) {
	for request := 0; request < maxCPU; request++ {
		shares := MilliCPUToShares(int64(request))
		estimate := SharesToMilliCPU(int64(shares))

		diff := int64(request) - estimate
		if diff > expectedAccuracy || diff < -expectedAccuracy {
			if diff < 0 {
				diff = -diff
			}
			if request > minNonZeroRequest {
				t.Errorf("CPU request %v: estimate %v, unexpected inaccuracy %v > %v",
					request, estimate, diff, expectedAccuracy)
			} else {
				t.Logf("CPU request %v: estimate %v, inaccuracy %v > %v (OK, this was expected)",
					request, estimate, diff, expectedAccuracy)
			}
		}

		// fail if our estimates are not accurate for full CPUs worth of millicpus
		if (request%1000) == 0 && diff != 0 {
			t.Errorf("CPU request %v != estimate %v (diff %v)", request, estimate, diff)
		}
	}
}

func TestCPULimitCalculationAccuracy(t *testing.T) {
	for limit := int64(0); limit < int64(maxCPU); limit++ {
		quota, period := MilliCPUToQuota(limit)
		estimate := QuotaToMilliCPU(quota, period)

		diff := limit - estimate
		if diff > expectedAccuracy || diff < -expectedAccuracy {
			if diff < 0 {
				diff = -diff
			}
			if quota != kubernetes.MinQuotaPeriod {
				t.Errorf("CPU limit %v: estimate %v, unexpected inaccuracy %v > %v",
					limit, estimate, diff, expectedAccuracy)
			} else {
				t.Logf("CPU limit %v: estimate %v, inaccuracy %v > %v (OK, this was expected)",
					limit, estimate, diff, expectedAccuracy)
			}
		}

		// fail if our estimates are not accurate for full CPUs worth of millicpus
		if (limit%1000) == 0 && diff != 0 {
			t.Errorf("CPU limit %v != estimate %v (diff %v)", limit, estimate, diff)
		}
	}
}


================================================
FILE: pkg/cri/resource-manager/cache/container.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cache

import (
	"encoding/json"
	"regexp"
	"sort"
	"strconv"
	"strings"

	"github.com/intel/cri-resource-manager/pkg/apis/resmgr"
	"github.com/intel/cri-resource-manager/pkg/cgroups"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes"
	"github.com/intel/cri-resource-manager/pkg/topology"

	v1 "k8s.io/api/core/v1"
	resapi "k8s.io/apimachinery/pkg/api/resource"
	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"

	extapi "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1"
)

// Create a container for a create request.
func (c *container) fromCreateRequest(req *criv1.CreateContainerRequest) error {
	c.PodID = req.PodSandboxId

	pod, ok := c.cache.Pods[c.PodID]
	if !ok {
		return cacheError("can't find cached pod %s for container to create", c.PodID)
	}

	cfg := req.Config
	if cfg == nil {
		return cacheError("container of pod %s has no config", c.PodID)
	}
	meta := cfg.Metadata
	if meta == nil {
		return cacheError("container of pod %s has no request metadata", c.PodID)
	}
	podCfg := req.SandboxConfig
	if podCfg == nil {
		return cacheError("container of pod %s has no request pod config data", c.PodID)
	}
	podMeta := podCfg.Metadata
	if podMeta == nil {
		return cacheError("container of pod %s has no request pod metadata", c.PodID)
	}

	c.Name = meta.Name
	c.Namespace = podMeta.Namespace
	c.State = ContainerStateCreating
	c.Image = cfg.GetImage().GetImage()
	c.Command = cfg.Command
	c.Args = cfg.Args
	c.Labels = cfg.Labels
	c.Annotations = cfg.Annotations

	c.Env = make(map[string]string)
	for _, kv := range cfg.Envs {
		c.Env[kv.Key] = kv.Value
	}

	genHints := true
	if hintSetting, ok := c.GetEffectiveAnnotation(TopologyHintsKey); ok {
		preference, err := strconv.ParseBool(hintSetting)
		if err != nil {
			c.cache.Error("invalid annotation %q=%q: %v", TopologyHintsKey, hintSetting, err)
		} else {
			genHints = preference
		}
	}
	c.cache.Info("automatic topology hint generation %s for %q",
		map[bool]string{false: "disabled", true: "enabled"}[genHints], c.PrettyName())

	c.Mounts = make(map[string]*Mount)
	for _, m := range cfg.Mounts {
		c.Mounts[m.ContainerPath] = &Mount{
			Container:   m.ContainerPath,
			Host:        m.HostPath,
			Readonly:    m.Readonly,
			Relabel:     m.SelinuxRelabel,
			Propagation: MountType(m.Propagation),
		}

		if genHints {
			if hints := getTopologyHints(m.HostPath, m.ContainerPath, m.Readonly); len(hints) > 0 {
				c.TopologyHints = topology.MergeTopologyHints(c.TopologyHints, hints)
			}
		}
	}

	c.Devices = make(map[string]*Device)
	for _, d := range cfg.Devices {
		c.Devices[d.ContainerPath] = &Device{
			Container:   d.ContainerPath,
			Host:        d.HostPath,
			Permissions: d.Permissions,
		}
		if genHints {
			if hints := getTopologyHints(d.HostPath, d.ContainerPath, strings.IndexAny(d.Permissions, "wm") == -1); len(hints) > 0 {
				c.TopologyHints = topology.MergeTopologyHints(c.TopologyHints, hints)
			}
		}
	}

	c.Tags = make(map[string]string)

	c.LinuxReq = cfg.GetLinux().GetResources()

	if pod.Resources != nil {
		if r, ok := pod.Resources.InitContainers[c.Name]; ok {
			c.Resources = r
		} else if r, ok := pod.Resources.Containers[c.Name]; ok {
			c.Resources = r
		}
	}

	if len(c.Resources.Requests) == 0 && len(c.Resources.Limits) == 0 {
		c.Resources = estimateComputeResources(c.LinuxReq, pod.CgroupParent)
	}

	c.TopologyHints = topology.MergeTopologyHints(c.TopologyHints, getKubeletHint(c.GetCpusetCpus(), c.GetCpusetMems()))

	if err := c.setDefaults(); err != nil {
		return err
	}

	return nil
}

// Create container from a container list response.
func (c *container) fromListResponse(lrc *criv1.Container) error {
	c.PodID = lrc.PodSandboxId

	pod, ok := c.cache.Pods[c.PodID]
	if !ok {
		return cacheError("can't find cached pod %s for listed container", c.PodID)
	}

	meta := lrc.Metadata
	if meta == nil {
		return cacheError("listed container of pod %s has no metadata", c.PodID)
	}

	c.ID = lrc.Id
	c.Name = meta.Name
	c.Namespace = pod.Namespace
	c.State = ContainerState(int32(lrc.State))
	c.Image = lrc.GetImage().GetImage()
	c.Labels = lrc.Labels
	c.Annotations = lrc.Annotations
	c.Tags = make(map[string]string)

	if pod.Resources != nil {
		if r, ok := pod.Resources.InitContainers[c.Name]; ok {
			c.Resources = r
		} else if r, ok := pod.Resources.Containers[c.Name]; ok {
			c.Resources = r
		}
	}

	if len(c.Resources.Requests) == 0 && len(c.Resources.Limits) == 0 {
		c.Resources = estimateComputeResources(c.LinuxReq, pod.CgroupParent)
	}

	if err := c.setDefaults(); err != nil {
		return err
	}

	return nil
}

func (c *container) setDefaults() error {
	class, ok := c.GetEffectiveAnnotation(RDTClassKey)
	if !ok {
		class = RDTClassPodQoS
	}
	c.SetRDTClass(class)

	class, ok = c.GetEffectiveAnnotation(BlockIOClassKey)
	if !ok {
		class = string(c.GetQOSClass())
	}
	c.SetBlockIOClass(class)

	limit, ok := c.GetEffectiveAnnotation(ToptierLimitKey)
	if !ok {
		c.ToptierLimit = ToptierLimitUnset
	} else {
		qty, err := resapi.ParseQuantity(limit)
		if err != nil {
			return cacheError("%q: failed to parse top tier limit annotation %q (%q): %v",
				c.PrettyName(), ToptierLimitKey, limit, err)
		}
		c.SetToptierLimit(qty.Value())
	}

	return nil
}

func (c *container) PrettyName() string {
	if c.prettyName != "" {
		return c.prettyName
	}
	if pod, ok := c.GetPod(); !ok {
		c.prettyName = c.PodID + ":" + c.Name
	} else {
		c.prettyName = pod.GetName() + ":" + c.Name
	}
	return c.prettyName
}

func (c *container) GetPod() (Pod, bool) {
	pod, found := c.cache.Pods[c.PodID]
	return pod, found
}

func (c *container) GetID() string {
	return c.ID
}

func (c *container) GetPodID() string {
	return c.PodID
}

func (c *container) GetCacheID() string {
	return c.CacheID
}

func (c *container) GetName() string {
	return c.Name
}

func (c *container) GetNamespace() string {
	return c.Namespace
}

func (c *container) UpdateState(state ContainerState) {
	c.State = state
}

func (c *container) GetState() ContainerState {
	return c.State
}

func (c *container) GetQOSClass() v1.PodQOSClass {
	var qos v1.PodQOSClass

	if pod, found := c.GetPod(); found {
		qos = pod.GetQOSClass()
	}

	return qos
}

func (c *container) GetImage() string {
	return c.Image
}

func (c *container) GetCommand() []string {
	command := make([]string, len(c.Command))
	copy(command, c.Command)
	return command
}

func (c *container) GetArgs() []string {
	args := make([]string, len(c.Args))
	copy(args, c.Args)
	return args
}

func keysInNamespace(m map[string]string, namespace string) []string {
	keys := make([]string, 0, len(m))

	for key := range m {
		split := strings.SplitN(key, "/", 2)
		if len(split) == 2 && split[0] == namespace {
			keys = append(keys, split[1])
		} else if len(split) == 1 && len(namespace) == 0 {
			keys = append(keys, split[0])
		}
	}

	return keys
}

func (c *container) GetLabelKeys() []string {
	keys := make([]string, len(c.Labels))

	idx := 0
	for key := range c.Labels {
		keys[idx] = key
		idx++
	}

	return keys
}

func (c *container) GetLabel(key string) (string, bool) {
	value, ok := c.Labels[key]
	return value, ok
}

func (c *container) GetResmgrLabelKeys() []string {
	return keysInNamespace(c.Labels, kubernetes.ResmgrKeyNamespace)
}

func (c *container) GetResmgrLabel(key string) (string, bool) {
	value, ok := c.Labels[kubernetes.ResmgrKey(key)]
	return value, ok
}

func (c *container) GetLabels() map[string]string {
	if c.Labels == nil {
		return nil
	}
	labels := make(map[string]string, len(c.Labels))
	for key, value := range c.Labels {
		labels[key] = value
	}
	return labels
}

func (c *container) GetAnnotationKeys() []string {
	keys := make([]string, len(c.Annotations))

	idx := 0
	for key := range c.Annotations {
		keys[idx] = key
		idx++
	}

	return keys
}

func (c *container) GetAnnotation(key string, objPtr interface{}) (string, bool) {
	jsonStr, ok := c.Annotations[key]
	if !ok {
		return "", false
	}

	if objPtr != nil {
		if err := json.Unmarshal([]byte(jsonStr), objPtr); err != nil {
			c.cache.Error("failed to unmarshal annotation %s (%s) of pod %s into %T",
				key, jsonStr, c.ID, objPtr)
			return "", false
		}
	}

	return jsonStr, true
}

func (c *container) GetResmgrAnnotationKeys() []string {
	return keysInNamespace(c.Annotations, kubernetes.ResmgrKeyNamespace)
}

func (c *container) GetResmgrAnnotation(key string, objPtr interface{}) (string, bool) {
	return c.GetAnnotation(kubernetes.ResmgrKey(key), objPtr)
}

func (c *container) GetEffectiveAnnotation(key string) (string, bool) {
	pod, ok := c.GetPod()
	if !ok {
		return "", false
	}
	return pod.GetEffectiveAnnotation(key, c.Name)
}

func (c *container) GetAnnotations() map[string]string {
	if c.Annotations == nil {
		return nil
	}
	annotations := make(map[string]string, len(c.Annotations))
	for key, value := range c.Annotations {
		annotations[key] = value
	}
	return annotations
}

func (c *container) GetEnvKeys() []string {
	keys := make([]string, len(c.Env))

	idx := 0
	for key := range c.Env {
		keys[idx] = key
		idx++
	}

	return keys
}

func (c *container) GetEnv(key string) (string, bool) {
	value, ok := c.Env[key]
	return value, ok
}

func (c *container) GetMounts() []Mount {
	mounts := make([]Mount, len(c.Mounts))

	idx := 0
	for _, m := range c.Mounts {
		mounts[idx] = *m
		idx++
	}

	return mounts
}

func (c *container) GetMountByHost(path string) *Mount {
	for _, m := range c.Mounts {
		if m.Host == path {
			return &(*m)
		}
	}

	return nil
}

func (c *container) GetMountByContainer(path string) *Mount {
	m, ok := c.Mounts[path]
	if !ok {
		return nil
	}

	return &(*m)
}

func (c *container) GetDevices() []Device {
	devices := make([]Device, len(c.Devices))

	idx := 0
	for _, d := range c.Devices {
		devices[idx] = *d
		idx++
	}

	return devices
}

func (c *container) GetDeviceByHost(path string) *Device {
	for _, d := range c.Devices {
		if d.Host == path {
			return &(*d)
		}
	}

	return nil
}

func (c *container) GetDeviceByContainer(path string) *Device {
	d, ok := c.Devices[path]
	if !ok {
		return nil
	}

	return &(*d)
}

func (c *container) GetResourceRequirements() v1.ResourceRequirements {
	if adjust, _ := c.getEffectiveAdjustment(); adjust != nil {
		if resources, ok := adjust.GetResourceRequirements(); ok {
			return resources
		}
	}
	return c.Resources
}

func (c *container) GetLinuxResources() *criv1.LinuxContainerResources {
	if c.LinuxReq == nil {
		return nil
	}

	return &(*c.LinuxReq)
}

func (c *container) setEffectiveAdjustment(name string) string {
	previous := c.Adjustment
	c.Adjustment = name
	return previous
}

func (c *container) getEffectiveAdjustment() (*extapi.AdjustmentSpec, string) {
	if c.Adjustment == "" {
		return nil, ""
	}
	if c.cache.External != nil {
		return c.cache.External.Adjustments[c.Adjustment], c.Adjustment
	}
	return nil, c.Adjustment
}

func (c *container) SetCommand(value []string) {
	c.Command = value
	c.markPending(CRI)
}

func (c *container) SetArgs(value []string) {
	c.Args = value
	c.markPending(CRI)
}

func (c *container) SetLabel(key, value string) {
	if c.Labels == nil {
		c.Labels = make(map[string]string)
	}
	c.Labels[key] = value
	c.markPending(CRI)
}

func (c *container) DeleteLabel(key string) {
	if _, ok := c.Labels[key]; ok {
		delete(c.Labels, key)
		c.markPending(CRI)
	}
}

func (c *container) SetAnnotation(key, value string) {
	if c.Annotations == nil {
		c.Annotations = make(map[string]string)
	}
	c.Annotations[key] = value
	c.markPending(CRI)
}

func (c *container) DeleteAnnotation(key string) {
	if _, ok := c.Annotations[key]; ok {
		delete(c.Annotations, key)
		c.markPending(CRI)
	}
}

func (c *container) SetEnv(key, value string) {
	if c.Env == nil {
		c.Env = make(map[string]string)
	}
	c.Env[key] = value
	c.markPending(CRI)
}

func (c *container) UnsetEnv(key string) {
	if _, ok := c.Env[key]; ok {
		delete(c.Env, key)
		c.markPending(CRI)
	}
}

func (c *container) InsertMount(m *Mount) {
	if c.Mounts == nil {
		c.Mounts = make(map[string]*Mount)
	}
	c.Mounts[m.Container] = m
	c.markPending(CRI)
}

func (c *container) DeleteMount(path string) {
	if _, ok := c.Mounts[path]; ok {
		delete(c.Mounts, path)
		c.markPending(CRI)
	}
}

func (c *container) InsertDevice(d *Device) {
	if c.Devices == nil {
		c.Devices = make(map[string]*Device)
	}
	c.Devices[d.Container] = d
	c.markPending(CRI)
}

func (c *container) DeleteDevice(path string) {
	if _, ok := c.Devices[path]; ok {
		delete(c.Devices, path)
		c.markPending(CRI)
	}
}

func (c *container) GetTopologyHints() topology.Hints {
	return c.TopologyHints
}

func (c *container) GetCPUPeriod() int64 {
	if c.LinuxReq == nil {
		return 0
	}
	return c.LinuxReq.CpuPeriod
}

func (c *container) GetCPUQuota() int64 {
	if c.LinuxReq == nil {
		return 0
	}
	return c.LinuxReq.CpuQuota
}

func (c *container) GetCPUShares() int64 {
	if c.LinuxReq == nil {
		return 0
	}
	return c.LinuxReq.CpuShares
}

func (c *container) GetMemoryLimit() int64 {
	if c.LinuxReq == nil {
		return 0
	}
	return c.LinuxReq.MemoryLimitInBytes
}

func (c *container) GetOomScoreAdj() int64 {
	if c.LinuxReq == nil {
		return 0
	}
	return c.LinuxReq.OomScoreAdj
}

func (c *container) GetCpusetCpus() string {
	if c.LinuxReq == nil {
		return ""
	}
	return c.LinuxReq.CpusetCpus
}

func (c *container) GetCpusetMems() string {
	if c.LinuxReq == nil {
		return ""
	}
	return c.LinuxReq.CpusetMems
}

func (c *container) SetLinuxResources(req *criv1.LinuxContainerResources) {
	c.LinuxReq = req
	c.markPending(CRI)
}

func (c *container) SetCPUPeriod(value int64) {
	if c.LinuxReq == nil {
		c.LinuxReq = &criv1.LinuxContainerResources{}
	}
	c.LinuxReq.CpuPeriod = value
	c.markPending(CRI)
}

func (c *container) SetCPUQuota(value int64) {
	if c.LinuxReq == nil {
		c.LinuxReq = &criv1.LinuxContainerResources{}
	}
	c.LinuxReq.CpuQuota = value
	c.markPending(CRI)
}

func (c *container) SetCPUShares(value int64) {
	if c.LinuxReq == nil {
		c.LinuxReq = &criv1.LinuxContainerResources{}
	}
	c.LinuxReq.CpuShares = value
	c.markPending(CRI)
}

func (c *container) SetMemoryLimit(value int64) {
	if c.LinuxReq == nil {
		c.LinuxReq = &criv1.LinuxContainerResources{}
	}
	c.LinuxReq.MemoryLimitInBytes = value
	c.markPending(CRI)
}

func (c *container) SetOomScoreAdj(value int64) {
	if c.LinuxReq == nil {
		c.LinuxReq = &criv1.LinuxContainerResources{}
	}
	c.LinuxReq.OomScoreAdj = value
	c.markPending(CRI)
}

func (c *container) SetCpusetCpus(value string) {
	if c.LinuxReq == nil {
		c.LinuxReq = &criv1.LinuxContainerResources{}
	}
	c.LinuxReq.CpusetCpus = value
	c.markPending(CRI)
}

func (c *container) SetCpusetMems(value string) {
	if c.LinuxReq == nil {
		c.LinuxReq = &criv1.LinuxContainerResources{}
	}
	c.LinuxReq.CpusetMems = value
	c.markPending(CRI)
}

func getTopologyHints(hostPath, containerPath string, readOnly bool) topology.Hints {

	if readOnly {
		// if device or path is read-only, assume it as non-important for now
		// TODO: determine topology hint, but use it with low priority
		return topology.Hints{}
	}

	// ignore topology information for small files in /etc, service files in /var/lib/kubelet and host libraries mounts
	ignoredTopologyPaths := []string{"/.cri-resmgr", "/etc/", "/dev/termination-log", "/lib/", "/lib64/", "/usr/lib/", "/usr/lib32/", "/usr/lib64/"}

	for _, path := range ignoredTopologyPaths {
		if strings.HasPrefix(hostPath, path) || strings.HasPrefix(containerPath, path) {
			return topology.Hints{}
		}
	}

	// More complext rules, for Kubelet secrets and config maps
	ignoredTopologyPathRegexps := []*regexp.Regexp{
		// Kubelet directory can be different, but we can detect it by structure inside of it.
		// For now, we can safely ignore exposed config maps and secrets for topology hints.
		regexp.MustCompile(`(kubelet)?/pods/[[:xdigit:]-]+/volumes/kubernetes\.io~(configmap|secret)/`),
	}
	for _, re := range ignoredTopologyPathRegexps {
		if re.MatchString(hostPath) || re.MatchString(containerPath) {
			return topology.Hints{}
		}
	}

	if devPath, err := topology.FindSysFsDevice(hostPath); err == nil {
		// errors are ignored
		if hints, err := topology.NewTopologyHints(devPath); err == nil && len(hints) > 0 {
			return hints
		}
	}

	return topology.Hints{}
}

func getKubeletHint(cpus, mems string) (ret topology.Hints) {
	if cpus != "" || mems != "" {
		ret = topology.Hints{
			topology.ProviderKubelet: topology.Hint{
				Provider: topology.ProviderKubelet,
				CPUs:     cpus,
				NUMAs:    mems}}
	}
	return
}

func (c *container) GetAffinity() ([]*Affinity, error) {
	pod, ok := c.GetPod()
	if !ok {
		c.cache.Error("internal error: can't find Pod for container %s", c.PrettyName())
	}
	affinity, err := pod.GetContainerAffinity(c.GetName())
	if err != nil {
		return nil, err
	}
	affinity = append(affinity, c.implicitAffinities(len(affinity) > 0)...)
	c.cache.Debug("affinity for container %s:", c.PrettyName())
	for _, a := range affinity {
		c.cache.Debug("  - %s", a.String())
	}

	return affinity, nil
}

func (c *container) GetCgroupDir() string {
	if c.CgroupDir != "" {
		return c.CgroupDir
	}
	if pod, ok := c.GetPod(); ok {
		parent, _ := pod.GetCgroupParentDir(), pod.GetID()
		ID := c.GetID()
		c.CgroupDir = findContainerDir(parent, ID)
	}
	return c.CgroupDir
}

func (c *container) SetRDTClass(class string) {
	c.RDTClass = class
	c.markPending(RDT)
}

func (c *container) GetRDTClass() string {
	if adjust, _ := c.getEffectiveAdjustment(); adjust != nil {
		if class, ok := adjust.GetRDTClass(); ok {
			return class
		}
	}
	return c.RDTClass
}

func (c *container) SetBlockIOClass(class string) {
	c.BlockIOClass = class
	c.markPending(BlockIO)
}

func (c *container) GetBlockIOClass() string {
	if adjust, _ := c.getEffectiveAdjustment(); adjust != nil {
		if class, ok := adjust.GetBlockIOClass(); ok {
			return class
		}
	}
	return c.BlockIOClass
}

func (c *container) SetToptierLimit(limit int64) {
	c.ToptierLimit = limit
	c.markPending(Memory)
}

func (c *container) GetToptierLimit() int64 {
	if adjust, _ := c.getEffectiveAdjustment(); adjust != nil {
		if adjust.ToptierLimit != nil {
			return adjust.ToptierLimit.Value()
		}
	}
	return c.ToptierLimit
}

func (c *container) SetPageMigration(p *PageMigrate) {
	c.PageMigrate = p
	c.markPending(PageMigration)
}

func (c *container) GetPageMigration() *PageMigrate {
	return c.PageMigrate
}

func (c *container) GetProcesses() ([]string, error) {
	dir := c.GetCgroupDir()
	if dir == "" {
		return nil, cacheError("%s: unknown cgroup directory", c.PrettyName())
	}
	return cgroups.Cpu.Group(dir).GetProcesses()
}

func (c *container) GetTasks() ([]string, error) {
	dir := c.GetCgroupDir()
	if dir == "" {
		return nil, cacheError("%s: unknown cgroup directory", c.PrettyName())
	}
	return cgroups.Cpu.Group(dir).GetTasks()
}

func (c *container) SetCRIRequest(req interface{}) error {
	if c.req != nil {
		return cacheError("can't set pending container request: another pending")
	}
	c.req = &req
	return nil
}

func (c *container) GetCRIRequest() (interface{}, bool) {
	if c.req == nil {
		return nil, false
	}

	return *c.req, true
}

func (c *container) ClearCRIRequest() (interface{}, bool) {
	req, ok := c.GetCRIRequest()
	c.req = nil
	return req, ok
}

func (c *container) GetCRIEnvs() []*criv1.KeyValue {
	envs := make([]*criv1.KeyValue, len(c.Env), len(c.Env))
	idx := 0
	for k, v := range c.Env {
		envs[idx] = &criv1.KeyValue{
			Key:   k,
			Value: v,
		}
		idx++
	}
	return envs
}

func (c *container) GetCRIMounts() []*criv1.Mount {
	if c.Mounts == nil {
		return nil
	}
	mounts := make([]*criv1.Mount, len(c.Mounts), len(c.Mounts))
	idx := 0
	for _, m := range c.Mounts {
		mounts[idx] = &criv1.Mount{
			ContainerPath:  m.Container,
			HostPath:       m.Host,
			Readonly:       m.Readonly,
			SelinuxRelabel: m.Relabel,
			Propagation:    criv1.MountPropagation(m.Propagation),
		}
		idx++
	}
	return mounts
}

func (c *container) GetCRIDevices() []*criv1.Device {
	if c.Devices == nil {
		return nil
	}
	devices := make([]*criv1.Device, len(c.Devices), len(c.Devices))
	idx := 0
	for _, d := range c.Devices {
		devices[idx] = &criv1.Device{
			ContainerPath: d.Container,
			HostPath:      d.Host,
			Permissions:   d.Permissions,
		}
		idx++
	}
	return devices
}

func (c *container) markPending(controllers ...string) {
	if c.pending == nil {
		c.pending = make(map[string]struct{})
	}
	for _, ctrl := range controllers {
		c.pending[ctrl] = struct{}{}
		c.cache.markPending(c)
	}
}

func (c *container) ClearPending(controller string) {
	delete(c.pending, controller)
	if len(c.pending) == 0 {
		c.cache.clearPending(c)
	}
}

func (c *container) GetPending() []string {
	if c.pending == nil {
		return nil
	}
	pending := make([]string, 0, len(c.pending))
	for controller := range c.pending {
		pending = append(pending, controller)
	}
	sort.Strings(pending)
	return pending
}

func (c *container) HasPending(controller string) bool {
	if c.pending == nil {
		return false
	}
	_, pending := c.pending[controller]
	return pending
}

func (c *container) GetTag(key string) (string, bool) {
	value, ok := c.Tags[key]
	return value, ok
}

func (c *container) SetTag(key string, value string) (string, bool) {
	prev, ok := c.Tags[key]
	c.Tags[key] = value
	return prev, ok
}

func (c *container) DeleteTag(key string) (string, bool) {
	value, ok := c.Tags[key]
	delete(c.Tags, key)
	return value, ok
}

func (c *container) implicitAffinities(hasExplicit bool) []*Affinity {
	affinities := []*Affinity{}
	for name, generate := range c.cache.implicit {
		implicit := generate(c, hasExplicit)
		if implicit == nil {
			c.cache.Debug("no implicit affinity %s for container %s",
				name, c.PrettyName())
			continue
		}

		c.cache.Debug("using implicit affinity %s for %s", name, c.PrettyName())
		affinities = append(affinities, implicit)
	}
	return affinities
}

func (c *container) String() string {
	return c.PrettyName()
}

func (c *container) Eval(key string) interface{} {
	switch key {
	case resmgr.KeyPod:
		pod, ok := c.GetPod()
		if !ok {
			return cacheError("%s: failed to find pod %s", c.PrettyName(), c.PodID)
		}
		return pod
	case resmgr.KeyName:
		return c.Name
	case resmgr.KeyNamespace:
		return c.Namespace
	case resmgr.KeyQOSClass:
		return c.GetQOSClass()
	case resmgr.KeyLabels:
		return c.Labels
	case resmgr.KeyTags:
		return c.Tags
	case resmgr.KeyID:
		return c.ID
	default:
		return cacheError("%s: Container cannot evaluate of %q", c.PrettyName(), key)
	}
}

// CompareContainersFn compares two containers by some arbitrary property.
// It returns a negative integer, 0, or a positive integer, depending on
// whether the first container is considered smaller, equal, or larger than
// the second.
type CompareContainersFn func(Container, Container) int

// SortContainers sorts a slice of containers using the given comparison functions.
// If the containers are otherwise equal they are sorted by pod and container name.
// If the comparison functions are omitted, containers are compared by QoS class,
// memory and cpuset size.
func SortContainers(containers []Container, compareFns ...CompareContainersFn) {
	if len(compareFns) == 0 {
		compareFns = CompareByQOSMemoryCPU
	}
	sort.Slice(containers, func(i, j int) bool {
		ci, cj := containers[i], containers[j]
		for _, cmpFn := range compareFns {
			switch diff := cmpFn(ci, cj); {
			case diff < 0:
				return true
			case diff > 0:
				return false
			}
		}
		// If two containers are otherwise equal they are sorted by pod and container name.
		if pi, ok := ci.GetPod(); ok {
			if pj, ok := cj.GetPod(); ok {
				ni, nj := pi.GetName(), pj.GetName()
				if ni != nj {
					return ni < nj
				}
			}
		}
		return ci.GetName() < cj.GetName()
	})
}

// CompareByQOSMemoryCPU is a slice for comparing container by QOS, memory, and CPU.
var CompareByQOSMemoryCPU = []CompareContainersFn{CompareQOS, CompareMemory, CompareCPU}

// CompareQOS compares containers by QOS class.
func CompareQOS(ci, cj Container) int {
	qosi, qosj := ci.GetQOSClass(), cj.GetQOSClass()
	switch {
	case qosi == v1.PodQOSGuaranteed && qosj != v1.PodQOSGuaranteed:
		return -1
	case qosj == v1.PodQOSGuaranteed && qosi != v1.PodQOSGuaranteed:
		return +1
	case qosi == v1.PodQOSBurstable && qosj == v1.PodQOSBestEffort:
		return -1
	case qosj == v1.PodQOSBurstable && qosi == v1.PodQOSBestEffort:
		return +1
	}
	return 0
}

// CompareMemory compares containers by memory requests and limits.
func CompareMemory(ci, cj Container) int {
	var reqi, limi, reqj, limj int64

	resi := ci.GetResourceRequirements()
	if qty, ok := resi.Requests[v1.ResourceMemory]; ok {
		reqi = qty.Value()
	}
	if qty, ok := resi.Limits[v1.ResourceMemory]; ok {
		limi = qty.Value()
	}
	resj := cj.GetResourceRequirements()
	if qty, ok := resj.Requests[v1.ResourceMemory]; ok {
		reqj = qty.Value()
	}
	if qty, ok := resj.Limits[v1.ResourceMemory]; ok {
		limj = qty.Value()
	}

	switch diff := reqj - reqi; {
	case diff < 0:
		return -1
	case diff > 0:
		return +1
	}
	switch diff := limj - limi; {
	case diff < 0:
		return -1
	case diff > 0:
		return +1
	}
	return 0
}

// CompareCPU compares containers by CPU requests and limits.
func CompareCPU(ci, cj Container) int {
	var reqi, limi, reqj, limj int64

	resi := ci.GetResourceRequirements()
	if qty, ok := resi.Requests[v1.ResourceCPU]; ok {
		reqi = qty.MilliValue()
	}
	if qty, ok := resi.Limits[v1.ResourceCPU]; ok {
		limi = qty.MilliValue()
	}
	resj := cj.GetResourceRequirements()
	if qty, ok := resj.Requests[v1.ResourceCPU]; ok {
		reqj = qty.MilliValue()
	}
	if qty, ok := resj.Limits[v1.ResourceCPU]; ok {
		limj = qty.MilliValue()
	}

	switch diff := reqj - reqi; {
	case diff < 0:
		return -1
	case diff > 0:
		return +1
	}
	switch diff := limj - limi; {
	case diff < 0:
		return -1
	case diff > 0:
		return +1
	}
	return 0
}


================================================
FILE: pkg/cri/resource-manager/cache/container_test.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cache

import (
	"sort"
	"testing"

	"github.com/google/go-cmp/cmp"
	"github.com/google/go-cmp/cmp/cmpopts"
)

func TestGetKubeletHint(t *testing.T) {
	type T struct {
		name        string
		cpus        string
		mems        string
		expectedLen int
	}

	cases := []T{
		{
			name:        "empty",
			cpus:        "",
			mems:        "",
			expectedLen: 0,
		},
		{
			name:        "cpus",
			cpus:        "0-9",
			mems:        "",
			expectedLen: 1,
		},
		{
			name:        "mems",
			cpus:        "",
			mems:        "0,1",
			expectedLen: 1,
		},
		{
			name:        "both",
			cpus:        "0-9",
			mems:        "0,1",
			expectedLen: 1,
		},
	}

	for _, tc := range cases {
		t.Run(tc.name, func(t *testing.T) {
			output := getKubeletHint(tc.cpus, tc.mems)
			if len(output) != tc.expectedLen {
				t.Errorf("expected len of hints: %d, got: %d, hints: %+v", tc.expectedLen, len(output), output)
			}
		})
	}
}

func TestGetTopologyHints(t *testing.T) {
	type T struct {
		name          string
		hostPath      string
		containerPath string
		readOnly      bool
		expectedLen   int
	}

	cases := []T{
		{
			name:          "read-only",
			hostPath:      "/something",
			containerPath: "/something",
			readOnly:      true,
		},
		{
			name:          "host /etc",
			hostPath:      "/etc/something",
			containerPath: "/data/something",
		},
		{
			name:          "container /etc",
			hostPath:      "/var/lib/kubelet/pods/0c9bcfc4-c51b-11e9-ac9a-b8aeed7c7427/etc-hosts",
			containerPath: "/etc/hosts",
		},
		{
			name:          "ConfigMap",
			containerPath: "/var/lib/kube-proxy",
			hostPath:      "/var/lib/kubelet/pods/0c9bcfc4-c51b-11e9-ac9a-b8aeed7c7427/volumes/kubernetes.io~configmap/kube-proxy",
		},
		{
			name:          "secret",
			containerPath: "/var/run/secrets/kubernetes.io/serviceaccount",
			hostPath:      "/var/lib/kubelet/pods/0c9bcfc4-c51b-11e9-ac9a-b8aeed7c7427/volumes/kubernetes.io~secret/kube-proxy-token-d9slz",
		},
		{
			name:          "dev null",
			hostPath:      "/dev/null",
			containerPath: "/dev/null",
		},
	}

	for _, tc := range cases {
		t.Run(tc.name, func(t *testing.T) {
			output := getTopologyHints(tc.hostPath, tc.containerPath, tc.readOnly)
			if len(output) != tc.expectedLen {
				t.Errorf("expected len of hints: %d, got: %d, hints: %+v", tc.expectedLen, len(output), output)
			}
		})
	}
}

func TestKeysInNamespace(t *testing.T) {
	testMap := map[string]string{
		"no-namespace":               "",
		"my.name.space":              "",
		"my.name.space/key-1":        "",
		"my.name.space/key-2":        "",
		"other.name.space/other-key": "",
	}
	tcases := []struct {
		name          string
		collectionMap map[string]string
		namespace     string
		expectedKeys  []string
	}{
		{
			name: "empty map should return nothing for empty namespace",
		},
		{
			name:      "empty map should return nothing",
			namespace: "my.name.space",
		},
		{
			name:          "keys with no namespace",
			collectionMap: testMap,
			expectedKeys:  []string{"my.name.space", "no-namespace"},
		},
		{
			name:          "keys in namespace",
			collectionMap: testMap,
			namespace:     "my.name.space",
			expectedKeys:  []string{"key-1", "key-2"},
		},
	}

	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			keys := keysInNamespace(tc.collectionMap, tc.namespace)
			sort.Strings(keys)
			if !cmp.Equal(keys, tc.expectedKeys, cmpopts.EquateEmpty()) {
				t.Errorf("Expected %v, received %v", tc.expectedKeys, keys)
			}
		})
	}
}


================================================
FILE: pkg/cri/resource-manager/cache/error.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cache

import (
	"fmt"
)

func cacheError(format string, args ...interface{}) error {
	return fmt.Errorf("cache: "+format, args...)
}


================================================
FILE: pkg/cri/resource-manager/cache/pod.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cache

import (
	"encoding/json"
	"strconv"
	"strings"

	v1 "k8s.io/api/core/v1"
	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"

	"github.com/intel/cri-resource-manager/pkg/apis/resmgr"
	"github.com/intel/cri-resource-manager/pkg/cgroups"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes"
)

const (
	// KeyResourceAnnotation is the annotation key our webhook uses.
	KeyResourceAnnotation = "intel.com/resources"
)

// Create a pod from a run request.
func (p *pod) fromRunRequest(req *criv1.RunPodSandboxRequest) error {
	cfg := req.Config
	if cfg == nil {
		return cacheError("pod %s has no config", p.ID)
	}
	meta := cfg.Metadata
	if meta == nil {
		return cacheError("pod %s has no request metadata", p.ID)
	}

	p.containers = make(map[string]string)
	p.UID = meta.Uid
	p.Name = meta.Name
	p.Namespace = meta.Namespace
	p.State = PodState(int32(PodStateReady))
	p.Labels = cfg.Labels
	p.Annotations = cfg.Annotations
	p.CgroupParent = cfg.GetLinux().GetCgroupParent()

	if err := p.discoverQOSClass(); err != nil {
		p.cache.Error("%v", err)
	}

	p.parseResourceAnnotations()

	return nil
}

// Create a pod from a list response.
func (p *pod) fromListResponse(pod *criv1.PodSandbox, status *PodStatus) error {
	meta := pod.Metadata
	if meta == nil {
		return cacheError("pod %s has no reply metadata", p.ID)
	}

	p.containers = make(map[string]string)
	p.UID = meta.Uid
	p.Name = meta.Name
	p.Namespace = meta.Namespace
	p.State = PodState(int32(pod.State))
	p.Labels = pod.Labels
	p.Annotations = pod.Annotations

	if status == nil {
		p.cache.Error("pod %s has no associated status query data", p.ID)
	} else {
		p.CgroupParent = status.CgroupParent
	}

	if err := p.discoverQOSClass(); err != nil {
		p.cache.Error("%v", err)
	}

	p.parseResourceAnnotations()

	return nil
}

// Get the init containers of a pod.
func (p *pod) GetInitContainers() []Container {
	if p.Resources == nil {
		return nil
	}

	containers := []Container{}

	for id, c := range p.cache.Containers {
		if id != c.CacheID {
			continue
		}
		if _, ok := p.Resources.InitContainers[c.ID]; ok {
			containers = append(containers, c)
		}
	}

	return containers
}

// Get the normal containers of a pod.
func (p *pod) GetContainers() []Container {
	containers := []Container{}

	for id, c := range p.cache.Containers {
		if c.PodID != p.ID || id != c.CacheID {
			continue
		}
		if p.Resources != nil {
			if _, ok := p.Resources.InitContainers[c.ID]; ok {
				continue
			}
		}
		containers = append(containers, c)
	}

	return containers
}

// Get container pointer by its name.
func (p *pod) getContainer(name string) *container {
	var found *container

	if id, ok := p.containers[name]; ok {
		return p.cache.Containers[id]
	}

	for _, c := range p.GetContainers() {
		cptr := c.(*container)
		p.containers[cptr.Name] = cptr.ID
		if cptr.Name == name {
			found = cptr
		}
	}

	return found
}

// Get container by its name.
func (p *pod) GetContainer(name string) (Container, bool) {
	c := p.getContainer(name)

	return c, c != nil
}

// Get the id of a pod.
func (p *pod) GetID() string {
	return p.ID
}

// Get the (k8s) unique id of a pod.
func (p *pod) GetUID() string {
	return p.UID
}

// Get the name of a pod.
func (p *pod) GetName() string {
	return p.Name
}

// Get the namespace of a pod.
func (p *pod) GetNamespace() string {
	return p.Namespace
}

// Get the PodState of a pod.
func (p *pod) GetState() PodState {
	return p.State
}

// Get the keys of all labels of a pod.
func (p *pod) GetLabelKeys() []string {
	keys := make([]string, len(p.Labels))

	idx := 0
	for key := range p.Labels {
		keys[idx] = key
		idx++
	}

	return keys
}

// Get the label for a key of a pod.
func (p *pod) GetLabel(key string) (string, bool) {
	value, ok := p.Labels[key]
	return value, ok
}

// Get all label keys in the cri-resource-manager namespace.
func (p *pod) GetResmgrLabelKeys() []string {
	return keysInNamespace(p.Labels, kubernetes.ResmgrKeyNamespace)
}

// Get the label for the given key in the cri-resource-manager namespace.
func (p *pod) GetResmgrLabel(key string) (string, bool) {
	value, ok := p.Labels[kubernetes.ResmgrKey(key)]
	return value, ok
}

// Get the keys of all annotations of a pod.
func (p *pod) GetAnnotationKeys() []string {
	keys := make([]string, len(p.Annotations))

	idx := 0
	for key := range p.Annotations {
		keys[idx] = key
		idx++
	}

	return keys
}

// Get pod annotation for the given key.
func (p *pod) GetAnnotation(key string) (string, bool) {
	value, ok := p.Annotations[key]
	return value, ok
}

// Get and decode/unmarshal pod annotation for the given key.
func (p *pod) GetAnnotationObject(key string, objPtr interface{},
	decode func([]byte, interface{}) error) (bool, error) {
	var err error

	value, ok := p.GetAnnotation(key)
	if !ok {
		return false, nil
	}

	// decode with decoder function, if given
	if decode != nil {
		err = decode([]byte(value), objPtr)
		return true, err
	}

	// decode with type-specific default decoder
	switch objPtr.(type) {
	case *string:
		*objPtr.(*string) = value
	case *bool:
		*objPtr.(*bool), err = strconv.ParseBool(value)
	case *int:
		var i int64
		i, err = strconv.ParseInt(value, 0, 0)
		*objPtr.(*int) = int(i)
	case *uint:
		var i uint64
		i, err = strconv.ParseUint(value, 0, 0)
		*objPtr.(*uint) = uint(i)
	case *int64:
		*objPtr.(*int64), err = strconv.ParseInt(value, 0, 64)
	case *uint64:
		*objPtr.(*uint64), err = strconv.ParseUint(value, 0, 64)
	default:
		err = json.Unmarshal([]byte(value), objPtr)
	}

	if err != nil {
		p.cache.Error("failed to decode annotation %s (%s): %v", key, value, err)
	}

	return true, err
}

// Get the keys of all annotation in the cri-resource-manager namespace.
func (p *pod) GetResmgrAnnotationKeys() []string {
	return keysInNamespace(p.Annotations, kubernetes.ResmgrKeyNamespace)
}

// Get the value of the given annotation in the cri-resource-manager namespace.
func (p *pod) GetResmgrAnnotation(key string) (string, bool) {
	return p.GetAnnotation(kubernetes.ResmgrKey(key))
}

// Get and decode the pod annotation for the key in the cri-resource-manager namespace..
func (p *pod) GetResmgrAnnotationObject(key string, objPtr interface{},
	decode func([]byte, interface{}) error) (bool, error) {
	return p.GetAnnotationObject(kubernetes.ResmgrKey(key), objPtr, decode)
}

// Get the effective annotation for the container.
func (p *pod) GetEffectiveAnnotation(key, container string) (string, bool) {
	if v, ok := p.Annotations[key+"/container."+container]; ok {
		return v, true
	}
	if v, ok := p.Annotations[key+"/pod"]; ok {
		return v, true
	}
	v, ok := p.Annotations[key]
	return v, ok
}

// Get the cgroup parent directory of a pod, if known.
func (p *pod) GetCgroupParentDir() string {
	return p.CgroupParent
}

// discover a pod's QoS class by parsing the cgroup parent directory.
func (p *pod) discoverQOSClass() error {
	if p.CgroupParent == "" {
		p.QOSClass = v1.PodQOSBestEffort
		return cacheError("%s: unknown cgroup parent/QoS class", p.ID)
	}

	dirs := strings.Split(p.CgroupParent[1:], "/")
	if len(dirs) < 1 {
		return cacheError("%s: failed to parse %q for QoS class",
			p.ID, p.CgroupParent)

	}

	// consume any potential --cgroup-root passed to kubelet
	if dirs[0] != "kubepods.slice" && dirs[0] != "kubepods" {
		dirs = dirs[1:]
	}
	if len(dirs) < 1 {
		return cacheError("%s: failed to parse %q for QoS class",
			p.ID, p.CgroupParent)
	}

	// consume potential kubepods[.slice]
	if dirs[0] == "kubepods.slice" || dirs[0] == "kubepods" {
		dirs = dirs[1:]
	}
	if len(dirs) < 1 {
		return cacheError("%s: failed to parse %q for QoS class",
			p.ID, p.CgroupParent)
	}

	// check for besteffort, burstable, or lack thereof indicating guaranteed
	switch dir := dirs[0]; {
	case dir == "kubepods-besteffort.slice" || dir == "besteffort":
		p.QOSClass = v1.PodQOSBestEffort
		return nil
	case dir == "kubepods-burstable.slice" || dir == "burstable":
		p.QOSClass = v1.PodQOSBurstable
		return nil
	case strings.HasPrefix(dir, "kubepods-pod") || strings.HasPrefix(dir, "pod"):
		p.QOSClass = v1.PodQOSGuaranteed
		return nil
	}

	return cacheError("%s: failed to parse %q for QoS class",
		p.ID, p.CgroupParent)
}

// Get the resource requirements of a pod.
func (p *pod) GetPodResourceRequirements() PodResourceRequirements {
	if p.Resources == nil {
		return PodResourceRequirements{}
	}

	return *p.Resources
}

// Parse per container resource requirements from webhook annotations.
func (p *pod) parseResourceAnnotations() {
	p.Resources = &PodResourceRequirements{}
	p.GetAnnotationObject(KeyResourceAnnotation, p.Resources, nil)
}

// Determine the QoS class of the pod.
func (p *pod) GetQOSClass() v1.PodQOSClass {
	return p.QOSClass
}

// GetContainerAffinity returns the annotated affinity for the named container.
func (p *pod) GetContainerAffinity(name string) ([]*Affinity, error) {
	if p.Affinity != nil {
		return (*p.Affinity)[name], nil
	}

	affinity := &podContainerAffinity{}

	value, ok := p.GetResmgrAnnotation(keyAffinity)
	if ok {
		weight := DefaultWeight
		if !affinity.parseSimple(p, value, weight) {
			if err := affinity.parseFull(p, value, weight); err != nil {
				p.cache.Error("%v", err)
				return nil, err
			}
		}
	}
	value, ok = p.GetResmgrAnnotation(keyAntiAffinity)
	if ok {
		weight := -DefaultWeight
		if !affinity.parseSimple(p, value, weight) {
			if err := affinity.parseFull(p, value, weight); err != nil {
				p.cache.Error("%v", err)
				return nil, err
			}
		}
	}

	if p.cache.DebugEnabled() {
		p.cache.Debug("Pod container affinity for %s:", p.GetName())
		for id, ca := range *affinity {
			p.cache.Debug("  - container %s:", id)
			for _, a := range ca {
				p.cache.Debug("    * %s", a.String())
			}
		}
	}

	p.Affinity = affinity

	return (*p.Affinity)[name], nil
}

// ScopeExpression returns an affinity expression for defining this pod as the scope.
func (p *pod) ScopeExpression() *resmgr.Expression {
	return &resmgr.Expression{
		//      Domain: LabelsDomain,
		Key:    kubernetes.PodNameLabel,
		Op:     resmgr.Equals,
		Values: []string{p.GetName()},
	}
}

// String returns a string representation of pod.
func (p *pod) String() string {
	return p.Name
}

// Eval returns the value of a key for expression evaluation.
func (p *pod) Eval(key string) interface{} {
	switch key {
	case resmgr.KeyName:
		return p.Name
	case resmgr.KeyNamespace:
		return p.Namespace
	case resmgr.KeyQOSClass:
		return p.GetQOSClass()
	case resmgr.KeyLabels:
		return p.Labels
	case resmgr.KeyID:
		return p.ID
	case resmgr.KeyUID:
		return p.UID
	default:
		return cacheError("Pod cannot evaluate of %q", key)
	}
}

// GetProcesses returns the pids of processes in a pod.
func (p *pod) GetProcesses(recursive bool) ([]string, error) {
	return p.getTasks(recursive, true)
}

// GetTasks returns the pids of threads in a pod.
func (p *pod) GetTasks(recursive bool) ([]string, error) {
	return p.getTasks(recursive, false)
}

// getTasks returns the pids of processes or threads in a pod.
func (p *pod) getTasks(recursive, processes bool) ([]string, error) {
	var pids, childPids []string
	var err error

	dir := p.GetCgroupParentDir()
	if dir == "" {
		return nil, cacheError("%s: unknown cgroup parent directory", p.Name)
	}

	if processes {
		pids, err = cgroups.Cpu.Group(dir).GetProcesses()
	} else {
		pids, err = cgroups.Cpu.Group(dir).GetTasks()
	}
	if err != nil {
		return nil, cacheError("%s: failed to read pids: %v", p.Name, err)
	}

	if !recursive {
		return pids, nil
	}

	for _, c := range append(p.GetInitContainers(), p.GetContainers()...) {
		if c.GetState() == ContainerStateRunning {
			if processes {
				childPids, err = c.GetProcesses()
			} else {
				childPids, err = c.GetTasks()
			}
			if err == nil {
				pids = append(pids, childPids...)
				continue
			}

			p.cache.Error("%s: failed to read pids of %s: %v", p.Name,
				c.PrettyName(), err)
		}
	}

	return pids, nil
}

// ParsePodStatus parses a PodSandboxStatusResponse into a PodStatus.
func ParsePodStatus(response *criv1.PodSandboxStatusResponse) (*PodStatus, error) {
	var name string

	type infoRuntimeSpec struct {
		Annotations map[string]string `json:"annotations"`
	}
	type infoConfig struct {
		Linux *struct {
			CgroupParent string `json:"cgroup_parent"`
		} `json:"linux"`
	}
	type statusInfo struct {
		RuntimeSpec *infoRuntimeSpec `json:"runtimeSpec"`
		Config      *infoConfig      `json:"config"`
	}

	if response.Status.Metadata != nil {
		name = response.Status.Metadata.Name
	} else {
		name = response.Status.Id
	}

	blob, ok := response.Info["info"]
	if !ok {
		return nil, cacheError("%s: missing info in pod status response", name)
	}
	info := statusInfo{}
	if err := json.Unmarshal([]byte(blob), &info); err != nil {
		return nil, cacheError("%s: failed to extract pod status info: %v",
			name, err)
	}

	ps := &PodStatus{}

	if info.Config != nil { // containerd
		// CgroupParent: Info["config"]["linux"]["cgroup_parent"]
		ps.CgroupParent = info.Config.Linux.CgroupParent
	} else if info.RuntimeSpec != nil { // cri-o
		// CgroupParent: Info["info"]["runtimeSpec"]["annotations"][crioCgroupParent]
		const (
			crioCgroupParent = "io.kubernetes.cri-o.CgroupParent"
		)

		ps.CgroupParent = info.RuntimeSpec.Annotations[crioCgroupParent]
	}

	if ps.CgroupParent == "" {
		return nil, cacheError("%s: failed to extract cgroup parent from pod status",
			name)
	}

	return ps, nil
}


================================================
FILE: pkg/cri/resource-manager/cache/utils.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cache

import (
	"os"
	"path"
	"strconv"
	"strings"

	corev1 "k8s.io/api/core/v1"
	resapi "k8s.io/apimachinery/pkg/api/resource"
	"k8s.io/apimachinery/pkg/util/sets"
	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"

	"github.com/intel/cri-resource-manager/pkg/cgroups"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes"
)

var (
	memoryCapacity   int64
	SharesToMilliCPU = kubernetes.SharesToMilliCPU
	QuotaToMilliCPU  = kubernetes.QuotaToMilliCPU
	MilliCPUToShares = kubernetes.MilliCPUToShares
	MilliCPUToQuota  = kubernetes.MilliCPUToQuota
)

// IsPodQOSClassName returns true if the given class is one of the Pod QOS classes.
func IsPodQOSClassName(class string) bool {
	switch corev1.PodQOSClass(class) {
	case corev1.PodQOSBestEffort, corev1.PodQOSBurstable, corev1.PodQOSGuaranteed:
		return true
	}
	return false
}

// estimateComputeResources calculates resource requests/limits from a CRI request.
func estimateComputeResources(lnx *criv1.LinuxContainerResources, cgroupParent string) corev1.ResourceRequirements {
	var qos corev1.PodQOSClass

	resources := corev1.ResourceRequirements{
		Requests: corev1.ResourceList{},
		Limits:   corev1.ResourceList{},
	}

	if lnx == nil {
		return resources
	}

	if cgroupParent != "" {
		qos = cgroupParentToQOS(cgroupParent)
	}

	// calculate CPU request
	if value := SharesToMilliCPU(lnx.CpuShares); value > 0 {
		qty := resapi.NewMilliQuantity(value, resapi.DecimalSI)
		resources.Requests[corev1.ResourceCPU] = *qty
	}

	// get memory limit
	if value := lnx.MemoryLimitInBytes; value > 0 {
		qty := resapi.NewQuantity(value, resapi.DecimalSI)
		resources.Limits[corev1.ResourceMemory] = *qty
	}

	// set or calculate CPU limit, set memory request if known
	if qos == corev1.PodQOSGuaranteed {
		resources.Limits[corev1.ResourceCPU] = resources.Requests[corev1.ResourceCPU]
		resources.Requests[corev1.ResourceMemory] = resources.Limits[corev1.ResourceMemory]
	} else {
		if value := QuotaToMilliCPU(lnx.CpuQuota, lnx.CpuPeriod); value > 0 {
			qty := resapi.NewMilliQuantity(value, resapi.DecimalSI)
			resources.Limits[corev1.ResourceCPU] = *qty
		}
	}

	return resources
}

// getMemoryCapacity parses memory capacity from /proc/meminfo (mimicking cAdvisor).
func getMemoryCapacity() int64 {
	var data []byte
	var err error

	if memoryCapacity > 0 {
		return memoryCapacity
	}

	if data, err = os.ReadFile("/proc/meminfo"); err != nil {
		return -1
	}

	for _, line := range strings.Split(string(data), "\n") {
		keyval := strings.Split(line, ":")
		if len(keyval) != 2 || keyval[0] != "MemTotal" {
			continue
		}

		valunit := strings.Split(strings.TrimSpace(keyval[1]), " ")
		if len(valunit) != 2 || valunit[1] != "kB" {
			return -1
		}

		memoryCapacity, err = strconv.ParseInt(valunit[0], 10, 64)
		if err != nil {
			return -1
		}

		memoryCapacity *= 1024
		break
	}

	return memoryCapacity
}

// cgroupParentToQOS tries to map Pod cgroup parent to QOS class.
func cgroupParentToQOS(dir string) corev1.PodQOSClass {
	var qos corev1.PodQOSClass

	// The parent directory naming scheme depends on the cgroup driver in use.
	// Thus, rely on substring matching
	split := strings.Split(strings.TrimPrefix(dir, "/"), "/")
	switch {
	case len(split) < 2:
		qos = corev1.PodQOSClass("")
	case strings.Index(split[1], strings.ToLower(string(corev1.PodQOSBurstable))) != -1:
		qos = corev1.PodQOSBurstable
	case strings.Index(split[1], strings.ToLower(string(corev1.PodQOSBestEffort))) != -1:
		qos = corev1.PodQOSBestEffort
	default:
		qos = corev1.PodQOSGuaranteed
	}

	return qos
}

// resourcesToQOS tries to map Pod container resources (from annotation) to QOS class.
func resourcesToQOS(podResources *PodResourceRequirements) corev1.PodQOSClass {
	var qos corev1.PodQOSClass

	if podResources == nil {
		return qos
	}

	requests := corev1.ResourceList{}
	limits := corev1.ResourceList{}
	zeroQuantity := resapi.MustParse("0")
	isGuaranteed := true
	for _, resources := range podResources.Containers {
		// process requests
		for name, quantity := range resources.Requests {
			if !isSupportedQoSComputeResource(name) {
				continue
			}
			if quantity.Cmp(zeroQuantity) == 1 {
				delta := quantity.DeepCopy()
				if _, exists := requests[name]; !exists {
					requests[name] = delta
				} else {
					delta.Add(requests[name])
					requests[name] = delta
				}
			}
		}
		// process limits
		qosLimitsFound := sets.NewString()
		for name, quantity := range resources.Limits {
			if !isSupportedQoSComputeResource(name) {
				continue
			}
			if quantity.Cmp(zeroQuantity) == 1 {
				qosLimitsFound.Insert(string(name))
				delta := quantity.DeepCopy()
				if _, exists := limits[name]; !exists {
					limits[name] = delta
				} else {
					delta.Add(limits[name])
					limits[name] = delta
				}
			}
		}

		if !qosLimitsFound.HasAll(string(corev1.ResourceMemory), string(corev1.ResourceCPU)) {
			isGuaranteed = false
		}
	}
	if len(requests) == 0 && len(limits) == 0 {
		return corev1.PodQOSBestEffort
	}
	// Check is requests match limits for all resources.
	if isGuaranteed {
		for name, req := range requests {
			if lim, exists := limits[name]; !exists || lim.Cmp(req) != 0 {
				isGuaranteed = false
				break
			}
		}
	}
	if isGuaranteed &&
		len(requests) == len(limits) {
		return corev1.PodQOSGuaranteed
	}
	return corev1.PodQOSBurstable
}

// findContainerDir brute-force searches for a container cgroup dir.
func findContainerDir(podCgroupDir, ID string) string {
	var dirs []string

	if podCgroupDir == "" {
		return ""
	}

	cpusetDir := cgroups.Cpuset.Path()

	dirs = []string{
		path.Join(cpusetDir, podCgroupDir, ID),
		// containerd, systemd
		path.Join(cpusetDir, podCgroupDir, "cri-containerd-"+ID+".scope"),
		// containerd, cgroupfs
		path.Join(cpusetDir, podCgroupDir, "cri-containerd-"+ID),
		// crio, systemd
		path.Join(cpusetDir, podCgroupDir, "crio-"+ID+".scope"),
		// crio, cgroupfs
		path.Join(cpusetDir, podCgroupDir, "crio-"+ID),
	}

	for _, dir := range dirs {
		if info, err := os.Stat(dir); err == nil {
			if info.Mode().IsDir() {
				return strings.TrimPrefix(dir, cpusetDir)
			}
		}
	}

	return ""
}

func isSupportedQoSComputeResource(name corev1.ResourceName) bool {
	return name == corev1.ResourceCPU || name == corev1.ResourceMemory
}

func init() {
	// TODO: get rid of this eventually, use pkg/sysfs instead...
	getMemoryCapacity()
}


================================================
FILE: pkg/cri/resource-manager/config/api/v1/api.pb.go
================================================
//
//Copyright 2019 Intel Corporation
//
//Licensed under the Apache License, Version 2.0 (the "License");
//you may not use this file except in compliance with the License.
//You may obtain a copy of the License at
//
//http://www.apache.org/licenses/LICENSE-2.0
//
//Unless required by applicable law or agreed to in writing, software
//distributed under the License is distributed on an "AS IS" BASIS,
//WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//See the License for the specific language governing permissions and
//limitations under the License.

// Code generated by protoc-gen-go. DO NOT EDIT.
// versions:
// 	protoc-gen-go v1.28.0
// 	protoc        v3.20.1
// source: pkg/cri/resource-manager/config/api/v1/api.proto

package v1

import (
	protoreflect "google.golang.org/protobuf/reflect/protoreflect"
	protoimpl "google.golang.org/protobuf/runtime/protoimpl"
	reflect "reflect"
	sync "sync"
)

const (
	// Verify that this generated code is sufficiently up-to-date.
	_ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion)
	// Verify that runtime/protoimpl is sufficiently up-to-date.
	_ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20)
)

type SetConfigRequest struct {
	state         protoimpl.MessageState
	sizeCache     protoimpl.SizeCache
	unknownFields protoimpl.UnknownFields

	// node_name is node name used to acquire this configuration.
	NodeName string `protobuf:"bytes,1,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"`
	// config is the ConfigMap data.
	Config map[string]string `protobuf:"bytes,2,rep,name=config,proto3" json:"config,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"`
}

func (x *SetConfigRequest) Reset() {
	*x = SetConfigRequest{}
	if protoimpl.UnsafeEnabled {
		mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[0]
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		ms.StoreMessageInfo(mi)
	}
}

func (x *SetConfigRequest) String() string {
	return protoimpl.X.MessageStringOf(x)
}

func (*SetConfigRequest) ProtoMessage() {}

func (x *SetConfigRequest) ProtoReflect() protoreflect.Message {
	mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[0]
	if protoimpl.UnsafeEnabled && x != nil {
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		if ms.LoadMessageInfo() == nil {
			ms.StoreMessageInfo(mi)
		}
		return ms
	}
	return mi.MessageOf(x)
}

// Deprecated: Use SetConfigRequest.ProtoReflect.Descriptor instead.
func (*SetConfigRequest) Descriptor() ([]byte, []int) {
	return file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescGZIP(), []int{0}
}

func (x *SetConfigRequest) GetNodeName() string {
	if x != nil {
		return x.NodeName
	}
	return ""
}

func (x *SetConfigRequest) GetConfig() map[string]string {
	if x != nil {
		return x.Config
	}
	return nil
}

type SetConfigReply struct {
	state         protoimpl.MessageState
	sizeCache     protoimpl.SizeCache
	unknownFields protoimpl.UnknownFields

	// If not empty, indicate an error that happened while trying to apply new configuration.
	Error string `protobuf:"bytes,1,opt,name=error,proto3" json:"error,omitempty"`
}

func (x *SetConfigReply) Reset() {
	*x = SetConfigReply{}
	if protoimpl.UnsafeEnabled {
		mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[1]
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		ms.StoreMessageInfo(mi)
	}
}

func (x *SetConfigReply) String() string {
	return protoimpl.X.MessageStringOf(x)
}

func (*SetConfigReply) ProtoMessage() {}

func (x *SetConfigReply) ProtoReflect() protoreflect.Message {
	mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[1]
	if protoimpl.UnsafeEnabled && x != nil {
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		if ms.LoadMessageInfo() == nil {
			ms.StoreMessageInfo(mi)
		}
		return ms
	}
	return mi.MessageOf(x)
}

// Deprecated: Use SetConfigReply.ProtoReflect.Descriptor instead.
func (*SetConfigReply) Descriptor() ([]byte, []int) {
	return file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescGZIP(), []int{1}
}

func (x *SetConfigReply) GetError() string {
	if x != nil {
		return x.Error
	}
	return ""
}

type SetAdjustmentRequest struct {
	state         protoimpl.MessageState
	sizeCache     protoimpl.SizeCache
	unknownFields protoimpl.UnknownFields

	// node_name is node name used to acquire this configuration.
	NodeName string `protobuf:"bytes,1,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"`
	// Serialized map of all adjustment CRDs, name as key, CRD as value.
	Adjustment string `protobuf:"bytes,2,opt,name=adjustment,proto3" json:"adjustment,omitempty"`
}

func (x *SetAdjustmentRequest) Reset() {
	*x = SetAdjustmentRequest{}
	if protoimpl.UnsafeEnabled {
		mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[2]
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		ms.StoreMessageInfo(mi)
	}
}

func (x *SetAdjustmentRequest) String() string {
	return protoimpl.X.MessageStringOf(x)
}

func (*SetAdjustmentRequest) ProtoMessage() {}

func (x *SetAdjustmentRequest) ProtoReflect() protoreflect.Message {
	mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[2]
	if protoimpl.UnsafeEnabled && x != nil {
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		if ms.LoadMessageInfo() == nil {
			ms.StoreMessageInfo(mi)
		}
		return ms
	}
	return mi.MessageOf(x)
}

// Deprecated: Use SetAdjustmentRequest.ProtoReflect.Descriptor instead.
func (*SetAdjustmentRequest) Descriptor() ([]byte, []int) {
	return file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescGZIP(), []int{2}
}

func (x *SetAdjustmentRequest) GetNodeName() string {
	if x != nil {
		return x.NodeName
	}
	return ""
}

func (x *SetAdjustmentRequest) GetAdjustment() string {
	if x != nil {
		return x.Adjustment
	}
	return ""
}

type SetAdjustmentReply struct {
	state         protoimpl.MessageState
	sizeCache     protoimpl.SizeCache
	unknownFields protoimpl.UnknownFields

	// If not empty, indicates that errors happened while trying to apply the adjustments.
	Errors map[string]string `protobuf:"bytes,1,rep,name=errors,proto3" json:"errors,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"`
}

func (x *SetAdjustmentReply) Reset() {
	*x = SetAdjustmentReply{}
	if protoimpl.UnsafeEnabled {
		mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[3]
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		ms.StoreMessageInfo(mi)
	}
}

func (x *SetAdjustmentReply) String() string {
	return protoimpl.X.MessageStringOf(x)
}

func (*SetAdjustmentReply) ProtoMessage() {}

func (x *SetAdjustmentReply) ProtoReflect() protoreflect.Message {
	mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[3]
	if protoimpl.UnsafeEnabled && x != nil {
		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
		if ms.LoadMessageInfo() == nil {
			ms.StoreMessageInfo(mi)
		}
		return ms
	}
	return mi.MessageOf(x)
}

// Deprecated: Use SetAdjustmentReply.ProtoReflect.Descriptor instead.
func (*SetAdjustmentReply) Descriptor() ([]byte, []int) {
	return file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescGZIP(), []int{3}
}

func (x *SetAdjustmentReply) GetErrors() map[string]string {
	if x != nil {
		return x.Errors
	}
	return nil
}

var File_pkg_cri_resource_manager_config_api_v1_api_proto protoreflect.FileDescriptor

var file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDesc = []byte{
	0x0a, 0x30, 0x70, 0x6b, 0x67, 0x2f, 0x63, 0x72, 0x69, 0x2f, 0x72, 0x65, 0x73, 0x6f, 0x75, 0x72,
	0x63, 0x65, 0x2d, 0x6d, 0x61, 0x6e, 0x61, 0x67, 0x65, 0x72, 0x2f, 0x63, 0x6f, 0x6e, 0x66, 0x69,
	0x67, 0x2f, 0x61, 0x70, 0x69, 0x2f, 0x76, 0x31, 0x2f, 0x61, 0x70, 0x69, 0x2e, 0x70, 0x72, 0x6f,
	0x74, 0x6f, 0x12, 0x02, 0x76, 0x31, 0x22, 0xa4, 0x01, 0x0a, 0x10, 0x53, 0x65, 0x74, 0x43, 0x6f,
	0x6e, 0x66, 0x69, 0x67, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x1b, 0x0a, 0x09, 0x6e,
	0x6f, 0x64, 0x65, 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08,
	0x6e, 0x6f, 0x64, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x12, 0x38, 0x0a, 0x06, 0x63, 0x6f, 0x6e, 0x66,
	0x69, 0x67, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x20, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65,
	0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x2e, 0x43,
	0x6f, 0x6e, 0x66, 0x69, 0x67, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x06, 0x63, 0x6f, 0x6e, 0x66,
	0x69, 0x67, 0x1a, 0x39, 0x0a, 0x0b, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x45, 0x6e, 0x74, 0x72,
	0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03,
	0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01,
	0x28, 0x09, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x26, 0x0a,
	0x0e, 0x53, 0x65, 0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x12,
	0x14, 0x0a, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05,
	0x65, 0x72, 0x72, 0x6f, 0x72, 0x22, 0x53, 0x0a, 0x14, 0x53, 0x65, 0x74, 0x41, 0x64, 0x6a, 0x75,
	0x73, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x1b, 0x0a,
	0x09, 0x6e, 0x6f, 0x64, 0x65, 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09,
	0x52, 0x08, 0x6e, 0x6f, 0x64, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x12, 0x1e, 0x0a, 0x0a, 0x61, 0x64,
	0x6a, 0x75, 0x73, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a,
	0x61, 0x64, 0x6a, 0x75, 0x73, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x22, 0x8b, 0x01, 0x0a, 0x12, 0x53,
	0x65, 0x74, 0x41, 0x64, 0x6a, 0x75, 0x73, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x70, 0x6c,
	0x79, 0x12, 0x3a, 0x0a, 0x06, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28,
	0x0b, 0x32, 0x22, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x74, 0x41, 0x64, 0x6a, 0x75, 0x73, 0x74,
	0x6d, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x2e, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x73,
	0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x06, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x73, 0x1a, 0x39, 0x0a,
	0x0b, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03,
	0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14,
	0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76,
	0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x32, 0x86, 0x01, 0x0a, 0x06, 0x43, 0x6f, 0x6e,
	0x66, 0x69, 0x67, 0x12, 0x37, 0x0a, 0x09, 0x53, 0x65, 0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67,
	0x12, 0x14, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x52,
	0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x12, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x74, 0x43,
	0x6f, 0x6e, 0x66, 0x69, 0x67, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x12, 0x43, 0x0a, 0x0d,
	0x53, 0x65, 0x74, 0x41, 0x64, 0x6a, 0x75, 0x73, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x12, 0x18, 0x2e,
	0x76, 0x31, 0x2e, 0x53, 0x65, 0x74, 0x41, 0x64, 0x6a, 0x75, 0x73, 0x74, 0x6d, 0x65, 0x6e, 0x74,
	0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x16, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x74,
	0x41, 0x64, 0x6a, 0x75, 0x73, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22,
	0x00, 0x42, 0x07, 0x5a, 0x05, 0x2e, 0x2e, 0x2f, 0x76, 0x31, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74,
	0x6f, 0x33,
}

var (
	file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescOnce sync.Once
	file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescData = file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDesc
)

func file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescGZIP() []byte {
	file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescOnce.Do(func() {
		file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescData)
	})
	return file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescData
}

var file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes = make([]protoimpl.MessageInfo, 6)
var file_pkg_cri_resource_manager_config_api_v1_api_proto_goTypes = []interface{}{
	(*SetConfigRequest)(nil),     // 0: v1.SetConfigRequest
	(*SetConfigReply)(nil),       // 1: v1.SetConfigReply
	(*SetAdjustmentRequest)(nil), // 2: v1.SetAdjustmentRequest
	(*SetAdjustmentReply)(nil),   // 3: v1.SetAdjustmentReply
	nil,                          // 4: v1.SetConfigRequest.ConfigEntry
	nil,                          // 5: v1.SetAdjustmentReply.ErrorsEntry
}
var file_pkg_cri_resource_manager_config_api_v1_api_proto_depIdxs = []int32{
	4, // 0: v1.SetConfigRequest.config:type_name -> v1.SetConfigRequest.ConfigEntry
	5, // 1: v1.SetAdjustmentReply.errors:type_name -> v1.SetAdjustmentReply.ErrorsEntry
	0, // 2: v1.Config.SetConfig:input_type -> v1.SetConfigRequest
	2, // 3: v1.Config.SetAdjustment:input_type -> v1.SetAdjustmentRequest
	1, // 4: v1.Config.SetConfig:output_type -> v1.SetConfigReply
	3, // 5: v1.Config.SetAdjustment:output_type -> v1.SetAdjustmentReply
	4, // [4:6] is the sub-list for method output_type
	2, // [2:4] is the sub-list for method input_type
	2, // [2:2] is the sub-list for extension type_name
	2, // [2:2] is the sub-list for extension extendee
	0, // [0:2] is the sub-list for field type_name
}

func init() { file_pkg_cri_resource_manager_config_api_v1_api_proto_init() }
func file_pkg_cri_resource_manager_config_api_v1_api_proto_init() {
	if File_pkg_cri_resource_manager_config_api_v1_api_proto != nil {
		return
	}
	if !protoimpl.UnsafeEnabled {
		file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} {
			switch v := v.(*SetConfigRequest); i {
			case 0:
				return &v.state
			case 1:
				return &v.sizeCache
			case 2:
				return &v.unknownFields
			default:
				return nil
			}
		}
		file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} {
			switch v := v.(*SetConfigReply); i {
			case 0:
				return &v.state
			case 1:
				return &v.sizeCache
			case 2:
				return &v.unknownFields
			default:
				return nil
			}
		}
		file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} {
			switch v := v.(*SetAdjustmentRequest); i {
			case 0:
				return &v.state
			case 1:
				return &v.sizeCache
			case 2:
				return &v.unknownFields
			default:
				return nil
			}
		}
		file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} {
			switch v := v.(*SetAdjustmentReply); i {
			case 0:
				return &v.state
			case 1:
				return &v.sizeCache
			case 2:
				return &v.unknownFields
			default:
				return nil
			}
		}
	}
	type x struct{}
	out := protoimpl.TypeBuilder{
		File: protoimpl.DescBuilder{
			GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
			RawDescriptor: file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDesc,
			NumEnums:      0,
			NumMessages:   6,
			NumExtensions: 0,
			NumServices:   1,
		},
		GoTypes:           file_pkg_cri_resource_manager_config_api_v1_api_proto_goTypes,
		DependencyIndexes: file_pkg_cri_resource_manager_config_api_v1_api_proto_depIdxs,
		MessageInfos:      file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes,
	}.Build()
	File_pkg_cri_resource_manager_config_api_v1_api_proto = out.File
	file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDesc = nil
	file_pkg_cri_resource_manager_config_api_v1_api_proto_goTypes = nil
	file_pkg_cri_resource_manager_config_api_v1_api_proto_depIdxs = nil
}


================================================
FILE: pkg/cri/resource-manager/config/api/v1/api.proto
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

syntax = "proto3";

package v1;
option go_package = "../v1";

service Config{
    rpc SetConfig(SetConfigRequest) returns (SetConfigReply) {}
    rpc SetAdjustment(SetAdjustmentRequest) returns (SetAdjustmentReply) {}
}

message SetConfigRequest {
    // node_name is node name used to acquire this configuration.
    string node_name = 1;
    // config is the ConfigMap data.
    map<string, string> config = 2;
}

message SetConfigReply {
     // If not empty, indicate an error that happened while trying to apply new configuration.
    string error = 1;
}

message SetAdjustmentRequest {
    // node_name is node name used to acquire this configuration.
    string node_name = 1;
    // Serialized map of all adjustment CRDs, name as key, CRD as value.
    string adjustment = 2;
}

message SetAdjustmentReply {
    // If not empty, indicates that errors happened while trying to apply the adjustments.
    map<string, string> errors = 1;
}


================================================
FILE: pkg/cri/resource-manager/config/api/v1/api_grpc.pb.go
================================================
// Code generated by protoc-gen-go-grpc. DO NOT EDIT.
// versions:
// - protoc-gen-go-grpc v1.2.0
// - protoc             v3.20.1
// source: pkg/cri/resource-manager/config/api/v1/api.proto

package v1

import (
	context "context"
	grpc "google.golang.org/grpc"
	codes "google.golang.org/grpc/codes"
	status "google.golang.org/grpc/status"
)

// This is a compile-time assertion to ensure that this generated file
// is compatible with the grpc package it is being compiled against.
// Requires gRPC-Go v1.32.0 or later.
const _ = grpc.SupportPackageIsVersion7

// ConfigClient is the client API for Config service.
//
// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
type ConfigClient interface {
	SetConfig(ctx context.Context, in *SetConfigRequest, opts ...grpc.CallOption) (*SetConfigReply, error)
	SetAdjustment(ctx context.Context, in *SetAdjustmentRequest, opts ...grpc.CallOption) (*SetAdjustmentReply, error)
}

type configClient struct {
	cc grpc.ClientConnInterface
}

func NewConfigClient(cc grpc.ClientConnInterface) ConfigClient {
	return &configClient{cc}
}

func (c *configClient) SetConfig(ctx context.Context, in *SetConfigRequest, opts ...grpc.CallOption) (*SetConfigReply, error) {
	out := new(SetConfigReply)
	err := c.cc.Invoke(ctx, "/v1.Config/SetConfig", in, out, opts...)
	if err != nil {
		return nil, err
	}
	return out, nil
}

func (c *configClient) SetAdjustment(ctx context.Context, in *SetAdjustmentRequest, opts ...grpc.CallOption) (*SetAdjustmentReply, error) {
	out := new(SetAdjustmentReply)
	err := c.cc.Invoke(ctx, "/v1.Config/SetAdjustment", in, out, opts...)
	if err != nil {
		return nil, err
	}
	return out, nil
}

// ConfigServer is the server API for Config service.
// All implementations must embed UnimplementedConfigServer
// for forward compatibility
type ConfigServer interface {
	SetConfig(context.Context, *SetConfigRequest) (*SetConfigReply, error)
	SetAdjustment(context.Context, *SetAdjustmentRequest) (*SetAdjustmentReply, error)
	mustEmbedUnimplementedConfigServer()
}

// UnimplementedConfigServer must be embedded to have forward compatible implementations.
type UnimplementedConfigServer struct {
}

func (UnimplementedConfigServer) SetConfig(context.Context, *SetConfigRequest) (*SetConfigReply, error) {
	return nil, status.Errorf(codes.Unimplemented, "method SetConfig not implemented")
}
func (UnimplementedConfigServer) SetAdjustment(context.Context, *SetAdjustmentRequest) (*SetAdjustmentReply, error) {
	return nil, status.Errorf(codes.Unimplemented, "method SetAdjustment not implemented")
}
func (UnimplementedConfigServer) mustEmbedUnimplementedConfigServer() {}

// UnsafeConfigServer may be embedded to opt out of forward compatibility for this service.
// Use of this interface is not recommended, as added methods to ConfigServer will
// result in compilation errors.
type UnsafeConfigServer interface {
	mustEmbedUnimplementedConfigServer()
}

func RegisterConfigServer(s grpc.ServiceRegistrar, srv ConfigServer) {
	s.RegisterService(&Config_ServiceDesc, srv)
}

func _Config_SetConfig_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
	in := new(SetConfigRequest)
	if err := dec(in); err != nil {
		return nil, err
	}
	if interceptor == nil {
		return srv.(ConfigServer).SetConfig(ctx, in)
	}
	info := &grpc.UnaryServerInfo{
		Server:     srv,
		FullMethod: "/v1.Config/SetConfig",
	}
	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
		return srv.(ConfigServer).SetConfig(ctx, req.(*SetConfigRequest))
	}
	return interceptor(ctx, in, info, handler)
}

func _Config_SetAdjustment_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
	in := new(SetAdjustmentRequest)
	if err := dec(in); err != nil {
		return nil, err
	}
	if interceptor == nil {
		return srv.(ConfigServer).SetAdjustment(ctx, in)
	}
	info := &grpc.UnaryServerInfo{
		Server:     srv,
		FullMethod: "/v1.Config/SetAdjustment",
	}
	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
		return srv.(ConfigServer).SetAdjustment(ctx, req.(*SetAdjustmentRequest))
	}
	return interceptor(ctx, in, info, handler)
}

// Config_ServiceDesc is the grpc.ServiceDesc for Config service.
// It's only intended for direct use with grpc.RegisterService,
// and not to be introspected or modified (even as a copy)
var Config_ServiceDesc = grpc.ServiceDesc{
	ServiceName: "v1.Config",
	HandlerType: (*ConfigServer)(nil),
	Methods: []grpc.MethodDesc{
		{
			MethodName: "SetConfig",
			Handler:    _Config_SetConfig_Handler,
		},
		{
			MethodName: "SetAdjustment",
			Handler:    _Config_SetAdjustment_Handler,
		},
	},
	Streams:  []grpc.StreamDesc{},
	Metadata: "pkg/cri/resource-manager/config/api/v1/api.proto",
}


================================================
FILE: pkg/cri/resource-manager/config/config.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package config

import (
	extapi "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1"
)

// RawConfig represents the resource manager config data in unparsed form, as
// received from the agent.
type RawConfig struct {
	// NodeName is the node name the agent used to acquire configuration.
	NodeName string
	// Data is the raw ConfigMap data for this node.
	Data map[string]string
}

// Adjustment represents external adjustments for this node.
type Adjustment struct {
	// Adjustments contains all adjustment CRDs for this node.
	Adjustments map[string]*extapi.AdjustmentSpec
}

// HasIdenticalData returns true if RawConfig has identical data to the supplied one.
func (c *RawConfig) HasIdenticalData(data map[string]string) bool {
	if c == nil && data == nil {
		return true
	}
	if c == nil || data == nil {
		return false
	}

	if len(c.Data) != len(data) {
		return false
	}

	for k, v := range c.Data {
		if dv, found := data[k]; !found || dv != v {
			return false
		}
	}

	for dk, dv := range data {
		if v, found := c.Data[dk]; !found || v != dv {
			return false
		}
	}

	return true
}


================================================
FILE: pkg/cri/resource-manager/config/server.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package config

import (
	"context"
	"fmt"
	"net"
	"os"
	"path/filepath"
	"sync"

	"google.golang.org/grpc"

	v1 "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/config/api/v1"
	"github.com/intel/cri-resource-manager/pkg/log"

	"encoding/json"

	extapi "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1"
)

const (
	SocketDisabled = "disabled"
)

// SetConfigCb is a callback function for a SetConfig request.
type SetConfigCb func(*RawConfig) error

// SetAdjustmentCb is a callback function for a SetAdjustment request.
type SetAdjustmentCb func(*Adjustment) map[string]error

// Server is the interface for our gRPC server.
type Server interface {
	Start(string) error
	Stop()
}

// server implements Server.
type server struct {
	v1.UnimplementedConfigServer
	log.Logger
	socket          string          // configured socket
	sync.Mutex                      // lock for concurrent per-request goroutines.
	server          *grpc.Server    // gRPC server instance
	setConfigCb     SetConfigCb     // configuration update notification callback
	setAdjustmentCb SetAdjustmentCb // extneral adjustment update notification callback
}

// NewConfigServer creates new Server instance.
func NewConfigServer(configCb SetConfigCb, adjustmentCb SetAdjustmentCb) (Server, error) {
	s := &server{
		Logger:          log.NewLogger("config-server"),
		setConfigCb:     configCb,
		setAdjustmentCb: adjustmentCb,
	}
	return s, nil
}

// Start runs server instance.
func (s *server) Start(socket string) error {
	if socket == SocketDisabled || socket == "" {
		s.Info("config-server is disabled...,")
		return nil
	}

	// Make sure we have a directory for the socket
	if err := os.MkdirAll(filepath.Dir(socket), 0700); err != nil {
		return serverError("failed to create directory for socket %s: %v",
			socket, err)
	}

	// Remove socket file if it exists
	if err := os.Remove(socket); err != nil && !os.IsNotExist(err) {
		return serverError("failed to unlink socket file: %s", err)
	}

	// Create server listening for local unix domain socket
	lis, err := net.Listen("unix", socket)
	if err != nil {
		return serverError("failed to listen to socket: %v", err)
	}

	serverOpts := []grpc.ServerOption{}
	s.server = grpc.NewServer(serverOpts...)
	v1.RegisterConfigServer(s.server, s)

	s.Info("starting config-server at socket %s...", socket)
	go func() {
		defer lis.Close()
		err := s.server.Serve(lis)
		if err != nil {
			s.Fatal("config-server died: %v", err)
		}
	}()
	return nil

}

// Stop Server instance
func (s *server) Stop() {
	if s.server != nil {
		s.server.Stop()
		s.server = nil
	}
}

// SetConfig pushes a configuration update to the server.
func (s *server) SetConfig(_ context.Context, req *v1.SetConfigRequest) (*v1.SetConfigReply, error) {
	s.Lock()
	defer s.Unlock()

	s.Debug("SetConfig request: %+v", req)

	reply := &v1.SetConfigReply{}
	err := s.setConfigCb(&RawConfig{NodeName: req.NodeName, Data: req.Config})
	if err != nil {
		reply.Error = fmt.Sprintf("failed to apply configuration: %v", err)
	}

	return reply, nil
}

// SetAdjustment pushes updated external policies to the server.
func (s *server) SetAdjustment(_ context.Context, req *v1.SetAdjustmentRequest) (*v1.SetAdjustmentReply, error) {
	s.Lock()
	defer s.Unlock()

	s.Debug("SetAdjustment request: %+v", req)

	errors := map[string]error{}
	specs := map[string]*extapi.AdjustmentSpec{}

	if err := json.Unmarshal([]byte(req.Adjustment), &specs); err != nil {
		return nil, serverError("failed to decode SetAdjustment request: %v", err)
	}

	for name, spec := range specs {
		if err := spec.Verify(); err != nil {
			errors[name] = err
		}
	}

	if len(errors) == 0 {
		errors = s.setAdjustmentCb(&Adjustment{Adjustments: specs})
	}

	reply := &v1.SetAdjustmentReply{Errors: make(map[string]string)}
	for str, err := range errors {
		reply.Errors[str] = err.Error()
	}
	return reply, nil
}

func serverError(format string, args ...interface{}) error {
	return fmt.Errorf(format, args...)
}


================================================
FILE: pkg/cri/resource-manager/control/blockio/blockio.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package blockio

import (
	"errors"
	"fmt"

	"github.com/intel/cri-resource-manager/pkg/blockio"
	"github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/cri/client"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control"
	logger "github.com/intel/cri-resource-manager/pkg/log"
)

const (
	// BlockIOController is the name of the block I/O controller.
	BlockIOController = cache.BlockIO
)

// blockio encapsulates the runtime state of our block I/O enforcement/controller.
type blockioctl struct {
	cache cache.Cache // resource manager cache
	idle  *bool       // true if we run without any classes configured
}

// Our logger instance.
var log logger.Logger = logger.NewLogger(BlockIOController)

// Our singleton block I/O controller instance.
var singleton *blockioctl

// getBlockIOController returns our singleton block I/O controller instance.
func getBlockIOController() *blockioctl {
	if singleton == nil {
		singleton = &blockioctl{}
	}
	return singleton
}

// Start initializes the controller for enforcing decisions.
func (ctl *blockioctl) Start(cache cache.Cache, _ client.Client) error {
	ctl.cache = cache
	ctl.reconfigureRunningContainers()
	return nil
}

// Stop shuts down the controller.
func (ctl *blockioctl) Stop() {
}

// PreCreateHook is the block I/O controller pre-create hook.
func (ctl *blockioctl) PreCreateHook(_ cache.Container) error {
	return nil
}

// PreStartHook is the block I/O controller pre-start hook.
func (ctl *blockioctl) PreStartHook(_ cache.Container) error {
	return nil
}

// PostStartHook is the block I/O controller post-start hook.
func (ctl *blockioctl) PostStartHook(c cache.Container) error {
	if !c.HasPending(BlockIOController) {
		return nil
	}

	if err := ctl.assign(c); err != nil {
		return err
	}

	c.ClearPending(BlockIOController)

	return nil
}

// PostUpdateHook is the block I/O controller post-update hook.
func (ctl *blockioctl) PostUpdateHook(c cache.Container) error {
	if !c.HasPending(BlockIOController) {
		return nil
	}

	if err := ctl.assign(c); err != nil {
		return err
	}

	c.ClearPending(BlockIOController)

	return nil
}

// PostStop is the block I/O controller post-stop hook.
func (ctl *blockioctl) PostStopHook(_ cache.Container) error {
	return nil
}

// isImplicitlyDisabled checks if we run without any classes confiured
func (ctl *blockioctl) isImplicitlyDisabled() bool {
	if ctl.idle != nil {
		return *ctl.idle
	}

	idle := len(blockio.GetClasses()) == 0
	if idle {
		log.Warn("controller implictly disabled (no configured classes)")
	}
	ctl.idle = &idle

	return *ctl.idle
}

// assign assigns the container to the given block I/O class.
func (ctl *blockioctl) assign(c cache.Container) error {
	class := c.GetBlockIOClass()
	if class == "" {
		return nil
	}

	if ctl.isImplicitlyDisabled() && cache.IsPodQOSClassName(class) {
		return nil
	}

	if err := blockio.SetContainerClass(c, class); err != nil {
		return blockioError("%q: failed to assign to class %q: %w", c.PrettyName(), class, err)
	}

	log.Info("%q: assigned to class %q", c.PrettyName(), class)

	return nil
}

// configNotify is blockio class mapping and class definition configuration callback
func (ctl *blockioctl) configNotify(event config.Event, _ config.Source) error {
	ignoreErrors := (event == config.RevertEvent)
	err := blockio.UpdateOciConfig(ignoreErrors)
	if err != nil {
		return err
	}
	// Possible errors in reconfiguring running containers are not errors in
	// the updated configuration, therefore silently ignored.
	ctl.reconfigureRunningContainers()

	// We'll re-check idleness at next operation/request.
	ctl.idle = nil

	return nil
}

// reconfigureRunningContainers force setting current blockio configuration to all containers running on the node
func (ctl *blockioctl) reconfigureRunningContainers() error {
	errs := []error{}
	if ctl.cache == nil {
		return nil
	}
	for _, c := range ctl.cache.GetContainers() {
		class := c.GetBlockIOClass()
		log.Debug("%q: configure blockio class %q", c.PrettyName(), class)
		err := blockio.SetContainerClass(c, class)
		if err != nil {
			errs = append(errs, err)
		}
	}
	return errors.Join(errs...)
}

// blockioError creates a block I/O-controller-specific formatted error message.
func blockioError(format string, args ...interface{}) error {
	return fmt.Errorf("blockio: "+format, args...)
}

// init registers this controller and sets configuration change handling.
func init() {
	control.Register(BlockIOController, "Block I/O controller", getBlockIOController())
	config.GetModule(blockio.ConfigModuleName).AddNotify(getBlockIOController().configNotify)
}


================================================
FILE: pkg/cri/resource-manager/control/control.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package control

import (
	"fmt"
	"sort"
	"strings"

	"github.com/intel/cri-resource-manager/pkg/cri/client"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	logger "github.com/intel/cri-resource-manager/pkg/log"
)

// Control is the interface for triggering controller-/domain-specific post-decision actions.
type Control interface {
	// StartStopControllers starts/stops all controllers according to configuration.
	StartStopControllers(cache.Cache, client.Client) error
	// PreCreateHooks runs the pre-create hooks of all registered controllers.
	RunPreCreateHooks(cache.Container) error
	// RunPreStartHooks runs the pre-start hooks of all registered controllers.
	RunPreStartHooks(cache.Container) error
	// RunPostStartHooks runs the post-start hooks of all registered controllers.
	RunPostStartHooks(cache.Container) error
	// RunPostUpdateHooks runs the post-update hooks of all registered controllers.
	RunPostUpdateHooks(cache.Container) error
	// RunPostStopHooks runs the post-stop hooks of all registered controllers.
	RunPostStopHooks(cache.Container) error
}

// Controller is the interface all resource controllers must implement.
type Controller interface {
	// Start prepares the controller for resource control/decision enforcement.
	Start(cache.Cache, client.Client) error
	// Stop shuts down the controller.
	Stop()
	// PreCreateHook is the controller's pre-create hook.
	PreCreateHook(cache.Container) error
	// PreStartHook is the controller's pre-start hook.
	PreStartHook(cache.Container) error
	// PostStartHook is the controller's post-start hook.
	PostStartHook(cache.Container) error
	// PostUpdateHook is the controller's post-update hook.
	PostUpdateHook(cache.Container) error
	// PostStopHook is the controller's post-stop hook.
	PostStopHook(cache.Container) error
}

// control encapsulates our controller-agnostic runtime state.
type control struct {
	cache       cache.Cache   // resource manager cache
	client      client.Client // resource manager CRI client
	controllers []*controller // active controllers
}

// controller represents a single registered controller.
type controller struct {
	name        string     // controller name
	description string     // controller description
	c           Controller // controller interface
	mode        mode       // controller mode
	running     bool       // whether the controller is running
}

// our hook names
const (
	precreate  = "pre-create"
	prestart   = "pre-start"
	poststart  = "post-start"
	postupdate = "post-update"
	poststop   = "post-stop"
)

// All registered controllers.
var controllers = make(map[string]*controller)

// Our logger instance.
var log logger.Logger = logger.NewLogger("resource-control")

// NewControl creates a new controller-agnostic instance.
func NewControl() (Control, error) {
	c := &control{}

	for _, controller := range controllers {
		c.controllers = append(c.controllers, controller)
	}
	sort.Slice(c.controllers,
		func(i, j int) bool {
			return strings.Compare(c.controllers[i].name, c.controllers[j].name) < 0
		})

	return c, nil
}

// StartStopController starts/stops all controllers according to configuration.
func (c *control) StartStopControllers(cache cache.Cache, client client.Client) error {
	c.cache = cache
	c.client = client

	log.Info("syncing controllers with configuration...")

	for _, controller := range c.controllers {
		if controller.mode == Disabled {
			if controller.running {
				controller.c.Stop()
				controller.running = false
			}
			log.Info("controller %s: disabled", controller.name)
			continue
		}

		if controller.running {
			log.Info("controller %s: running", controller.name)
			continue
		}

		err := controller.c.Start(cache, client)

		if err != nil {
			log.Error("controller %s: failed to start: %v", controller.name, err)
			controller.running = false
			switch controller.mode {
			case Required:
				return controlError("%s failed to start: %v", controller.name, err)
			case Optional, Relaxed:
				log.Warn("disabling %s, failed to start: %v", controller.name, err)
				controller.mode = Disabled
			}
		} else {
			controller.running = true
			if controller.mode == Optional {
				controller.mode = Required
			}
		}
	}

	for _, controller := range c.controllers {
		state := map[bool]string{false: "inactive", true: "running"}
		log.Info("controller %s is now %s, mode %s",
			controller.name, state[controller.running], controller.mode)
	}

	return nil
}

// RunPreCreateHooks runs all registered controllers' PreCreate hooks.
func (c *control) RunPreCreateHooks(container cache.Container) error {
	for _, controller := range c.controllers {
		if err := c.runhook(controller, precreate, container); err != nil {
			return err
		}
	}
	return nil
}

// RunPreStartHooks runs all registered controllers' PreStart hooks.
func (c *control) RunPreStartHooks(container cache.Container) error {
	for _, controller := range c.controllers {
		if err := c.runhook(controller, prestart, container); err != nil {
			return err
		}
	}
	return nil
}

// RunPostStartHooks runs all registered controllers' PostStart hooks.
func (c *control) RunPostStartHooks(container cache.Container) error {
	for _, controller := range c.controllers {
		if err := c.runhook(controller, poststart, container); err != nil {
			return err
		}
	}
	return nil
}

// RunPostUpdateHooks runs all registered controllers' PostUpdate hooks.
func (c *control) RunPostUpdateHooks(container cache.Container) error {
	for _, controller := range c.controllers {
		if err := c.runhook(controller, postupdate, container); err != nil {
			return err
		}
	}
	return nil
}

// RunPostStopHooks runs all registered controllers' PostStop hooks.
func (c *control) RunPostStopHooks(container cache.Container) error {
	for _, controller := range c.controllers {
		if err := c.runhook(controller, poststop, container); err != nil {
			return err
		}
	}
	return nil
}

// runhook executes the given container hook according to the controller settings
func (c *control) runhook(controller *controller, hook string, container cache.Container) error {
	if controller.mode == Disabled || !controller.running {
		return nil
	}

	var fn func(cache.Container) error

	switch hook {
	case precreate:
		fn = controller.c.PreCreateHook
	case prestart:
		fn = controller.c.PreStartHook
	case poststart:
		fn = controller.c.PostStartHook
	case postupdate:
		fn = controller.c.PostUpdateHook
	case poststop:
		fn = controller.c.PostStopHook
	}

	log.Debug("running %s %s hook for container %s", controller.name, hook, container.PrettyName())

	if err := fn(container); err != nil {
		if controller.mode == Required {
			return controlError("%s %s hook failed: %v", controller.name, hook, err)
		}
		log.Error("%s %s hook failed: %v", controller.name, hook, err)
	}

	return nil
}

// Register registers a new controller.
func Register(name, description string, c Controller) error {
	log.Info("registering controller %s...", name)

	if oc, ok := controllers[name]; ok {
		return controlError("controller %s (%s) already registered.", oc.name, oc.description)
	}

	controllers[name] = &controller{
		name:        name,
		description: description,
		c:           c,
	}

	return nil
}

// controlError returns a controller-specific formatted error.
func controlError(format string, args ...interface{}) error {
	return fmt.Errorf("control: "+format, args...)
}


================================================
FILE: pkg/cri/resource-manager/control/cpu/api.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cpu

import (
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/goresctrl/pkg/utils"
)

// GetClasses returns all available CPU classes.
func GetClasses() map[string]Class {
	return getCPUController().config.getClasses()
}

// Assign assigns a set of cpus to a class.
//
// TODO: Drop this function. Don't store cpu class in policy data but implement
// controller-specific data store in cache.
func Assign(c cache.Cache, class string, cpus ...int) error {
	// NOTE: no locking implemented anywhere around -> we don't expect multiple parallel callers

	// Store the class assignment. Assign cpus to a class and remove them from
	// other classes
	assignments := *getClassAssignments(c)

	if this, ok := assignments[class]; !ok {
		assignments[class] = utils.NewIDSetFromIntSlice(cpus...)
	} else {
		this.Add(cpus...)
	}

	for k, v := range assignments {
		if k != class {
			v.Del(cpus...)

			// Don't store empty classes, serves as a garbage collector, too
			if v.Size() == 0 {
				delete(assignments, k)
			}
		}
	}

	setClassAssignments(c, &assignments)

	if getCPUController().started {
		// We don't want to try to enforce until the controller has been fully
		// started. Enforcement of all assignments happens on StarT(), anyway.
		ctl := getCPUController()
		if err := ctl.enforceCpufreq(class, cpus...); err != nil {
			log.Error("cpufreq enforcement failed: %v", err)
		}
		if err := ctl.enforceUncore(assignments, cpus...); err != nil {
			log.Error("uncore frequency enforcement failed: %v", err)
		}
	}

	return nil
}


================================================
FILE: pkg/cri/resource-manager/control/cpu/cache.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cpu

import (
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/goresctrl/pkg/utils"
)

const (
	cacheKeyCPUAssignments = "CPUClassAssignments"
)

// cpuClassAssignments contains the information about how cpus are assigned to
// classes
type cpuClassAssignments map[string]utils.IDSet

// Get the state of CPU class assignments from cache
func getClassAssignments(c cache.Cache) *cpuClassAssignments {
	a := &cpuClassAssignments{}

	if !c.GetPolicyEntry(cacheKeyCPUAssignments, a) {
		log.Error("no cached state of CPU class assignments found")
	}

	return a
}

// Save the state of CPU class assignments in cache
func setClassAssignments(c cache.Cache, a *cpuClassAssignments) {
	c.SetPolicyEntry(cacheKeyCPUAssignments, cache.Cachable(a))
}

// Set the value of cached cpuClassAssignments
func (c *cpuClassAssignments) Set(value interface{}) {
	switch value.(type) {
	case cpuClassAssignments:
		*c = value.(cpuClassAssignments)
	case *cpuClassAssignments:
		cp := value.(*cpuClassAssignments)
		*c = *cp
	}
}

// Get cached cpuClassAssignments
func (c *cpuClassAssignments) Get() interface{} {
	return *c
}


================================================
FILE: pkg/cri/resource-manager/control/cpu/cpu.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cpu

import (
	"fmt"

	pkgcfg "github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/cri/client"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/sysfs"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	"github.com/intel/goresctrl/pkg/utils"
)

const (
	// ConfigModuleName is the configuration section for the CPU controller.
	ConfigModuleName = "cpu"

	// CPUController is the name of the CPU controller.
	CPUController = cache.CPU
)

// cpuctl encapsulates the runtime state of our CPU enforcement/controller.
type cpuctl struct {
	cache   cache.Cache  // resource manager cache
	system  sysfs.System // system topology
	config  *config
	started bool
}

type config struct {
	Classes map[string]Class `json:"classes"`

	// Private field for storing info if we need to care about uncore
	uncoreEnabled bool
}

type Class struct {
	MinFreq                     uint `json:"minFreq"`
	MaxFreq                     uint `json:"maxFreq"`
	EnergyPerformancePreference uint `json:"energyPerformancePreference"`
	UncoreMinFreq               uint `json:"uncoreMinFreq"`
	UncoreMaxFreq               uint `json:"uncoreMaxFreq"`
}

var log logger.Logger = logger.NewLogger(CPUController)

// Ccontroller singleton instance.
var singleton *cpuctl

// getCPUController returns the (singleton) CPU controller instance.
func getCPUController() *cpuctl {
	if singleton == nil {
		singleton = &cpuctl{}
		singleton.config = singleton.defaultOptions().(*config)
	}
	return singleton
}

// Start initializes the controller for enforcing decisions.
func (ctl *cpuctl) Start(cache cache.Cache, _ client.Client) error {
	sys, err := sysfs.DiscoverSystem()
	if err != nil {
		return fmt.Errorf("failed to discover system topology: %w", err)
	}

	ctl.system = sys
	ctl.cache = cache

	// DEBUG: dump the class assignments we have stored in the cache
	log.Debug("retrieved cpu class assignments from cache:\n%s", utils.DumpJSON(getClassAssignments(ctl.cache)))

	if err := ctl.configure(); err != nil {
		// Just print an error. A config update later on may be valid.
		log.Error("failed apply /cpuinitial configuration: %v", err)
	}

	// TODO: We probably could just remove this and the hooks if they are not used
	pkgcfg.GetModule(ConfigModuleName).AddNotify(getCPUController().configNotify)

	ctl.started = true

	return nil
}

// Stop shuts down the controller.
func (ctl *cpuctl) Stop() {
}

// PreCreateHook handler for the CPU controller.
func (ctl *cpuctl) PreCreateHook(_ cache.Container) error {
	return nil
}

// PreStartHook handler for the CPU controller.
func (ctl *cpuctl) PreStartHook(_ cache.Container) error {
	return nil
}

// PostStartHook handler for the CPU controller.
func (ctl *cpuctl) PostStartHook(_ cache.Container) error {
	return nil
}

// PostUpdateHook handler for the CPU controller.
func (ctl *cpuctl) PostUpdateHook(_ cache.Container) error {
	return nil
}

// PostStopHook handler for the CPU controller.
func (ctl *cpuctl) PostStopHook(_ cache.Container) error {
	return nil
}

// enforceCpufreq enforces a class-specific cpufreq configuration to a cpuset
func (ctl *cpuctl) enforceCpufreq(class string, cpus ...int) error {
	if _, ok := ctl.config.Classes[class]; !ok {
		return fmt.Errorf("non-existent cpu class %q", class)
	}

	min := int(ctl.config.Classes[class].MinFreq)
	max := int(ctl.config.Classes[class].MaxFreq)
	log.Debug("enforcing cpu frequency limits {%d, %d} from class %q on %v", min, max, class, cpus)

	if err := utils.SetCPUsScalingMinFreq(cpus, min); err != nil {
		return fmt.Errorf("Cannot set min freq %d: %w", min, err)
	}

	if err := utils.SetCPUsScalingMaxFreq(cpus, max); err != nil {
		return fmt.Errorf("Cannot set max freq %d: %w", max, err)
	}

	return nil
}

// enforceUncore enforces uncore frequency limits
func (ctl *cpuctl) enforceUncore(assignments cpuClassAssignments, affectedCPUs ...int) error {
	if !ctl.config.uncoreEnabled {
		return nil
	}

	cpus := cpuset.New(affectedCPUs...)

	for _, cpuPkgID := range ctl.system.PackageIDs() {
		cpuPkg := ctl.system.Package(cpuPkgID)
		for _, cpuDieID := range cpuPkg.DieIDs() {
			dieCPUs := cpuPkg.DieCPUSet(cpuDieID)

			// Check if this die is affected by the specified cpuset
			if cpus.Size() == 0 || dieCPUs.Intersection(cpus).Size() > 0 {
				min, max, minCls, maxCls := effectiveUncoreFreqs(utils.NewIDSet(dieCPUs.List()...), ctl.config.Classes, assignments)

				if min == 0 && max == 0 {
					log.Debug("no uncore frequency limits for cpu package/die %d/%d", cpuPkgID, cpuDieID)
					continue
				}

				log.Debug("enforcing uncore min freq to %d (class %q), max freq to %d (class %q) on cpu package/die %d/%d", min, minCls, max, maxCls, cpuPkgID, cpuDieID)
				if min > 0 {
					if max > 0 && min > max {
						log.Warn("uncore frequency limit min > max (%d > %d) on cpu package/die %d/%d", min, max, cpuPkgID, cpuDieID)
					}

					if err := utils.SetUncoreMinFreq(cpuPkgID, cpuDieID, int(min)); err != nil {
						return err
					}
				}
				if max > 0 {
					if err := utils.SetUncoreMaxFreq(cpuPkgID, cpuDieID, int(max)); err != nil {
						return err
					}
				}
			}
		}
	}
	return nil
}

// effectiveUncoreClasses resolves the effective classes for setting the uncore
// frequency limits for a cpu package/die. It has "performance preference" so
// that the highest value (for both min and max) of the cpu classes effective
// on the die is selected.
func effectiveUncoreFreqs(cpus utils.IDSet, classes map[string]Class, assignments cpuClassAssignments) (minFreq, maxFreq uint, minCls, maxCls string) {
	for className, assignedCPUs := range assignments {
		// Check if this class is "effective" on the specified cpuset
		if idSetIntersects(cpus, assignedCPUs) {
			class := classes[className]
			if class.UncoreMinFreq > minFreq {
				minCls = className
				minFreq = class.UncoreMinFreq
			}
			if class.UncoreMaxFreq > maxFreq {
				maxCls = className
				maxFreq = class.UncoreMaxFreq
			}
		}
	}
	return minFreq, maxFreq, minCls, maxCls
}

func idSetIntersects(a, b utils.IDSet) bool {
	// Try to optimize the search for unbalanced idsets
	if len(a) < len(b) {
		for id := range a {
			if _, ok := b[id]; ok {
				return true
			}
		}
	} else {
		for id := range b {
			if _, ok := a[id]; ok {
				return true
			}
		}
	}
	return false
}

func (ctl *cpuctl) configure() error {
	// Re-configure CPUs that are assigned to some known class
	assignments := *getClassAssignments(ctl.cache)

	// DEBUG: dump the class assignments we have stored in the cache
	log.Debug("applying cpu controller configuration:\n%s", utils.DumpJSON(ctl.config))

	// Sanity check
	uncoreAvailable := utils.UncoreFreqAvailable()
	for name, conf := range ctl.config.Classes {
		if conf.UncoreMinFreq != 0 || conf.UncoreMaxFreq != 0 {
			if !uncoreAvailable {
				return fmt.Errorf("uncore limits set in cpu class %q but uncore driver not available in the system, make sure that the intel_uncore_frequency driver is loaded", name)
			}
			ctl.config.uncoreEnabled = true
			break
		}
	}

	// Configure the system
	for class, cpus := range assignments {
		if _, ok := ctl.config.Classes[class]; ok {
			// Re-configure cpus (sysfs) according to new class parameters
			if err := ctl.enforceCpufreq(class, cpus.SortedMembers()...); err != nil {
				log.Error("cpufreq enforcement on re-configure failed: %v", err)
			}
		} else {
			// TODO: what should we really do with classes that do not exist in
			// the configuration anymore? Now we remember the CPUs assigned to
			// them. A further config update might re-introduce the class in
			// which case the CPUs will be reconfigured.
			log.Warn("class %q with cpus %v missing from the configuration", class, cpus)
		}
	}
	if err := ctl.enforceUncore(assignments); err != nil {
		log.Error("uncore frequency enforcement on re-configure failed: %v", err)
	}

	log.Debug("cpu controller configured")

	return nil
}

// Callback for runtime configuration notifications.
func (ctl *cpuctl) configNotify(_ pkgcfg.Event, _ pkgcfg.Source) error {
	if !ctl.started {
		// We don't want to configure until the controller has been fully
		// started and initialized. We will configure on Start(), anyway.
		return nil
	}

	log.Info("configuration update, applying new config")
	return ctl.configure()
}

func (ctl *cpuctl) defaultOptions() interface{} {
	return &config{}
}

func (c *config) getClasses() map[string]Class {
	ret := make(map[string]Class, len(c.Classes))
	for k, v := range c.Classes {
		ret[k] = v
	}
	return ret
}

// Register us as a controller.
func init() {
	control.Register(CPUController, "CPU controller", getCPUController())
	pkgcfg.Register(ConfigModuleName, "CPU control", getCPUController().config, getCPUController().defaultOptions)
}


================================================
FILE: pkg/cri/resource-manager/control/cri/cri.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cri

import (
	"fmt"

	"github.com/intel/cri-resource-manager/pkg/cri/client"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control"
	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"

	logger "github.com/intel/cri-resource-manager/pkg/log"
)

const (
	// CRIController is the name of this controller.
	CRIController = cache.CRI
)

// crictl encapsulated the runtime state of our CRI enforcement/controller.
type crictl struct {
	cache  cache.Cache
	client client.Client
}

// Our logger instance.
var log logger.Logger = logger.NewLogger(CRIController)

// Our CRI controller singleton instance.
var singleton *crictl

// getCRIController returns our singleton CRI controller instance.
func getCRIController() control.Controller {
	if singleton == nil {
		singleton = &crictl{}
	}
	return singleton
}

// Start initializes the controller for enforcing decisions.
func (ctl *crictl) Start(cache cache.Cache, client client.Client) error {
	ctl.cache = cache
	ctl.client = client

	return nil
}

// Stop shuts down the controller.
func (ctl *crictl) Stop() {
}

// PreCreateHook is the CRI controller pre-create hook.
func (ctl *crictl) PreCreateHook(c cache.Container) error {
	if !c.HasPending(CRIController) {
		log.Debug("pre-create hook: no pending changes for %s", c.PrettyName())
		return nil
	}

	log.Debug("pre-create hook: updating %s", c.PrettyName())

	request, ok := c.GetCRIRequest()
	if !ok {
		return criError("pre-create hook: no pending CRI request")
	}
	create, ok := request.(*criv1.CreateContainerRequest)
	if !ok {
		return criError("pre-create hook: pending CRI request of wrong type (%T)", request)
	}

	create.Config.Command = c.GetCommand()
	create.Config.Args = c.GetArgs()
	create.Config.Labels = c.GetLabels()
	create.Config.Annotations = c.GetAnnotations()
	create.Config.Envs = c.GetCRIEnvs()
	create.Config.Mounts = c.GetCRIMounts()
	create.Config.Devices = c.GetCRIDevices()
	if create.Config.Linux == nil {
		create.Config.Linux = &criv1.LinuxContainerConfig{}
	}
	create.Config.Linux.Resources = c.GetLinuxResources()

	c.ClearPending(CRIController)

	return nil
}

// PreStartHook is the CRI controller pre-start hook.
func (ctl *crictl) PreStartHook(_ cache.Container) error {
	return nil
}

// PostStartHook is the CRI controller post-start hook.
func (ctl *crictl) PostStartHook(_ cache.Container) error {
	return nil
}

// PostUpdateHook is the CRI controller post-update hook.
func (ctl *crictl) PostUpdateHook(c cache.Container) error {
	var update *criv1.UpdateContainerResourcesRequest

	if !c.HasPending(CRIController) {
		log.Debug("post-update hook: no changes for %s", c.PrettyName())
		return nil
	}

	log.Debug("post-update hook: updating %s", c.PrettyName())

	resources := c.GetLinuxResources()
	if resources == nil {
		return nil
	}
	request, ok := c.GetCRIRequest()
	if !ok {
		update = &criv1.UpdateContainerResourcesRequest{
			ContainerId: c.GetID(),
		}
		c.SetCRIRequest(update)
	} else {
		if update, ok = request.(*criv1.UpdateContainerResourcesRequest); !ok {
			return criError("post-update hook: CRI request of wrong type (%T)", request)
		}
	}
	update.Linux = resources

	c.ClearPending(CRIController)

	return nil
}

// PostStop is the CRI controller post-stop hook.
func (ctl *crictl) PostStopHook(_ cache.Container) error {
	return nil
}

// criError creates an CRI-controller-specific formatted error message.
func criError(format string, args ...interface{}) error {
	return fmt.Errorf("cri: "+format, args...)
}

// Register us as a controller.
func init() {
	control.Register(CRIController, "CRI controller", getCRIController())
}


================================================
FILE: pkg/cri/resource-manager/control/flags.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package control

import (
	"encoding/json"
	"fmt"
	"strings"

	"github.com/intel/cri-resource-manager/pkg/config"
)

// Options captures our runtime configuration.
type options struct {
	Controllers map[string]mode
}

// Our runtime configuration.
var opt = defaultOptions().(*options)

// mode describes how errors for the controller should be treated.
type mode int

const (
	// Disabled controller are stopped, hooks are not run.
	Disabled mode = iota
	// Required controllers must start, hooks must succeed.
	Required
	// Optional controllers are Disabled if they can't start, otherwise they are Required.
	Optional
	// Relaxed controllers are Disabled if they can't start, hook failures are not errors.
	Relaxed
	// Default mode is Relaxed.
	Default = Relaxed
)

// ControllerMode returns the current mode for the given controller.
func (o *options) ControllerMode(name string) mode {
	if m, ok := o.Controllers[name]; ok {
		return m
	}

	return Default
}

// configNotify is our configuration update notification callback.
func (o *options) configNotify(_ config.Event, _ config.Source) error {
	log.Info("configuration updated")
	for name, controller := range controllers {
		controller.mode = o.ControllerMode(name)
	}
	return nil
}

// String returns the string representation of a mode.
func (m mode) String() string {
	switch m {
	case Disabled:
		return "disabled"
	case Required:
		return "required"
	case Optional:
		return "optional"
	case Relaxed:
		return "relaxed"
	default:
		return fmt.Sprintf("<unknown mode %d>", m)
	}
}

// MarshalJSON is the JSON marshaller for mode.
func (m mode) MarshalJSON() ([]byte, error) {
	return json.Marshal(m.String())
}

// UnmarshalJSON is the JSON unmarshaller for mode.
func (m *mode) UnmarshalJSON(raw []byte) error {
	var str string

	if err := json.Unmarshal(raw, &str); err != nil {
		return controlError("failed to unmarshal mode: %v", err)
	}

	switch strings.ToLower(str) {
	case "disabled", "disable":
		*m = Disabled
	case "required", "mandatory":
		*m = Required
	case "optional":
		*m = Optional
	case "relaxed":
		*m = Relaxed
	default:
		return controlError("invalid mode %s", str)
	}
	return nil
}

// defaultOptions returns a new options instance, all initialized to defaults.
func defaultOptions() interface{} {
	return &options{Controllers: make(map[string]mode)}
}

// Register us for configuration handling.
func init() {
	config.Register("resource-manager.control", "Resource control.", opt, defaultOptions,
		config.WithNotify(opt.configNotify))
}


================================================
FILE: pkg/cri/resource-manager/control/memory/memory.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package memory

import (
	"fmt"
	"os"
	"strconv"

	"github.com/intel/cri-resource-manager/pkg/cgroups"
	"github.com/intel/cri-resource-manager/pkg/cri/client"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control"
	logger "github.com/intel/cri-resource-manager/pkg/log"
)

const (
	// MemoryController is the name of the memory controller.
	MemoryController = cache.Memory

	// memoryCgroupPath is the path to the root of the memory cgroup.
	memoryCgroupPath = "/sys/fs/cgroup/memory"
	// toptierSoftLimitControl is the memory cgroup entry to set top tier soft limit.
	toptierSoftLimitControl = "memory.toptier_soft_limit_in_bytes"
)

// memctl encapsulates the runtime state of our memory enforcement/controller.
type memctl struct {
	cache    cache.Cache // resource manager cache
	disabled bool        // true, if kernel lacks the necessary cgroup controls
}

// Our logger instance.
var log logger.Logger = logger.NewLogger(MemoryController)

// Our singleton memory controller instance.
var singleton *memctl

// getMemoryController returns our singleton memory controller instance.
func getMemoryController() *memctl {
	if singleton == nil {
		singleton = &memctl{}
	}
	return singleton
}

// Start initializes the controller for enforcing decisions.
func (ctl *memctl) Start(cache cache.Cache, _ client.Client) error {
	// Let's keep this off for now so we can exercise this without a patched kernel...
	if !ctl.checkToptierLimitSupport() {
		return memctlError("cgroup top tier memory limit control not available")
	}
	ctl.cache = cache
	return nil
}

// Stop shuts down the controller.
func (ctl *memctl) Stop() {
}

// PreCreateHook is the memory controller pre-create hook.
func (ctl *memctl) PreCreateHook(_ cache.Container) error {
	return nil
}

// PreStartHook is the memory controller pre-start hook.
func (ctl *memctl) PreStartHook(_ cache.Container) error {
	return nil
}

// PostStartHook is the memory controller post-start hook.
func (ctl *memctl) PostStartHook(c cache.Container) error {
	if !c.HasPending(MemoryController) {
		return nil
	}

	if err := ctl.setToptierLimit(c); err != nil {
		return err
	}

	c.ClearPending(MemoryController)

	return nil
}

// PostUpdateHook is the memory controller post-update hook.
func (ctl *memctl) PostUpdateHook(c cache.Container) error {
	if !c.HasPending(MemoryController) {
		return nil
	}

	if err := ctl.setToptierLimit(c); err != nil {
		return err
	}

	c.ClearPending(MemoryController)

	return nil
}

// PostStop is the memory controller post-stop hook.
func (ctl *memctl) PostStopHook(_ cache.Container) error {
	return nil
}

// Check if memory cgroup controller supports top tier soft limits.
func (ctl *memctl) checkToptierLimitSupport() bool {
	_, err := os.Stat(memoryCgroupPath + "/" + toptierSoftLimitControl)
	if err != nil && os.IsNotExist(err) {
		log.Warn("cgroup top tier memory limit control not available")
		ctl.disabled = true
	}
	return !ctl.disabled
}

// setToptierLimit sets the top tier memory (soft) limit for the container.
func (ctl *memctl) setToptierLimit(c cache.Container) error {
	dir := c.GetCgroupDir()
	if dir == "" {
		return memctlError("%q: failed to determine cgroup directory",
			c.PrettyName())
	}

	limit := strconv.FormatInt(c.GetToptierLimit(), 10)
	group := cgroups.Memory.Group(dir)
	entry := toptierSoftLimitControl

	if err := group.Write(entry, "%s\n", limit); err != nil {
		return err
	}

	log.Info("%q: memory toptier soft limit set to %v", c.PrettyName(), limit)

	return nil
}

// memctlError creates a memory I/O-controller-specific formatted error message.
func memctlError(format string, args ...interface{}) error {
	return fmt.Errorf("memory: "+format, args...)
}

// init registers this controller.
func init() {
	control.Register(MemoryController, "memory toptier controller", getMemoryController())
}


================================================
FILE: pkg/cri/resource-manager/control/page-migrate/demoter.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package pagemigrate

import (
	"encoding/binary"
	"fmt"
	"io"
	"math/rand"
	"os"
	"strconv"
	"strings"
	"time"

	"github.com/intel/cri-resource-manager/pkg/cgroups"
	"github.com/intel/cri-resource-manager/pkg/config"
	idset "github.com/intel/goresctrl/pkg/utils"
)

// Support dynamic pushing of unused pages from DRAM to PMEM.
//
// The algorithm is be (roughly) this:
//
// Find out which processes belong to the container. For every process in the
// container, find out which pages the process uses. Using move_pages(), push a
// number of pages not in the working set, which are present in DRAM, from DRAM
// to PMEM. This may need to be done for many times with a delay in between,
// because the process will be "stuck" when the pages are moved. Repeat this
// process.
//
// How to figure out which pages are not part of the working set:
//
// 1. Clear soft-dirty bits on the PTEs:
//    https://www.kernel.org/doc/html/latest/admin-guide/mm/soft-dirty.html
// 2. Wait for a while.
// 3. Read out the process page maps:
//    https://www.kernel.org/doc/html/latest/admin-guide/mm/pagemap.html The pages
//    which don't have the soft-dirty bit are considered to be outside of the
//    working set.

type page struct {
	pid  int
	addr uint64
}

type addrRange struct {
	addr   uint64
	length uint64
}

type demoter struct {
	migration *migration // controller backpointer

	// Finding pages
	dirtyBitReset time.Ticker      // Ticker for resetting the dirty bits.
	dirtyBitStop  chan interface{} // Channel for stopping the ticker.

	// Moving pages
	pageMover         PageMover
	containerDemoters map[string]chan interface{} // Channel for sending pagemap updates to demoters.
	pageScanInterval  config.Duration             // How often should we scan pages.
	pageMoveInterval  config.Duration             // How often should we move pages for a container.
	maxPageMoveCount  uint                        // How many pages to move at once.
}

type pagePool struct {
	pages        map[int][]page
	longestRange uint
}

type demotion struct {
	pagePool    pagePool
	targetNodes idset.IDSet
}

func copyPagePool(p pagePool) pagePool {
	c := pagePool{
		longestRange: p.longestRange,
		pages:        make(map[int][]page, 0),
	}
	for pid, pages := range p.pages {
		c.pages[pid] = make([]page, len(pages))
		copy(c.pages[pid], pages)
	}
	return c
}

func newDemoter(m *migration) *demoter {
	return &demoter{
		migration:         m,
		containerDemoters: make(map[string]chan interface{}, 0),
		pageMover:         &linuxPageMover{},
	}
}

func (d *demoter) start() {
	if d.pageScanInterval > 0 && d.pageMoveInterval > 0 && d.maxPageMoveCount > 0 {
		log.Info("scanning pages every %s, moving max. %d pages every %s",
			d.pageScanInterval.String(), d.maxPageMoveCount, d.pageMoveInterval.String())
		d.startDirtyBitResetTimer()
	} else {
		log.Info("scanning pages is disabled")
	}
}

// Stop stops page scanning and demotion.
func (d *demoter) Stop() {
	d.stopDirtyBitResetTimer()
	d.migration.Lock()
	defer d.migration.Unlock()
	d.stopDemoters()
}

// Reconfigure restarts, if necessary, page scanning and demotion with new options.
func (d *demoter) Reconfigure() {
	if d.pageScanInterval != opt.PageScanInterval ||
		d.pageMoveInterval != opt.PageMoveInterval ||
		d.maxPageMoveCount != opt.MaxPageMoveCount {
		d.Stop()
		d.pageScanInterval = opt.PageScanInterval
		d.pageMoveInterval = opt.PageMoveInterval
		d.maxPageMoveCount = opt.MaxPageMoveCount
	}
	d.start()
}

func (d *demoter) updateDemoter(cid string, p pagePool, targetNodes idset.IDSet) {
	channel, found := d.containerDemoters[cid]
	if !found {
		channel := make(chan interface{})
		go func() {
			moveTimer := time.NewTicker(time.Duration(d.pageMoveInterval))
			moveTimerChan := moveTimer.C
			pagePool := p
			nodes := targetNodes
			count := d.maxPageMoveCount
			for {
				select {
				case msg := <-channel:
					demotion, ok := msg.(demotion)
					if ok {
						pagePool = demotion.pagePool
						targetNodes = demotion.targetNodes
						if p.longestRange > d.maxPageMoveCount {
							// The number of pages moved needs to be at least as large as a range in numa_maps
							// file so that we know that all pages will be moved (even if some of them were
							// already on the PMEM node).

							// TODO: adjust the timer if we have a larger-than-usual range of pages to move.
							count = p.longestRange
						} else {
							count = d.maxPageMoveCount
						}
					} else {
						// A stop request.
						if moveTimer != nil {
							moveTimer.Stop()
						}
						return
					}
				case _ = <-moveTimerChan:
					err := d.movePages(pagePool, count, nodes)
					if err != nil {
						log.Error("Error demoting pages: %s", err)
					}
				}
			}
		}()
		d.containerDemoters[cid] = channel
		// TODO: trigger instant update when run the first time?
	} else {
		channel <- demotion{pagePool: p, targetNodes: targetNodes}
	}
}

func (d *demoter) stopDemoter(cid string) {
	channel, found := d.containerDemoters[cid]
	if found {
		channel <- "stop"
		delete(d.containerDemoters, cid)
	}
}

func (d *demoter) stopUnusedDemoters(cs map[string]*container) {
	for id := range d.containerDemoters {
		if _, found := cs[id]; !found {
			d.stopDemoter(id)
		}
	}
}

func (d *demoter) stopDemoters() {
	for cid, channel := range d.containerDemoters {
		channel <- "stop"
		delete(d.containerDemoters, cid)
	}
}

func (d *demoter) stopDirtyBitResetTimer() {
	if d.dirtyBitStop != nil {
		close(d.dirtyBitStop)
		d.dirtyBitStop = nil
	}
}

func (d *demoter) startDirtyBitResetTimer() {
	if d.dirtyBitStop != nil {
		return
	}

	stop := make(chan interface{})
	go func() {
		dirtyBitResetTimer := time.NewTicker(time.Duration(d.pageScanInterval))
		dirtyBitResetChan := dirtyBitResetTimer.C
		for {
			select {
			case _ = <-stop:
				if dirtyBitResetTimer != nil {
					dirtyBitResetTimer.Stop()
				}
				return
			case _ = <-dirtyBitResetChan:
				d.scanPages()
			}
		}
	}()
	d.dirtyBitStop = stop
}

func resetDirtyBit(pid string) error {
	// Write magic value "4" to the clear_refs file. This resets the dirty bit.
	path := "/proc/" + pid + "/clear_refs"
	err := os.WriteFile(path, []byte("4"), 0600)
	return err
}

// resetDirtyBit unsets soft-dirty bits for all processes in a container.
func (d *demoter) resetDirtyBit(c *container) error {
	group := cgroups.Memory.Group(c.cgroupDir)

	pids, err := group.GetProcesses()
	if err != nil {
		return err
	}

	for _, pid := range pids {
		err = resetDirtyBit(pid)
		if err != nil {
			log.Error("%s: failed to reset dirty but for process %s: %v",
				c.prettyName, pid, err)
			return err
		}
	}
	return nil
}

// scanPages scans pages of tracked containers to detect idle ones.
func (d *demoter) scanPages() {
	d.migration.Lock()
	defer d.migration.Unlock()

	for _, container := range d.migration.containers {
		pm := container.GetPageMigration()
		if pm == nil {
			continue
		}
		dramNodes := pm.SourceNodes
		pmemNodes := pm.TargetNodes
		if dramNodes.Size() == 0 || pmemNodes.Size() == 0 {
			continue
		}

		// Gather the known pages which need to be moved.
		pagePool, err := d.getPagesForContainer(container, dramNodes)
		if err != nil {
			log.Error("failed to get pages for container %v", container.prettyName)
			continue
		}

		count := 0
		for _, pages := range pagePool.pages {
			count += len(pages)
		}
		log.Debug("%d pages for (maybe) demoting for %v", count, container.prettyName)

		// Reset the dirty bit from all pages.
		d.resetDirtyBit(container)

		// Give the pages to the page moving goroutine. Copy the page pool so that there's no race.
		d.updateDemoter(container.GetCacheID(), copyPagePool(pagePool), pmemNodes.Clone())
	}

	d.stopUnusedDemoters(d.migration.containers)
}

func (d *demoter) getPagesForContainer(c *container, sourceNodes idset.IDSet) (pagePool, error) {
	pool := pagePool{
		pages:        make(map[int][]page, 0),
		longestRange: 0,
	}

	group := cgroups.Memory.Group(c.cgroupDir)
	pids, err := group.GetProcesses()
	if err != nil {
		return pagePool{}, err
	}

	for _, pid := range pids {
		addressRanges := make([]addrRange, 0)
		pidNumber64, err := strconv.ParseInt(pid, 10, 32)
		if err != nil {
			log.Error("Failed to parse addr to int: %v", err)
			continue
		}
		pidNumber := int(pidNumber64)
		// Read /proc/pid/numa_maps and /proc/pid/maps
		numaMapsPath := "/proc/" + pid + "/numa_maps"
		numaMapsBytes, err := os.ReadFile(numaMapsPath)
		if err != nil {
			log.Error("Could not read numa_maps: %v", err)
			continue
		}
		mapsPath := "/proc/" + pid + "/maps"
		mapsBytes, err := os.ReadFile(mapsPath)
		if err != nil {
			log.Error("Could not read maps: %v\n", err)
			continue
		}
		mapsLines := strings.Split(string(mapsBytes), "\n")

		for _, line := range strings.Split(string(numaMapsBytes), "\n") {
			tokens := strings.Split(line, " ")
			if len(tokens) < 3 {
				continue
			}
			attrs := strings.Join(tokens[2:], " ")
			// Filter out lines which don't have "anonymous", since we are not
			// interested in file-mapped or shared pages. Save the interesting ranges.
			// TODO: consider dropping the "heap" requirement. There are often ranges
			// in the file which don't have any attributes indicating the memory
			// location.
			if !strings.Contains(attrs, "heap") || !strings.Contains(attrs, "anon=") {
				continue
			}
			// We only find out if *any* pages in the range are in a DRAM node. The
			// more fine-grained analysis is done later by running the move_pages()
			// system call twice.
			locatedOnDRAMNode := false
			for node := range sourceNodes {
				number := strconv.FormatInt(int64(node), 10)
				str := "N" + number + "="
				if strings.Contains(attrs, str) {
					locatedOnDRAMNode = true
					break
				}
			}
			if !locatedOnDRAMNode {
				continue
			}

			for _, mapLine := range mapsLines {
				if strings.HasPrefix(mapLine, tokens[0]+"-") {
					spaceIndex := strings.Index(mapLine, " ")
					if spaceIndex > len(tokens[0]+"-") {
						endAddrStr := mapLine[len(tokens[0]+"-"):spaceIndex]
						startAddr, err := strconv.ParseInt(tokens[0], 16, 64)
						if err != nil {
							log.Error("Failed to parse addr to int: %v\n", err)
							break
						}
						endAddr, err := strconv.ParseInt(endAddrStr, 16, 64)
						if err != nil {
							log.Error("Failed to parse addr to int: %v\n", err)
							break
						}
						rangeLength := endAddr - startAddr
						addressRanges = append(addressRanges, addrRange{uint64(startAddr), uint64(rangeLength / int64(os.Getpagesize()))})
						// log.Debug("found interesting page range for pid %s: %v", pid, addressRanges[len(addressRanges)-1])
						break
					}
				}
			}
		}

		// Read /proc/pid/pagemap and process only interesting page ranges. For
		// every read-only page and for every page with the soft-dirty bit on, mark
		// them as candidates to be moved by adding them to pagePool.

		if len(addressRanges) > 0 {
			// log.Debug("Getting pages for PID %s for ranges %v", pid, addressRanges)
			pages := make([]page, 0)
			path := "/proc/" + pid + "/pagemap"
			pageMap, err := os.OpenFile(path, os.O_RDONLY, 0)
			if err != nil {
				// Probably the process just died?
				fmt.Printf("Could not read pagemaps: %v\n", err)
				break
			}
			for _, addressRange := range addressRanges {
				idx := int64(addressRange.addr / uint64(os.Getpagesize()) * 8)
				offset, err := pageMap.Seek(idx, io.SeekStart)
				if err != nil {
					// Maybe there was a race condition and the maps changed?
					log.Error("Failed to seek: %v\n", err)
					continue
				}
				for i := uint64(0); i < addressRange.length; i++ {
					bytes := make([]byte, 8)
					// Read exactly 8 bytes (because the file interface breaks otherwise).
					_, err = io.ReadAtLeast(pageMap, bytes, 8)
					if err != nil {
						// Possibly the maps changed.
						log.Error("Could not read data from pagemaps(%v)(page size: %d, seek offset: %d): %v\n", idx, os.Getpagesize(), offset, err)
						break
					}
					data := binary.LittleEndian.Uint64(bytes)

					// Check that the page is present (not swapped), exclusively
					// mapped (not used by any other process), and it has the
					// soft-dirty bit off.

					// Note: there appears to be no way to see from the pagemap entry what the NUMA node is.
					// We could map this back to the physical address ranges if needed. Currently this is handled
					// in movePages() by calling move_pages() first with an empty node array.

					softDirtyBit := uint64(0x1) << 55
					exclusiveBit := uint64(0x1) << 56
					presentBit := uint64(0x1) << 63
					present := (data&presentBit == presentBit)
					exclusive := (data&exclusiveBit == exclusiveBit)
					softDirty := (data&softDirtyBit == softDirtyBit)

					if present && exclusive && !softDirty {
						// log.Debug("page a candidate for moving: 0x%08x", addressRange.addr+i*uint64(os.Getpagesize()))
						pages = append(pages, page{addr: addressRange.addr + i*uint64(os.Getpagesize()), pid: pidNumber})
					}
				}
			}
			if _, found := pool.pages[pidNumber]; found {
				pool.pages[pidNumber] = append(pool.pages[pidNumber], pages...)
			} else {
				pool.pages[pidNumber] = pages
			}
			if uint(len(addressRanges)) > pool.longestRange {
				pool.longestRange = uint(len(addressRanges))
			}
		}
	}

	return pool, nil
}

func pickClosestPMEMNode(targetNodes idset.IDSet) idset.ID {
	// TODO: analyze the topology information (and possibly the amount of free memory) and choose the "best"
	// PMEM node to demote the page to. The array targetNodes already contains only the subset of PMEM nodes
	// available in this topology subtree. Right now just pick a random controller.
	nodes := targetNodes.Members()
	return nodes[rand.Intn(len(nodes))]
}

func (d *demoter) movePagesForPid(p []page, count uint, pid int, targetNodes idset.IDSet) (uint, error) {
	// We move at max count pages, but there might not be that much.
	nPages := count
	if uint(len(p)) < count {
		nPages = uint(len(p))
	}

	// Gather memory page pointers.
	pages := make([]uintptr, nPages)
	var i uint
	for i = 0; i < nPages; i++ {
		pages[i] = uintptr(p[i].addr)
	}

	// MPOL_MF_MOVE - only move pages exclusive to this process. There will be
	// permission denied errors for pages which couldn't be moved. FIXME: find
	// out if the whole move_pages() syscall failed or if just the non-exclusive
	// pages were not moved.
	flags := 1 << 1

	// Call move_pages() first with nil nodes array to find out the current controllers.
	_, currentStatus, err := d.pageMover.MovePagesSyscall(pid, nPages, pages, nil, flags)
	if err != nil {
		log.Error("Failed to find out the current status of the pages: %v.", err)
		return 0, err
	}

	dramPages := make([]uintptr, 0)
	nodes := make([]int, 0)
	// Choose a target node for every page. Drop the pages which already are on the right controller from the list.
	for i, pageStatus := range currentStatus {
		if pageStatus < 0 {
			// There was an error regarding this page.
			continue
		}
		// log.Debug("page 0x%08X: old status %d", pages[i], pageStatus)
		if !targetNodes.Has(idset.ID(pageStatus)) {
			// In case of many PMEM controllers choose the one that is the closest.
			dramPages = append(dramPages, pages[i])
			nodes = append(nodes, int(pickClosestPMEMNode(targetNodes)))
		} // else no need to move.
	}

	// Call move_pages() to actually move the pages.
	_, _, err = d.pageMover.MovePagesSyscall(pid, uint(len(dramPages)), dramPages, nodes, flags)

	// We processed (moved or ignored) at least nPages.
	return nPages, err
}

func (d *demoter) movePages(p pagePool, count uint, targetNodes idset.IDSet) error {
	// Select pid for moving the pages so that the process with the largest number
	// of non-dirty pages gets the pages moved first.
	processedPids := make(map[int]bool, 0)

	for count > 0 {
		mostPagesPid := 0
		nPagesForPid := uint(0)
		for pid, pages := range p.pages {
			_, alreadyProcessed := processedPids[pid]
			if alreadyProcessed {
				continue
			}
			if uint(len(pages)) > nPagesForPid {
				mostPagesPid = pid
				nPagesForPid = uint(len(pages))
			}
		}

		if nPagesForPid == 0 {
			return nil
		}

		processedPids[mostPagesPid] = true

		nMovePages := nPagesForPid
		if count < nPagesForPid {
			nMovePages = count
			count = 0
		} else {
			count -= nPagesForPid
		}

		log.Debug("moving %d pages for pid %d", nMovePages, mostPagesPid)
		nPages, err := d.movePagesForPid(p.pages[mostPagesPid], nMovePages, mostPagesPid, targetNodes)
		if err != nil {
			log.Error("Failed to move pages: %v", err)
			return err
		}
		// Remove processed pages from the pagemap.
		p.pages[mostPagesPid] = p.pages[mostPagesPid][nPages:]
	}
	return nil
}


================================================
FILE: pkg/cri/resource-manager/control/page-migrate/demoter_test.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package pagemigrate

import (
	"fmt"
	idset "github.com/intel/goresctrl/pkg/utils"
	"testing"
)

type mockPageMover struct {
	firstSuccess               bool
	secondSuccess              bool
	expectedPagesForSecondCall uint
	firstStatus                []int
}

func (m *mockPageMover) MovePagesSyscall(pid int, count uint, pages []uintptr, nodes []int, flags int) (uint, []int, error) {

	status := make([]int, len(pages))

	fmt.Printf("move_pages(): pid %d, count %d, pages %v, nodes %v, flags %d\n",
		pid, count, pages, nodes, flags)

	if nodes == nil {
		// First call is made without nodes
		if m.firstSuccess == false {
			return 0, m.firstStatus, fmt.Errorf("Fake error")
		}
		return 0, m.firstStatus, nil
	}

	// Second call
	if m.secondSuccess == false {
		return 0, status, fmt.Errorf("Fake error")
	}
	if uint(len(pages)) != m.expectedPagesForSecondCall {
		return 0, status, fmt.Errorf("Real error")
	}

	return 0, status, nil
}

func TestMovePages(t *testing.T) {
	tcases := []struct {
		name                       string
		pool                       pagePool
		targetNodes                idset.IDSet
		pageCount                  uint
		expectedRemainingPageCount uint
		expectedError              bool
		pageMover                  PageMover
		pid                        int
	}{
		{
			name: "move pages (both)",
			pool: pagePool{
				pages: map[int][]page{
					500: {
						{
							pid:  500,
							addr: 0xdeadbeef,
						},
						{
							pid:  500,
							addr: 0xc0ffee,
						},
					},
				},
			},
			pid:       500,
			pageCount: 2,
			pageMover: &mockPageMover{
				firstSuccess:               true,
				secondSuccess:              true,
				firstStatus:                []int{0, 0},
				expectedPagesForSecondCall: 2,
			},
			targetNodes:                idset.NewIDSet(1, 2),
			expectedError:              false,
			expectedRemainingPageCount: 0,
		},
		{
			name: "move pages (only one)",
			pool: pagePool{
				pages: map[int][]page{
					500: {
						{
							pid:  500,
							addr: 0xdeadbeef,
						},
						{
							pid:  500,
							addr: 0xc0ffee,
						},
					},
				},
			},
			pid:       500,
			pageCount: 2,
			pageMover: &mockPageMover{
				firstSuccess:               true,
				secondSuccess:              true,
				firstStatus:                []int{0, 2},
				expectedPagesForSecondCall: 1,
			},
			targetNodes:                idset.NewIDSet(1, 2),
			expectedError:              false,
			expectedRemainingPageCount: 0,
		},
		{
			name: "move pages (none)",
			pool: pagePool{
				pages: map[int][]page{
					500: {
						{
							pid:  500,
							addr: 0xdeadbeef,
						},
						{
							pid:  500,
							addr: 0xc0ffee,
						},
					},
				},
			},
			pid:       500,
			pageCount: 2,
			pageMover: &mockPageMover{
				firstSuccess:               true,
				secondSuccess:              true,
				firstStatus:                []int{2, 1},
				expectedPagesForSecondCall: 0,
			},
			targetNodes:                idset.NewIDSet(1, 2),
			expectedError:              false,
			expectedRemainingPageCount: 0,
		},
		{
			name: "move pages (count 1)",
			pool: pagePool{
				pages: map[int][]page{
					500: {
						{
							pid:  500,
							addr: 0xdeadbeef,
						},
						{
							pid:  500,
							addr: 0xc0ffee,
						},
					},
				},
			},
			pid:       500,
			pageCount: 1,
			pageMover: &mockPageMover{
				firstSuccess:               true,
				secondSuccess:              true,
				firstStatus:                []int{0},
				expectedPagesForSecondCall: 1,
			},
			targetNodes:                idset.NewIDSet(1, 2),
			expectedError:              false,
			expectedRemainingPageCount: 1,
		},
		{
			name: "move pages (first call error)",
			pool: pagePool{
				pages: map[int][]page{
					500: {
						{
							pid:  500,
							addr: 0xdeadbeef,
						},
						{
							pid:  500,
							addr: 0xc0ffee,
						},
					},
				},
			},
			pid:       500,
			pageCount: 2,
			pageMover: &mockPageMover{
				firstSuccess:               false,
				secondSuccess:              true,
				firstStatus:                []int{0, 0},
				expectedPagesForSecondCall: 0,
			},
			targetNodes:                idset.NewIDSet(1, 2),
			expectedError:              true,
			expectedRemainingPageCount: 2,
		},
		{
			name: "move pages (second call error)",
			pool: pagePool{
				pages: map[int][]page{
					500: {
						{
							pid:  500,
							addr: 0xdeadbeef,
						},
						{
							pid:  500,
							addr: 0xc0ffee,
						},
					},
				},
			},
			pid:       500,
			pageCount: 2,
			pageMover: &mockPageMover{
				firstSuccess:               true,
				secondSuccess:              false,
				firstStatus:                []int{0, 0},
				expectedPagesForSecondCall: 0,
			},
			targetNodes:                idset.NewIDSet(1, 2),
			expectedError:              true,
			expectedRemainingPageCount: 2,
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			dynamicDemoter := &demoter{
				maxPageMoveCount: tc.pageCount,
				pageMover:        tc.pageMover,
			}

			err := dynamicDemoter.movePages(tc.pool, tc.pageCount, tc.targetNodes)
			if err != nil {
				if err.Error() != "Fake error" {
					t.Errorf("Non-fake error: %v", err)
				}
			}
			if (err != nil) != tc.expectedError {
				t.Errorf("Unexpected error value")
			}

			if uint(len(tc.pool.pages[tc.pid])) != tc.expectedRemainingPageCount {
				t.Errorf("Wrong number of remaining pages: %d", len(tc.pool.pages[tc.pid]))
			}
		})
	}
}


================================================
FILE: pkg/cri/resource-manager/control/page-migrate/flags.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package pagemigrate

import (
	"github.com/intel/cri-resource-manager/pkg/config"
)

// options captures our configurable controller parameters.
type options struct {
	// PageScanInterval controls how much time we give containers to touch non-idle pages.
	PageScanInterval config.Duration
	// PageMoveInterval controls how often we trigger moving pages.
	PageMoveInterval config.Duration
	// MaxPageMoveCount controls how many pages we can move in a single go.
	MaxPageMoveCount uint
}

// Our runtime configuration.
var opt = defaultOptions().(*options)

// defaultOptions returns a new options instance, all initialized to defaults.
func defaultOptions() interface{} {
	return &options{}
}

// Register us for configuration handling.
func init() {
	config.Register(PageMigrationConfigPath, PageMigrationDescription, opt, defaultOptions)
}


================================================
FILE: pkg/cri/resource-manager/control/page-migrate/page-migrate.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package pagemigrate

import (
	"fmt"
	"sync"

	"github.com/intel/cri-resource-manager/pkg/cri/client"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control"
	logger "github.com/intel/cri-resource-manager/pkg/log"
)

const (
	// PageMigrationController is the name/domain of the page migration controller.
	PageMigrationController = cache.PageMigration
	// PageMigrationConfigPath is the configuration path for the page migration controller.
	PageMigrationConfigPath = "resource-manager.control." + PageMigrationController
	// PageMigrationDescription is the description for the page migration controller.
	PageMigrationDescription = "page migration controller"
)

// migration implements the controller for memory page migration.
type migration struct {
	cache      cache.Cache           // resource manager cache
	sync.Mutex                       // protect access from multiple goroutines
	containers map[string]*container // containers we migrate
	demoter    *demoter              // demoter adopted from topology-aware policy
}

//
// The resource manager serializes access to the cache during request
// processing, event processing, and configuration updates by locking
// the resource-manager for each of these. Since controller hooks are
// invoked either as part of processing a request or an event, access
// to the cache from hooks is properly serialized.
//
// Page scanning or migration on the other hand happen asynchronously
// from dedicated goroutines. In order to avoid having to serialize
// access to the cache for these, we track and cache locally just enough
// data about containers that we can perform these actions completely on
// our own, without the need to access the resource manager cache at all.
//
// An alternative would have been to duplicate what we had originally in
// the policy:
//  - introduce controller events akin to policy events
//  - have the resource-manager call controller event handlers with the
//    lock held
//  - periodically inject a controller event when we want to scan pages
//  - perform page scanning or demotion from the event handler with the
//    resource-manager lock held
//
// However that would have destroyed one of the goals of splitting page
// scanning and migration out to a controller of its own, which was to
// perform these potentially time consuming actions without blocking
// concurrent processing of requests or events.
//

// container is the per container data we track locally.
type container struct {
	cacheID    string
	id         string
	prettyName string
	cgroupDir  string
	pm         *cache.PageMigrate
}

// Our logger instance.
var log = logger.NewLogger(PageMigrationController)

// Our singleton page migration controller.
var singleton *migration

// getMigrationController returns our singleton controller instance.
func getMigrationController() *migration {
	if singleton == nil {
		singleton = &migration{
			containers: make(map[string]*container),
		}
		singleton.demoter = newDemoter(singleton)
	}
	return singleton
}

// Start prepares the controller for resource control/decision enforcement.
func (m *migration) Start(cache cache.Cache, _ client.Client) error {
	m.cache = cache
	m.syncWithCache()
	m.demoter.Reconfigure()
	return nil
}

// Stop shuts down the controller.
func (m *migration) Stop() {
	m.demoter.Stop()
}

// PreCreateHook is the controller's pre-create hook.
func (m *migration) PreCreateHook(cache.Container) error {
	return nil
}

// PreStartHook is the controller's pre-start hook.
func (m *migration) PreStartHook(cache.Container) error {
	return nil
}

// PostStartHook is the controller's post-start hook.
func (m *migration) PostStartHook(cc cache.Container) error {
	m.Lock()
	defer m.Unlock()
	err := m.insertContainer(cc)
	cc.ClearPending(PageMigrationController)
	return err
}

// PostUpdateHook is the controller's post-update hook.
func (m *migration) PostUpdateHook(cc cache.Container) error {
	m.Lock()
	defer m.Unlock()
	m.updateContainer(cc)
	cc.ClearPending(PageMigrationController)
	return nil
}

// PostStopHook is the controller's post-stop hook.
func (m *migration) PostStopHook(cc cache.Container) error {
	m.Lock()
	defer m.Unlock()
	m.deleteContainer(cc)
	return nil
}

// syncWithCache synchronizes tracked containers with the cache.
func (m *migration) syncWithCache() {
	m.Lock()
	defer m.Unlock()
	m.containers = make(map[string]*container)
	for _, cc := range m.cache.GetContainers() {
		m.insertContainer(cc)
	}
}

// insertContainer creates a local copy of the container.
func (m *migration) insertContainer(cc cache.Container) error {
	pm := cc.GetPageMigration()
	if pm == nil {
		return nil
	}

	c := &container{
		cacheID:    cc.GetCacheID(),
		id:         cc.GetID(),
		prettyName: cc.PrettyName(),
		cgroupDir:  cc.GetCgroupDir(),
		pm:         pm.Clone(),
	}
	if c.cgroupDir == "" {
		return migrationError("can't find cgroup dir for container %s",
			c.prettyName)
	}

	m.containers[c.cacheID] = c

	return nil
}

// updateContainer updates the local copy of the container.
func (m *migration) updateContainer(cc cache.Container) error {
	pm := cc.GetPageMigration()
	if pm == nil {
		delete(m.containers, cc.GetCacheID())
		return nil
	}

	c, ok := m.containers[cc.GetCacheID()]
	if !ok {
		return m.insertContainer(cc)
	}

	c.pm = pm.Clone()
	return nil
}

// deleteContainer creates a local copy of the container.
func (m *migration) deleteContainer(cc cache.Container) error {
	delete(m.containers, cc.GetCacheID())
	return nil
}

// GetCacheID replicates the respective cache.Container function.
func (c *container) GetCacheID() string {
	return c.cacheID
}

// GetID replicates the respective cache.Container function.
func (c *container) GetID() string {
	return c.id
}

// GetCgroupDir replicates the respective cache.Container function.
func (c *container) GetCgroupDir() string {
	return c.GetCgroupDir()
}

// GetPageMigration replicates the respective cache.Container function.
func (c *container) GetPageMigration() *cache.PageMigrate {
	return c.pm
}

// PrettyName replicates the respective cache.Container function.
func (c *container) PrettyName() string {
	return c.prettyName
}

// init registers this controller.
func init() {
	control.Register(PageMigrationController, "page migration controller", getMigrationController())
}

// migrationError creates a controller-specific formatted error message.
func migrationError(format string, args ...interface{}) error {
	return fmt.Errorf("page-migrate: "+format, args...)
}


================================================
FILE: pkg/cri/resource-manager/control/page-migrate/page-mover.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package pagemigrate

import "C"

import (
	"fmt"
	"unsafe"

	"golang.org/x/sys/unix"
)

type linuxPageMover struct{}

// PageMover implements the way to move pages in a given HW/SW platform.
type PageMover interface {
	MovePagesSyscall(pid int, count uint, pages []uintptr, nodes []int, flags int) (uint, []int, error)
}

func (m *linuxPageMover) MovePagesSyscall(pid int, count uint, pages []uintptr, nodes []int, flags int) (uint, []int, error) {

	// syscall:
	// long move_pages(int pid, unsigned long count, void **pages,
	//                 const int *nodes, int *status, int flags);

	var err error

	if count == 0 {
		return 0, []int{}, nil
	}

	// Go int is 64 bits on a 64-bit system, but C int is only guaranteed to be at least 16 bits, typically 32.
	cNodes := make([]C.int, len(nodes))
	for i := 0; i < len(nodes); i++ {
		if nodes[i] < 0 || nodes[i] > 32767 {
			return 0, []int{}, fmt.Errorf("int value error: %d", nodes[i])
		}
		cNodes[i] = C.int(nodes[i]) // safe downcast
	}

	cStatus := make([]C.int, len(pages))

	nodesPtr := unsafe.Pointer(nil)
	if nodes != nil {
		nodesPtr = unsafe.Pointer(&cNodes[0])
	}

	ret, _, en := unix.Syscall6(unix.SYS_MOVE_PAGES, uintptr(pid), uintptr(count), uintptr(unsafe.Pointer(&pages[0])), uintptr(nodesPtr), uintptr(unsafe.Pointer(&cStatus[0])), uintptr(flags))
	if en != 0 {
		err = unix.Errno(en)
	}

	// log.Debug("move_pages(): pid %d, count %d, pages %v, nodes %v, flags %d: return value %d, status %d, errno %v",
	// 	pid, count, pages, nodes, flags, uint(ret), cStatus, err)

	status := make([]int, count)
	for i := uint(0); i < count; i++ {
		status[i] = int(cStatus[i])
	}

	return uint(ret), status, err
}


================================================
FILE: pkg/cri/resource-manager/control/rdt/rdt.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package rdt

import (
	"fmt"

	corev1 "k8s.io/api/core/v1"

	pkgcfg "github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/cri/client"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/metrics"
	"github.com/intel/goresctrl/pkg/rdt"
)

const (
	// ConfigModuleName is the configuration section for RDT
	ConfigModuleName = "rdt"

	// RDTController is the name of the RDT controller.
	RDTController = cache.RDT

	resctrlGroupPrefix = "cri-resmgr."
)

// rdtctl encapsulates the runtime state of our RTD enforcement/controller.
type rdtctl struct {
	cache        cache.Cache   // resource manager cache
	noQoSClasses bool          // true if mapping pod qos class to rdt class is disabled
	mode         OperatingMode // track the mode here to capture mode changes
	opt          *config
}

type config struct {
	rdt.Config

	Options struct {
		rdt.Options

		Mode               OperatingMode `json:"mode"`
		MonitoringDisabled bool          `json:"monitoringDisabled"`
	} `json:"options"`
}

type OperatingMode string

const (
	OperatingModeDisabled  OperatingMode = "Disabled"
	OperatingModeDiscovery OperatingMode = "Discovery"
	OperatingModeFull      OperatingMode = "Full"
)

// Our logger instance.
var log logger.Logger = logger.NewLogger(RDTController)

// our RDT controller singleton instance.
var singleton *rdtctl

// getRDTController returns our singleton RDT controller instance.
func getRDTController() *rdtctl {
	if singleton == nil {
		singleton = &rdtctl{}
		singleton.opt = singleton.defaultOptions().(*config)
	}
	return singleton
}

// Start initializes the controller for enforcing decisions.
func (ctl *rdtctl) Start(cache cache.Cache, _ client.Client) error {
	if err := rdt.Initialize(resctrlGroupPrefix); err != nil {
		return rdtError("failed to initialize RDT controls: %v", err)
	}

	ctl.cache = cache

	if err := ctl.configure(); err != nil {
		// Just print an error. A config update later on may be valid.
		log.Error("failed apply initial configuration: %v", err)
	}

	rdt.RegisterCustomPrometheusLabels("pod_name", "container_name")
	err := metrics.RegisterCollector("rdt", rdt.NewCollector)
	if err != nil {
		log.Error("failed register rdt collector: %v", err)
	}

	pkgcfg.GetModule(ConfigModuleName).AddNotify(getRDTController().configNotify)

	return nil
}

// Stop shuts down the controller.
func (ctl *rdtctl) Stop() {
}

// PreCreateHook is the RDT controller pre-create hook.
func (ctl *rdtctl) PreCreateHook(_ cache.Container) error {
	return nil
}

// PreStartHook is the RDT controller pre-start hook.
func (ctl *rdtctl) PreStartHook(_ cache.Container) error {
	return nil
}

// PostStartHook is the RDT controller post-start hook.
func (ctl *rdtctl) PostStartHook(c cache.Container) error {
	if !c.HasPending(RDTController) {
		return nil
	}

	if err := ctl.assign(c); err != nil {
		return err
	}

	c.ClearPending(RDTController)

	return nil
}

// PostUpdateHook is the RDT controller post-update hook.
func (ctl *rdtctl) PostUpdateHook(c cache.Container) error {
	if !c.HasPending(RDTController) {
		return nil
	}

	if err := ctl.assign(c); err != nil {
		return err
	}

	c.ClearPending(RDTController)

	return nil
}

// PostStop is the RDT controller post-stop hook.
func (ctl *rdtctl) PostStopHook(c cache.Container) error {
	if err := ctl.stopMonitor(c); err != nil {
		return rdtError("%q: failed to remove monitoring group: %v", c.PrettyName(), err)
	}
	return nil
}

// assign assigns all processes/threads in a container to the correct class
func (ctl *rdtctl) assign(c cache.Container) error {
	if ctl.opt.Options.Mode == OperatingModeDisabled {
		return nil
	}

	class := c.GetRDTClass()
	switch class {
	case "":
		class = rdt.RootClassName
	case cache.RDTClassPodQoS:
		if ctl.noQoSClasses {
			class = rdt.RootClassName
		} else {
			class = string(c.GetQOSClass())
		}
	}

	err := ctl.assignClass(c, class)
	if err != nil && class != rdt.RootClassName {
		log.Warn("%v; falling back to system root class", err)
		return ctl.assignClass(c, rdt.RootClassName)
	}
	return err
}

// assignClass assigns all processes/threads in a container to the specified class
func (ctl *rdtctl) assignClass(c cache.Container, class string) error {
	cls, ok := rdt.GetClass(class)
	if !ok {
		return rdtError("%q: unknown RDT class %q", c.PrettyName(), class)
	}

	pod, ok := c.GetPod()
	if !ok {
		return rdtError("%q: failed to get pod", c.PrettyName())
	}

	pids, err := c.GetProcesses()
	if err != nil {
		return rdtError("%q: failed to get process list: %v", c.PrettyName(), err)
	}

	if err := cls.AddPids(pids...); err != nil {
		return rdtError("%q: failed to assign to class %q: %v", c.PrettyName(), class, err)
	}

	pretty := c.PrettyName()
	if _, ok := cls.GetMonGroup(pretty); !ok || ctl.monitoringDisabled() {
		ctl.stopMonitor(c)
	}

	if !ctl.monitoringDisabled() {
		pname, name, id := pod.GetName(), c.GetName(), c.GetID()
		if err := ctl.monitor(cls, pname, name, id, pretty, pids); err != nil {
			return err
		}
	}
	log.Info("%q: assigned to class %q", pretty, class)

	return nil
}

// monitor starts monitoring a container.
func (ctl *rdtctl) monitor(cls rdt.CtrlGroup, pod, name, id, pretty string, pids []string) error {
	if !rdt.MonSupported() {
		return nil
	}

	annotations := map[string]string{"pod_name": pod, "container_name": name}
	if mg, err := cls.CreateMonGroup(id, annotations); err != nil {
		log.Warn("%q: failed to create monitoring group: %v", pretty, err)
	} else {
		if err := mg.AddPids(pids...); err != nil {
			return rdtError("%q: failed to assign to monitoring group %q: %v",
				pretty, cls.Name()+"/"+mg.Name(), err)
		}
		log.Info("%q: assigned to monitoring group %q", pretty, cls.Name()+"/"+mg.Name())
	}
	return nil
}

// stopMonitor stops monitoring a container.
func (ctl *rdtctl) stopMonitor(c cache.Container) error {
	name := c.PrettyName()
	for _, cls := range rdt.GetClasses() {
		if mg, ok := cls.GetMonGroup(name); ok {
			if err := cls.DeleteMonGroup(name); err != nil {
				return err
			}
			log.Info("%q: removed monitoring group %q",
				c.PrettyName(), cls.Name()+"/"+mg.Name())
		}
	}
	return nil
}

// stopMonitorAll removes all monitoring groups
func (ctl *rdtctl) stopMonitorAll() error {
	for _, cls := range rdt.GetClasses() {
		if err := cls.DeleteMonGroups(); err != nil {
			return err
		}
	}
	return nil
}

func (ctl *rdtctl) assignAll(forceClass string) {
	// Assign all containers
	for _, c := range ctl.cache.GetContainers() {
		var err error
		if forceClass != "" {
			err = ctl.assignClass(c, forceClass)
		} else {
			err = ctl.assign(c)
		}
		if err != nil {
			log.Warn("failed to assign rdt class of %q: %v", c.PrettyName(), err)
		}
	}

}

func (ctl *rdtctl) monitoringDisabled() bool {
	return ctl.mode == OperatingModeDisabled || ctl.opt.Options.MonitoringDisabled
}

func (ctl *rdtctl) configure() error {
	// Apply RDT configuration, depending on the operating mode
	switch ctl.opt.Options.Mode {
	case OperatingModeDisabled:
		if ctl.mode != ctl.opt.Options.Mode {
			ctl.stopMonitorAll()
			// Drop all cri-resctrl specific groups by applying an empty config
			if err := rdt.SetConfig(&rdt.Config{}, true); err != nil {
				return rdtError("failed apply empty rdt config: %v", err)
			}
			ctl.noQoSClasses = true
			ctl.mode = ctl.opt.Options.Mode
			ctl.assignAll(rdt.RootClassName)
		}
	case OperatingModeDiscovery:
		if ctl.mode != ctl.opt.Options.Mode {
			ctl.stopMonitorAll()
			// Drop all cri-resctrl specific groups by applying an empty config
			if err := rdt.SetConfig(&rdt.Config{}, true); err != nil {
				return rdtError("failed apply empty rdt config: %v", err)
			}
		}
		// Run Initialize with empty prefix to discover existing resctrl groups
		if err := rdt.DiscoverClasses(""); err != nil {
			return rdtError("failed to discover classes from fs: %v", err)
		}

		// Disable mapping from Pod QoS to RDT class if no Pod QoS class equivalents exist
		ctl.noQoSClasses = true
		cs := []corev1.PodQOSClass{corev1.PodQOSBestEffort, corev1.PodQOSBurstable, corev1.PodQOSGuaranteed}
		for _, c := range cs {
			if _, ok := rdt.GetClass(string(c)); ok {
				ctl.noQoSClasses = false
				break
			}
		}

		ctl.mode = ctl.opt.Options.Mode
		ctl.assignAll("")
	case OperatingModeFull:
		if ctl.mode != ctl.opt.Options.Mode {
			ctl.stopMonitorAll()
		}

		// Copy goresctrl specific part from our extended options
		ctl.opt.Config.Options = ctl.opt.Options.Options
		if err := rdt.SetConfig(&ctl.opt.Config, true); err != nil {
			return err
		}
		// Disable mapping from Pod QoS to RDT class if no classes have been defined
		ctl.noQoSClasses = len(rdt.GetClasses()) <= 1
		ctl.mode = ctl.opt.Options.Mode
		ctl.assignAll("")
	default:
		return rdtError("invalid mode %q", ctl.opt.Options.Mode)
	}

	log.Debug("rdt controller operating mode set to %q", ctl.mode)

	if ctl.opt.Options.Mode != OperatingModeDisabled {
		log.Debug("rdt monitoring %s", map[bool]string{true: "disabled", false: "enabled"}[ctl.monitoringDisabled()])
	}

	return nil
}

// configNotify is our runtime configuration notification callback.
func (ctl *rdtctl) configNotify(_ pkgcfg.Event, _ pkgcfg.Source) error {
	log.Info("configuration update, applying new config")
	return ctl.configure()
}

func (ctl *rdtctl) defaultOptions() interface{} {
	c := &config{}
	c.Options.Mode = OperatingModeFull
	return c
}

// GetClasses returns all available RDT classes
func GetClasses() []rdt.CtrlGroup {
	return rdt.GetClasses()
}

// rdtError creates an RDT-controller-specific formatted error message.
func rdtError(format string, args ...interface{}) error {
	return fmt.Errorf("rdt: "+format, args...)
}

// Register us as a controller.
func init() {
	control.Register(RDTController, "RDT controller", getRDTController())
	pkgcfg.Register(ConfigModuleName, "RDT control", getRDTController().opt, getRDTController().defaultOptions)
}


================================================
FILE: pkg/cri/resource-manager/controllers.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package resmgr

import (
	// List of controllers to pull in.
	_ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/blockio"
	_ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/cpu"
	_ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/cri"
	_ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/memory"
	_ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/page-migrate"
	_ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/rdt"
)


================================================
FILE: pkg/cri/resource-manager/error.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package resmgr

import (
	"fmt"
)

// resmgrError creates a resource manager-specific formatted error.
func resmgrError(format string, args ...interface{}) error {
	return fmt.Errorf("resource-manager: "+format, args...)
}


================================================
FILE: pkg/cri/resource-manager/events/events.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package events

// Metrics is a set of metrics-related events we might need to act upon.
type Metrics struct {
	// Avx describes changes in container AVX512 instruction usage.
	Avx *Avx
}

// AVX contains data related to container AVX512 instruction usage.
type Avx struct {
	// Updates contains containers with a change in their AVX512 instruction usage.
	Updates map[string]bool
}

// Policy is a policy-specific event to be handled by the active policy.
type Policy struct {
	// Event is the policy-specific type of this event.
	Type string
	// Source describes where this event is originated from.
	Source string
	// Data is any optional arbitrary data associated with this event.
	Data interface{}
}

const (
	// ContainerStarted is delivered to policies when a StartContainer request succeeds.
	ContainerStarted = "container-started"
)


================================================
FILE: pkg/cri/resource-manager/events.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package resmgr

import (
	"time"

	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/metrics"
	logger "github.com/intel/cri-resource-manager/pkg/log"
)

// Our logger instance for events.
var evtlog = logger.NewLogger("events")

// setupEventProcessing sets up event and metrics processing.
func (m *resmgr) setupEventProcessing() error {
	var err error

	m.events = make(chan interface{}, 8)
	m.stop = make(chan interface{})
	options := metrics.Options{
		PollInterval: opt.MetricsTimer,
		Events:       m.events,
	}
	if m.metrics, err = metrics.NewMetrics(options); err != nil {
		return resmgrError("failed to create metrics (pre)processor: %v", err)
	}

	return nil
}

// startEventProcessing starts event and metrics processing.
func (m *resmgr) startEventProcessing() error {
	if err := m.metrics.Start(); err != nil {
		return resmgrError("failed to start metrics (pre)processor: %v", err)
	}

	stop := m.stop
	go func() {
		var rebalanceTimer *time.Ticker
		var rebalanceChan <-chan time.Time

		if opt.RebalanceTimer > 0 {
			rebalanceTimer = time.NewTicker(opt.RebalanceTimer)
			rebalanceChan = rebalanceTimer.C
		} else {
			m.Info("periodic rebalancing is disabled")
		}
		for {
			select {
			case _ = <-stop:
				if rebalanceTimer != nil {
					rebalanceTimer.Stop()
				}
				return
			case event := <-m.events:
				m.processEvent(event)
			case _ = <-rebalanceChan:
				if err := m.RebalanceContainers(); err != nil {
					evtlog.Error("rebalancing failed: %v", err)
				}
			}
			logger.Flush()
		}
	}()

	return nil
}

// stopEventProcessing stops event and metrics processing.
func (m *resmgr) stopEventProcessing() {
	if m.stop != nil {
		close(m.stop)
		m.metrics.Stop()
		m.stop = nil
	}
}

// SendEvent injects the given event to the resource manager's event processing loop.
func (m *resmgr) SendEvent(event interface{}) error {
	if m.events == nil {
		return resmgrError("can't send event, no event channel")
	}
	select {
	case m.events <- event:
		return nil
	default:
		return resmgrError("can't send event of type %T, event channel full", event)
	}
}

// processEvent processes the given event.
func (m *resmgr) processEvent(e interface{}) {
	evtlog.Debug("received event of type %T...", e)

	switch event := e.(type) {
	case string:
		evtlog.Debug("'%s'...", event)
	case *events.Metrics:
		m.processAvx(event.Avx)
	case *events.Policy:
		m.DeliverPolicyEvent(event)
	default:
		evtlog.Warn("event of unexpected type %T...", e)
	}
}

// processAvx processes AVX512 events.
func (m *resmgr) processAvx(e *events.Avx) bool {
	if e == nil {
		return false
	}

	m.Lock()
	defer m.Unlock()

	changes := false
	for cgroup, active := range e.Updates {
		c, ok := m.resolveCgroupPath(cgroup)
		if !ok {
			continue
		}
		// XXX This is just for testing, we should effectively drive state transitions
		//     through a low-pass filter.
		if active {
			if _, wasTagged := c.SetTag(cache.TagAVX512, "true"); !wasTagged {
				evtlog.Info("container %s STARTED using AVX512 instructions", c.PrettyName())
			}
		} else {
			if _, wasTagged := c.DeleteTag(cache.TagAVX512); wasTagged {
				evtlog.Info("container %s STOPPED using AVX512 instructions", c.PrettyName())
			}
		}
	}
	return changes
}

// resolveCgroupPath resolves a cgroup path to a container.
func (m *resmgr) resolveCgroupPath(path string) (cache.Container, bool) {
	return m.cache.LookupContainerByCgroup(path)
}


================================================
FILE: pkg/cri/resource-manager/flags.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package resmgr

import (
	"flag"
	"time"

	"github.com/intel/cri-resource-manager/pkg/cri/relay"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/sockets"
	"github.com/intel/cri-resource-manager/pkg/pidfile"
)

// Options captures our command line parameters.
type options struct {
	HostRoot              string
	ImageSocket           string
	RuntimeSocket         string
	RelaySocket           string
	RelayDir              string
	AllowUntestedRuntimes bool
	AgentSocket           string
	ConfigSocket          string
	PidFile               string
	ResctrlPath           string
	FallbackConfig        string
	ForceConfig           string
	ForceConfigSignal     string
	DisablePolicySwitch   bool
	ResetPolicy           bool
	ResetConfig           bool
	MetricsTimer          time.Duration
	RebalanceTimer        time.Duration
	DisableUI             bool
}

// Relay command line options.
var opt = options{}

const (
	allowUntestedRuntimesFlag = "allow-untested-runtimes"
)

// Register us for command line option processing.
func init() {
	flag.StringVar(&opt.HostRoot, "host-root", "",
		"Directory prefix under which the host's sysfs, etc. are mounted.")

	flag.StringVar(&opt.RuntimeSocket, "runtime-socket", sockets.Containerd,
		"Unix domain socket path where CRI runtime service requests should be relayed to.")
	flag.StringVar(&opt.ImageSocket, "image-socket", relay.DefaultImageSocket,
		"CRI image service socket, defaults to the value used for --runtime-socket.")
	flag.StringVar(&opt.RelaySocket, "relay-socket", sockets.ResourceManagerRelay,
		"Unix domain socket path where the resource manager should serve requests on.")
	flag.StringVar(&opt.RelayDir, "relay-dir", "/var/lib/cri-resmgr",
		"Permanent storage directory path for the resource manager to store its state in.")
	flag.BoolVar(&opt.AllowUntestedRuntimes, allowUntestedRuntimesFlag, false,
		"Allow proxying for untested CRI runtimes. Usually this is not a good idea.")

	flag.StringVar(&opt.AgentSocket, "agent-socket", sockets.ResourceManagerAgent,
		"local socket of the cri-resmgr agent to connect")
	flag.StringVar(&opt.ConfigSocket, "config-socket", sockets.ResourceManagerConfig,
		"Unix domain socket path where the resource manager listens for cri-resmgr-agent")
	flag.StringVar(&opt.PidFile, "pid-file", pidfile.GetPath(),
		"PID file to write daemon PID to")
	flag.StringVar(&opt.FallbackConfig, "fallback-config", "",
		"Fallback configuration to use unless/until one is available from the cache or agent.")
	flag.StringVar(&opt.ForceConfig, "force-config", "",
		"Configuration used to override the one stored in the cache. Disables the agent.")
	flag.StringVar(&opt.ForceConfigSignal, "force-config-signal", "SIGHUP",
		"Signal used to reload forced configuration.")
	flag.BoolVar(&opt.ResetConfig, "reset-config", false,
		"Remove configuration (from the agent) stored in the cache, then exit.")

	flag.BoolVar(&opt.ResetPolicy, "reset-policy", false,
		"Reset policy data stored in the cache, then exit.")
	flag.BoolVar(&opt.DisablePolicySwitch, "disable-policy-switch", false,
		"Disable switching policies during startup.")

	flag.DurationVar(&opt.MetricsTimer, "metrics-interval", 0,
		"Interval for polling/gathering runtime metrics data. Use 'disable' for disabling.")
	flag.DurationVar(&opt.RebalanceTimer, "rebalance-interval", 0,
		"Minimum interval between two container rebalancing attempts. Use 'disable' for disabling.")

	flag.BoolVar(&opt.DisableUI, "disable-ui", false,
		"Disable serving container placement visualization UIs.")
}


================================================
FILE: pkg/cri/resource-manager/introspect/introspect.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package introspect

import (
	"encoding/json"
	"fmt"
	"net/http"
	"sync"

	xhttp "github.com/intel/cri-resource-manager/pkg/instrumentation/http"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/topology"
)

// Pod describes a single pod and its containers.
type Pod struct {
	ID         string                // pod CRI ID
	UID        string                // pod kubernetes ID
	Name       string                // pod name
	Containers map[string]*Container // containers of this pod
}

// Container describes a single container.
type Container struct {
	ID            string        // container CRI ID
	Name          string        // container name
	Command       []string      // command
	Args          []string      // and its arguments
	CPURequest    int64         // CPU requested in milli-CPU (guaranteed amount)
	CPULimit      int64         // CPU limit in milli-CPU (maximum allowed CPU)
	MemoryRequest int64         // memory requested in bytes
	MemoryLimit   int64         // memory limit in bytes (maximum allowed memory)
	Hints         TopologyHints // topology/allocation hints
}

// TopologyHints contain a set of allocation hints for a container.
type TopologyHints topology.Hints

// Assignment describes resource assignments for a single container.
type Assignment struct {
	ContainerID   string // ID of container for this assignment
	SharedCPUs    string // shared CPUs
	CPUShare      int    // CPU share/weight for SharedCPUs
	ExclusiveCPUs string // exclusive CPUs
	Memory        string // memory controllers
	Pool          string // pool container is assigned to
}

// Pool describes a single (resource) pool.
type Pool struct {
	Name     string   // pool name
	CPUs     string   // CPUs in this pool
	Memory   string   // memory controllers (NUMA nodes) for this pool
	Parent   string   // parent pool
	Children []string // child pools
}

// Socket describes a single physical CPU socket in the system.
type Socket struct {
	ID   int    // CPU ID
	CPUs string // CPUs in this socket
}

// Node describes a single NUMA node in the system.
type Node struct {
	ID   int    // node ID
	CPUs string // CPUs with locality for this NUMA node.
}

// System describes the underlying HW/system.
type System struct {
	Sockets        map[int]*Socket // physical sockets in the system
	Nodes          map[int]*Node   // NUMA nodes in the system
	Isolated       string          // kernel-isolated CPUs
	Offlined       string          // CPUs offline
	RDTClasses     []string        // list of RDT classes
	BlockIOClasses []string        // list of block I/O classes
	Policy         string          // active policy
}

// State is the current introspected state of the resource manager.
type State struct {
	Pools       map[string]*Pool       // pools
	Pods        map[string]*Pod        // pods and containers
	Assignments map[string]*Assignment // resource assignments
	System      *System                // info about hardware/system
	Error       string
}

// our logger instance
var log = logger.NewLogger("instrospect")

// Server is our server for external introspection.
type Server struct {
	sync.RWMutex                 // need to protect against concurrent introspection/update
	mux          *xhttp.ServeMux // our HTTP request multiplexer
	state        *State          // introspection data
	data         string          // state as a JSON string
	ready        bool
}

// Setup prepares the given HTTP request multiplexer for serving introspection.
func Setup(mux *xhttp.ServeMux, state *State) (*Server, error) {
	s := &Server{mux: mux}
	if err := s.set(state); err != nil {
		return nil, err
	}
	mux.HandleFunc("/introspect", s.serve)
	return s, nil
}

// Set sets the current state for introspection.
func (s *Server) Set(state *State) error {
	s.Lock()
	defer s.Unlock()
	return s.set(state)
}

// Start enables serving HTTP requests.
func (s *Server) Start() {
	log.Info("starting introspection server...")
	s.ready = true
}

// Stop stops serving further HTTP requests.
func (s *Server) Stop() {
	log.Info("stopping introspection server...")
	s.ready = false
}

// set sets the given state and encodes it as a JSON string.
func (s *Server) set(state *State) error {
	log.Debug("updating introspection data...")
	s.state = state
	data, err := json.Marshal(s.state)
	if err != nil {
		err = introspectError("failed to marshal state for introspection: %v", err)
		s.state = &State{Error: fmt.Sprintf("%v", err)}
		data, _ = json.Marshal(s.state)
	}

	s.data = string(data)
	return err
}

// serve serves a single HTTP request.
func (s *Server) serve(w http.ResponseWriter, _ *http.Request) {
	if !s.ready {
		return
	}
	log.Debug("serving introspection data...")
	s.RLock()
	fmt.Fprintf(w, "%s\r\n", s.data)
	s.RUnlock()
}

// introspectError creates an introspection-specific error.
func introspectError(format string, args ...interface{}) error {
	return fmt.Errorf("introspection: "+format, args...)
}


================================================
FILE: pkg/cri/resource-manager/kubernetes/kubernetes.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package kubernetes

const (
	// ResmgrKeyNamespace is a CRI Resource Manager namespace
	ResmgrKeyNamespace = "cri-resource-manager.intel.com"

	// NamespaceSystem is the kubernetes system namespace.
	NamespaceSystem = "kube-system"
	// PodNameLabel is the key for the kubernetes pod name label.
	PodNameLabel = "io.kubernetes.pod.name"
	// PodNameLabel is the key for the kubernetes pod UID label.
	PodUIDLabel = "io.kubernetes.pod.uid"
	// ContainerNameLabel is the key for the kubernetes container name label.
	ContainerNameLabel = "io.kubernetes.container.name"
)

// ResmgrKey returns a full namespaced name of a resource manager specific key
func ResmgrKey(name string) string {
	return ResmgrKeyNamespace + "/" + name
}


================================================
FILE: pkg/cri/resource-manager/kubernetes/resources.go
================================================
// Copyright The NRI Plugins Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kubernetes

const (
	// Constants for converting back and forth between CPU requirements in
	// terms of milli-CPUs and kernel cgroup/scheduling parameters.

	// MinShares is the minimum cpu.shares accepted by cgroups.
	MinShares = 2
	// MaxShares is the minimum cpu.shares accepted by cgroups.
	MaxShares = 262144
	// SharesPerCPU is cpu.shares worth one full CPU.
	SharesPerCPU = 1024
	// MilliCPUToCPU is milli-CPUs worth a full CPU.
	MilliCPUToCPU = 1000
	// QuotaPeriod is 100000 microseconds, or 100ms
	QuotaPeriod = 100000
	// MinQuotaPeriod is 1000 microseconds, or 1ms
	MinQuotaPeriod = 1000
)

// MilliCPUToQuota converts milliCPU to CFS quota and period values.
// (Almost) identical to the same function in kubelet.
func MilliCPUToQuota(milliCPU int64) (quota, period int64) {
	if milliCPU == 0 {
		return 0, 0
	}

	// TODO(klihub): this is behind the CPUSFSQuotaPerdiod feature gate in kubelet
	period = int64(QuotaPeriod)

	quota = (milliCPU * period) / MilliCPUToCPU

	if quota < MinQuotaPeriod {
		quota = MinQuotaPeriod
	}

	return quota, period
}

// MilliCPUToShares converts the milliCPU to CFS shares.
// Identical to the same function in kubelet.
func MilliCPUToShares(milliCPU int64) uint64 {
	if milliCPU == 0 {
		return MinShares
	}
	shares := (milliCPU * SharesPerCPU) / MilliCPUToCPU
	if shares < MinShares {
		return MinShares
	}
	if shares > MaxShares {
		return MaxShares
	}
	return uint64(shares)
}

// SharesToMilliCPU converts CFS CPU shares to milli-CPUs.
func SharesToMilliCPU(shares int64) int64 {
	if shares == MinShares {
		return 0
	}
	return int64(float64(shares*MilliCPUToCPU)/float64(SharesPerCPU) + 0.5)
}

// QuotaToMilliCPU converts CFS quota and period to milli-CPUs.
func QuotaToMilliCPU(quota, period int64) int64 {
	if quota == 0 || period == 0 {
		return 0
	}
	return int64(float64(quota*MilliCPUToCPU)/float64(period) + 0.5)
}


================================================
FILE: pkg/cri/resource-manager/metrics/avx.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package metrics

import (
	model "github.com/prometheus/client_model/go"
	"path/filepath"

	"github.com/intel/cri-resource-manager/pkg/cgroups"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events"
)

func (m *Metrics) collectAvxEvents(raw map[string]*model.MetricFamily) *events.Avx {
	all, ok := raw["all_switch_count_per_cgroup"]
	if !ok {
		return nil
	}
	dump("all context switches", all)

	avx, ok := raw["avx_switch_count_per_cgroup"]
	if !ok {
		return nil
	}
	dump("AVX context switches", avx)

	ratio := map[string]float64{}
	for _, v := range avx.Metric {
		cgroup, err := filepath.Rel(cgroups.GetV2Dir(), v.Label[0].GetValue())
		if err != nil {
			continue
		}
		ratio[cgroup] = v.Gauge.GetValue()
	}
	for _, v := range all.Metric {
		cgroup, err := filepath.Rel(cgroups.GetV2Dir(), v.Label[0].GetValue())
		if err != nil {
			continue
		}
		ratio[cgroup] /= v.Gauge.GetValue()
	}

	usage := map[string]bool{}
	for cgroup, use := range ratio {
		active := use >= m.opts.AvxThreshold
		log.Debug(" %s AVX ratio = %f, active?: %v", cgroup, use, active)
		usage["/"+cgroup] = active
	}

	return &events.Avx{Updates: usage}
}


================================================
FILE: pkg/cri/resource-manager/metrics/metrics.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package metrics

import (
	"bytes"
	"fmt"
	"strings"
	"sync"
	"time"

	"github.com/prometheus/client_golang/prometheus"
	model "github.com/prometheus/client_model/go"
	"github.com/prometheus/common/expfmt"

	logger "github.com/intel/cri-resource-manager/pkg/log"

	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events"
	"github.com/intel/cri-resource-manager/pkg/instrumentation"
	"github.com/intel/cri-resource-manager/pkg/metrics"
	// pull in all metrics collectors
	_ "github.com/intel/cri-resource-manager/pkg/metrics/register"
)

const (
	// DefaultAvxThreshold is the cutoff below which a cgroup/container is not an AVX user.
	DefaultAvxThreshold = float64(0.1)
)

// Options describes options for metrics collection and processing.
type Options struct {
	// PollInterval is the interval for polling raw metrics.
	PollInterval time.Duration
	// Events is the channel for delivering metrics events.
	Events chan interface{}
	// AvxThreshold is the threshold (0 - 1) for a cgroup to be considered AVX512-active
	AvxThreshold float64
}

// Metrics implements collecting, caching and processing of raw metrics.
type Metrics struct {
	sync.RWMutex
	opts Options               // metrics collecting options
	g    prometheus.Gatherer   // prometheus/raw metrics gatherer
	stop chan interface{}      // channel to stop polling goroutine
	raw  []*model.MetricFamily // latest set of raw metrics
	pend []*model.MetricFamily // pending metrics for forwarding
}

// Our logger instance.
var log = logger.NewLogger("metrics")

// NewMetrics creates a new instance for metrics collecting and processing.
func NewMetrics(opts Options) (*Metrics, error) {
	if opts.Events == nil {
		return nil, metricsError("invalid options, nil Event channel")
	}
	if opts.AvxThreshold == 0.0 {
		opts.AvxThreshold = DefaultAvxThreshold
	}

	g, err := metrics.NewMetricGatherer()
	if err != nil {
		return nil, metricsError("failed to create raw metrics gatherer: %v", err)
	}

	m := &Metrics{
		opts: opts,
		raw:  make([]*model.MetricFamily, 0),
		g:    g,
	}

	m.poll()
	instrumentation.RegisterGatherer(m)

	return m, nil
}

// Start starts metrics collection and processing.
func (m *Metrics) Start() error {
	if m.stop != nil {
		return nil
	}

	stop := make(chan interface{})
	go func() {
		var pollTimer *time.Ticker
		var pollChan <-chan time.Time

		if m.opts.PollInterval > 0 {
			pollTimer = time.NewTicker(m.opts.PollInterval)
			pollChan = pollTimer.C
		} else {
			log.Info("periodic collection of metrics is disabled")
		}

		for {
			select {
			case _ = <-stop:
				if pollTimer != nil {
					pollTimer.Stop()
				}
				return
			case _ = <-pollChan:
				if err := m.poll(); err != nil {
					log.Error("failed to poll raw metrics: %v", err)
					continue
				}

				if err := m.process(); err != nil {
					log.Error("failed to deliver metrics event: %v", err)
				}
			}
		}
	}()
	m.stop = stop

	return nil
}

// Stop stops metrics collection and processing.
func (m *Metrics) Stop() {
	if m.stop != nil {
		close(m.stop)
		m.stop = nil
	}
}

// poll does a single round of raw metrics collection.
func (m *Metrics) poll() error {
	m.Lock()
	defer m.Unlock()

	f, err := m.g.Gather()
	if err != nil {
		return metricsError("failed to poll raw metrics: %v", err)
	}
	m.raw = f
	m.pend = f
	return nil
}

// process processes the collected raw metrics.
func (m *Metrics) process() error {
	raw := map[string]*model.MetricFamily{}
	for _, f := range m.raw {
		dump(" <metric "+*f.Name+"> ", f)
		raw[*f.Name] = f
	}

	event := &events.Metrics{
		Avx: m.collectAvxEvents(raw),
	}

	return m.sendEvent(event)
}

// sendEvent sends a metrics-based event for processing.
func (m *Metrics) sendEvent(e *events.Metrics) error {
	select {
	case m.opts.Events <- e:
		return nil
	default:
		return metricsError("failed to deliver event %v (channel full?)", *e)
	}
}

// dump debug-dumps the given MetricFamily data
func dump(prefix string, f *model.MetricFamily) {
	if !log.DebugEnabled() {
		return
	}
	buf := &bytes.Buffer{}
	if _, err := expfmt.MetricFamilyToText(buf, f); err != nil {
		return
	}
	log.DebugBlock("  <"+prefix+"> ", "%s", strings.TrimSpace(buf.String()))
}

// metricsError returns a new formatted error specific to metrics-processing.
func metricsError(format string, args ...interface{}) error {
	return fmt.Errorf("metrics: "+format, args...)
}


================================================
FILE: pkg/cri/resource-manager/metrics/prometheus.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package metrics

import (
	model "github.com/prometheus/client_model/go"
)

// Gather is our prometheus.Gatherer interface for proxying metrics.
func (m *Metrics) Gather() ([]*model.MetricFamily, error) {
	m.Lock()
	pend := m.pend
	m.Unlock()

	if pend == nil {
		log.Debug("no data to proxy to prometheus...")
	} else {
		log.Debug("proxying data to prometheus...")
	}

	return pend, nil
}


================================================
FILE: pkg/cri/resource-manager/no-test-api.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !test
// +build !test

package resmgr

// ResourceManagerTestAPI is dummy if we're compiling without test build flag.
type ResourceManagerTestAPI interface {
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/balloons/balloons-policy.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package balloons

import (
	"fmt"
	"path/filepath"

	corev1 "k8s.io/api/core/v1"
	resapi "k8s.io/apimachinery/pkg/api/resource"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

	pkgcfg "github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/cpuallocator"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	cpucontrol "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/cpu"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	policyapi "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/utils"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	idset "github.com/intel/goresctrl/pkg/utils"
)

const (
	// PolicyName is the name used to activate this policy.
	PolicyName = "balloons"
	// PolicyDescription is a short description of this policy.
	PolicyDescription = "Flexible pools with per-pool CPU parameters"
	// PolicyPath is the path of this policy in the configuration hierarchy.
	PolicyPath = "policy." + PolicyName
	// balloonKey is a pod annotation key, the value is a pod balloon name.
	balloonKey = "balloon." + PolicyName + "." + kubernetes.ResmgrKeyNamespace
	// reservedBalloonDefName is the name in the reserved balloon definition.
	reservedBalloonDefName = "reserved"
	// defaultBalloonDefName is the name in the default balloon definition.
	defaultBalloonDefName = "default"
	// NoLimit value denotes no limit being set.
	NoLimit = 0
)

// balloons contains configuration and runtime attributes of the balloons policy
type balloons struct {
	options          *policyapi.BackendOptions // configuration common to all policies
	bpoptions        BalloonsOptions           // balloons-specific configuration
	cch              cache.Cache               // cri-resmgr cache
	allowed          cpuset.CPUSet             // bounding set of CPUs we're allowed to use
	reserved         cpuset.CPUSet             // system-/kube-reserved CPUs
	freeCpus         cpuset.CPUSet             // CPUs to be included in growing or new ballons
	cpuTree          *cpuTreeNode              // system CPU topology
	cpuTreeAllocator *cpuTreeAllocator         // CPU allocator from system CPU topology

	reservedBalloonDef *BalloonDef // built-in definition of the reserved balloon
	defaultBalloonDef  *BalloonDef // built-in definition of the default balloon
	balloons           []*Balloon  // balloon instances: reserved, default and user-defined

	cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy
}

// Balloon contains attributes of a balloon instance
type Balloon struct {
	// Def is the definition from which this balloon instance is created.
	Def *BalloonDef
	// Instance is the index of this balloon instance, starting from
	// zero for every balloon definition.
	Instance int
	// Cpus is the set of CPUs exclusive to this balloon instance only.
	Cpus cpuset.CPUSet
	// Mems is the set of memory nodes with minimal access delay
	// from CPUs.
	Mems idset.IDSet
	// SharedIdleCpus is the set of idle CPUs that workloads in a
	// balloon are allowed to use with workloads in other balloons
	// that shareIdleCpus.
	SharedIdleCpus cpuset.CPUSet
	// PodIDs maps pod ID to list of container IDs.
	// - len(PodIDs) is the number of pods in the balloon.
	// - len(PodIDs[podID]) is the number of containers of podID
	//   currently assigned to the balloon.
	PodIDs           map[string][]string
	cpuTreeAllocator *cpuTreeAllocator
}

var log logger.Logger = logger.NewLogger("policy")

// String is a stringer for a balloon.
func (bln Balloon) String() string {
	return fmt.Sprintf("%s{Cpus:%s, Mems:%s}", bln.PrettyName(), bln.Cpus, bln.Mems)
}

// PrettyName returns a unique name for a balloon.
func (bln Balloon) PrettyName() string {
	return fmt.Sprintf("%s[%d]", bln.Def.Name, bln.Instance)
}

// ContainerIDs returns IDs of containers assigned in a balloon.
// (Using cache.Container.GetCacheID()'s)
func (bln Balloon) ContainerIDs() []string {
	cIDs := []string{}
	for _, ctrIDs := range bln.PodIDs {
		cIDs = append(cIDs, ctrIDs...)
	}
	return cIDs
}

// ContainerCount returns the number of containers in a balloon.
func (bln Balloon) ContainerCount() int {
	count := 0
	for _, ctrIDs := range bln.PodIDs {
		count += len(ctrIDs)
	}
	return count
}

func (bln Balloon) AvailMilliCpus() int {
	return bln.Cpus.Size() * 1000
}

func (bln Balloon) MaxAvailMilliCpus(freeCpus cpuset.CPUSet) int {
	if bln.Def.MaxCpus == NoLimit {
		return (bln.Cpus.Size() + freeCpus.Size()) * 1000
	}
	return bln.Def.MaxCpus * 1000
}

// CreateBalloonsPolicy creates a new policy instance.
func CreateBalloonsPolicy(policyOptions *policy.BackendOptions) policy.Backend {
	var err error
	p := &balloons{
		options:      policyOptions,
		cch:          policyOptions.Cache,
		cpuAllocator: cpuallocator.NewCPUAllocator(policyOptions.System),
	}
	log.Info("creating %s policy...", PolicyName)
	if p.cpuTree, err = NewCpuTreeFromSystem(); err != nil {
		log.Errorf("creating CPU topology tree failed: %s", err)
	}
	log.Debug("CPU topology: %s", p.cpuTree)
	// Handle common policy options: AvailableResources and ReservedResources.
	// p.allowed: CPUs available for the policy
	if allowed, ok := policyOptions.Available[policyapi.DomainCPU]; ok {
		p.allowed = allowed.(cpuset.CPUSet)
	} else {
		// Available CPUs not specified, default to all on-line CPUs.
		p.allowed = policyOptions.System.CPUSet().Difference(policyOptions.System.Offlined())
	}
	// p.reserved: CPUs reserved for kube-system pods, subset of p.allowed.
	p.reserved = cpuset.New()
	if reserved, ok := p.options.Reserved[policyapi.DomainCPU]; ok {
		switch v := reserved.(type) {
		case cpuset.CPUSet:
			p.reserved = p.allowed.Intersection(v)
		case resapi.Quantity:
			reserveCnt := (int(v.MilliValue()) + 999) / 1000
			cpus, err := p.cpuAllocator.AllocateCpus(&p.allowed, reserveCnt, cpuallocator.PriorityNone)
			if err != nil {
				log.Fatal("failed to allocate reserved CPUs: %s", err)
			}
			p.reserved = cpus
			p.allowed = p.allowed.Union(cpus)
		}
	}
	if p.reserved.IsEmpty() {
		log.Fatal("%s cannot run without reserved CPUs that are also AvailableResources", PolicyName)
	}
	// Handle policy-specific options
	log.Debug("creating %s configuration", PolicyName)
	if err := p.setConfig(balloonsOptions); err != nil {
		log.Fatal("failed to create %s policy: %v", PolicyName, err)
	}
	log.Debug("first effective configuration:\n%s\n", utils.DumpJSON(p.bpoptions))
	pkgcfg.GetModule(PolicyPath).AddNotify(p.configNotify)

	return p
}

// Name returns the name of this policy.
func (p *balloons) Name() string {
	return PolicyName
}

// Description returns the description for this policy.
func (p *balloons) Description() string {
	return PolicyDescription
}

// Start prepares this policy for accepting allocation/release requests.
func (p *balloons) Start(add []cache.Container, del []cache.Container) error {
	log.Info("%s policy started", PolicyName)
	// reassign all containers
	return p.Sync(p.cch.GetContainers(), del)
}

// Sync synchronizes the active policy state.
func (p *balloons) Sync(add []cache.Container, del []cache.Container) error {
	log.Debug("synchronizing state...")
	for _, c := range del {
		p.ReleaseResources(c)
	}
	for _, c := range add {
		p.AllocateResources(c)
	}
	return nil
}

// AllocateResources is a resource allocation request for this policy.
func (p *balloons) AllocateResources(c cache.Container) error {
	log.Debug("allocating resources for container %s (request %d mCPU, limit %d mCPU)...",
		c.PrettyName(),
		p.containerRequestedMilliCpus(c.GetCacheID()),
		p.containerLimitedMilliCpus(c.GetCacheID()))
	bln, err := p.allocateBalloon(c)
	if err != nil {
		return balloonsError("balloon allocation for container %s failed: %w", c.PrettyName(), err)
	}
	if bln == nil {
		return balloonsError("no suitable balloons found for container %s", c.PrettyName())
	}
	// Resize selected balloon to fit the new container, unless it
	// uses the ReservedResources CPUs, which is a fixed set.
	reqMilliCpus := p.containerRequestedMilliCpus(c.GetCacheID()) + p.requestedMilliCpus(bln)
	// Even if all containers in a balloon request is 0 mCPU in
	// total (all are BestEffort, for example), force the size of
	// the balloon to be enough for at least 1 mCPU
	// request. Otherwise balloon's cpuset becomes empty, which in
	// would mean no CPU pinning and balloon's containers would
	// run on any CPUs.
	if bln.AvailMilliCpus() < max(1, reqMilliCpus) {
		p.resizeBalloon(bln, max(1, reqMilliCpus))
	}
	p.assignContainer(c, bln)
	if log.DebugEnabled() {
		log.Debug(p.dumpBalloon(bln))
	}
	return nil
}

// ReleaseResources is a resource release request for this policy.
func (p *balloons) ReleaseResources(c cache.Container) error {
	log.Debug("releasing container %s...", c.PrettyName())
	if bln := p.balloonByContainer(c); bln != nil {
		p.dismissContainer(c, bln)
		if log.DebugEnabled() {
			log.Debug(p.dumpBalloon(bln))
		}
		if bln.ContainerCount() == 0 {
			// Deflate the balloon completely before
			// freeing it.
			p.resizeBalloon(bln, 0)
			log.Debug("all containers removed, free balloon allocation %s", bln.PrettyName())
			p.freeBalloon(bln)
		} else {
			// Make sure that the balloon will have at
			// least 1 CPU to run remaining containers.
			p.resizeBalloon(bln, max(1, p.requestedMilliCpus(bln)))
		}
	} else {
		log.Debug("ReleaseResources: balloon-less container %s, nothing to release", c.PrettyName())
	}
	return nil
}

// UpdateResources is a resource allocation update request for this policy.
func (p *balloons) UpdateResources(c cache.Container) error {
	log.Debug("(not) updating container %s...", c.PrettyName())
	return nil
}

// Rebalance tries to find an optimal allocation of resources for the current containers.
func (p *balloons) Rebalance() (bool, error) {
	log.Debug("(not) rebalancing containers...")
	return false, nil
}

// HandleEvent handles policy-specific events.
func (p *balloons) HandleEvent(*events.Policy) (bool, error) {
	log.Debug("(not) handling event...")
	return false, nil
}

// ExportResourceData provides resource data to export for the container.
func (p *balloons) ExportResourceData(c cache.Container) map[string]string {
	return nil
}

// Introspect provides data for external introspection.
func (p *balloons) Introspect(*introspect.State) {
	return
}

// balloonByContainer returns a balloon that contains a container.
func (p *balloons) balloonByContainer(c cache.Container) *Balloon {
	podID := c.GetPodID()
	cID := c.GetCacheID()
	for _, bln := range p.balloons {
		for _, ctrID := range bln.PodIDs[podID] {
			if ctrID == cID {
				return bln
			}
		}
	}
	return nil
}

// balloonsByNamespace returns balloons that contain containers in a
// namespace.
func (p *balloons) balloonsByNamespace(namespace string) []*Balloon {
	blns := []*Balloon{}
	for _, bln := range p.balloons {
		for podID, ctrIDs := range bln.PodIDs {
			if len(ctrIDs) == 0 {
				continue
			}
			pod, ok := p.cch.LookupPod(podID)
			if !ok {
				continue
			}
			if pod.GetNamespace() == namespace {
				blns = append(blns, bln)
				break
			}
		}
	}
	return blns
}

// balloonsByPod returns balloons that contain any container of a pod.
func (p *balloons) balloonsByPod(pod cache.Pod) []*Balloon {
	podID := pod.GetID()
	blns := []*Balloon{}
	for _, bln := range p.balloons {
		if _, ok := bln.PodIDs[podID]; ok {
			blns = append(blns, bln)
		}
	}
	return blns
}

// balloonsByDef returns list of balloons instantiated from a balloon
// definition.
func (p *balloons) balloonsByDef(blnDef *BalloonDef) []*Balloon {
	balloons := []*Balloon{}
	for _, balloon := range p.balloons {
		if balloon.Def == blnDef {
			balloons = append(balloons, balloon)
		}
	}
	return balloons
}

// balloonDefByName returns a balloon definition with a name.
func (p *balloons) balloonDefByName(defName string) *BalloonDef {
	if defName == "reserved" {
		return p.reservedBalloonDef
	}
	if defName == "default" {
		return p.defaultBalloonDef
	}
	for _, blnDef := range p.bpoptions.BalloonDefs {
		if blnDef.Name == defName {
			return blnDef
		}
	}
	return nil
}

func (p *balloons) chooseBalloonDef(c cache.Container) (*BalloonDef, error) {
	var blnDef *BalloonDef
	// BalloonDef is defined by annotation?
	if blnDefName, ok := c.GetEffectiveAnnotation(balloonKey); ok {
		blnDef = p.balloonDefByName(blnDefName)
		if blnDef == nil {
			return nil, balloonsError("no balloon for annotation %q", blnDefName)
		}
		return blnDef, nil
	}

	// BalloonDef is defined by a special namespace (kube-system +
	// ReservedPoolNamespaces)?
	if namespaceMatches(c.GetNamespace(), append(p.bpoptions.ReservedPoolNamespaces, metav1.NamespaceSystem)) {
		return p.balloons[0].Def, nil
	}

	// BalloonDef is defined by the namespace.
	for _, blnDef := range append([]*BalloonDef{p.reservedBalloonDef, p.defaultBalloonDef}, p.bpoptions.BalloonDefs...) {
		if namespaceMatches(c.GetNamespace(), blnDef.Namespaces) {
			return blnDef, nil
		}
	}

	// Fallback to the default balloon.
	return p.defaultBalloonDef, nil
}

func (p *balloons) containerRequestedMilliCpus(contID string) int {
	cont, ok := p.cch.LookupContainer(contID)
	if !ok {
		return 0
	}
	reqCpu, ok := cont.GetResourceRequirements().Requests[corev1.ResourceCPU]
	if !ok {
		return 0
	}
	return int(reqCpu.MilliValue())
}

func (p *balloons) containerLimitedMilliCpus(contID string) int {
	cont, ok := p.cch.LookupContainer(contID)
	if !ok {
		return 0
	}
	reqCpu, ok := cont.GetResourceRequirements().Limits[corev1.ResourceCPU]
	if !ok {
		return 0
	}
	return int(reqCpu.MilliValue())
}

// requestedMilliCpus sums up and returns CPU requests of all
// containers assigned to a balloon.
func (p *balloons) requestedMilliCpus(bln *Balloon) int {
	cpuRequested := 0
	for _, cID := range bln.ContainerIDs() {
		cpuRequested += p.containerRequestedMilliCpus(cID)
	}
	return cpuRequested
}

// freeMilliCpus returns free CPU resources in a balloon without
// inflating the balloon.
func (p *balloons) freeMilliCpus(bln *Balloon) int {
	return bln.AvailMilliCpus() - p.requestedMilliCpus(bln)
}

// maxFreeMilliCpus returns free CPU resources in a balloon when it is
// inflated as large as possible.
func (p *balloons) maxFreeMilliCpus(bln *Balloon) int {
	return bln.MaxAvailMilliCpus(p.freeCpus) - p.requestedMilliCpus(bln)
}

// largest helps finding the largest element and value in a slice.
// Input the length of a slice and a function that returns the
// magnitude of given element in the slice as int.
func largest(sliceLen int, valueOf func(i int) int) (int, int) {
	largestIndex := -1
	largestValue := 0
	for index := 0; index < sliceLen; index++ {
		value := valueOf(index)
		if largestIndex == -1 || value > largestValue {
			largestIndex = index
			largestValue = value
		}
	}
	return largestIndex, largestValue
}

// resetCpuClass resets CPU configurations globally. All balloons can
// be ignored, their CPU configurations will be applied later.
func (p *balloons) resetCpuClass() error {
	// Usual inputs:
	// - p.allowed (cpuset.CPUset): all CPUs available for this
	//   policy.
	// - p.IdleCpuClass (string): CPU class for allowed CPUs.
	//
	// Other inputs, if needed:
	// - p.reserved (cpuset.CPUset): CPUs of ReservedResources
	//   (typically for kube-system containers).
	//
	// Note: p.useCpuClass(balloon) will be called before assigning
	// containers on the balloon, including the reserved balloon.
	//
	// TODO: don't depend on cpu controller directly
	cpucontrol.Assign(p.cch, p.bpoptions.IdleCpuClass, p.allowed.UnsortedList()...)
	log.Debugf("resetCpuClass available: %s; reserved: %s", p.allowed, p.reserved)
	return nil
}

// useCpuClass configures CPUs of a balloon.
func (p *balloons) useCpuClass(bln *Balloon) error {
	// Usual inputs:
	// - CPUs that cpuallocator has reserved for this balloon:
	//   bln.Cpus (cpuset.CPUSet).
	// - User-defined CPU configuration for CPUs of balloon of this type:
	//   bln.Def.CpuClass (string).
	// - Current configuration(?): feel free to add data
	//   structure for this. For instance policy-global p.cpuConfs,
	//   or balloon-local bln.cpuConfs.
	//
	// Other input examples, if needed:
	// - Requested CPU resources by all containers in the balloon:
	//   p.requestedMilliCpus(bln).
	// - Free CPU resources in the balloon: p.freeMilliCpus(bln).
	// - Number of assigned containers: bln.ContainerCount().
	// - Container details: access p.cch with bln.ContainerIDs().
	// - User-defined CPU AllocatorPriority: bln.Def.AllocatorPriority.
	// - All existing balloon instances: p.balloons.
	// - CPU configurations by user: bln.Def.CpuClass (for bln in p.balloons)
	cpucontrol.Assign(p.cch, bln.Def.CpuClass, bln.Cpus.UnsortedList()...)
	log.Debugf("useCpuClass Cpus: %s; CpuClass: %s", bln.Cpus, bln.Def.CpuClass)
	return nil
}

// forgetCpuClass is called when CPUs of a balloon are released from duty.
func (p *balloons) forgetCpuClass(bln *Balloon) {
	// Use p.IdleCpuClass for bln.Cpus.
	// Usual inputs: see useCpuClass
	cpucontrol.Assign(p.cch, p.bpoptions.IdleCpuClass, bln.Cpus.UnsortedList()...)
	log.Debugf("forgetCpuClass Cpus: %s; CpuClass: %s", bln.Cpus, bln.Def.CpuClass)
}

func (p *balloons) newBalloon(blnDef *BalloonDef, confCpus bool) (*Balloon, error) {
	var cpus cpuset.CPUSet
	var err error
	blnsOfDef := p.balloonsByDef(blnDef)
	// Allowed to create new balloon instance from blnDef?
	if blnDef.MaxBalloons > NoLimit && blnDef.MaxBalloons <= len(blnsOfDef) {
		return nil, balloonsError("cannot create new %q balloon, MaxBalloons limit (%d) reached", blnDef.Name, blnDef.MaxBalloons)
	}
	// Find the first unused balloon instance index.
	freeInstance := 0
	for freeInstance = 0; freeInstance < len(blnsOfDef); freeInstance++ {
		isFree := true
		for _, bln := range blnsOfDef {
			if bln.Instance == freeInstance {
				isFree = false
				break
			}
		}
		if isFree {
			break
		}
	}
	// Configure new cpuTreeAllocator for this balloon if there
	// are type specific allocator options, otherwise use policy
	// default allocator.
	cpuTreeAllocator := p.cpuTreeAllocator
	if blnDef.AllocatorTopologyBalancing != nil || blnDef.PreferSpreadOnPhysicalCores != nil {
		allocatorOptions := cpuTreeAllocatorOptions{
			topologyBalancing:           p.bpoptions.AllocatorTopologyBalancing,
			preferSpreadOnPhysicalCores: p.bpoptions.PreferSpreadOnPhysicalCores,
		}
		if blnDef.AllocatorTopologyBalancing != nil {
			allocatorOptions.topologyBalancing = *blnDef.AllocatorTopologyBalancing
		}
		if blnDef.PreferSpreadOnPhysicalCores != nil {
			allocatorOptions.preferSpreadOnPhysicalCores = *blnDef.PreferSpreadOnPhysicalCores
		}
		cpuTreeAllocator = p.cpuTree.NewAllocator(allocatorOptions)
	}

	// Allocate CPUs
	if blnDef == p.reservedBalloonDef ||
		(blnDef == p.defaultBalloonDef && blnDef.MinCpus == 0 && blnDef.MaxCpus == 0) {
		// The reserved balloon uses ReservedResources CPUs.
		// So does the default balloon unless its CPU counts are tweaked.
		cpus = p.reserved
	} else {
		addFromCpus, _, err := cpuTreeAllocator.ResizeCpus(cpuset.New(), p.freeCpus, blnDef.MinCpus)
		if err != nil {
			return nil, balloonsError("failed to choose a cpuset for allocating first %d CPUs from %#s", blnDef.MinCpus, p.freeCpus)
		}
		cpus, err = p.cpuAllocator.AllocateCpus(&addFromCpus, blnDef.MinCpus, blnDef.AllocatorPriority)
		if err != nil {
			return nil, balloonsError("could not allocate %d MinCpus for balloon %s[%d]: %w", blnDef.MinCpus, blnDef.Name, freeInstance, err)
		}
		p.freeCpus = p.freeCpus.Difference(cpus)
	}
	bln := &Balloon{
		Def:              blnDef,
		Instance:         freeInstance,
		PodIDs:           make(map[string][]string),
		Cpus:             cpus,
		SharedIdleCpus:   cpuset.New(),
		Mems:             p.closestMems(cpus),
		cpuTreeAllocator: cpuTreeAllocator,
	}
	if confCpus {
		if err = p.useCpuClass(bln); err != nil {
			log.Errorf("failed to apply CPU configuration to new balloon %s[%d] (cpus: %s): %w", blnDef.Name, freeInstance, cpus, err)
			return nil, err
		}
	}
	return bln, nil
}

// deleteBalloon removes an empty balloon.
func (p *balloons) deleteBalloon(bln *Balloon) {
	log.Debugf("deleting balloon %s", bln)
	remainingBalloons := []*Balloon{}
	for _, b := range p.balloons {
		if b != bln {
			remainingBalloons = append(remainingBalloons, b)
		}
	}
	p.balloons = remainingBalloons
	p.forgetCpuClass(bln)
	p.freeCpus = p.freeCpus.Union(bln.Cpus)
	p.cpuAllocator.ReleaseCpus(&bln.Cpus, bln.Cpus.Size(), bln.Def.AllocatorPriority)
}

// freeBalloon clears a balloon and deletes it if allowed.
func (p *balloons) freeBalloon(bln *Balloon) {
	bln.PodIDs = make(map[string][]string)
	blnsSameDef := p.balloonsByDef(bln.Def)
	if len(blnsSameDef) > bln.Def.MinBalloons {
		p.deleteBalloon(bln)
	}
}

func (p *balloons) chooseBalloonInstance(blnDef *BalloonDef, fm FillMethod, c cache.Container) (*Balloon, error) {
	// If assigning to the reserved or the default balloon, fill
	// method is ignored: always fill the chosen balloon.
	if blnDef == p.balloons[0].Def {
		return p.balloons[0], nil
	}
	if blnDef == p.balloons[1].Def {
		return p.balloons[1], nil
	}
	reqMilliCpus := p.containerRequestedMilliCpus(c.GetCacheID())
	// Handle fill methods that do not use existing instances of
	// balloonDef.
	switch fm {
	case FillReservedBalloon:
		return p.balloons[0], nil
	case FillDefaultBalloon:
		return p.balloons[1], nil
	case FillNewBalloon, FillNewBalloonMust:
		// Choosing an existing balloon without containers is
		// preferred over instantiating a new balloon.
		for _, bln := range p.balloonsByDef(blnDef) {
			if len(bln.PodIDs) == 0 {
				return bln, nil
			}
		}
		newBln, err := p.newBalloon(blnDef, false)
		if err != nil {
			if fm == FillNewBalloonMust {
				return nil, err
			}
			return nil, nil
		}
		// newBln may already have CPUs allocated for it. If
		// we notice that the new balloon fill method cannot
		// be used after all, collect steps to undo() new
		// balloon creation.
		undoFuncs := []func(){}
		undo := func() {
			for _, undoFunc := range undoFuncs {
				undoFunc()
			}
		}
		undoFuncs = append(undoFuncs, func() {
			p.freeCpus = p.freeCpus.Union(newBln.Cpus)
		})
		if newBln.MaxAvailMilliCpus(p.freeCpus) < reqMilliCpus {
			// New balloon cannot be inflated to fit new
			// container. Release its CPUs if already
			// allocated (MinCPUs > 0), and never add it
			// to the list of balloons.
			undo()
			if fm == FillNewBalloonMust {
				return nil, balloonsError("not enough CPUs to run container %s requesting %s mCPU. %s.MaxCPUs: %d mCPU, free CPUs: %s",
					c.PrettyName(), reqMilliCpus, blnDef.Name, blnDef.MaxCpus*1000, p.freeCpus.Size()*1000)
			} else {
				return nil, nil
			}
		}
		// Make the existence of the new balloon official by
		// adding it to the balloons slice.
		p.balloons = append(p.balloons, newBln)
		undoFuncs = append(undoFuncs, func() {
			p.balloons = p.balloons[:len(p.balloons)-1]
		})
		// If the new balloon already has CPUs, there is some
		// housekeeping to do.
		if newBln.Cpus.Size() > 0 {
			// Make sure CPUs in the balloon use correct
			// CPU class.
			if err = p.useCpuClass(newBln); err != nil {
				log.Errorf("failed to apply CPU configuration to new balloon %s (cpus: %s): %s",
					newBln.PrettyName(), newBln.Cpus, err)
				undo()
				return nil, err
			}
			// Reshare idle CPUs because freeCpus have
			// changed and CPUs of the new balloon are no
			// more idle.
			p.updatePinning(p.shareIdleCpus(p.freeCpus, newBln.Cpus)...)
		}
		return newBln, nil
	case FillSameNamespace:
		for _, bln := range p.balloonsByNamespace(c.GetNamespace()) {
			if bln.Def == blnDef && p.maxFreeMilliCpus(bln) >= reqMilliCpus {
				return bln, nil
			}
		}
		return nil, nil
	case FillSamePod:
		if pod, ok := c.GetPod(); ok {
			for _, bln := range p.balloonsByPod(pod) {
				if p.maxFreeMilliCpus(bln) >= reqMilliCpus {
					return bln, nil
				}
			}
			return nil, nil
		} else {
			return nil, balloonsError("fill method %s failed: cannot find pod for container %s", fm, c.PrettyName())
		}
	}
	// Handle fill methods that need existing instances of
	// balloonDef, and fail if there are no instances.
	balloons := p.balloonsByDef(blnDef)
	if len(balloons) == 0 {
		return nil, nil
	}
	switch fm {
	case FillBalanced:
		// Are there balloons where the container would fit
		// without inflating the balloon?
		blnIdx, freeMilliCpus := largest(len(balloons), func(i int) int {
			return p.freeMilliCpus(balloons[i])
		})
		if freeMilliCpus >= reqMilliCpus {
			return balloons[blnIdx], nil
		}
	case FillBalancedInflate:
		// Are there balloons where the container would fit
		// after inflating the balloon?
		blnIdx, maxFreeMilliCpus := largest(len(balloons), func(i int) int {
			return p.maxFreeMilliCpus(balloons[i])
		})
		if maxFreeMilliCpus >= reqMilliCpus {
			return balloons[blnIdx], nil
		}
	default:
		return nil, balloonsError("balloon type fill method not implemented: %s", fm)
	}
	// No error, but balloon type remains undecided in this assign method.
	return nil, nil
}

func namespaceMatches(namespace string, patterns []string) bool {
	for _, pattern := range patterns {
		ret, err := filepath.Match(pattern, namespace)
		if err == nil && ret {
			return true
		}
	}
	return false
}

// allocateBalloon returns a balloon allocated for a container.
func (p *balloons) allocateBalloon(c cache.Container) (*Balloon, error) {
	blnDef, err := p.chooseBalloonDef(c)
	if err != nil {
		return nil, err
	}
	if blnDef == nil {
		return nil, balloonsError("no applicable balloon type found")
	}

	bln, err := p.allocateBalloonOfDef(blnDef, c)
	if err != nil {
		return nil, err
	}
	if bln == nil {
		return nil, balloonsError("no suitable balloon instance available")
	}
	return bln, nil
}

// allocateBalloonOfDef returns a balloon instantiated from a
// definition for a container.
func (p *balloons) allocateBalloonOfDef(blnDef *BalloonDef, c cache.Container) (*Balloon, error) {
	if blnDef == p.reservedBalloonDef {
		return p.balloons[0], nil
	}
	if blnDef == p.defaultBalloonDef {
		return p.balloons[1], nil
	}

	fillChain := []FillMethod{}
	if !blnDef.PreferSpreadingPods {
		fillChain = append(fillChain, FillSamePod)
	}
	if blnDef.PreferPerNamespaceBalloon {
		fillChain = append(fillChain, FillSameNamespace, FillNewBalloon)
	}
	if blnDef.PreferNewBalloons {
		fillChain = append(fillChain, FillNewBalloon, FillBalanced, FillBalancedInflate)
	} else {
		fillChain = append(fillChain, FillBalanced, FillBalancedInflate, FillNewBalloon)
	}
	for _, fillMethod := range fillChain {
		bln, err := p.chooseBalloonInstance(blnDef, fillMethod, c)
		if err != nil {
			log.Debugf("fill method %q prevents allocation: %w", fillMethod, err)
			return nil, err
		}
		if bln == nil {
			log.Debugf("fill method %q not applicable", fillMethod)
			continue
		}
		log.Debugf("fill method %q suggests balloon instance %v", fillMethod, bln)
		return bln, nil
	}
	return nil, nil
}

// dumpBalloon dumps balloon contents in detail.
func (p *balloons) dumpBalloon(bln *Balloon) string {
	conts := []string{}
	pods := []string{}
	for podID, contIDs := range bln.PodIDs {
		podName := podID
		if pod, ok := p.cch.LookupPod(podID); ok {
			podName = pod.GetName()
		}
		pods = append(pods, podName)
		for _, contID := range contIDs {
			if cont, ok := p.cch.LookupContainer(contID); ok {
				conts = append(conts, cont.PrettyName())
			} else {
				conts = append(conts, podName+"."+contID)
			}
		}
	}
	s := fmt.Sprintf("Balloon %s{Cpus: %s; Mems: %s; mCPU used: %d; capacity: %d; max. capacity: %d; pods: %s; conts: %s}",
		bln.PrettyName(),
		bln.Cpus,
		bln.Mems,
		p.requestedMilliCpus(bln),
		bln.AvailMilliCpus(),
		bln.MaxAvailMilliCpus(p.freeCpus),
		pods,
		conts)
	return s
}

// getPodMilliCPU returns mCPUs requested by podID.
func (p *balloons) getPodMilliCPU(podID string) int64 {
	cpuRequested := int64(0)
	for _, c := range p.cch.GetContainers() {
		if c.GetPodID() == podID {
			if reqCpu, ok := c.GetResourceRequirements().Requests[corev1.ResourceCPU]; ok {
				cpuRequested += reqCpu.MilliValue()
			}
		}
	}
	return cpuRequested
}

// changesBalloons returns true if two balloons policy configurations
// may lead into different balloon instances or workload assignment.
func changesBalloons(opts0, opts1 *BalloonsOptions) bool {
	if opts0 == nil && opts1 == nil {
		return false
	}
	if opts0 == nil || opts1 == nil {
		return true
	}
	if len(opts0.BalloonDefs) != len(opts1.BalloonDefs) {
		return true
	}
	o0 := opts0.DeepCopy()
	o1 := opts1.DeepCopy()
	// Ignore differences in CPU class names. Every other change
	// potentially changes balloons or workloads.
	o0.IdleCpuClass = ""
	o1.IdleCpuClass = ""
	for i := range o0.BalloonDefs {
		o0.BalloonDefs[i].CpuClass = ""
		o1.BalloonDefs[i].CpuClass = ""
	}
	return utils.DumpJSON(o0) != utils.DumpJSON(o1)
}

// changesCpuClasses returns true if two balloons policy
// configurations can lead to using different CPU classes on
// corresponding balloon instances. Calling changesCpuClasses(o0, o1)
// makes sense only if changesBalloons(o0, o1) has returned false.
func changesCpuClasses(opts0, opts1 *BalloonsOptions) bool {
	if opts0 == nil && opts1 == nil {
		return false
	}
	if opts0 == nil || opts1 == nil {
		return true
	}
	if opts0.IdleCpuClass != opts1.IdleCpuClass {
		return true
	}
	if len(opts0.BalloonDefs) != len(opts1.BalloonDefs) {
		return true
	}
	for i := range opts0.BalloonDefs {
		if opts0.BalloonDefs[i].CpuClass != opts1.BalloonDefs[i].CpuClass {
			return true
		}
	}
	return false
}

// configNotify applies new configuration.
func (p *balloons) configNotify(event pkgcfg.Event, source pkgcfg.Source) error {
	log.Info("configuration %s", event)
	defer log.Debug("effective configuration:\n%s\n", utils.DumpJSON(p.bpoptions))
	newBalloonsOptions := balloonsOptions.DeepCopy()
	if !changesBalloons(&p.bpoptions, newBalloonsOptions) {
		if !changesCpuClasses(&p.bpoptions, newBalloonsOptions) {
			log.Info("no configuration changes")
		} else {
			log.Info("configuration changes only on CPU classes")
			// Update new CPU classes to existing balloon
			// definitions. The same BalloonDef instances
			// must be kept in use, because each Balloon
			// instance holds a direct reference to its
			// BalloonDef.
			for i := range p.bpoptions.BalloonDefs {
				p.bpoptions.BalloonDefs[i].CpuClass = newBalloonsOptions.BalloonDefs[i].CpuClass
			}
			// (Re)configures all CPUs in balloons.
			p.resetCpuClass()
			for _, bln := range p.balloons {
				p.useCpuClass(bln)
			}
		}
		return nil
	}
	if err := p.setConfig(newBalloonsOptions); err != nil {
		log.Error("config update failed: %v", err)
		return err
	}
	log.Info("config updated successfully")
	p.Sync(p.cch.GetContainers(), p.cch.GetContainers())
	return nil
}

// applyBalloonDef creates user-defined balloons or reconfigures built-in
// balloons according to the blnDef. Does not initialize balloon CPUs.
func (p *balloons) applyBalloonDef(balloons *[]*Balloon, blnDef *BalloonDef, freeCpus *cpuset.CPUSet) error {
	if len(*balloons) < 2 {
		return balloonsError("internal error: reserved and default balloons missing, cannot apply balloon definitions")
	}
	reservedBalloon := (*balloons)[0]
	defaultBalloon := (*balloons)[1]
	// Every BalloonDef does one of the following:
	// 1. reconfigures the "reserved" balloon (most restricted)
	// 2. reconfigures the "default" balloon (somewhat restricted)
	// 3. defines new user-defined balloons.
	switch blnDef.Name {
	case "":
		// Case 0: bad name
		return balloonsError("undefined or empty balloon name")
	case reservedBalloon.Def.Name:
		// Case 1: reconfigure the "reserved" balloon.
		if blnDef.MinCpus != 0 {
			return balloonsError("cannot reconfigure the reserved balloon MinCpus, specified in ReservedResources CPUs")
		}
		if blnDef.MaxCpus != 0 {
			return balloonsError("cannot reconfigure the reserved balloon MaxCpus, specified in ReservedResources CPUs")
		}
		if blnDef.MinBalloons != 0 {
			return balloonsError("cannot reconfigure the reserved balloon MinBalloons")
		}
		p.reservedBalloonDef.AllocatorPriority = blnDef.AllocatorPriority
		p.reservedBalloonDef.CpuClass = blnDef.CpuClass
		p.reservedBalloonDef.Namespaces = blnDef.Namespaces
	case defaultBalloon.Def.Name:
		// Case 2: reconfigure the "default" balloon.
		defaultUsesReservedCpus := true
		if blnDef.MinCpus != 0 || blnDef.MaxCpus != 0 {
			defaultUsesReservedCpus = false
		}
		if blnDef.MinBalloons != 0 {
			return balloonsError("cannot reconfigure the default balloon MinBalloons")
		}
		p.defaultBalloonDef.MinCpus = blnDef.MinCpus
		p.defaultBalloonDef.MaxCpus = blnDef.MaxCpus
		p.defaultBalloonDef.AllocatorPriority = blnDef.AllocatorPriority
		p.defaultBalloonDef.CpuClass = blnDef.CpuClass
		p.defaultBalloonDef.Namespaces = blnDef.Namespaces
		if !defaultUsesReservedCpus {
			// Overwrite existing default balloon instance
			// that uses reserved CPUs with a balloon that
			// uses its own CPUs.
			newDefaultBln, err := p.newBalloon(p.defaultBalloonDef, false)
			if err != nil {
				return balloonsError("cannot create new default balloon: %w", err)
			}
			newDefaultBln.Instance = 0
			(*balloons)[1] = newDefaultBln
		}
	default:
		// Case 3: create minimum amount (MinBalloons) of each user-defined balloons.
		for allocPrio := cpuallocator.CPUPriority(0); allocPrio < cpuallocator.NumCPUPriorities; allocPrio++ {
			if blnDef.AllocatorPriority != allocPrio {
				continue
			}
			for blnIdx := 0; blnIdx < blnDef.MinBalloons; blnIdx++ {
				newBln, err := p.newBalloon(blnDef, false)
				if err != nil {
					return err
				}
				if newBln == nil {
					return balloonsError("failed to create balloon '%s[%d]' as required by MinBalloons=%d", blnDef.Name, blnIdx, blnDef.MinBalloons)
				}
				*balloons = append(*balloons, newBln)
			}
		}
	}
	return nil
}

func (p *balloons) validateConfig(bpoptions *BalloonsOptions) error {
	for _, blnDef := range bpoptions.BalloonDefs {
		if blnDef.MaxCpus != NoLimit && blnDef.MinCpus > blnDef.MaxCpus {
			return balloonsError("MinCpus (%d) > MaxCpus (%d) in balloon type %q",
				blnDef.MinCpus, blnDef.MaxCpus, blnDef.Name)
		}
		if blnDef.MaxBalloons != NoLimit && blnDef.MinBalloons > blnDef.MaxBalloons {
			return balloonsError("MinBalloons (%d) > MaxBalloons (%d) in balloon type %q",
				blnDef.MinCpus, blnDef.MaxCpus, blnDef.Name)
		}
	}
	return nil
}

// setConfig takes new balloon configuration into use.
func (p *balloons) setConfig(bpoptions *BalloonsOptions) error {
	// TODO: revert allocations (p.freeCpus) to old ones if the
	// configuration is invalid. Currently bad configuration
	// leaves a mess in bookkeeping.
	if err := p.validateConfig(bpoptions); err != nil {
		return balloonsError("invalid configuration: %w", err)
	}

	// Create the default reserved and default balloon
	// definitions. Some properties of these definitions may be
	// altered by user configuration.
	p.reservedBalloonDef = &BalloonDef{
		Name:              reservedBalloonDefName,
		MinBalloons:       1,
		AllocatorPriority: 3,
	}
	p.defaultBalloonDef = &BalloonDef{
		Name:              defaultBalloonDefName,
		MinBalloons:       1,
		AllocatorPriority: 3,
	}
	p.balloons = []*Balloon{}
	p.freeCpus = p.allowed.Clone()
	p.freeCpus = p.freeCpus.Difference(p.reserved)
	p.cpuTreeAllocator = p.cpuTree.NewAllocator(cpuTreeAllocatorOptions{
		topologyBalancing:           bpoptions.AllocatorTopologyBalancing,
		preferSpreadOnPhysicalCores: bpoptions.PreferSpreadOnPhysicalCores,
	})
	// We can't delay taking new configuration into use beyond this point,
	// because p.newBalloon() dereferences our options via p.bpoptions, so
	// it would end up using the old configuration.
	p.bpoptions = *bpoptions
	// Instantiate built-in reserved and default balloons.
	reservedBalloon, err := p.newBalloon(p.reservedBalloonDef, false)
	if err != nil {
		return err
	}
	p.balloons = append(p.balloons, reservedBalloon)
	defaultBalloon, err := p.newBalloon(p.defaultBalloonDef, false)
	if err != nil {
		return err
	}
	p.balloons = append(p.balloons, defaultBalloon)
	// First apply customizations to built-in balloons: "reserved"
	// and "default".
	for _, blnDef := range bpoptions.BalloonDefs {
		if blnDef.Name != reservedBalloonDefName && blnDef.Name != defaultBalloonDefName {
			continue
		}
		if err := p.applyBalloonDef(&p.balloons, blnDef, &p.freeCpus); err != nil {
			return err
		}
	}
	// Apply all user balloon definitions, skip already customized
	// "reserved" and "default" balloons.
	for _, blnDef := range bpoptions.BalloonDefs {
		if blnDef.Name == reservedBalloonDefName || blnDef.Name == defaultBalloonDefName {
			continue
		}
		if err := p.applyBalloonDef(&p.balloons, blnDef, &p.freeCpus); err != nil {
			return err
		}
	}
	// Finish balloon instance initialization.
	log.Info("%s policy balloons:", PolicyName)
	for blnIdx, bln := range p.balloons {
		log.Info("- balloon %d: %s", blnIdx, bln)
	}
	p.updatePinning(p.shareIdleCpus(p.freeCpus, cpuset.New())...)
	// (Re)configures all CPUs in balloons.
	p.resetCpuClass()
	for _, bln := range p.balloons {
		p.useCpuClass(bln)
	}
	return nil
}

// closestMems returns memory node IDs good for pinning containers
// that run on given CPUs
func (p *balloons) closestMems(cpus cpuset.CPUSet) idset.IDSet {
	mems := idset.NewIDSet()
	sys := p.options.System
	for _, nodeID := range sys.NodeIDs() {
		if !cpus.Intersection(sys.Node(nodeID).CPUSet()).IsEmpty() {
			mems.Add(nodeID)
		}
	}
	return mems
}

// filterBalloons returns balloons for which the test function returns true
func filterBalloons(balloons []*Balloon, test func(*Balloon) bool) (ret []*Balloon) {
	for _, bln := range balloons {
		if test(bln) {
			ret = append(ret, bln)
		}
	}
	return
}

// availableMilliCPU returns mCPUs available in a balloon.
func (p *balloons) availableMilliCpus(balloon *Balloon) int64 {
	cpuAvail := int64(balloon.Cpus.Size() * 1000)
	cpuRequested := int64(0)
	for podID := range balloon.PodIDs {
		cpuRequested += p.getPodMilliCPU(podID)
	}
	return cpuAvail - cpuRequested
}

// resizeBalloon changes the CPUs allocated for a balloon, if allowed.
func (p *balloons) resizeBalloon(bln *Balloon, newMilliCpus int) error {
	if bln.Cpus.Equals(p.reserved) {
		log.Debugf("not resizing %s to %d mCPU, using fixed CPUs", bln, newMilliCpus)
		return nil
	}
	oldCpuCount := bln.Cpus.Size()
	newCpuCount := (newMilliCpus + 999) / 1000
	if bln.Def.MaxCpus > NoLimit && newCpuCount > bln.Def.MaxCpus {
		newCpuCount = bln.Def.MaxCpus
	}
	if bln.Def.MinCpus > 0 && newCpuCount < bln.Def.MinCpus {
		newCpuCount = bln.Def.MinCpus
	}
	log.Debugf("resize %s to fit %d mCPU", bln, newMilliCpus)
	log.Debugf("- change full CPUs from %d to %d", oldCpuCount, newCpuCount)
	log.Debugf("- freecpus: %#s", p.freeCpus)
	if oldCpuCount == newCpuCount {
		return nil
	}
	cpuCountDelta := newCpuCount - oldCpuCount
	p.forgetCpuClass(bln)
	defer p.useCpuClass(bln)
	if cpuCountDelta > 0 {
		// Inflate the balloon.
		addFromCpus, _, err := bln.cpuTreeAllocator.ResizeCpus(bln.Cpus, p.freeCpus, cpuCountDelta)
		if err != nil {
			return balloonsError("resize/inflate: failed to choose a cpuset for allocating additional %d CPUs: %w", cpuCountDelta, err)
		}
		log.Debugf("- allocate CPUs %d from %#s", cpuCountDelta, addFromCpus)
		newCpus, err := p.cpuAllocator.AllocateCpus(&addFromCpus, newCpuCount-oldCpuCount, bln.Def.AllocatorPriority)
		if err != nil {
			return balloonsError("resize/inflate: allocating %d CPUs for %s failed: %w", cpuCountDelta, bln, err)
		}
		p.freeCpus = p.freeCpus.Difference(newCpus)
		bln.Cpus = bln.Cpus.Union(newCpus)
		p.updatePinning(p.shareIdleCpus(p.freeCpus, newCpus)...)
	} else {
		// Deflate the balloon.
		_, removeFromCpus, err := bln.cpuTreeAllocator.ResizeCpus(bln.Cpus, p.freeCpus, cpuCountDelta)
		if err != nil {
			return balloonsError("resize/deflate: failed to choose a cpuset for releasing %d CPUs: %w", -cpuCountDelta, err)
		}
		log.Debugf("- releasing %d CPUs from cpuset %#s", -cpuCountDelta, removeFromCpus)
		_, err = p.cpuAllocator.ReleaseCpus(&removeFromCpus, -cpuCountDelta, bln.Def.AllocatorPriority)
		if err != nil {
			return balloonsError("resize/deflate: releasing %d CPUs from %s failed: %w", -cpuCountDelta, bln, err)
		}
		log.Debugf("- old freeCpus: %#s, old bln.Cpus: %#s, releasing: %#s", p.freeCpus, bln.Cpus, removeFromCpus)
		p.freeCpus = p.freeCpus.Union(removeFromCpus)
		bln.Cpus = bln.Cpus.Difference(removeFromCpus)
		p.updatePinning(p.shareIdleCpus(removeFromCpus, cpuset.New())...)
	}
	log.Debugf("- resize successful: %s, freecpus: %#s", bln, p.freeCpus)
	p.updatePinning(bln)
	return nil
}

func (p *balloons) updatePinning(blns ...*Balloon) {
	for _, bln := range blns {
		cpus := bln.Cpus.Union(bln.SharedIdleCpus)
		bln.Mems = p.closestMems(cpus)
		for _, cID := range bln.ContainerIDs() {
			if c, ok := p.cch.LookupContainer(cID); ok {
				p.pinCpuMem(c, cpus, bln.Mems)
			}
		}
	}
}

// shareIdleCpus adds addCpus and removes removeCpus to those balloons
// that whose containers are allowed to use shared idle CPUs. Returns
// balloons that will need re-pinning.
func (p *balloons) shareIdleCpus(addCpus, removeCpus cpuset.CPUSet) []*Balloon {
	updateBalloons := map[int]struct{}{}
	if removeCpus.Size() > 0 {
		for blnIdx, bln := range p.balloons {
			if bln.SharedIdleCpus.Intersection(removeCpus).Size() > 0 {
				bln.SharedIdleCpus = bln.SharedIdleCpus.Difference(removeCpus)
				updateBalloons[blnIdx] = struct{}{}
			}
		}
	}
	if addCpus.Size() > 0 {
		for blnIdx, bln := range p.balloons {
			topoLevel := bln.Def.ShareIdleCpusInSame
			if topoLevel == CPUTopologyLevelUndefined {
				continue
			}
			idleCpusInTopoLevel := cpuset.New()
			p.cpuTree.DepthFirstWalk(func(t *cpuTreeNode) error {
				// Dive in correct topology level.
				if t.level != topoLevel {
					return nil
				}
				// Does the balloon include CPUs in the correct topology level?
				if t.cpus.Intersection(bln.Cpus).Size() > 0 {
					// Share idle CPUs on this level to this balloon.
					idleCpusInTopoLevel = idleCpusInTopoLevel.Union(t.cpus.Intersection(addCpus))
				}
				// Do not walk deeper than the correct level.
				return WalkSkipChildren
			})
			if idleCpusInTopoLevel.Size() == 0 {
				continue
			}
			sharedBefore := bln.SharedIdleCpus.Size()
			bln.SharedIdleCpus = bln.SharedIdleCpus.Union(idleCpusInTopoLevel)
			sharedNow := bln.SharedIdleCpus.Size()
			if sharedBefore != sharedNow {
				log.Debugf("balloon %d shares %d new idle CPU(s) in %s(s), %d in total (%s)",
					bln.PrettyName(), sharedNow-sharedBefore,
					topoLevel, bln.SharedIdleCpus.Size(), bln.SharedIdleCpus)
				updateBalloons[blnIdx] = struct{}{}
			}
		}
	}
	updatedBalloons := make([]*Balloon, 0, len(updateBalloons))
	for blnIdx := range updateBalloons {
		updatedBalloons = append(updatedBalloons, p.balloons[blnIdx])
	}
	return updatedBalloons
}

// assignContainer adds a container to a balloon
func (p *balloons) assignContainer(c cache.Container, bln *Balloon) {
	log.Info("assigning container %s to balloon %s", c.PrettyName(), bln)
	// TODO: inflate the balloon (add CPUs / reconfigure balloons)
	// if necessary
	podID := c.GetPodID()
	bln.PodIDs[podID] = append(bln.PodIDs[podID], c.GetCacheID())
	p.updatePinning(bln)
}

// dismissContainer removes a container from a balloon
func (p *balloons) dismissContainer(c cache.Container, bln *Balloon) {
	podID := c.GetPodID()
	bln.PodIDs[podID] = removeString(bln.PodIDs[podID], c.GetCacheID())
	if len(bln.PodIDs[podID]) == 0 {
		delete(bln.PodIDs, podID)
	}
}

// pinCpuMem pins container to CPUs and memory nodes if flagged
func (p *balloons) pinCpuMem(c cache.Container, cpus cpuset.CPUSet, mems idset.IDSet) {
	if p.bpoptions.PinCPU == nil || *p.bpoptions.PinCPU {
		log.Debug("  - pinning %s to cpuset: %s", c.PrettyName(), cpus)
		c.SetCpusetCpus(cpus.String())
		if reqCpu, ok := c.GetResourceRequirements().Requests[corev1.ResourceCPU]; ok {
			mCpu := int(reqCpu.MilliValue())
			c.SetCPUShares(int64(cache.MilliCPUToShares(int64(mCpu))))
		}
	}
	if p.bpoptions.PinMemory == nil || *p.bpoptions.PinMemory {
		log.Debug("  - pinning %s to memory %s", c.PrettyName(), mems)
		c.SetCpusetMems(mems.String())
	}
}

// balloonsError formats an error from this policy.
func balloonsError(format string, args ...interface{}) error {
	return fmt.Errorf(PolicyName+": "+format, args...)
}

// removeString returns the first occurrence of a string from string slice.
func removeString(strings []string, element string) []string {
	for index, s := range strings {
		if s == element {
			strings[index] = strings[len(strings)-1]
			return strings[:len(strings)-1]
		}
	}
	return strings
}

func max(a, b int) int {
	if a > b {
		return a
	}
	return b
}

// Register us as a policy implementation.
func init() {
	policy.Register(PolicyName, PolicyDescription, CreateBalloonsPolicy)
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/balloons/balloons-policy_test.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package balloons

import (
	"testing"
)

func TestChangesBalloons(t *testing.T) {
	tcases := []struct {
		name          string
		opts1         *BalloonsOptions
		opts2         *BalloonsOptions
		expectedValue bool
	}{
		{
			name:          "both options are nil",
			expectedValue: false,
		},
		{
			name:          "one option is nil",
			opts2:         &BalloonsOptions{},
			expectedValue: true,
		},
		{
			name: "reserved pool namespaces differ by len",
			opts1: &BalloonsOptions{
				IdleCpuClass:           "icc0",
				ReservedPoolNamespaces: []string{"ns0"},
			},
			opts2: &BalloonsOptions{
				IdleCpuClass:           "icc0",
				ReservedPoolNamespaces: []string{},
			},
			expectedValue: true,
		},
		{
			name: "reserved pool namespaces differ by content",
			opts1: &BalloonsOptions{
				IdleCpuClass:           "icc0",
				ReservedPoolNamespaces: []string{"ns0"},
			},
			opts2: &BalloonsOptions{
				IdleCpuClass:           "icc0",
				ReservedPoolNamespaces: []string{"ns1"},
			},
			expectedValue: true,
		},
		{
			name: "idle cpu classes differ",
			opts1: &BalloonsOptions{
				IdleCpuClass:           "icc0",
				ReservedPoolNamespaces: []string{"ns0"},
			},
			opts2: &BalloonsOptions{
				IdleCpuClass:           "icc1",
				ReservedPoolNamespaces: []string{"ns0"},
			},
			expectedValue: false,
		},
		{
			name: "balloon defs differ",
			opts1: &BalloonsOptions{
				IdleCpuClass:           "icc0",
				ReservedPoolNamespaces: []string{"ns0"},
				BalloonDefs:            []*BalloonDef{},
			},
			opts2: &BalloonsOptions{
				IdleCpuClass:           "icc1",
				ReservedPoolNamespaces: []string{"ns0"},
				BalloonDefs:            []*BalloonDef{},
			},
			expectedValue: false,
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			value := changesBalloons(tc.opts1, tc.opts2)
			if value != tc.expectedValue {
				t.Errorf("Expected return value %v but got %v", tc.expectedValue, value)
			}
		})
	}
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/balloons/cputree.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package balloons

import (
	"encoding/json"
	"errors"
	"fmt"
	"sort"
	"strings"

	system "github.com/intel/cri-resource-manager/pkg/sysfs"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
)

type CPUTopologyLevel int

const (
	CPUTopologyLevelUndefined CPUTopologyLevel = iota
	CPUTopologyLevelSystem
	CPUTopologyLevelPackage
	CPUTopologyLevelDie
	CPUTopologyLevelNuma
	CPUTopologyLevelCore
	CPUTopologyLevelThread
	CPUTopologyLevelCount
)

// cpuTreeNode is a node in the CPU tree.
type cpuTreeNode struct {
	name     string
	level    CPUTopologyLevel
	parent   *cpuTreeNode
	children []*cpuTreeNode
	cpus     cpuset.CPUSet // union of CPUs of child nodes

}

// cpuTreeNodeAttributes contains various attributes of a CPU tree
// node. When allocating or releasing CPUs, all CPU tree nodes in
// which allocating/releasing could be possible are stored to the same
// slice with these attributes. The attributes contain all necessary
// information for comparing which nodes are the best choices for
// allocating/releasing, thus traversing the tree is not needed in the
// comparison phase.
type cpuTreeNodeAttributes struct {
	t                *cpuTreeNode
	depth            int
	currentCpus      cpuset.CPUSet
	freeCpus         cpuset.CPUSet
	currentCpuCount  int
	currentCpuCounts []int
	freeCpuCount     int
	freeCpuCounts    []int
}

// cpuTreeAllocator allocates CPUs from the branch of a CPU tree
// where the "root" node is the topmost CPU of the branch.
type cpuTreeAllocator struct {
	options cpuTreeAllocatorOptions
	root    *cpuTreeNode
}

// cpuTreeAllocatorOptions contains parameters for the CPU allocator
// that that selects CPUs from a CPU tree.
type cpuTreeAllocatorOptions struct {
	// topologyBalancing true prefers allocating from branches
	// with most free CPUs (spread allocations), while false is
	// the opposite (packed allocations).
	topologyBalancing           bool
	preferSpreadOnPhysicalCores bool
}

// Strings returns topology level as a string
func (ctl CPUTopologyLevel) String() string {
	s, ok := cpuTopologyLevelToString[ctl]
	if ok {
		return s
	}
	return fmt.Sprintf("CPUTopologyLevelUnknown(%d)", ctl)
}

// cpuTopologyLevelToString defines names for all CPU topology levels.
var cpuTopologyLevelToString = map[CPUTopologyLevel]string{
	CPUTopologyLevelUndefined: "",
	CPUTopologyLevelSystem:    "system",
	CPUTopologyLevelPackage:   "package",
	CPUTopologyLevelDie:       "die",
	CPUTopologyLevelNuma:      "numa",
	CPUTopologyLevelCore:      "core",
	CPUTopologyLevelThread:    "thread",
}

// MarshalJSON()
func (ctl CPUTopologyLevel) MarshalJSON() ([]byte, error) {
	return json.Marshal(ctl.String())
}

// UnmarshalJSON unmarshals a JSON string to CPUTopologyLevel
func (ctl *CPUTopologyLevel) UnmarshalJSON(data []byte) error {
	var dataString string
	if err := json.Unmarshal(data, &dataString); err != nil {
		return err
	}
	name := strings.ToLower(dataString)
	for ctlConst, ctlString := range cpuTopologyLevelToString {
		if ctlString == name {
			*ctl = ctlConst
			return nil
		}
	}
	return fmt.Errorf("invalid CPU topology level %q", name)
}

// String returns string representation of a CPU tree node.
func (t *cpuTreeNode) String() string {
	if len(t.children) == 0 {
		return t.name
	}
	return fmt.Sprintf("%s%v", t.name, t.children)
}

func (t *cpuTreeNode) PrettyPrint() string {
	origDepth := t.Depth()
	lines := []string{}
	t.DepthFirstWalk(func(tn *cpuTreeNode) error {
		lines = append(lines,
			fmt.Sprintf("%s%s: %q cpus: %s",
				strings.Repeat(" ", (tn.Depth()-origDepth)*4),
				tn.level, tn.name, tn.cpus))
		return nil
	})
	return strings.Join(lines, "\n")
}

// String returns cpuTreeNodeAttributes as a string.
func (tna cpuTreeNodeAttributes) String() string {
	return fmt.Sprintf("%s{%d,%v,%d,%d}", tna.t.name, tna.depth,
		tna.currentCpuCounts,
		tna.freeCpuCount, tna.freeCpuCounts)
}

// NewCpuTree returns a named CPU tree node.
func NewCpuTree(name string) *cpuTreeNode {
	return &cpuTreeNode{
		name: name,
		cpus: cpuset.New(),
	}
}

func (t *cpuTreeNode) CopyTree() *cpuTreeNode {
	newNode := t.CopyNode()
	newNode.children = make([]*cpuTreeNode, 0, len(t.children))
	for _, child := range t.children {
		newNode.AddChild(child.CopyTree())
	}
	return newNode
}

func (t *cpuTreeNode) CopyNode() *cpuTreeNode {
	newNode := cpuTreeNode{
		name:     t.name,
		level:    t.level,
		parent:   t.parent,
		children: t.children,
		cpus:     t.cpus,
	}
	return &newNode
}

// Depth returns the distance from the root node.
func (t *cpuTreeNode) Depth() int {
	if t.parent == nil {
		return 0
	}
	return t.parent.Depth() + 1
}

// AddChild adds new child node to a CPU tree node.
func (t *cpuTreeNode) AddChild(child *cpuTreeNode) {
	child.parent = t
	t.children = append(t.children, child)
}

// AddCpus adds CPUs to a CPU tree node and all its parents.
func (t *cpuTreeNode) AddCpus(cpus cpuset.CPUSet) {
	t.cpus = t.cpus.Union(cpus)
	if t.parent != nil {
		t.parent.AddCpus(cpus)
	}
}

// Cpus returns CPUs of a CPU tree node.
func (t *cpuTreeNode) Cpus() cpuset.CPUSet {
	return t.cpus
}

// SiblingIndex returns the index of this node among its parents
// children. Returns -1 for the root node, -2 if this node is not
// listed among the children of its parent.
func (t *cpuTreeNode) SiblingIndex() int {
	if t.parent == nil {
		return -1
	}
	for idx, child := range t.parent.children {
		if child == t {
			return idx
		}
	}
	return -2
}

func (t *cpuTreeNode) FindLeafWithCpu(cpu int) *cpuTreeNode {
	var found *cpuTreeNode
	t.DepthFirstWalk(func(tn *cpuTreeNode) error {
		if len(tn.children) > 0 {
			return nil
		}
		for _, cpuHere := range tn.cpus.List() {
			if cpu == cpuHere {
				found = tn
				return WalkStop
			}
		}
		return nil // not found here, no more children to search
	})
	return found
}

// WalkSkipChildren error returned from a DepthFirstWalk handler
// prevents walking deeper in the tree. The caller of the
// DepthFirstWalk will get no error.
var WalkSkipChildren error = errors.New("skip children")

// WalkStop error returned from a DepthFirstWalk handler stops the
// walk altogether. The caller of the DepthFirstWalk will get the
// WalkStop error.
var WalkStop error = errors.New("stop")

// DepthFirstWalk walks through nodes in a CPU tree. Every node is
// passed to the handler callback that controls next step by
// returning:
// - nil: continue walking to the next node
// - WalkSkipChildren: continue to the next node but skip children of this node
// - WalkStop: stop walking.
func (t *cpuTreeNode) DepthFirstWalk(handler func(*cpuTreeNode) error) error {
	if err := handler(t); err != nil {
		if err == WalkSkipChildren {
			return nil
		}
		return err
	}
	for _, child := range t.children {
		if err := child.DepthFirstWalk(handler); err != nil {
			return err
		}
	}
	return nil
}

// CpuLocations returns a slice where each element contains names of
// topology elements over which a set of CPUs spans. Example:
// systemNode.CpuLocations(cpuset:0,99) = [["system"],["p0", "p1"], ["p0d0", "p1d0"], ...]
func (t *cpuTreeNode) CpuLocations(cpus cpuset.CPUSet) [][]string {
	names := make([][]string, int(CPUTopologyLevelCount)-int(t.level))
	t.DepthFirstWalk(func(tn *cpuTreeNode) error {
		if tn.cpus.Intersection(cpus).Size() == 0 {
			return WalkSkipChildren
		}
		levelIndex := int(tn.level) - int(t.level)
		names[levelIndex] = append(names[levelIndex], tn.name)
		return nil
	})
	return names
}

// NewCpuTreeFromSystem returns the root node of the topology tree
// constructed from the underlying system.
func NewCpuTreeFromSystem() (*cpuTreeNode, error) {
	sys, err := system.DiscoverSystem()
	if err != nil {
		return nil, err
	}
	// TODO: split deep nested loops into functions
	sysTree := NewCpuTree("system")
	sysTree.level = CPUTopologyLevelSystem
	for _, packageID := range sys.PackageIDs() {
		packageTree := NewCpuTree(fmt.Sprintf("p%d", packageID))
		packageTree.level = CPUTopologyLevelPackage
		cpuPackage := sys.Package(packageID)
		sysTree.AddChild(packageTree)
		for _, dieID := range cpuPackage.DieIDs() {
			dieTree := NewCpuTree(fmt.Sprintf("p%dd%d", packageID, dieID))
			dieTree.level = CPUTopologyLevelDie
			packageTree.AddChild(dieTree)
			for _, nodeID := range cpuPackage.DieNodeIDs(dieID) {
				nodeTree := NewCpuTree(fmt.Sprintf("p%dd%dn%d", packageID, dieID, nodeID))
				nodeTree.level = CPUTopologyLevelNuma
				dieTree.AddChild(nodeTree)
				node := sys.Node(nodeID)
				threadsSeen := map[int]struct{}{}
				for _, cpuID := range node.CPUSet().List() {
					if _, alreadySeen := threadsSeen[cpuID]; alreadySeen {
						continue
					}
					cpuTree := NewCpuTree(fmt.Sprintf("p%dd%dn%dcpu%d", packageID, dieID, nodeID, cpuID))

					cpuTree.level = CPUTopologyLevelCore
					nodeTree.AddChild(cpuTree)
					cpu := sys.CPU(cpuID)
					for _, threadID := range cpu.ThreadCPUSet().List() {
						threadsSeen[threadID] = struct{}{}
						threadTree := NewCpuTree(fmt.Sprintf("p%dd%dn%dcpu%dt%d", packageID, dieID, nodeID, cpuID, threadID))
						threadTree.level = CPUTopologyLevelThread
						cpuTree.AddChild(threadTree)
						threadTree.AddCpus(cpuset.New(threadID))
					}
				}
			}
		}
	}
	return sysTree, nil
}

// ToAttributedSlice returns a CPU tree node and recursively all its
// child nodes in a slice that contains nodes with their attributes
// for allocation/releasing comparison.
// - currentCpus is the set of CPUs that can be freed in coming operation
// - freeCpus is the set of CPUs that can be allocated in coming operation
// - filter(tna) returns false if the node can be ignored
func (t *cpuTreeNode) ToAttributedSlice(
	currentCpus, freeCpus cpuset.CPUSet,
	filter func(*cpuTreeNodeAttributes) bool) []cpuTreeNodeAttributes {
	tnas := []cpuTreeNodeAttributes{}
	currentCpuCounts := []int{}
	freeCpuCounts := []int{}
	t.toAttributedSlice(currentCpus, freeCpus, filter, &tnas, 0, currentCpuCounts, freeCpuCounts)
	return tnas
}

func (t *cpuTreeNode) toAttributedSlice(
	currentCpus, freeCpus cpuset.CPUSet,
	filter func(*cpuTreeNodeAttributes) bool,
	tnas *[]cpuTreeNodeAttributes,
	depth int,
	currentCpuCounts []int,
	freeCpuCounts []int) {
	currentCpusHere := t.cpus.Intersection(currentCpus)
	freeCpusHere := t.cpus.Intersection(freeCpus)
	currentCpuCountHere := currentCpusHere.Size()
	currentCpuCountsHere := make([]int, len(currentCpuCounts)+1, len(currentCpuCounts)+1)
	copy(currentCpuCountsHere, currentCpuCounts)
	currentCpuCountsHere[depth] = currentCpuCountHere

	freeCpuCountHere := freeCpusHere.Size()
	freeCpuCountsHere := make([]int, len(freeCpuCounts)+1, len(freeCpuCounts)+1)
	copy(freeCpuCountsHere, freeCpuCounts)
	freeCpuCountsHere[depth] = freeCpuCountHere

	tna := cpuTreeNodeAttributes{
		t:                t,
		depth:            depth,
		currentCpus:      currentCpusHere,
		freeCpus:         freeCpusHere,
		currentCpuCount:  currentCpuCountHere,
		currentCpuCounts: currentCpuCountsHere,
		freeCpuCount:     freeCpuCountHere,
		freeCpuCounts:    freeCpuCountsHere,
	}

	if filter != nil && !filter(&tna) {
		return
	}

	*tnas = append(*tnas, tna)
	for _, child := range t.children {
		child.toAttributedSlice(currentCpus, freeCpus, filter,
			tnas, depth+1, currentCpuCountsHere, freeCpuCountsHere)
	}
}

// SplitLevel returns the root node of a new CPU tree where all
// branches of a topology level have been split into new classes.
func (t *cpuTreeNode) SplitLevel(splitLevel CPUTopologyLevel, cpuClassifier func(int) int) *cpuTreeNode {
	newRoot := t.CopyTree()
	newRoot.DepthFirstWalk(func(tn *cpuTreeNode) error {
		// Dive into the level that will be split.
		if tn.level != splitLevel {
			return nil
		}
		// Classify CPUs to the map: class -> list of cpus
		classCpus := map[int][]int{}
		for _, cpu := range t.cpus.List() {
			class := cpuClassifier(cpu)
			classCpus[class] = append(classCpus[class], cpu)
		}
		// Clear existing children of this node. New children
		// will be classes whose children are masked versions
		// of original children of this node.
		origChildren := tn.children
		tn.children = make([]*cpuTreeNode, 0, len(classCpus))
		// Add new child corresponding each class.
		for class, cpus := range classCpus {
			cpuMask := cpuset.New(cpus...)
			newNode := NewCpuTree(fmt.Sprintf("%sclass%d", tn.name, class))
			tn.AddChild(newNode)
			newNode.cpus = tn.cpus.Intersection(cpuMask)
			newNode.level = tn.level
			newNode.parent = tn
			for _, child := range origChildren {
				newChild := child.CopyTree()
				newChild.DepthFirstWalk(func(cn *cpuTreeNode) error {
					cn.cpus = cn.cpus.Intersection(cpuMask)
					if cn.cpus.Size() == 0 && cn.parent != nil {
						// all cpus masked
						// out: cut out this
						// branch
						newSiblings := []*cpuTreeNode{}
						for _, child := range cn.parent.children {
							if child != cn {
								newSiblings = append(newSiblings, child)
							}
						}
						cn.parent.children = newSiblings
						return WalkSkipChildren
					}
					return nil
				})
				newNode.AddChild(newChild)
			}
		}
		return WalkSkipChildren
	})
	return newRoot
}

// NewAllocator returns new CPU allocator for allocating CPUs from a
// CPU tree branch.
func (t *cpuTreeNode) NewAllocator(options cpuTreeAllocatorOptions) *cpuTreeAllocator {
	ta := &cpuTreeAllocator{
		root:    t,
		options: options,
	}
	if options.preferSpreadOnPhysicalCores {
		newTree := t.SplitLevel(CPUTopologyLevelNuma,
			// CPU classifier: class of the CPU equals to
			// the index in the child list of its parent
			// node in the tree. Expect leaf node is a
			// hyperthread, parent a physical core.
			func(cpu int) int {
				leaf := t.FindLeafWithCpu(cpu)
				if leaf == nil {
					log.Fatalf("SplitLevel CPU classifier: cpu %d not in tree:\n%s\n\n", cpu, t.PrettyPrint())
				}
				return leaf.SiblingIndex()
			})
		ta.root = newTree
	}
	return ta
}

// sorterAllocate implements an "is-less-than" callback that helps
// sorting a slice of cpuTreeNodeAttributes. The first item in the
// sorted list contains an optimal CPU tree node for allocating new
// CPUs.
func (ta *cpuTreeAllocator) sorterAllocate(tnas []cpuTreeNodeAttributes) func(int, int) bool {
	return func(i, j int) bool {
		if tnas[i].depth != tnas[j].depth {
			return tnas[i].depth > tnas[j].depth
		}
		for tdepth := 0; tdepth < len(tnas[i].currentCpuCounts); tdepth += 1 {
			// After this currentCpus will increase.
			// Maximize the maximal amount of currentCpus
			// as high level in the topology as possible.
			if tnas[i].currentCpuCounts[tdepth] != tnas[j].currentCpuCounts[tdepth] {
				return tnas[i].currentCpuCounts[tdepth] > tnas[j].currentCpuCounts[tdepth]
			}
		}
		for tdepth := 0; tdepth < len(tnas[i].freeCpuCounts); tdepth += 1 {
			// After this freeCpus will decrease.
			if tnas[i].freeCpuCounts[tdepth] != tnas[j].freeCpuCounts[tdepth] {
				if ta.options.topologyBalancing {
					// Goal: minimize maximal freeCpus in topology.
					return tnas[i].freeCpuCounts[tdepth] > tnas[j].freeCpuCounts[tdepth]
				} else {
					// Goal: maximize maximal freeCpus in topology.
					return tnas[i].freeCpuCounts[tdepth] < tnas[j].freeCpuCounts[tdepth]
				}
			}
		}
		return tnas[i].t.name < tnas[j].t.name
	}
}

// sorterRelease implements an "is-less-than" callback that helps
// sorting a slice of cpuTreeNodeAttributes. The first item in the
// list contains an optimal CPU tree node for releasing new CPUs.
func (ta *cpuTreeAllocator) sorterRelease(tnas []cpuTreeNodeAttributes) func(int, int) bool {
	return func(i, j int) bool {
		if tnas[i].depth != tnas[j].depth {
			return tnas[i].depth > tnas[j].depth
		}
		for tdepth := 0; tdepth < len(tnas[i].currentCpuCounts); tdepth += 1 {
			// After this currentCpus will decrease. Aim
			// to minimize the minimal amount of
			// currentCpus in order to decrease
			// fragmentation as high level in the topology
			// as possible.
			if tnas[i].currentCpuCounts[tdepth] != tnas[j].currentCpuCounts[tdepth] {
				return tnas[i].currentCpuCounts[tdepth] < tnas[j].currentCpuCounts[tdepth]
			}
		}
		for tdepth := 0; tdepth < len(tnas[i].freeCpuCounts); tdepth += 1 {
			// After this freeCpus will increase. Try to
			// maximize minimal free CPUs for better
			// isolation as high level in the topology as
			// possible.
			if tnas[i].freeCpuCounts[tdepth] != tnas[j].freeCpuCounts[tdepth] {
				if ta.options.topologyBalancing {
					return tnas[i].freeCpuCounts[tdepth] < tnas[j].freeCpuCounts[tdepth]
				} else {
					return tnas[i].freeCpuCounts[tdepth] < tnas[j].freeCpuCounts[tdepth]
				}
			}
		}
		return tnas[i].t.name > tnas[j].t.name
	}
}

// ResizeCpus implements topology awareness to both adding CPUs to and
// removing them from a set of CPUs. It returns CPUs from which actual
// allocation or releasing of CPUs can be done. ResizeCpus does not
// allocate or release CPUs.
//
// Parameters:
//   - currentCpus: a set of CPUs to/from which CPUs would be added/removed.
//   - freeCpus: a set of CPUs available CPUs.
//   - delta: number of CPUs to add (if positive) or remove (if negative).
//
// Return values:
//   - addFromCpus contains free CPUs from which delta CPUs can be
//     allocated. Note that the size of the set may be larger than
//     delta: there is room for other allocation logic to select from
//     these CPUs.
//   - removeFromCpus contains CPUs in currentCpus set from which
//     abs(delta) CPUs can be freed.
func (ta *cpuTreeAllocator) ResizeCpus(currentCpus, freeCpus cpuset.CPUSet, delta int) (cpuset.CPUSet, cpuset.CPUSet, error) {
	if delta > 0 {
		addFromSuperset, removeFromSuperset, err := ta.resizeCpus(currentCpus, freeCpus, delta)
		if !ta.options.preferSpreadOnPhysicalCores || addFromSuperset.Size() == delta {
			return addFromSuperset, removeFromSuperset, err
		}
		// addFromSuperset contains more CPUs (equally good
		// choices) than actually needed. In case of
		// preferSpreadOnPhysicalCores, however, selecting any
		// of these does not result in equally good
		// result. Therefore, in this case, construct addFrom
		// set by adding one CPU at a time.
		addFrom := cpuset.New()
		for n := 0; n < delta; n++ {
			addSingleFrom, _, err := ta.resizeCpus(currentCpus, freeCpus, 1)
			if err != nil {
				return addFromSuperset, removeFromSuperset, err
			}
			if addSingleFrom.Size() != 1 {
				return addFromSuperset, removeFromSuperset, fmt.Errorf("internal error: failed to find single CPU to allocate, "+
					"currentCpus=%s freeCpus=%s expectedSingle=%s",
					currentCpus, freeCpus, addSingleFrom)
			}
			addFrom = addFrom.Union(addSingleFrom)
			if addFrom.Size() != n+1 {
				return addFromSuperset, removeFromSuperset, fmt.Errorf("internal error: double add the same CPU (%s) to cpuset %s on round %d",
					addSingleFrom, addFrom, n+1)
			}
			currentCpus = currentCpus.Union(addSingleFrom)
			freeCpus = freeCpus.Difference(addSingleFrom)
		}
		return addFrom, removeFromSuperset, nil
	}
	// In multi-CPU removal, remove CPUs one by one instead of
	// trying to find a single topology element from which all of
	// them could be removed.
	removeFrom := cpuset.New()
	addFrom := cpuset.New()
	for n := 0; n < -delta; n++ {
		_, removeSingleFrom, err := ta.resizeCpus(currentCpus, freeCpus, -1)
		if err != nil {
			return addFrom, removeFrom, err
		}
		// Make cheap internal error checks in order to capture
		// issues in alternative algorithms.
		if removeSingleFrom.Size() != 1 {
			return addFrom, removeFrom, fmt.Errorf("internal error: failed to find single cpu to free, "+
				"currentCpus=%s freeCpus=%s expectedSingle=%s",
				currentCpus, freeCpus, removeSingleFrom)
		}
		if removeFrom.Union(removeSingleFrom).Size() != n+1 {
			return addFrom, removeFrom, fmt.Errorf("internal error: double release of a cpu, "+
				"currentCpus=%s freeCpus=%s alreadyRemoved=%s removedNow=%s",
				currentCpus, freeCpus, removeFrom, removeSingleFrom)
		}
		removeFrom = removeFrom.Union(removeSingleFrom)
		currentCpus = currentCpus.Difference(removeSingleFrom)
		freeCpus = freeCpus.Union(removeSingleFrom)
	}
	return addFrom, removeFrom, nil
}

func (ta *cpuTreeAllocator) resizeCpus(currentCpus, freeCpus cpuset.CPUSet, delta int) (cpuset.CPUSet, cpuset.CPUSet, error) {
	tnas := ta.root.ToAttributedSlice(currentCpus, freeCpus,
		func(tna *cpuTreeNodeAttributes) bool {
			// filter out branches with insufficient cpus
			if delta > 0 && tna.freeCpuCount-delta < 0 {
				// cannot allocate delta cpus
				return false
			}
			if delta < 0 && tna.currentCpuCount+delta < 0 {
				// cannot release delta cpus
				return false
			}
			return true
		})

	// Sort based on attributes
	if delta > 0 {
		sort.Slice(tnas, ta.sorterAllocate(tnas))
	} else {
		sort.Slice(tnas, ta.sorterRelease(tnas))
	}
	if len(tnas) == 0 {
		return freeCpus, currentCpus, fmt.Errorf("not enough free CPUs")
	}
	return tnas[0].freeCpus, tnas[0].currentCpus, nil
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/balloons/cputree_test.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package balloons

import (
	"fmt"
	"sort"
	"strings"
	"testing"

	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
)

type cpuInTopology struct {
	packageID, dieID, numaID, coreID, threadID, cpuID             int
	packageName, dieName, numaName, coreName, threadName, cpuName string
}

type cpusInTopology map[int]cpuInTopology

func (cit cpuInTopology) TopoName(topoLevel string) string {
	switch topoLevel {
	case "thread":
		return cit.threadName
	case "core":
		return cit.coreName
	case "numa":
		return cit.numaName
	case "die":
		return cit.dieName
	case "package":
		return cit.packageName
	}
	panic("invalid topoLevel")
}

func (csit cpusInTopology) dumps(nameCpus map[string]cpuset.CPUSet) string {
	lines := []string{}
	names := make([]string, 0, len(nameCpus))
	for name := range nameCpus {
		names = append(names, name)
	}
	sort.Strings(names)
	for cpuID := 0; cpuID < len(csit); cpuID++ {
		line := fmt.Sprintf("cpu%02d %s", cpuID, csit[cpuID].threadName)
		for _, name := range names {
			if nameCpus[name].Contains(cpuID) {
				line = fmt.Sprintf("%s %s", line, name)
			}
		}
		lines = append(lines, line)
	}
	return strings.Join(lines, "\n")
}

func newCpuTreeFromInt5(pdnct [5]int) (*cpuTreeNode, cpusInTopology) {
	pkgs := pdnct[0]
	dies := pdnct[1]
	numas := pdnct[2]
	cores := pdnct[3]
	threads := pdnct[4]
	cpuID := 0
	sysTree := NewCpuTree("system")
	sysTree.level = CPUTopologyLevelSystem
	csit := cpusInTopology{}
	for packageID := 0; packageID < pkgs; packageID++ {
		packageTree := NewCpuTree(fmt.Sprintf("p%d", packageID))
		packageTree.level = CPUTopologyLevelPackage
		sysTree.AddChild(packageTree)
		for dieID := 0; dieID < dies; dieID++ {
			dieTree := NewCpuTree(fmt.Sprintf("p%dd%d", packageID, dieID))
			dieTree.level = CPUTopologyLevelDie
			packageTree.AddChild(dieTree)
			for numaID := 0; numaID < numas; numaID++ {
				numaTree := NewCpuTree(fmt.Sprintf("p%dd%dn%d", packageID, dieID, numaID))
				numaTree.level = CPUTopologyLevelNuma
				dieTree.AddChild(numaTree)
				for coreID := 0; coreID < cores; coreID++ {
					coreTree := NewCpuTree(fmt.Sprintf("p%dd%dn%dc%02d", packageID, dieID, numaID, coreID))
					coreTree.level = CPUTopologyLevelCore
					numaTree.AddChild(coreTree)
					for threadID := 0; threadID < threads; threadID++ {
						threadTree := NewCpuTree(fmt.Sprintf("p%dd%dn%dc%02dt%d", packageID, dieID, numaID, coreID, threadID))
						threadTree.level = CPUTopologyLevelThread
						coreTree.AddChild(threadTree)
						threadTree.AddCpus(cpuset.New(cpuID))
						csit[cpuID] = cpuInTopology{
							packageID, dieID, numaID, coreID, threadID, cpuID,
							packageTree.name, dieTree.name, numaTree.name, coreTree.name, threadTree.name,
							fmt.Sprintf("cpu%d", cpuID),
						}
						cpuID += 1
					}
				}
			}
		}
	}
	return sysTree, csit
}

func verifyNotOn(t *testing.T, nameContents string, cpus cpuset.CPUSet, csit cpusInTopology) {
	for _, cpuID := range cpus.List() {
		name := csit[cpuID].threadName
		if strings.Contains(name, nameContents) {
			t.Errorf("cpu%d (%s) in unexpected region %s", cpuID, name, nameContents)
		}
	}
}

func doVerifySame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInTopology, inversed bool) {
	seenName := ""
	seenCpuID := -1
	for _, cpuID := range cpus.List() {
		cit := csit[cpuID]
		thisName := cit.TopoName(topoLevel)
		thisCpuID := cit.cpuID
		if thisName == "" {
			t.Errorf("unexpected (invalid) topology level %q", topoLevel)
		}
		if seenName == "" {
			seenName = thisName
			seenCpuID = cit.cpuID
			continue
		}
		if (seenName != thisName && !inversed) ||
			(seenName == thisName && inversed) {
			msg := "the same"
			if inversed {
				msg = "not the same"
			}
			t.Errorf("expected %s %s, got: cpu%d in %s, cpu%d in %s",
				msg,
				topoLevel,
				seenCpuID, seenName,
				thisCpuID, thisName)
		}
	}
}

func verifySame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInTopology) {
	doVerifySame(t, topoLevel, cpus, csit, false)
}

func verifyNotSame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInTopology) {
	doVerifySame(t, topoLevel, cpus, csit, true)
}

func (csit cpusInTopology) getElements(topoLevel string, cpus cpuset.CPUSet) []string {
	elts := []string{}
	for _, cpuID := range cpus.List() {
		elts = append(elts, csit[cpuID].TopoName(topoLevel))
	}
	return elts
}

func (csit cpusInTopology) verifyDisjoint(t *testing.T, topoLevel string, cpusA cpuset.CPUSet, cpusB cpuset.CPUSet) {
	eltsA := csit.getElements(topoLevel, cpusA)
	eltsB := csit.getElements(topoLevel, cpusB)
	for _, eltA := range eltsA {
		for _, eltB := range eltsB {
			if eltA == eltB {
				t.Errorf("expected disjoint %ss, got %s on both cpusets %s and %s",
					topoLevel, eltA, cpusA, cpusB)
				return
			}
		}
	}
}

/*
   CPU ids and locations in the 2-2-2-2-2-topology for verifying
   current and developing future unit tests. The location in topology
   is in format:

   p<package-id>/d<die-id>/n<numa-id>/c<core-index>/t<thread-id>

topology: [5]int{2, 2, 2, 2, 2},
allocations: []int{
	0,  // cpu on p0/d0/n0/c0/t0
	1,  // cpu on p0/d0/n0/c0/t1
	2,  // cpu on p0/d0/n0/c1/t0
	3,  // cpu on p0/d0/n0/c1/t1
	4,  // cpu on p0/d0/n1/c0/t0
	5,  // cpu on p0/d0/n1/c0/t1
	6,  // cpu on p0/d0/n1/c1/t0
	7,  // cpu on p0/d0/n1/c1/t1
	8,  // cpu on p0/d1/n0/c0/t0
	9,  // cpu on p0/d1/n0/c0/t1
	10, // cpu on p0/d1/n0/c1/t0
	11, // cpu on p0/d1/n0/c1/t1
	12, // cpu on p0/d1/n1/c0/t0
	13, // cpu on p0/d1/n1/c0/t1
	14, // cpu on p0/d1/n1/c1/t0
	15, // cpu on p0/d1/n1/c1/t1
	16, // cpu on p1/d0/n0/c0/t0
	17, // cpu on p1/d0/n0/c0/t1
	18, // cpu on p1/d0/n0/c1/t0
	19, // cpu on p1/d0/n0/c1/t1
	20, // cpu on p1/d0/n1/c0/t0
	21, // cpu on p1/d0/n1/c0/t1
	22, // cpu on p1/d0/n1/c1/t0
	23, // cpu on p1/d0/n1/c1/t1
	24, // cpu on p1/d1/n0/c0/t0
	25, // cpu on p1/d1/n0/c0/t1
	26, // cpu on p1/d1/n0/c1/t0
	27, // cpu on p1/d1/n0/c1/t1
	28, // cpu on p1/d1/n1/c0/t0
	29, // cpu on p1/d1/n1/c0/t1
	30, // cpu on p1/d1/n1/c1/t0
	31, // cpu on p1/d1/n1/c1/t1
},
*/

func TestResizeCpus(t *testing.T) {
	type TopoCcids struct {
		topo  string
		ccids []int
	}
	tcases := []struct {
		name                   string
		topology               [5]int // package, die, numa, core, thread count
		allocatorTB            bool   // allocator topologyBalancing
		allocatorPSoPC         bool   // allocator preferSpreadOnPhysicalCores
		allocations            []int
		deltas                 []int
		allocate               bool
		operateOnCcid          []int // which ccid (currentCpus id) will be used on call
		expectCurrentOnSame    []string
		expectCurrentNotOnSame []string
		expectAllOnSame        []string
		expectCurrentNotOn     []string
		expectAddSizes         []int
		expectDisjoint         []TopoCcids // which ccids should be disjoint
		expectErrors           []string
	}{
		{
			name:           "first allocations",
			topology:       [5]int{2, 2, 2, 2, 2},
			deltas:         []int{0, 1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17, 31, 32},
			expectAddSizes: []int{0, 1, 2, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32},
		},
		{
			name:         "too large an allocation",
			topology:     [5]int{2, 2, 2, 2, 2},
			deltas:       []int{33},
			expectErrors: []string{"not enough free CPUs"},
		},
		{
			name:          "spread allocations",
			topology:      [5]int{2, 2, 2, 2, 2},
			allocatorTB:   true,
			deltas:        []int{1, 1, 1, 1, 1, 1, 1, 1},
			allocate:      true,
			operateOnCcid: []int{1, 2, 3, 4, 5, 6, 7, 8},
			expectDisjoint: []TopoCcids{
				{},
				{"package", []int{1, 2}},
				{"die", []int{1, 2, 3}},
				{"die", []int{1, 2, 3, 4}},
				{"numa", []int{1, 2, 3, 4, 5}},
				{"numa", []int{1, 2, 3, 4, 5, 6}},
				{"numa", []int{1, 2, 3, 4, 5, 6, 7}},
				{"numa", []int{1, 2, 3, 4, 5, 6, 7, 8}},
			},
		},
		{
			name:          "spread allocations2",
			topology:      [5]int{4, 1, 4, 8, 2},
			allocatorTB:   true,
			deltas:        []int{1, 3, 2, 4, 1, 4, 2, 4},
			allocate:      true,
			operateOnCcid: []int{1, 2, 3, 4, 5, 6, 7, 8},
			expectDisjoint: []TopoCcids{
				{},
				{"package", []int{1, 2}},
				{"package", []int{1, 2, 3}},
				{"package", []int{1, 2, 3, 4}},
				{"numa", []int{1, 2, 3, 4, 5}},
				{"numa", []int{1, 2, 3, 4, 5, 6}},
				{"numa", []int{1, 2, 3, 4, 5, 6, 7}},
				{"numa", []int{1, 2, 3, 4, 5, 6, 7, 8}},
			},
		},
		{
			name:          "pack allocations",
			topology:      [5]int{2, 2, 2, 2, 2},
			allocatorTB:   false,
			deltas:        []int{1, 1, 1, 1},
			allocate:      true,
			operateOnCcid: []int{1, 2, 3, 4, 5},
			expectAllOnSame: []string{
				"", "core", "numa", "numa", "die", "die",
			},
		},
		{
			name:     "inflate",
			topology: [5]int{2, 2, 2, 2, 2},
			allocate: true,
			deltas: []int{
				1, 1, 1, 1, // cpu0..cpu3 on numaN0, dieD0
				1, 3, // cpu4..cpu7 on numaN1, still dieD0
				6, 1, 1, // cpu8..15 on dieD1, still packageP0
			},
			operateOnCcid: []int{
				1, 1, 1, 1,
				1, 1,
				1, 1, 1},
			expectCurrentOnSame: []string{
				"core", "core", "numa", "numa",
				"die", "die",
				"package", "package", "package"},
			expectAddSizes: []int{
				1, 1, 1, 1,
				1, 3,
				8, 1, 1},
		},
		{
			name:     "defragmenting single removals",
			topology: [5]int{2, 2, 2, 2, 2},
			allocations: []int{
				0,  // cpu on p0/d0/n0/c0/t0
				2,  // cpu on p0/d0/n0/c1/t0
				3,  // cpu on p0/d0/n0/c1/t1
				7,  // cpu on p0/d0/n1/c1/t1
				10, // cpu on p0/d1/n0/c1/t0
				17, // cpu on p1/d0/n0/c0/t1
				18, // cpu on p1/d0/n0/c1/t0
			},
			allocate: true,
			deltas: []int{
				-1, // release cpu17 or cpu18
				-1, // release cpu17 or cpu18 => all on same package
				-1, // release cpu10 => all on same die
				-1, // release cpu7 => all on same numa
				-1, // release cpu0 => all on same core
				-1, // release cpu2 or cpu3
				-1, // release cpu2 or cpu3
			},
			operateOnCcid: []int{1, 1, 1, 1, 1, 1, 1},
			expectCurrentOnSame: []string{
				"",
				"package",
				"die",
				"numa",
				"core",
				"core",
				"core",
			},
			expectCurrentNotOn: []string{
				"",
				"p1",
				"p0d1",
				"p0d0n1",
				"p0d0n0c00",
			},
		},
		{
			name:     "defragmenting multi-removals",
			topology: [5]int{2, 2, 2, 2, 2},
			allocations: []int{
				0,  // cpu on p0/d0/n0/c0/t0
				2,  // cpu on p0/d0/n0/c1/t0
				4,  // cpu on p0/d0/n1/c0/t0
				6,  // cpu on p0/d0/n1/c1/t0
				8,  // cpu on p0/d1/n0/c0/t0
				9,  // cpu on p0/d1/n0/c0/t1
				10, // cpu on p0/d1/n0/c1/t0

				24, // cpu on p1/d1/n0/c0/t0
				25, // cpu on p1/d1/n0/c0/t1
				26, // cpu on p1/d1/n0/c1/t0
				27, // cpu on p1/d1/n0/c1/t1
				28, // cpu on p1/d1/n1/c0/t0
				29, // cpu on p1/d1/n1/c0/t1
				30, // cpu on p1/d1/n1/c1/t0
				31, // cpu on p1/d1/n1/c1/t1
			},
			allocate: true,
			deltas: []int{
				-2, // release from p0d1n0
				-1, // release completely p0d1
				-5, // release completely p0, one from p1d1nX
				-3, // release completely p1d1nX => all on same numa
			},
			operateOnCcid: []int{1, 1, 1, 1},
			expectCurrentOnSame: []string{
				"",
				"",
				"die",
				"numa",
			},
			expectCurrentNotOn: []string{
				"",
				"p0d1",
				"p0",
				"",
			},
		},
		{
			name:     "gentle rebalancing",
			topology: [5]int{2, 1, 1, 16, 2}, // 2 packages, 16 hyperthreaded cores per package => 64 cpus in total
			deltas: []int{
				4, 4, 14, 7, 7, 4, 4, 14, // allocate 8 sets of cpus, the last 14cpus fills package0, spills over to package1
				-2, -2, -2, -2, // free a little room to package0
				-1, 1, -1, 1, -1, 1, -1, 1}, // deflate/inflate the last 14cpus, see that it gradually travels to package0
			operateOnCcid: []int{
				1, 2, 3, 4, 5, 6, 7, 8,
				1, 2, 3, 4,
				8, 8, 8, 8, 8, 8, 8, 8,
			},
			allocate: true,
			expectCurrentOnSame: []string{
				"package", "package", "package", "package",
				"package", "package", "package", "",
				"", "", "", "",
				"", "", "", "", "", "", "package", "package",
			},
		},
		{
			name:           "prefer spread on physical cores",
			topology:       [5]int{4, 1, 4, 8, 2},
			allocatorTB:    true,
			allocatorPSoPC: true,
			deltas: []int{
				2, 1, 4, 1, // allocate one thread from each core from the same NUMA
				3, 9, 16, // allocate three other cpusets, each should be from separate package (due to topology balancing)
				3, 4, 3, // increase the size of the
				// original, 3+4 fits to the same
				// NUMA, in the last 3: first cpu
				// should fill the NUMA and the rest 2
				// go to another NUMA on the same package.
				-2, 2, // release two CPUs that went to another NUMA on the same package, and put them back
				-10, // release 2+8 CPUs, the rest should be single threads each on their own core
			},
			allocate: true,
			operateOnCcid: []int{
				1, 1, 1, 1, // allocate one thread from each core from the same NUMA by inflating all the time the same cpuset
				2, 3, 4, // three new cpusets
				1, 1, 1, // increase size over one NUMA
				1, 1,
				1,
			},
			expectCurrentOnSame: []string{
				"numa", "numa", "numa", "numa",
				"numa", "numa", "numa",
				"numa", "numa", "package",
				"numa", "package",
				"numa",
			},
			expectCurrentNotOnSame: []string{
				"core", "core", "core", "core",
				"core", "", "",
				"", "", "",
				"", "",
				"core",
			},
			expectDisjoint: []TopoCcids{
				{}, {}, {}, {},
				{"package", []int{1, 2}}, {"package", []int{1, 2, 3}}, {"package", []int{1, 2, 3, 4}},
			},
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			tree, csit := newCpuTreeFromInt5(tc.topology)
			treeA := tree.NewAllocator(cpuTreeAllocatorOptions{
				topologyBalancing:           tc.allocatorTB,
				preferSpreadOnPhysicalCores: tc.allocatorPSoPC,
			})
			currentCpus := cpuset.New()
			freeCpus := tree.Cpus()
			if len(tc.allocations) > 0 {
				currentCpus = currentCpus.Union(cpuset.New(tc.allocations...))
				freeCpus = freeCpus.Difference(cpuset.New(tc.allocations...))
			}
			ccidCurrentCpus := map[int]cpuset.CPUSet{0: currentCpus}
			allocs := map[string]cpuset.CPUSet{"--:allo": currentCpus}
			for i, delta := range tc.deltas {
				if i < len(tc.operateOnCcid) && tc.operateOnCcid[i] > 0 {
					currentCpus = ccidCurrentCpus[tc.operateOnCcid[i]]
				}
				t.Logf("ResizeCpus(current=%s; free=%s; delta=%d)", currentCpus, freeCpus, delta)
				addFrom, removeFrom, err := treeA.ResizeCpus(currentCpus, freeCpus, delta)
				t.Logf("== addFrom=%s; removeFrom=%s, err=%v", addFrom, removeFrom, err)
				if i < len(tc.expectAddSizes) {
					if tc.expectAddSizes[i] != addFrom.Size() {
						t.Errorf("expected add size: %d, got %d", tc.expectAddSizes[i], addFrom.Size())
					}
				}
				if i < len(tc.expectErrors) {
					if tc.expectErrors[i] == "" && err != nil {
						t.Errorf("expected nil error, but got %v", err)
					}
					if tc.expectErrors[i] != "" {
						if err == nil {
							t.Errorf("expected error containing %q, got nil", tc.expectErrors[i])
						} else if !strings.Contains(fmt.Sprintf("%s", err), tc.expectErrors[i]) {
							t.Errorf("expected error containing %q, got %q", tc.expectErrors[i], err)
						}
					}
				}
				if tc.allocate {
					allocName := fmt.Sprintf("%02d:allo", i+1)
					allocs[allocName] = cpuset.New()

					for n, cpuID := range addFrom.List() {
						if n >= delta {
							break
						}
						freeCpus = freeCpus.Difference(cpuset.New(cpuID))
						currentCpus = currentCpus.Union(cpuset.New(cpuID))
						allocs[allocName] = allocs[allocName].Union(cpuset.New(cpuID))
					}
					allocName = fmt.Sprintf("%02d:free", i+1)
					for n, cpuID := range removeFrom.List() {
						if n >= -delta {
							break
						}
						freeCpus = freeCpus.Union(cpuset.New(cpuID))
						if i < len(tc.operateOnCcid) && tc.operateOnCcid[i] > 0 {
							currentCpus = currentCpus.Difference(cpuset.New(cpuID))
						}
						allocs[allocName] = allocs[allocName].Union(cpuset.New(cpuID))
					}
					if i < len(tc.operateOnCcid) && tc.operateOnCcid[i] > 0 {
						ccidCurrentCpus[tc.operateOnCcid[i]] = currentCpus
					}

					allocs["free"] = freeCpus
					t.Logf("=> current=%s; free=%s", currentCpus, freeCpus)
					if i < len(tc.expectCurrentOnSame) && tc.expectCurrentOnSame[i] != "" {
						verifySame(t, tc.expectCurrentOnSame[i], currentCpus, csit)
					}
					if i < len(tc.expectCurrentNotOnSame) && tc.expectCurrentNotOnSame[i] != "" {
						verifyNotSame(t, tc.expectCurrentNotOnSame[i], currentCpus, csit)
					}
					if i < len(tc.expectCurrentNotOn) && tc.expectCurrentNotOn[i] != "" {
						verifyNotOn(t, tc.expectCurrentNotOn[i], currentCpus, csit)
					}
					if i < len(tc.expectAllOnSame) && tc.expectAllOnSame[i] != "" {
						allCpus := cpuset.New()
						for _, cpus := range ccidCurrentCpus {
							allCpus = allCpus.Union(cpus)
						}
						verifySame(t, tc.expectAllOnSame[i], allCpus, csit)
					}

					if i < len(tc.expectDisjoint) && len(tc.expectDisjoint) > 1 {
						for first := 0; first < len(tc.expectDisjoint[i].ccids); first++ {
							for second := first + 1; second < len(tc.expectDisjoint[i].ccids); second++ {
								csit.verifyDisjoint(t, tc.expectDisjoint[i].topo,
									ccidCurrentCpus[tc.expectDisjoint[i].ccids[first]],
									ccidCurrentCpus[tc.expectDisjoint[i].ccids[second]])
							}
						}
					}
				}
				if t.Failed() {
					t.Logf("current and free cpus:\n%s\n", csit.dumps(allocs))
					break
				}
			}
		})
	}
}

func TestWalk(t *testing.T) {
	t.Run("single-node tree", func(t *testing.T) {
		tree := NewCpuTree("system")
		tree.level = CPUTopologyLevelSystem
		foundName := "unfound"
		foundLevel := CPUTopologyLevelUndefined
		rv := tree.DepthFirstWalk(func(tn *cpuTreeNode) error {
			foundName = tn.name
			foundLevel = tn.level
			return nil
		})
		if rv != nil {
			t.Errorf("expected no error, got %s", rv)
		}
		if foundLevel != CPUTopologyLevelSystem {
			t.Errorf("expected to find level %q, got %q",
				CPUTopologyLevelSystem, foundLevel)
		}
		if foundName != "system" {
			t.Errorf("expected to find name %q, got %q",
				"system", foundName)
		}
	})

	t.Run("fetch first core", func(t *testing.T) {
		tree, _ := newCpuTreeFromInt5([5]int{2, 2, 2, 2, 2})
		foundCount := 0
		foundName := ""
		rv := tree.DepthFirstWalk(func(tn *cpuTreeNode) error {
			foundCount += 1
			if tn.level == CPUTopologyLevelCore {
				foundName = tn.name
				return WalkStop
			}
			return nil
		})
		if rv != WalkStop {
			t.Errorf("expected WalkStop error, got %s", rv)
		}
		if foundCount != 5 {
			t.Errorf("expected to find 5 nodes, got %d", foundCount)
		}
		if foundName != "p0d0n0c00" {
			t.Errorf("expected to find p0d0n0c00, got %q", foundName)
		}
	})

	t.Run("skip children", func(t *testing.T) {
		tree, _ := newCpuTreeFromInt5([5]int{2, 2, 2, 2, 2})
		foundCount := 0
		rv := tree.DepthFirstWalk(func(tn *cpuTreeNode) error {
			foundCount += 1
			if tn.level == CPUTopologyLevelDie {
				return WalkSkipChildren
			}
			return nil
		})
		if rv != nil {
			t.Errorf("expected no error, got %s", rv)
		}
		if foundCount != 7 {
			t.Errorf("expected to find 7 nodes, got %d", foundCount)
		}
	})
}

func TestCpuLocations(t *testing.T) {
	tree, _ := newCpuTreeFromInt5([5]int{2, 2, 2, 4, 2})
	cpus := cpuset.New(0, 1, 3, 4, 16)
	systemlocations := tree.CpuLocations(cpus)
	package1locations := tree.children[1].CpuLocations(cpus)
	if len(package1locations) != 5 {
		t.Errorf("expected package1locations length 5, got %d", len(package1locations))
		return
	}
	if len(systemlocations) != 6 {
		t.Errorf("expected systemlocations length 6, got %d", len(systemlocations))
		return
	}
	if systemlocations[0][0] != "system" {
		t.Errorf("expected 'system' location, got %q", systemlocations[0][0])
		return
	}
	if systemlocations[1][0] != "p0" {
		t.Errorf("expected 'system' location, got %q", systemlocations[1][0])
		return
	}
	if len(systemlocations[4]) != 4 {
		t.Errorf("expected len(systemlocations[4]) 4, got %d", len(systemlocations[4]))
		return
	}
}

func TestCPUTopologyLevel(t *testing.T) {
	var lvl CPUTopologyLevel
	if lvl != CPUTopologyLevelUndefined {
		t.Errorf("unexpected default inital value for lvl: %s, expected undefined", lvl)
	}
	if err := lvl.UnmarshalJSON([]byte("\"\"")); err != nil || lvl != CPUTopologyLevelUndefined {
		t.Errorf("unexpected outcome unmarshalling topology level: \"\", error: %s, result: %s", err, lvl)
	}
	if err := lvl.UnmarshalJSON([]byte("\"system\"")); err != nil || lvl != CPUTopologyLevelSystem {
		t.Errorf("unexpected outcome unmarshalling topology level: system, error: %s, result: %s", err, lvl)
	}
	if err := lvl.UnmarshalJSON([]byte("\"NUMA\"")); err != nil || lvl != CPUTopologyLevelNuma {
		t.Errorf("unexpected outcome unmarshalling topology level: \"NUMA\", error: %s, result: %s", err, lvl)
	}
	if err := lvl.UnmarshalJSON([]byte("\"undefined\"")); err == nil {
		t.Errorf("unexpected outcome unmarshalling topology level: \"undefined\", error: %s, result: %s", err, lvl)
	}
	if err := lvl.UnmarshalJSON([]byte("system")); err == nil {
		t.Errorf("unexpected non-error outcome unmarshalling topology level: system, error: %s, result: %s", err, lvl)
	}
	if err := lvl.UnmarshalJSON([]byte("0")); err == nil {
		t.Errorf("unexpected non-error outcome unmarshalling topology level: 0, error: %s, result: %s", err, lvl)
	}
	if err := lvl.UnmarshalJSON([]byte("\"4\"")); err == nil {
		t.Errorf("unexpected non-error outcome unmarshalling topology level: \"0\", error: %s, result: %s", err, lvl)
	}
	if undefBytes, err := CPUTopologyLevelUndefined.MarshalJSON(); err != nil {
		t.Errorf("unexpected error marshaling undefined: %s", err)
	} else {
		if err = lvl.UnmarshalJSON(undefBytes); err != nil || lvl != CPUTopologyLevelUndefined {
			t.Errorf("unexpected outcome unmarshaling marshaled undefined: error: %s, result: %s", err, lvl)
		}
	}
	if threadBytes, err := CPUTopologyLevelThread.MarshalJSON(); err != nil {
		t.Errorf("unexpected error marshaling thread: %s", err)
	} else {
		if err = lvl.UnmarshalJSON(threadBytes); err != nil || lvl != CPUTopologyLevelThread {
			t.Errorf("unexpected outcome unmarshaling marshaled thread: error: %s, result: %s", err, lvl)
		}
	}

}

func TestSplitLevel(t *testing.T) {
	root, _ := newCpuTreeFromInt5([5]int{2, 2, 2, 4, 2})
	newRoot := root.SplitLevel(CPUTopologyLevelNuma,
		func(cpu int) int {
			leaf := root.FindLeafWithCpu(cpu)
			if leaf == nil {
				t.Fatalf("cpu %d not in tree:\n%s\n\n", cpu, root.PrettyPrint())
			}
			return leaf.SiblingIndex()
		})

	oldc62 := root.FindLeafWithCpu(62)
	oldc63 := root.FindLeafWithCpu(63)
	if oldc62.parent != oldc63.parent {
		t.Errorf("expected: 62 and 63 are hyperthreads of the same physical core in the original tree, observed parents %s and %s", oldc62.parent, oldc63.parent)
	}
	newc62 := newRoot.FindLeafWithCpu(62)
	newc63 := newRoot.FindLeafWithCpu(63)
	if newc62.parent == newc63.parent {
		t.Errorf("expected: 62 and 63 have different parents (physical cores), but they have the same %s", newc62.parent)
	}
	if newc62.parent.parent == newc63.parent.parent {
		t.Errorf("expected: 62 and 63 have different grand parents (numa subclasses), but they have the same: %s", newc62.parent.parent)
	}
	if newc62.parent.parent.parent != newc63.parent.parent.parent {
		t.Errorf("expected: 62 and 63 have the same great grand parents (numa), but they differ: %s and %s", newc62.parent.parent.parent, newc63.parent.parent.parent)
	}
	if t.Failed() {
		t.Logf("newRoot:\n%s\n", newRoot.PrettyPrint())
	}
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/balloons/fillmethod.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package balloons

import (
	"bytes"
	"encoding/json"
	"fmt"
)

// FillMethod specifies the order in which balloon instances should be filled.
type FillMethod int

const (
	FillUnspecified FillMethod = iota
	// FillBalanced: put a container into the balloon with most
	// free CPU without changing the size of the balloon.
	FillBalanced
	// FillBalancedInflate: put a container into the balloon with
	// most free CPU when the balloon is inflated to the maximum
	// size.
	FillBalancedInflate
	// FillPacked: put a container into a balloon so that it
	// minimizes the amount of currently unused CPUs in the
	// balloon.
	FillPacked
	// FillPackedInflate: put a container into a balloon so that
	// it minimizes the amount of unused CPUs if the balloon is
	// inflated to the maximum size.
	FillPackedInflate
	// FillSameNamespace: put a container into a balloon that already
	// includes another container from the same namespace
	FillSameNamespace
	// FillSamePod: put a container into a balloon that already
	// includes another container from the same pod.
	FillSamePod
	// FillNewBalloon: create a new balloon, if possible, and put
	// a container into it.
	FillNewBalloon
	// FillNewBalloonMust: create a new balloon for a container,
	// but refuse to run the container if the balloon cannot be
	// created.
	FillNewBalloonMust
	// FillReservedBalloon: put a container into the reserved
	// balloon.
	FillReservedBalloon
	// FillDefaultBalloon: put a container into the default
	// balloon.
	FillDefaultBalloon
)

var fillMethodNames = map[FillMethod]string{
	FillUnspecified:     "unspecified",
	FillBalanced:        "balanced",
	FillBalancedInflate: "balanced-inflate",
	FillPacked:          "packed",
	FillPackedInflate:   "packed-inflate",
	FillSameNamespace:   "same-namespace",
	FillSamePod:         "same-pod",
	FillNewBalloon:      "new-balloon",
	FillNewBalloonMust:  "new-balloon-must",
	FillDefaultBalloon:  "default-balloon",
	FillReservedBalloon: "reserved-balloon",
}

// String stringifies a FillMethod
func (fm FillMethod) String() string {
	if fmn, ok := fillMethodNames[fm]; ok {
		return fmn
	}
	return fmt.Sprintf("#UNNAMED-FILLMETHOD(%d)", int(fm))
}

// MarshalJSON marshals a FillMethod as a quoted json string
func (fm FillMethod) MarshalJSON() ([]byte, error) {
	buffer := bytes.NewBufferString(fmt.Sprintf("%q", fm))
	return buffer.Bytes(), nil
}

// UnmarshalJSON unmarshals a FillMethod quoted json string to the enum value
func (fm *FillMethod) UnmarshalJSON(b []byte) error {
	var fillMethodName string
	err := json.Unmarshal(b, &fillMethodName)
	if err != nil {
		return err
	}
	for fmID, fmName := range fillMethodNames {
		if fmName == fillMethodName {
			*fm = fmID
			return nil
		}
	}
	return balloonsError("invalid fill method %q", fillMethodName)
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/balloons/flags.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package balloons

import (
	"encoding/json"

	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

	pkgcfg "github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/cpuallocator"
)

type BalloonsOptions balloonsOptionsWrapped

// BalloonsOptions contains configuration options specific to this policy.
type balloonsOptionsWrapped struct {
	// PinCPU controls pinning containers to CPUs.
	PinCPU *bool `json:"PinCPU,omitempty"`
	// PinMemory controls pinning containers to memory nodes.
	PinMemory *bool `json:"PinMemory,omitempty"`
	// IdleCpuClass controls how unusded CPUs outside any a
	// balloons are (re)configured.
	IdleCpuClass string `json:"IdleCPUClass",omitempty"`
	// ReservedPoolNamespaces is a list of namespace globs that
	// will be allocated to reserved CPUs.
	ReservedPoolNamespaces []string `json:"ReservedPoolNamespaces,omitempty"`
	// If AllocatorTopologyBalancing is true, balloons are
	// allocated and resized so that all topology elements
	// (packages, dies, numa nodes, cores) have roughly same
	// amount of allocations. The default is false: balloons are
	// packed tightly to optimize power efficiency. The value set
	// here can be overridden with the balloon type specific
	// setting with the same name.
	AllocatorTopologyBalancing bool
	// PreferSpreadOnPhysicalCores prefers allocating logical CPUs
	// (possibly hyperthreads) for a balloon from separate physical CPU
	// cores. This prevents workloads in the balloon from interfering with
	// themselves as they do not compete on the resources of the same CPU
	// cores. On the other hand, it allows more interference between
	// workloads in different balloons. The default is false: balloons
	// are packed tightly to a minimum number of physical CPU cores. The
	// value set here is the default for all balloon types, but it can be
	// overridden with the balloon type specific setting with the same
	// name.
	PreferSpreadOnPhysicalCores bool `json:"PreferSpreadOnPhysicalCores,omitempty"`
	// BallonDefs contains balloon type definitions.
	BalloonDefs []*BalloonDef `json:"BalloonTypes,omitempty"`
}

// BalloonDef contains a balloon definition.
type BalloonDef struct {
	// Name of the balloon definition.
	Name string `json:"Name"`
	// Namespaces control which namespaces are assigned into
	// balloon instances from this definition. This is used by
	// namespace assign methods.
	Namespaces []string `json:"Namespaces",omitempty`
	// MaxCpus specifies the maximum number of CPUs exclusively
	// usable by containers in a balloon. Balloon size will not be
	// inflated larger than MaxCpus.
	MaxCpus int `json:"MaxCPUs"`
	// MinCpus specifies the minimum number of CPUs exclusively
	// usable by containers in a balloon. When new balloon is created,
	// this will be the number of CPUs reserved for it even if a container
	// would request less.
	MinCpus int `json:"MinCPUs"`
	// AllocatorPriority (0: High, 1: Normal, 2: Low, 3: None)
	// This parameter is passed to CPU allocator when creating or
	// resizing a balloon. At init, balloons with highest priority
	// CPUs are allocated first.
	AllocatorPriority cpuallocator.CPUPriority `json:"AllocatorPriority"`
	// PreferSpreadOnPhysicalCores is the balloon type specific
	// parameter of the policy level parameter with the same name.
	PreferSpreadOnPhysicalCores *bool `json:"PreferSpreadOnPhysicalCores,omitempty"`
	// AllocatorTopologyBalancing is the balloon type specific
	// parameter of the policy level parameter with the same name.
	AllocatorTopologyBalancing *bool `json:"AllocatorTopologyBalancing,omitempty"`
	// CpuClass controls how CPUs of a balloon are (re)configured
	// whenever a balloon is created, inflated or deflated.
	CpuClass string `json:"CpuClass"`
	// MinBalloons is the number of balloon instances that always
	// exist even if they would become empty. At init this number
	// of instances will be created before assigning any
	// containers.
	MinBalloons int `json:"MinBalloons"`
	// MaxBalloons is the maximum number of balloon instances that
	// is allowed to co-exist. If reached, new balloons cannot be
	// created anymore.
	MaxBalloons int `json:"MaxBalloons"`
	// PreferSpreadingPods: containers of the same pod may be
	// placed on separate balloons. The default is false: prefer
	// placing containers of a pod to the same balloon(s).
	PreferSpreadingPods bool
	// PreferPerNamespaceBalloon: if true, containers in different
	// namespaces are preferrably placed in separate balloons,
	// even if the balloon type is the same for all of them. On
	// the other hand, containers in the same namespace will be
	// placed in the same balloon instances. The default is false:
	// namespaces have no effect on placement.
	PreferPerNamespaceBalloon bool
	// PreferNewBalloons: prefer creating new balloons over adding
	// containers to existing balloons. The default is false:
	// prefer using filling free capacity and possibly inflating
	// existing balloons before creating new ones.
	PreferNewBalloons bool
	// ShareIdleCpusInSame <topology-level>: if there are idle
	// CPUs, that is CPUs not in any balloon, in the same
	// <topology-level> as any CPU in the balloon, then allow
	// workloads to run on those (shared) CPUs in addition to the
	// (dedicated) CPUs of the balloon.
	ShareIdleCpusInSame CPUTopologyLevel `json:"ShareIdleCPUsInSame,omitempty"`
}

var defaultPinCPU bool = true
var defaultPinMemory bool = true

// DeepCopy creates a deep copy of a BalloonsOptions
func (bo *BalloonsOptions) DeepCopy() *BalloonsOptions {
	outBo := *bo
	outBo.ReservedPoolNamespaces = make([]string, len(bo.ReservedPoolNamespaces))
	copy(outBo.ReservedPoolNamespaces, bo.ReservedPoolNamespaces)
	outBo.BalloonDefs = make([]*BalloonDef, len(bo.BalloonDefs))
	for i := range bo.BalloonDefs {
		outBo.BalloonDefs[i] = bo.BalloonDefs[i].DeepCopy()
	}
	return &outBo
}

// String stringifies a BalloonDef
func (bdef BalloonDef) String() string {
	return bdef.Name
}

// DeepCopy creates a deep copy of a BalloonDef
func (bdef *BalloonDef) DeepCopy() *BalloonDef {
	outBdef := *bdef
	outBdef.Namespaces = make([]string, len(bdef.Namespaces))
	copy(outBdef.Namespaces, bdef.Namespaces)
	return &outBdef
}

// defaultBalloonsOptions returns a new BalloonsOptions instance, all initialized to defaults.
func defaultBalloonsOptions() interface{} {
	return &BalloonsOptions{
		ReservedPoolNamespaces: []string{metav1.NamespaceSystem},
		PinCPU:                 &defaultPinCPU,
		PinMemory:              &defaultPinMemory,
	}
}

// Our runtime configuration.
var balloonsOptions = defaultBalloonsOptions().(*BalloonsOptions)

// UnmarshalJSON makes sure all options from previous unmarshals get
// cleared before unmarshaling new data to the same address.
func (bo *BalloonsOptions) UnmarshalJSON(data []byte) error {
	bow := balloonsOptionsWrapped{}
	if err := json.Unmarshal(data, &bow); err != nil {
		return err
	}
	*bo = BalloonsOptions(bow)
	return nil
}

// Register us for configuration handling.
func init() {
	pkgcfg.Register(PolicyPath, PolicyDescription, balloonsOptions, defaultBalloonsOptions)
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/balloons/metrics.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package balloons

import (
	"sort"
	"strconv"
	"strings"

	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	"github.com/prometheus/client_golang/prometheus"
)

// Prometheus Metric descriptor indices and descriptor table
const (
	balloonsDesc = iota
)

var descriptors = []*prometheus.Desc{
	balloonsDesc: prometheus.NewDesc(
		"balloons",
		"CPUs",
		[]string{
			"balloon_type",
			"cpu_class",
			"cpus_min",
			"cpus_max",
			"balloon",
			"cpus",
			"cpus_count",
			"numas",
			"numas_count",
			"dies",
			"dies_count",
			"packages",
			"packages_count",
			"sharedidlecpus",
			"sharedidlecpus_count",
			"cpus_allowed",
			"cpus_allowed_count",
			"mems",
			"containers",
			"tot_req_millicpu",
		}, nil,
	),
}

// Metrics defines the balloons-specific metrics from policy level.
type Metrics struct {
	Balloons []*BalloonMetrics
}

// BalloonMetrics define metrics of a balloon instance.
type BalloonMetrics struct {
	// Balloon type metrics
	DefName  string
	CpuClass string
	MinCpus  int
	MaxCpus  int
	// Balloon instance metrics
	PrettyName            string
	Cpus                  cpuset.CPUSet
	CpusCount             int
	Numas                 []string
	NumasCount            int
	Dies                  []string
	DiesCount             int
	Packages              []string
	PackagesCount         int
	SharedIdleCpus        cpuset.CPUSet
	SharedIdleCpusCount   int
	CpusAllowed           cpuset.CPUSet
	CpusAllowedCount      int
	Mems                  string
	ContainerNames        string
	ContainerReqMilliCpus int
}

// DescribeMetrics generates policy-specific prometheus metrics data
// descriptors.
func (p *balloons) DescribeMetrics() []*prometheus.Desc {
	return descriptors
}

// PollMetrics provides policy metrics for monitoring.
func (p *balloons) PollMetrics() policy.Metrics {
	policyMetrics := &Metrics{}
	policyMetrics.Balloons = make([]*BalloonMetrics, len(p.balloons))
	for index, bln := range p.balloons {
		cpuLoc := p.cpuTree.CpuLocations(bln.Cpus)
		bm := &BalloonMetrics{}
		policyMetrics.Balloons[index] = bm
		bm.DefName = bln.Def.Name
		bm.CpuClass = bln.Def.CpuClass
		bm.MinCpus = bln.Def.MinCpus
		bm.MaxCpus = bln.Def.MaxCpus
		bm.PrettyName = bln.PrettyName()
		bm.Cpus = bln.Cpus
		bm.CpusCount = bm.Cpus.Size()
		if len(cpuLoc) > 3 {
			bm.Numas = cpuLoc[3]
			bm.NumasCount = len(bm.Numas)
			bm.Dies = cpuLoc[2]
			bm.DiesCount = len(bm.Dies)
			bm.Packages = cpuLoc[1]
			bm.PackagesCount = len(bm.Packages)
		}
		bm.SharedIdleCpus = bln.SharedIdleCpus
		bm.SharedIdleCpusCount = bm.SharedIdleCpus.Size()
		bm.CpusAllowed = bm.Cpus.Union(bm.SharedIdleCpus)
		bm.CpusAllowedCount = bm.CpusAllowed.Size()
		bm.Mems = bln.Mems.String()
		cNames := []string{}
		// Get container names and total requested milliCPUs.
		for _, containerIDs := range bln.PodIDs {
			for _, containerID := range containerIDs {
				if c, ok := p.cch.LookupContainer(containerID); ok {
					cNames = append(cNames, c.PrettyName())
					bm.ContainerReqMilliCpus += p.containerRequestedMilliCpus(containerID)
				}
			}
		}
		sort.Strings(cNames)
		bm.ContainerNames = strings.Join(cNames, ",")
	}

	return policyMetrics
}

// CollectMetrics generates prometheus metrics from cached/polled
// policy-specific metrics data.
func (p *balloons) CollectMetrics(m policy.Metrics) ([]prometheus.Metric, error) {
	metrics, ok := m.(*Metrics)
	if !ok {
		return nil, balloonsError("type mismatch in balloons metrics")
	}
	promMetrics := make([]prometheus.Metric, len(metrics.Balloons))
	for index, bm := range metrics.Balloons {
		promMetrics[index] = prometheus.MustNewConstMetric(
			descriptors[balloonsDesc],
			prometheus.GaugeValue,
			float64(bm.Cpus.Size()),
			bm.DefName,
			bm.CpuClass,
			strconv.Itoa(bm.MinCpus),
			strconv.Itoa(bm.MaxCpus),
			bm.PrettyName,
			bm.Cpus.String(),
			strconv.Itoa(bm.CpusCount),
			strings.Join(bm.Numas, ","),
			strconv.Itoa(bm.NumasCount),
			strings.Join(bm.Dies, ","),
			strconv.Itoa(bm.DiesCount),
			strings.Join(bm.Packages, ","),
			strconv.Itoa(bm.PackagesCount),
			bm.SharedIdleCpus.String(),
			strconv.Itoa(bm.SharedIdleCpusCount),
			bm.CpusAllowed.String(),
			strconv.Itoa(bm.CpusAllowedCount),
			bm.Mems,
			bm.ContainerNames,
			strconv.Itoa(bm.ContainerReqMilliCpus))
	}
	return promMetrics, nil
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/dynamic-pools/cpu.go
================================================
package dyp

import (
	"bufio"
	"context"
	"io"
	"math"
	"os"
	"path/filepath"
	"runtime"
	"strconv"
	"strings"
	"time"

	"github.com/intel/cri-resource-manager/pkg/sysfs"
)

type cpuTimesStat struct {
	cpu       string  `json:"cpu"`
	user      float64 `json:"user"`
	system    float64 `json:"system"`
	idle      float64 `json:"idle"`
	nice      float64 `json:"nice"`
	ioWait    float64 `json:"iowait"`
	irq       float64 `json:"irq"`
	softirq   float64 `json:"softirq"`
	steal     float64 `json:"steal"`
	guest     float64 `json:"guest"`
	guestNice float64 `json:"guestNice"`
}

// getCpuUtilization returns the utilization of each cpu in an interval
func getCpuUtilization(interval time.Duration) ([]float64, error) {
	ctx := context.Background()
	cpuTimesStat1, err := getCpuTimesStat(ctx)
	if err != nil {
		return nil, err
	}
	if err := wait(ctx, interval); err != nil {
		return nil, err
	}
	cpuTimesStat2, err := getCpuTimesStat(ctx)
	if err != nil {
		return nil, err
	}
	return calculateAllCpusUtilization(cpuTimesStat1, cpuTimesStat2)
}

func getCpuTimesStat(ctx context.Context) ([]cpuTimesStat, error) {
	filename := filepath.Join("/", sysfs.SysRoot(), "proc", "stat")
	lines := []string{}
	cpuLines, err := readCpuLines(filename)
	if err != nil || len(cpuLines) == 0 {
		return []cpuTimesStat{}, err
	}

	stat := make([]cpuTimesStat, 0, len(lines))
	for _, l := range cpuLines {
		oneStat, err := parseStatLine(l)
		if err != nil {
			continue
		}
		stat = append(stat, *oneStat)
	}
	return stat, nil
}

func wait(ctx context.Context, interval time.Duration) error {
	timer := time.NewTimer(interval)
	select {
	case <-ctx.Done():
		return ctx.Err()
	case <-timer.C:
		return nil
	}
}

func calculateAllCpusUtilization(cts1, cts2 []cpuTimesStat) ([]float64, error) {
	if len(cts1) != len(cts2) {
		return nil, dynamicPoolsError("received two CPU counts: %d != %d", len(cts1), len(cts2))
	}
	allCpusUtilization := make([]float64, len(cts1))
	for i := 0; i < len(cts1); i++ {
		allCpusUtilization[i] = calculateOneCpuUtilization(cts1[i], cts2[i])
	}
	return allCpusUtilization, nil
}

// readCpuLines skips the first line indicating the total CPU utilization.
func readCpuLines(filename string) ([]string, error) {
	f, err := os.Open(filename)
	if err != nil {
		return nil, err
	}
	defer f.Close()
	var statLines []string
	reader := bufio.NewReader(f)
	for {
		line, _, err := reader.ReadLine()
		if err == io.EOF {
			break
		}
		statLines = append(statLines, string(line))
	}

	var cpuLines []string
	if len(statLines) < 2 {
		return nil, nil
	}
	for _, line := range statLines[1:] {
		if !strings.HasPrefix(line, "cpu") {
			break
		}
		cpuLines = append(cpuLines, line)
	}
	return cpuLines, nil
}

// parseStatLine is to parse cpuLine into cpuTimesStat.
func parseStatLine(cpuLine string) (*cpuTimesStat, error) {
	values := strings.Fields(cpuLine)
	if len(values) == 0 || len(values) < 8 {
		return nil, dynamicPoolsError("Stat does not contain cpu info.")
	}
	cpu := values[0]
	user, err := strconv.ParseFloat(values[1], 64)
	if err != nil {
		return nil, err
	}
	nice, err := strconv.ParseFloat(values[2], 64)
	if err != nil {
		return nil, err
	}
	system, err := strconv.ParseFloat(values[3], 64)
	if err != nil {
		return nil, err
	}
	idle, err := strconv.ParseFloat(values[4], 64)
	if err != nil {
		return nil, err
	}
	ioWait, err := strconv.ParseFloat(values[5], 64)
	if err != nil {
		return nil, err
	}
	irq, err := strconv.ParseFloat(values[6], 64)
	if err != nil {
		return nil, err
	}
	softirq, err := strconv.ParseFloat(values[7], 64)
	if err != nil {
		return nil, err
	}
	cts := &cpuTimesStat{
		cpu:     cpu,
		user:    user,
		nice:    nice,
		system:  system,
		idle:    idle,
		ioWait:  ioWait,
		irq:     irq,
		softirq: softirq,
	}
	if len(values) > 8 { // Linux >= 2.6.11
		steal, err := strconv.ParseFloat(values[8], 64)
		if err != nil {
			return nil, err
		}
		cts.steal = steal
	}
	if len(values) > 9 { // Linux >= 2.6.24
		guest, err := strconv.ParseFloat(values[9], 64)
		if err != nil {
			return nil, err
		}
		cts.guest = guest
	}
	if len(values) > 10 { // Linux >= 3.2.0
		guestNice, err := strconv.ParseFloat(values[10], 64)
		if err != nil {
			return nil, err
		}
		cts.guestNice = guestNice
	}
	return cts, nil
}

// calculateOneCpuUtilization returns the utilization of one cpu in an interval
func calculateOneCpuUtilization(cts1, cts2 cpuTimesStat) float64 {
	cts1Total, cts1Busy := getBusyTime(cts1)
	cts2Total, cts2Busy := getBusyTime(cts2)
	if cts2Busy <= cts1Busy {
		return 0
	}
	if cts2Total <= cts1Total {
		return 100
	}
	return math.Min(100, math.Max(0, (cts2Busy-cts1Busy)/(cts2Total-cts1Total)*100))
}

func getBusyTime(cts cpuTimesStat) (float64, float64) {
	total := cts.user + cts.system + cts.idle + cts.nice + cts.ioWait + cts.irq +
		cts.softirq + cts.steal + cts.guest + cts.guestNice
	if runtime.GOOS == "linux" {
		total -= cts.guest     // Linux 2.6.24+
		total -= cts.guestNice // Linux 3.2.0+
	}
	busy := total - cts.idle - cts.ioWait
	return total, busy
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/dynamic-pools/dyp.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package dyp

import (
	"fmt"
	"path/filepath"
	"time"

	corev1 "k8s.io/api/core/v1"
	resapi "k8s.io/apimachinery/pkg/api/resource"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

	pkgcfg "github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/cpuallocator"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	cpucontrol "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/cpu"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	policyapi "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/utils"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	idset "github.com/intel/goresctrl/pkg/utils"
)

const (
	// PolicyName is the name used to activate this policy.
	PolicyName = "dynamic-pools"
	// PolicyDescription is a short description of this policy.
	PolicyDescription = "The cpuset of the dynamic pools can be dynamically changed based on workload."
	// PolicyPath is the path of this policy in the configuration hierarchy.
	PolicyPath = "policy." + PolicyName
	// dynamicPoolKey is a pod annotation key, the value is a pod dynamicPool name.
	dynamicPoolKey = "dynamic-pool." + PolicyName + "." + kubernetes.ResmgrKeyNamespace
	// reservedDynamicPoolDefName is the name in the reserved dynamicPool definition.
	reservedDynamicPoolDefName = "reserved"
	// sharedDynamicPoolDefName is the name in the shared dynamicPool definition.
	sharedDynamicPoolDefName = "shared"
)

// dynamicPools contains configuration and runtime attributes of the dynamic-pools policy
type dynamicPools struct {
	options   *policyapi.BackendOptions // configuration common to all policies
	dpoptions DynamicPoolsOptions       // dynamicPool-specific configuration
	cch       cache.Cache               // cri-resmgr cache
	allowed   cpuset.CPUSet             // bounding set of CPUs we're allowed to use
	reserved  cpuset.CPUSet             // system-/kube-reserved CPUs
	freeCpus  cpuset.CPUSet             // CPUs to be included in growing dynamicPools

	reservedDynamicPoolDef *DynamicPoolDef // built-in definition of the reserved dynamicPool
	sharedDynamicPoolDef   *DynamicPoolDef // built-in definition of the shared dynamicPool
	dynamicPools           []*DynamicPool  // dynamicPool instances: reserved, shared and user-defined

	cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy
}

// DynamicPool contains attributes of a dynamicPool
type DynamicPool struct {
	// Def is the definition from which this dynamicPool is created.
	Def *DynamicPoolDef
	// Cpus is the set of CPUs exclusive to this dynamicPool only.
	Cpus cpuset.CPUSet
	// Mems is the set of memory nodes with minimal access delay from CPUs.
	Mems idset.IDSet
	// PodIDs maps pod ID to list of container IDs.
	// - len(PodIDs) is the number of pods in the dynamicPool.
	// - len(PodIDs[podID]) is the number of containers of podID currently assigned to the dynamicPool.
	PodIDs map[string][]string
}

var log logger.Logger = logger.NewLogger("policy")

// String is a stringer for a dynamicPool.
func (dp DynamicPool) String() string {
	return fmt.Sprintf("%s{Cpus:%s, Mems:%s}", dp.PrettyName(), dp.Cpus, dp.Mems)
}

// PrettyName returns a unique name for a dynamicPool.
func (dp DynamicPool) PrettyName() string {
	return dp.Def.Name
}

// ContainerIDs returns IDs of containers assigned in a dynamicPool.
// (Using cache.Container.GetCacheID()'s)
func (dp DynamicPool) ContainerIDs() []string {
	cIDs := []string{}
	for _, ctrIDs := range dp.PodIDs {
		cIDs = append(cIDs, ctrIDs...)
	}
	return cIDs
}

// ContainerCount returns the number of containers in a dynamicPool.
func (dp DynamicPool) ContainerCount() int {
	count := 0
	for _, ctrIDs := range dp.PodIDs {
		count += len(ctrIDs)
	}
	return count
}

// AvailMilliCpus returns the number of CPUs in a dynamicPool.
func (dp DynamicPool) AvailMilliCpus() int {
	return dp.Cpus.Size() * 1000
}

// updateRealCpuUsed returns cpu utilization of a dynamicPool.
func (dp *DynamicPool) updateRealCpuUsed(cpuInfo []float64) (float64, error) {
	if dp.Cpus.Size() == 0 {
		log.Debug("dynamic pool %s cpuset is 0", dp.Def.Name)
		return 0, nil
	}
	cpus := dp.Cpus.UnsortedList()
	var sum float64
	for i := 0; i < len(cpus); i++ {
		sum += cpuInfo[cpus[i]]
	}
	log.Debug("dynamic pool %s cpuset: %s,  cpu utilization: %v", dp.Def.Name, dp.Cpus, sum)
	return sum, nil
}

// calculateAllPoolWeights returns weights of all dynamicPools and the sum of weights.
// Use dynamicPool's cpu utilization as its weight.
func (p *dynamicPools) calculateAllPoolWeights() (map[*DynamicPool]float64, float64, error) {
	cpuInfo, _ := getCpuUtilization(time.Duration(time.Second))
	weight := make(map[*DynamicPool]float64)
	sumWeight := 0.0
	for _, dp := range p.dynamicPools {
		if dp.Def.Name == reservedDynamicPoolDefName {
			continue
		}
		// If there is no container in a dynamic pool, there is no need to calculate its weight, that is, there is no need to allocate CPUs to it.
		if dp.ContainerCount() == 0 {
			weight[dp] = 0.0
		} else {
			realCpuUsed, err := dp.updateRealCpuUsed(cpuInfo)
			if err != nil {
				return weight, sumWeight, dynamicPoolsError("The actual cpu usage of the dynamic pool %s cannot be obtained: %w",
					dp.PrettyName(), err)
			}
			weight[dp] = realCpuUsed
			sumWeight += weight[dp]
		}
		log.Debug("dynamic pool: %s, weight: %v", dp, weight[dp])
	}
	log.Debug("sum weight: %v", sumWeight)
	return weight, sumWeight, nil
}

// calculateAllPoolRequests returns the sum of the requests of containers in each dynamicPool and remaining free cpu.
// remainFree = allowed cpu - reserved cpu - sum(requests of containers in each dynamicPool)
func (p *dynamicPools) calculateAllPoolRequests() (map[*DynamicPool]int, int) {
	requestCpu := make(map[*DynamicPool]int)
	remainFree := p.allowed.Difference(p.reserved).Size()
	for _, dp := range p.dynamicPools {
		if dp.Def.Name == reservedDynamicPoolDefName {
			continue
		}
		requestCpu[dp] = (p.requestedMinMilliCpus(dp) + 999) / 1000
		remainFree -= requestCpu[dp]
		log.Debug("dynamic pool %s request cpu %d", dp, requestCpu[dp])
	}
	log.Debug("sum remain free cpu %d", remainFree)
	return requestCpu, remainFree
}

func (p *dynamicPools) containerPinPool(dp *DynamicPool) {
	dp.Mems = p.closestMems(dp.Cpus)
	for _, cID := range dp.ContainerIDs() {
		if c, ok := p.cch.LookupContainer(cID); ok {
			p.pinCpuMem(c, dp.Cpus, dp.Mems)
		}
	}
}

// calculatePoolCpuset returns the cpus that dynamic pools need to allocate.
func (p *dynamicPools) calculatePoolCpuset(requestCpu map[*DynamicPool]int, remainFree int, weight map[*DynamicPool]float64, sumWeight float64) map[*DynamicPool]int {
	usedCpu := 0
	// If there are containers in the shared dynamic pool, allocate at least one CPU to it,
	// otherwise there is no need to allocate a CPU to it.
	// Ensure that there is at least one cpu in the shared dynamicPool.
	for _, dp := range p.dynamicPools {
		if dp.Def.Name == sharedDynamicPoolDefName && dp.ContainerCount() > 0 && sumWeight != 0 {
			addCpu := int(float64(remainFree) * weight[dp] / sumWeight)
			if requestCpu[dp]+addCpu < 1 {
				requestCpu[dp] = 1
				remainFree -= 1
			}
		}
	}
	for _, dp := range p.dynamicPools {
		if dp.Def.Name == reservedDynamicPoolDefName {
			requestCpu[dp] = dp.Cpus.Size()
		}
		if sumWeight != 0 {
			addCpu := int(float64(remainFree) * weight[dp] / sumWeight)
			requestCpu[dp] += addCpu
			usedCpu += addCpu
		}
		log.Info("The cpu that dynamic pool %s needs to allocate is %d, remain free cpu %d", dp, requestCpu[dp], remainFree-usedCpu)
	}
	if usedCpu < remainFree {
		// If there is still cpus, give the dynamicPool with the highest cpu utilization.
		tmp := p.dynamicPools[1]
		for _, dp := range p.dynamicPools {
			if dp.Def.Name == reservedDynamicPoolDefName {
				continue
			}
			if weight[dp] > weight[tmp] {
				tmp = dp
			}
		}
		requestCpu[tmp] += (remainFree - usedCpu)
		log.Info("The cpu that dynamic pool %s needs to allocate is %d, remain free cpu %d", tmp, requestCpu[tmp], 0)
	}
	return requestCpu
}

// isNeedReallocate returns whether the cpus need to be reallocated.
func (p *dynamicPools) isNeedReallocate(newPoolCpu map[*DynamicPool]int) bool {
	for _, dp := range p.dynamicPools {
		if dp.Def.Name == reservedDynamicPoolDefName {
			continue
		}
		if dp.Cpus.Size() != newPoolCpu[dp] {
			return true
		}
	}
	return false
}

// updatePoolCpuset updates the cpuset of the dynamicPools.
func (p *dynamicPools) updatePoolCpuset() error {
	requestCpu, remainFree := p.calculateAllPoolRequests()
	weight, sumWeight, err := p.calculateAllPoolWeights()
	if err != nil {
		return err
	}

	if remainFree >= 1 {
		requestCpu = p.calculatePoolCpuset(requestCpu, remainFree, weight, sumWeight)
	}

	// If the number of newly allocated CPUs is the same as the number of existing CPUs in the pool,
	// it means that there is no need to re-allocate
	if !p.isNeedReallocate(requestCpu) {
		log.Info("The number of CPUs required by the pools is the same as the number of CPUs already in the pools, so there is no need to reallocate.")
		for _, dp := range p.dynamicPools {
			p.containerPinPool(dp)
		}
		return nil
	}

	for _, dp := range p.dynamicPools {
		if dp.Def.Name == reservedDynamicPoolDefName {
			continue
		}
		if dp.Cpus.Size() == 0 {
			continue
		}
		oldCpus := dp.Cpus.Clone()
		keptCpus, err := p.cpuAllocator.ReleaseCpus(&oldCpus, dp.Cpus.Size(), dp.Def.AllocatorPriority)
		if err != nil || keptCpus.Size() != 0 {
			return dynamicPoolsError("releasing %d CPUs from %s failed: %w (kept: %s)", dp.Cpus.Size(), dp, err, keptCpus)
		}
		p.freeCpus = p.freeCpus.Union(dp.Cpus)
	}
	for _, dp := range p.dynamicPools {
		if dp.Def.Name == reservedDynamicPoolDefName {
			continue
		}
		newCpus, err := p.cpuAllocator.AllocateCpus(&p.freeCpus, requestCpu[dp], dp.Def.AllocatorPriority)
		if err != nil {
			return dynamicPoolsError("allocating %d CPUs for %s failed: %w", requestCpu[dp], dp, err)
		}
		dp.Cpus = newCpus
		log.Debugf("resize successful for container: %s, new Cpus: %#s", dp.PrettyName(), dp.Cpus)
		p.containerPinPool(dp)
		p.useCpuClass(dp)
	}
	return nil
}

// CreateDynamicPoolsPolicy creates a new policy instance.
func CreateDynamicPoolsPolicy(policyOptions *policy.BackendOptions) policy.Backend {
	p := &dynamicPools{
		options:      policyOptions,
		cch:          policyOptions.Cache,
		cpuAllocator: cpuallocator.NewCPUAllocator(policyOptions.System),
	}
	log.Info("creating %s policy...", PolicyName)
	// Handle common policy options: AvailableResources and ReservedResources.
	// p.allowed: CPUs available for the policy
	if allowed, ok := policyOptions.Available[policyapi.DomainCPU]; ok {
		p.allowed = allowed.(cpuset.CPUSet)
	} else {
		// Available CPUs not specified, default to all on-line CPUs.
		p.allowed = policyOptions.System.CPUSet().Difference(policyOptions.System.Offlined())
	}
	// p.reserved: CPUs reserved for kube-system pods, subset of p.allowed.
	p.reserved = cpuset.New()
	if reserved, ok := p.options.Reserved[policyapi.DomainCPU]; ok {
		switch v := reserved.(type) {
		case cpuset.CPUSet:
			p.reserved = p.allowed.Intersection(v)
		case resapi.Quantity:
			reserveCnt := (int(v.MilliValue()) + 999) / 1000
			cpus, err := p.cpuAllocator.AllocateCpus(&p.allowed, reserveCnt, cpuallocator.PriorityNone)
			if err != nil {
				log.Fatal("failed to allocate reserved CPUs: %s", err)
			}
			p.reserved = cpus
			p.allowed = p.allowed.Union(cpus)
		}
	}
	if p.reserved.IsEmpty() {
		log.Fatal("%s cannot run without reserved CPUs that are also AvailableResources", PolicyName)
	}
	// Handle policy-specific options
	log.Debug("creating %s configuration", PolicyName)
	if err := p.setConfig(dynamicPoolsOptions); err != nil {
		log.Fatal("failed to create %s policy: %v", PolicyName, err)
	}
	pkgcfg.GetModule(PolicyPath).AddNotify(p.configNotify)

	return p
}

// Name returns the name of this policy.
func (p *dynamicPools) Name() string {
	return PolicyName
}

// Description returns the description for this policy.
func (p *dynamicPools) Description() string {
	return PolicyDescription
}

// Start prepares this policy for accepting allocation/release requests.
func (p *dynamicPools) Start(add []cache.Container, del []cache.Container) error {
	log.Info("%s policy started", PolicyName)
	return p.Sync(p.cch.GetContainers(), nil)
}

// Sync synchronizes the active policy state.
func (p *dynamicPools) Sync(add []cache.Container, del []cache.Container) error {
	log.Debug("synchronizing state...")
	for _, c := range del {
		p.ReleaseResources(c)
	}
	for _, c := range add {
		p.AllocateResources(c)
	}
	return nil
}

// AllocateResources is a resource allocation request for this policy.
func (p *dynamicPools) AllocateResources(c cache.Container) error {
	log.Debug("allocating resources for container %s...", c.PrettyName())
	dp, err := p.allocateDynamicPool(c)
	if err != nil {
		return dynamicPoolsError("dynamicPool allocation for container %s failed: %w", c.PrettyName(), err)
	}
	if dp == nil {
		return dynamicPoolsError("no suitable dynamicPools found for container %s", c.PrettyName())
	}
	log.Info("assigning container %s to dynamicPool %s", c.PrettyName(), dp)
	podID := c.GetPodID()
	dp.PodIDs[podID] = append(dp.PodIDs[podID], c.GetCacheID())
	if dp.Cpus.Equals(p.reserved) {
		p.assignContainer(c, dp)
		log.Debugf("if dynamic pool is reserved, do not updatePoolCpuset.")
	} else {
		p.updatePoolCpuset()
	}
	if log.DebugEnabled() {
		log.Debug(p.dumpDynamicPool(dp))
	}
	return nil
}

// ReleaseResources is a resource release request for this policy.
func (p *dynamicPools) ReleaseResources(c cache.Container) error {
	log.Debug("releasing container %s...", c.PrettyName())
	dp := p.dynamicPoolByContainer(c)
	if dp == nil {
		log.Debug("ReleaseResources: dynamicPool-less container %s, nothing to release", c.PrettyName())
		return nil
	}
	p.dismissContainer(c, dp)
	if dp.Cpus.Equals(p.reserved) {
		log.Debugf("if dynamic pool is reserved, do not updatePoolCpuset.")
	} else {
		p.updatePoolCpuset()
	}
	if log.DebugEnabled() {
		log.Debug(p.dumpDynamicPool(dp))
	}
	return nil
}

// UpdateResources is a resource allocation update request for this policy.
func (p *dynamicPools) UpdateResources(c cache.Container) error {
	log.Debug("(not) updating container %s...", c.PrettyName())
	return nil
}

// Rebalance tries to find an optimal allocation of resources for the current containers.
func (p *dynamicPools) Rebalance() (bool, error) {
	log.Debug("rebalancing containers...")
	err := p.updatePoolCpuset()
	return true, err
}

// HandleEvent handles policy-specific events.
func (p *dynamicPools) HandleEvent(*events.Policy) (bool, error) {
	log.Debug("(not) handling event...")
	return false, nil
}

// ExportResourceData provides resource data to export for the container.
func (p *dynamicPools) ExportResourceData(c cache.Container) map[string]string {
	return nil
}

// Introspect provides data for external introspection.
func (p *dynamicPools) Introspect(*introspect.State) {
	return
}

// dynamicPoolByContainer returns a dynamicPool that contains a container.
func (p *dynamicPools) dynamicPoolByContainer(c cache.Container) *DynamicPool {
	podID := c.GetPodID()
	cID := c.GetCacheID()
	for _, dp := range p.dynamicPools {
		for _, ctrID := range dp.PodIDs[podID] {
			if ctrID == cID {
				return dp
			}
		}
	}
	return nil
}

// dynamicPoolsByDef returns a dynamicPool instantiated from a dynamicPool definition.
func (p *dynamicPools) dynamicPoolByDef(dpDef *DynamicPoolDef) *DynamicPool {
	for _, dp := range p.dynamicPools {
		if dp.Def == dpDef {
			return dp
		}
	}
	return nil
}

// dynamicPoolDefByName returns a dynamicPool definition with a name.
func (p *dynamicPools) dynamicPoolDefByName(defName string) *DynamicPoolDef {
	if defName == reservedDynamicPoolDefName {
		return p.reservedDynamicPoolDef
	}
	if defName == sharedDynamicPoolDefName {
		return p.sharedDynamicPoolDef
	}
	for _, dpDef := range p.dpoptions.DynamicPoolDefs {
		if dpDef.Name == defName {
			return dpDef
		}
	}
	return nil
}

// chooseDynamicPoolDef returns the dynamicPoolDef selected by the container
func (p *dynamicPools) chooseDynamicPoolDef(c cache.Container) (*DynamicPoolDef, error) {
	var dpDef *DynamicPoolDef
	// If the requests and limits of container are 0, they are assigned to the shared dynamicPool.
	if !namespaceMatches(c.GetNamespace(), append(p.dpoptions.ReservedPoolNamespaces, metav1.NamespaceSystem)) &&
		p.containerRequestedMilliCpus(c.GetCacheID()) == 0 && p.containerLimitedMilliCpus(c.GetCacheID()) == 0 {
		return p.sharedDynamicPoolDef, nil
	}

	// DynamicPoolDef is defined by annotation?
	if dpDefName, ok := c.GetEffectiveAnnotation(dynamicPoolKey); ok {
		dpDef = p.dynamicPoolDefByName(dpDefName)
		if dpDef == nil {
			return nil, dynamicPoolsError("no dynamicPool for annotation %q", dpDefName)
		}
		return dpDef, nil
	}

	// DynamicPoolDef is defined by a special namespace (kube-system +
	// ReservedPoolNamespaces)?
	if namespaceMatches(c.GetNamespace(), append(p.dpoptions.ReservedPoolNamespaces, metav1.NamespaceSystem)) {
		return p.dynamicPools[0].Def, nil
	}

	// DynamicPoolDef is defined by the namespace?
	for _, dpDef := range append([]*DynamicPoolDef{p.reservedDynamicPoolDef, p.sharedDynamicPoolDef},
		p.dpoptions.DynamicPoolDefs...) {
		if namespaceMatches(c.GetNamespace(), dpDef.Namespaces) {
			return dpDef, nil
		}
	}
	// Fallback to the shared dynamicPool.
	return p.sharedDynamicPoolDef, nil
}

func (p *dynamicPools) containerRequestedMilliCpus(contID string) int {
	cont, ok := p.cch.LookupContainer(contID)
	if !ok {
		return 0
	}
	reqCpu, ok := cont.GetResourceRequirements().Requests[corev1.ResourceCPU]
	if !ok {
		return 0
	}
	return int(reqCpu.MilliValue())
}

func (p *dynamicPools) containerLimitedMilliCpus(contID string) int {
	cont, ok := p.cch.LookupContainer(contID)
	if !ok {
		return 0
	}
	limitCpu, ok := cont.GetResourceRequirements().Limits[corev1.ResourceCPU]
	if !ok {
		return 0
	}
	return int(limitCpu.MilliValue())
}

// requestedMaxMilliCpus sums up and returns CPU limits of all
// containers assigned to a dynamicPool.
func (p *dynamicPools) requestedMaxMilliCpus(dp *DynamicPool) int {
	cpuRequested := 0
	for _, cID := range dp.ContainerIDs() {
		cpuRequested += p.containerLimitedMilliCpus(cID)
	}
	return cpuRequested
}

// requestedMinMilliCpus sums up and returns CPU requests of all
// containers assigned to a dynamicPool.
func (p *dynamicPools) requestedMinMilliCpus(dp *DynamicPool) int {
	cpuRequested := 0
	for _, cID := range dp.ContainerIDs() {
		cpuRequested += p.containerRequestedMilliCpus(cID)
	}
	return cpuRequested
}

// useCpuClass configures CPUs of a dynamicPool.
func (p *dynamicPools) useCpuClass(dp *DynamicPool) error {
	// Usual inputs:
	// - CPUs that cpuallocator has reserved for this dynamicPool:
	//   dp.Cpus (cpuset.CPUSet).
	// - User-defined CPU configuration for CPUs of dynamicPool of this type:
	//   dp.Def.CpuClass (string).
	// - Current configuration(?): feel free to add data
	//   structure for this. For instance policy-global p.cpuConfs,
	//   or dynamicPool-local dp.cpuConfs.
	//
	// Other input examples, if needed:
	// - Requested CPU resources by all containers in the dynamicPool:
	//   p.requestedMilliCpus(dp).
	// - Free CPU resources in the dynamicPool: p.freeMilliCpus(dp).
	// - Number of assigned containers: dp.ContainerCount().
	// - Container details: access p.cch with dp.ContainerIDs().
	// - User-defined CPU AllocatorPriority: dp.Def.AllocatorPriority.
	// - All existing dynamicPool instances: p.dynamicPools.
	// - CPU configurations by user: dp.Def.CpuClass (for dp in p.dynamicPools)
	cpucontrol.Assign(p.cch, dp.Def.CpuClass, dp.Cpus.UnsortedList()...)
	log.Debugf("useCpuClass Cpus: %s; CpuClass: %s", dp.Cpus, dp.Def.CpuClass)
	return nil
}

func (p *dynamicPools) newDynamicPool(dpDef *DynamicPoolDef, confCpus bool) (*DynamicPool, error) {
	var cpus cpuset.CPUSet
	var err error
	if dpDef == p.reservedDynamicPoolDef {
		cpus = p.reserved
	} else {
		cpus, err = p.cpuAllocator.AllocateCpus(&p.freeCpus, 0, dpDef.AllocatorPriority)

		if err != nil {
			return nil, dynamicPoolsError("could not allocate Cpus for dynamicPool %s: %w", dpDef.Name, err)
		}
	}
	dp := &DynamicPool{
		Def:    dpDef,
		PodIDs: make(map[string][]string),
		Cpus:   cpus,
		Mems:   p.closestMems(cpus),
	}
	if confCpus {
		if err = p.useCpuClass(dp); err != nil {
			log.Errorf("failed to apply CPU configuration to new dynamicPool %s (cpus: %s): %w", dpDef.Name, cpus, err)
			return nil, err
		}
	}
	return dp, nil
}

func namespaceMatches(namespace string, patterns []string) bool {
	for _, pattern := range patterns {
		ret, err := filepath.Match(pattern, namespace)
		if err == nil && ret {
			return true
		}
	}
	return false
}

// allocateDynamicPool returns a dynamicPool allocated for a container.
func (p *dynamicPools) allocateDynamicPool(c cache.Container) (*DynamicPool, error) {
	dpDef, err := p.chooseDynamicPoolDef(c)
	if err != nil {
		return nil, err
	}
	if dpDef == nil {
		return nil, dynamicPoolsError("no applicable dynamicPool type found")
	}
	dynamicPool := p.dynamicPoolByDef(dpDef)
	if dynamicPool == nil {
		return nil, dynamicPoolsError("no suitable dynamicPool instance available")
	}
	return dynamicPool, err
}

// dumpDynamicPool dumps dynamicPool contents in detail.
func (p *dynamicPools) dumpDynamicPool(dp *DynamicPool) string {
	conts := []string{}
	pods := []string{}
	for podID, contIDs := range dp.PodIDs {
		podName := podID
		if pod, ok := p.cch.LookupPod(podID); ok {
			podName = pod.GetName()
		}
		pods = append(pods, podName)
		for _, contID := range contIDs {
			if cont, ok := p.cch.LookupContainer(contID); ok {
				conts = append(conts, cont.PrettyName())
			} else {
				conts = append(conts, podName+"."+contID)
			}
		}
	}
	s := fmt.Sprintf("DynamicPool %s{Cpus: %s; Mems: %s; mCPU requests: %d; mCPU limits: %d; capacity: %d; pods: %s; conts: %s}",
		dp.PrettyName(),
		dp.Cpus,
		dp.Mems,
		p.requestedMinMilliCpus(dp),
		p.requestedMaxMilliCpus(dp),
		dp.AvailMilliCpus(),
		pods,
		conts)
	return s
}

// changesDynamicPools returns true if two dynamicPools policy configurations
// may lead into different dynamicPools or workload assignment.
func changesDynamicPools(opts0, opts1 *DynamicPoolsOptions) bool {
	if opts0 == nil && opts1 == nil {
		return false
	}
	if opts0 == nil || opts1 == nil {
		return true
	}
	if len(opts0.DynamicPoolDefs) != len(opts1.DynamicPoolDefs) {
		return true
	}
	o0 := opts0.DeepCopy()
	o1 := opts1.DeepCopy()
	// Ignore differences in CPU class names. Every other change
	// potentially changes dynamicPools or workloads.
	for i := range o0.DynamicPoolDefs {
		o0.DynamicPoolDefs[i].CpuClass = ""
		o1.DynamicPoolDefs[i].CpuClass = ""
	}
	return utils.DumpJSON(o0) != utils.DumpJSON(o1)
}

// changesCpuClasses returns true if two dynamicPools policy
// configurations can lead to using different CPU classes on
// corresponding dynamicPool instances. Calling changesCpuClasses(o0, o1)
// makes sense only if changesDynamicPools(o0, o1) has returned false.
func changesCpuClasses(opts0, opts1 *DynamicPoolsOptions) bool {
	if opts0 == nil && opts1 == nil {
		return false
	}
	if opts0 == nil || opts1 == nil {
		return true
	}
	if len(opts0.DynamicPoolDefs) != len(opts1.DynamicPoolDefs) {
		return true
	}
	for i := range opts0.DynamicPoolDefs {
		if opts0.DynamicPoolDefs[i].CpuClass != opts1.DynamicPoolDefs[i].CpuClass {
			return true
		}
	}
	return false
}

// configNotify applies new configuration.
func (p *dynamicPools) configNotify(event pkgcfg.Event, source pkgcfg.Source) error {
	log.Info("configuration %s", event)
	defer log.Debug("effective configuration:\n%s\n", utils.DumpJSON(p.dpoptions))
	newDynamicPoolsOptions := dynamicPoolsOptions.DeepCopy()
	if !changesDynamicPools(&p.dpoptions, newDynamicPoolsOptions) {
		if !changesCpuClasses(&p.dpoptions, newDynamicPoolsOptions) {
			log.Info("no configuration changes")
		} else {
			log.Info("configuration changes only on CPU classes")
			// Update new CPU classes to existing DynamicPool
			// definitions. The same DynamicPoolDef instances
			// must be kept in use, because each dynamicPool
			// instance holds a direct reference to its
			// DynamicPoolDef.
			for i := range p.dpoptions.DynamicPoolDefs {
				p.dpoptions.DynamicPoolDefs[i].CpuClass = newDynamicPoolsOptions.DynamicPoolDefs[i].CpuClass
			}
			// (Re)configures all CPUs in DynamicPools.
			for _, dp := range p.dynamicPools {
				p.useCpuClass(dp)
			}
		}
		return nil
	}
	if err := p.setConfig(newDynamicPoolsOptions); err != nil {
		log.Error("config update failed: %v", err)
		return err
	}
	log.Info("config updated successfully")
	p.Sync(p.cch.GetContainers(), p.cch.GetContainers())
	return nil
}

// applyDynamicPoolDef creates user-defined dynamicPools or reconfigures built-in
// dynamicPools according to the dpDef. Does not initialize dynamicPool CPUs.
func (p *dynamicPools) applyDynamicPoolDef(dynamicPools *[]*DynamicPool, dpDef *DynamicPoolDef) error {
	if len(*dynamicPools) < 2 {
		return dynamicPoolsError("internal error: reserved and shared dynamicPools missing, cannot apply dynamicPool definitions")
	}
	reservedDynamicPool := (*dynamicPools)[0]
	sharedDynamicPool := (*dynamicPools)[1]
	// Every dynamicPoolDef does one of the following:
	// 1. reconfigures the "reserved" dynamicPool (most restricted)
	// 2. reconfigures the "shared" dynamicPool (somewhat restricted)
	// 3. defines new user-defined dynamicPool.
	switch dpDef.Name {
	case "":
		// Case 0: bad name
		return dynamicPoolsError("undefined or empty dynamicPool name")
	case reservedDynamicPool.Def.Name:
		// Case 1: reconfigure the "reserved" dynamicPool.
		p.reservedDynamicPoolDef.AllocatorPriority = dpDef.AllocatorPriority
		p.reservedDynamicPoolDef.CpuClass = dpDef.CpuClass
		p.reservedDynamicPoolDef.Namespaces = dpDef.Namespaces
	case sharedDynamicPool.Def.Name:
		// Case 2: reconfigure the "shared" dynamicPool.
		p.sharedDynamicPoolDef.AllocatorPriority = dpDef.AllocatorPriority
		p.sharedDynamicPoolDef.CpuClass = dpDef.CpuClass
		p.sharedDynamicPoolDef.Namespaces = dpDef.Namespaces
	default:
		// Case 3: create each user-defined dynamicPool without CPU.
		newdp, err := p.newDynamicPool(dpDef, false)
		if err != nil {
			return err
		}
		*dynamicPools = append(*dynamicPools, newdp)
	}
	return nil
}

// setConfig takes new dynamicPool configuration into use.
func (p *dynamicPools) setConfig(dpoptions *DynamicPoolsOptions) error {
	// Create the default reserved and shared dynamicPool
	// definitions. Some properties of these definitions may be
	// altered by user configuration.
	p.reservedDynamicPoolDef = &DynamicPoolDef{
		Name:              reservedDynamicPoolDefName,
		AllocatorPriority: 3,
	}
	p.sharedDynamicPoolDef = &DynamicPoolDef{
		Name:              sharedDynamicPoolDefName,
		AllocatorPriority: 3,
	}
	p.dynamicPools = []*DynamicPool{}
	p.freeCpus = p.allowed.Clone()
	p.freeCpus = p.freeCpus.Difference(p.reserved)
	// Instantiate built-in reserved and shared dynamicPool.
	reservedDynamicPool, err := p.newDynamicPool(p.reservedDynamicPoolDef, false)
	if err != nil {
		return err
	}
	p.dynamicPools = append(p.dynamicPools, reservedDynamicPool)
	sharedDynamicPool, err := p.newDynamicPool(p.sharedDynamicPoolDef, false)
	if err != nil {
		return err
	}
	p.dynamicPools = append(p.dynamicPools, sharedDynamicPool)
	// First apply customizations to built-in dynamicPools: "reserved"
	// and "shared".
	for _, dpDef := range dpoptions.DynamicPoolDefs {
		if dpDef.Name != reservedDynamicPoolDefName && dpDef.Name != sharedDynamicPoolDefName {
			continue
		}
		if err := p.applyDynamicPoolDef(&p.dynamicPools, dpDef); err != nil {
			return err
		}
	}
	// Apply all user dynamicPool definitions, skip already customized
	// "reserved" and "shared" dynamicPools.
	for _, dpDef := range dpoptions.DynamicPoolDefs {
		if dpDef.Name == reservedDynamicPoolDefName || dpDef.Name == sharedDynamicPoolDefName {
			continue
		}
		if err := p.applyDynamicPoolDef(&p.dynamicPools, dpDef); err != nil {
			return err
		}
	}
	// Finish dynamicPool initialization.
	log.Info("%s policy dynamicPools:", PolicyName)
	for dpIdx, dp := range p.dynamicPools {
		log.Info("- dynamicPool %d: %s", dpIdx, dp)
	}
	// No errors in dynamicPool creation, take new configuration into use.
	p.dpoptions = *dpoptions
	// (Re)configures all CPUs in dynamicPools.
	for _, dp := range p.dynamicPools {
		p.useCpuClass(dp)
	}
	return nil
}

// closestMems returns memory node IDs good for pinning containers
// that run on given CPUs.
func (p *dynamicPools) closestMems(cpus cpuset.CPUSet) idset.IDSet {
	mems := idset.NewIDSet()
	sys := p.options.System
	for _, nodeID := range sys.NodeIDs() {
		if !cpus.Intersection(sys.Node(nodeID).CPUSet()).IsEmpty() {
			mems.Add(nodeID)
		}
	}
	return mems
}

// assignContainer adds a container to a dynamicPool.
func (p *dynamicPools) assignContainer(c cache.Container, dp *DynamicPool) {
	log.Info("assigning container %s to dynamicPool %s", c.PrettyName(), dp)
	podID := c.GetPodID()
	dp.PodIDs[podID] = append(dp.PodIDs[podID], c.GetCacheID())
	p.pinCpuMem(c, dp.Cpus, dp.Mems)
}

// dismissContainer removes a container from a dynamicPool.
func (p *dynamicPools) dismissContainer(c cache.Container, dp *DynamicPool) {
	podID := c.GetPodID()
	dp.PodIDs[podID] = removeString(dp.PodIDs[podID], c.GetCacheID())
	if len(dp.PodIDs[podID]) == 0 {
		delete(dp.PodIDs, podID)
	}
}

// pinCpuMem pins container to CPUs and memory nodes if flagged.
func (p *dynamicPools) pinCpuMem(c cache.Container, cpus cpuset.CPUSet, mems idset.IDSet) {
	if p.dpoptions.PinCPU == nil || *p.dpoptions.PinCPU {
		log.Debug("  - pinning %s to cpuset: %s", c.PrettyName(), cpus)
		c.SetCpusetCpus(cpus.String())
		if reqCpu, ok := c.GetResourceRequirements().Requests[corev1.ResourceCPU]; ok {
			mCpu := int(reqCpu.MilliValue())
			c.SetCPUShares(int64(cache.MilliCPUToShares(int64(mCpu))))
		}
	}
	if p.dpoptions.PinMemory == nil || *p.dpoptions.PinMemory {
		log.Debug("  - pinning %s to memory %s", c.PrettyName(), mems)
		c.SetCpusetMems(mems.String())
	}
}

// dynamicPoolsError formats an error from this policy.
func dynamicPoolsError(format string, args ...interface{}) error {
	return fmt.Errorf(PolicyName+": "+format, args...)
}

// removeString returns the first occurrence of a string from string slice.
func removeString(strings []string, element string) []string {
	for index, s := range strings {
		if s == element {
			strings[index] = strings[len(strings)-1]
			return strings[:len(strings)-1]
		}
	}
	return strings
}

// Register us as a policy implementation.
func init() {
	policy.Register(PolicyName, PolicyDescription, CreateDynamicPoolsPolicy)
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/dynamic-pools/dyp_test.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package dyp

import (
	"testing"

	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
)

func TestChangesDynamicPools(t *testing.T) {
	tcases := []struct {
		name          string
		opts1         *DynamicPoolsOptions
		opts2         *DynamicPoolsOptions
		expectedValue bool
	}{
		{
			name:          "both options are nil",
			expectedValue: false,
		},
		{
			name:          "one option is nil",
			opts2:         &DynamicPoolsOptions{},
			expectedValue: true,
		},
		{
			name: "reserved pool namespaces differ by len",
			opts1: &DynamicPoolsOptions{
				ReservedPoolNamespaces: []string{"ns0"},
			},
			opts2: &DynamicPoolsOptions{
				ReservedPoolNamespaces: []string{},
			},
			expectedValue: true,
		},
		{
			name: "reserved pool namespaces differ by content",
			opts1: &DynamicPoolsOptions{
				ReservedPoolNamespaces: []string{"ns0"},
			},
			opts2: &DynamicPoolsOptions{
				ReservedPoolNamespaces: []string{"ns1"},
			},
			expectedValue: true,
		},
		{
			name: "dynamic-pool defs differ",
			opts1: &DynamicPoolsOptions{
				ReservedPoolNamespaces: []string{"ns0"},
				DynamicPoolDefs:        []*DynamicPoolDef{},
			},
			opts2: &DynamicPoolsOptions{
				ReservedPoolNamespaces: []string{"ns1"},
				DynamicPoolDefs:        []*DynamicPoolDef{},
			},
			expectedValue: true,
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			value := changesDynamicPools(tc.opts1, tc.opts2)
			if value != tc.expectedValue {
				t.Errorf("Expected return value %v but got %v", tc.expectedValue, value)
			}
		})
	}
}

func TestIsNeedReallocate(t *testing.T) {
	p := &dynamicPools{
		dynamicPools: []*DynamicPool{
			{
				Def: &DynamicPoolDef{
					Name: reservedDynamicPoolDefName,
				},
				Cpus: cpuset.New(1, 2),
			},
			{
				Def: &DynamicPoolDef{
					Name: sharedDynamicPoolDefName,
				},
				Cpus: cpuset.New(3, 4, 5, 6),
			},
			{
				Def: &DynamicPoolDef{
					Name: "poo1",
				},
				Cpus: cpuset.New(7, 8, 9, 10, 11, 12),
			},
			{
				Def: &DynamicPoolDef{
					Name: "poo2",
				},
				Cpus: cpuset.New(0),
			},
		},
	}
	tcases := []struct {
		name          string
		newPoolCpu    map[*DynamicPool]int
		expectedValue bool
	}{
		{
			name: "no need to reallocate",
			newPoolCpu: map[*DynamicPool]int{
				p.dynamicPools[0]: 2,
				p.dynamicPools[1]: 4,
				p.dynamicPools[2]: 6,
				p.dynamicPools[3]: 1,
			},
			expectedValue: false,
		},
		{
			name: "need to reallocate",
			newPoolCpu: map[*DynamicPool]int{
				p.dynamicPools[0]: 2,
				p.dynamicPools[1]: 6,
				p.dynamicPools[2]: 4,
				p.dynamicPools[3]: 1,
			},
			expectedValue: true,
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			value := p.isNeedReallocate(tc.newPoolCpu)
			if value != tc.expectedValue {
				t.Errorf("Expected return value %v but got %v", tc.expectedValue, value)
			}
		})
	}
}

func TestCalculatePoolCpuset(t *testing.T) {
	p := &dynamicPools{
		allowed:  cpuset.New(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13),
		reserved: cpuset.New(1, 2),
		dynamicPools: []*DynamicPool{
			{
				Def: &DynamicPoolDef{
					Name: reservedDynamicPoolDefName,
				},
				Cpus: cpuset.New(1, 2),
			},
			{
				Def: &DynamicPoolDef{
					Name: sharedDynamicPoolDefName,
				},
				Cpus: cpuset.New(3, 4, 5, 6),
			},
			{
				Def: &DynamicPoolDef{
					Name: "poo1",
				},
				Cpus: cpuset.New(7, 8, 9, 10, 11, 12, 13),
			},
			{
				Def: &DynamicPoolDef{
					Name: "poo2",
				},
				Cpus: cpuset.New(0),
			},
		},
	}
	tcases := []struct {
		name          string
		requestCpu    map[*DynamicPool]int
		remainFree    int
		weight        map[*DynamicPool]float64
		sumWeight     float64
		expectedValue map[*DynamicPool]int
	}{
		{
			name:       "The requests and weight of the dynamic pools are both nil",
			requestCpu: map[*DynamicPool]int{},
			remainFree: 12,
			weight:     map[*DynamicPool]float64{},
			sumWeight:  0.0,
			expectedValue: map[*DynamicPool]int{
				p.dynamicPools[0]: 2,
				p.dynamicPools[1]: 12,
				p.dynamicPools[2]: 0,
				p.dynamicPools[3]: 0,
			},
		},
		{
			name: "The requests of the dynamic pools is not nil, and the requests of the shared dynamic pools is 0",
			requestCpu: map[*DynamicPool]int{
				p.dynamicPools[0]: 1,
				p.dynamicPools[1]: 0,
				p.dynamicPools[2]: 2,
				p.dynamicPools[3]: 2,
			},
			remainFree: 8,
			weight:     map[*DynamicPool]float64{},
			sumWeight:  0.0,
			expectedValue: map[*DynamicPool]int{
				p.dynamicPools[0]: 2,
				p.dynamicPools[1]: 8,
				p.dynamicPools[2]: 2,
				p.dynamicPools[3]: 2,
			},
		},
		{
			name: "The requests of the dynamic pools is not nil, and the requests of the shared dynamic pools is not 0",
			requestCpu: map[*DynamicPool]int{
				p.dynamicPools[0]: 1,
				p.dynamicPools[1]: 2,
				p.dynamicPools[2]: 2,
				p.dynamicPools[3]: 2,
			},
			remainFree: 6,
			weight:     map[*DynamicPool]float64{},
			sumWeight:  0.0,
			expectedValue: map[*DynamicPool]int{
				p.dynamicPools[0]: 2,
				p.dynamicPools[1]: 8,
				p.dynamicPools[2]: 2,
				p.dynamicPools[3]: 2,
			},
		},
		{
			name:       "The weight of the dynamic pools is not nil, and the weight of the shared dynamic pools is not 0",
			requestCpu: map[*DynamicPool]int{},
			remainFree: 12,
			weight: map[*DynamicPool]float64{
				p.dynamicPools[0]: 10.0,
				p.dynamicPools[1]: 100.0,
				p.dynamicPools[2]: 200.0,
				p.dynamicPools[3]: 100.0,
			},
			sumWeight: 400.0,
			expectedValue: map[*DynamicPool]int{
				p.dynamicPools[0]: 2,
				p.dynamicPools[1]: 3,
				p.dynamicPools[2]: 6,
				p.dynamicPools[3]: 3,
			},
		},
		{
			name:       "The weight of the dynamic pools is not nil, and the weight of the shared dynamic pools is 0",
			requestCpu: map[*DynamicPool]int{},
			remainFree: 12,
			weight: map[*DynamicPool]float64{
				p.dynamicPools[0]: 10.0,
				p.dynamicPools[1]: 0.0,
				p.dynamicPools[2]: 200.0,
				p.dynamicPools[3]: 100.0,
			},
			sumWeight: 300.0,
			expectedValue: map[*DynamicPool]int{
				p.dynamicPools[0]: 2,
				p.dynamicPools[1]: 0,
				p.dynamicPools[2]: 8,
				p.dynamicPools[3]: 4,
			},
		},
		{
			name: "The requests and weight of the dynamic pools are not nil, and the requests of the shared dynamic pools is 0",
			requestCpu: map[*DynamicPool]int{
				p.dynamicPools[0]: 1,
				p.dynamicPools[1]: 0,
				p.dynamicPools[2]: 2,
				p.dynamicPools[3]: 2,
			},
			remainFree: 8,
			weight: map[*DynamicPool]float64{
				p.dynamicPools[0]: 10.0,
				p.dynamicPools[1]: 100.0,
				p.dynamicPools[2]: 200.0,
				p.dynamicPools[3]: 100.0,
			},
			sumWeight: 400.0,
			expectedValue: map[*DynamicPool]int{
				p.dynamicPools[0]: 2,
				p.dynamicPools[1]: 2,
				p.dynamicPools[2]: 6,
				p.dynamicPools[3]: 4,
			},
		},
		{
			name: "The requests and weight of the dynamic pools are not nil, and the weight of the shared dynamic pools is 0",
			requestCpu: map[*DynamicPool]int{
				p.dynamicPools[0]: 1,
				p.dynamicPools[1]: 1,
				p.dynamicPools[2]: 2,
				p.dynamicPools[3]: 2,
			},
			remainFree: 7,
			weight: map[*DynamicPool]float64{
				p.dynamicPools[0]: 10.0,
				p.dynamicPools[1]: 0.0,
				p.dynamicPools[2]: 200.0,
				p.dynamicPools[3]: 100.0,
			},
			sumWeight: 300.0,
			expectedValue: map[*DynamicPool]int{
				p.dynamicPools[0]: 2,
				p.dynamicPools[1]: 1,
				p.dynamicPools[2]: 7,
				p.dynamicPools[3]: 4,
			},
		},
		{
			name: "The requests and weight of the dynamic pools are not nil, and the requests and weight of the shared dynamic pools are both 0",
			requestCpu: map[*DynamicPool]int{
				p.dynamicPools[0]: 1,
				p.dynamicPools[1]: 0,
				p.dynamicPools[2]: 2,
				p.dynamicPools[3]: 2,
			},
			remainFree: 8,
			weight: map[*DynamicPool]float64{
				p.dynamicPools[0]: 10.0,
				p.dynamicPools[1]: 0.0,
				p.dynamicPools[2]: 200.0,
				p.dynamicPools[3]: 100.0,
			},
			sumWeight: 300.0,
			expectedValue: map[*DynamicPool]int{
				p.dynamicPools[0]: 2,
				p.dynamicPools[1]: 0,
				p.dynamicPools[2]: 8,
				p.dynamicPools[3]: 4,
			},
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			value := p.calculatePoolCpuset(tc.requestCpu, tc.remainFree, tc.weight, tc.sumWeight)
			for k, v := range value {
				if v != tc.expectedValue[k] {
					t.Errorf("dynamic pool %v Expected return value %v but got %v", k, tc.expectedValue[k], v)
				}
			}
		})
	}
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/dynamic-pools/flags.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package dyp

import (
	"encoding/json"

	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

	pkgcfg "github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/cpuallocator"
)

type DynamicPoolsOptions dynamicPoolsOptionsWrapped

// dynamicPoolsOptions contains configuration options specific to this policy.
type dynamicPoolsOptionsWrapped struct {
	// PinCPU controls pinning containers to CPUs.
	PinCPU *bool `json:"PinCPU,omitempty"`
	// PinMemory controls pinning containers to memory nodes.
	PinMemory *bool `json:"PinMemory,omitempty"`
	// ReservedPoolNamespaces is a list of namespace globs that
	// will be allocated to reserved CPUs.
	ReservedPoolNamespaces []string `json:"ReservedPoolNamespaces,omitempty"`
	// DynamicPoolDefs contains dynamicPool type definitions.
	DynamicPoolDefs []*DynamicPoolDef `json:"DynamicPoolTypes,omitempty"`
}

// DynamicPoolDef contains a dynamicPool definition.
type DynamicPoolDef struct {
	// Name of the dynamicPool definition.
	Name       string   `json:"Name"`
	Namespaces []string `json:"Namespaces",omitempty`
	CpuClass   string   `json:"CpuClass"`
	// AllocatorPriority (0: High, 1: Normal, 2: Low, 3: None)
	// This parameter is passed to CPU allocator when creating or
	// resizing a dynamicPool. At init, dynamicPools with highest priority
	// CPUs are allocated first.
	AllocatorPriority cpuallocator.CPUPriority `json:"AllocatorPriority"`
}

var defaultPinCPU bool = true
var defaultPinMemory bool = true

// DeepCopy creates a deep copy of a DynamicPoolsOptions
func (dpo *DynamicPoolsOptions) DeepCopy() *DynamicPoolsOptions {
	outDpo := *dpo
	outDpo.ReservedPoolNamespaces = make([]string, len(dpo.ReservedPoolNamespaces))
	copy(outDpo.ReservedPoolNamespaces, dpo.ReservedPoolNamespaces)
	outDpo.DynamicPoolDefs = make([]*DynamicPoolDef, len(dpo.DynamicPoolDefs))
	for i := range dpo.DynamicPoolDefs {
		outDpo.DynamicPoolDefs[i] = dpo.DynamicPoolDefs[i].DeepCopy()
	}
	return &outDpo
}

// String stringifies a DynamicPoolsDef
func (dpDef DynamicPoolDef) String() string {
	return dpDef.Name
}

// DeepCopy creates a deep copy of a DynamicPoolDef
func (bdef *DynamicPoolDef) DeepCopy() *DynamicPoolDef {
	outBdef := *bdef
	outBdef.Namespaces = make([]string, len(bdef.Namespaces))
	copy(outBdef.Namespaces, bdef.Namespaces)
	return &outBdef
}

// defaultDynamicPoolsOptions returns a new DynamicPoolsOptions instance, all initialized to defaults.
func defaultDynamicPoolsOptions() interface{} {
	return &DynamicPoolsOptions{
		ReservedPoolNamespaces: []string{metav1.NamespaceSystem},
		PinCPU:                 &defaultPinCPU,
		PinMemory:              &defaultPinMemory,
	}
}

// Our runtime configuration.
var dynamicPoolsOptions = defaultDynamicPoolsOptions().(*DynamicPoolsOptions)

// UnmarshalJSON makes sure all options from previous unmarshals get
// cleared before unmarshaling new data to the same address.
func (bo *DynamicPoolsOptions) UnmarshalJSON(data []byte) error {
	bow := dynamicPoolsOptionsWrapped{}
	if err := json.Unmarshal(data, &bow); err != nil {
		return err
	}
	*bo = DynamicPoolsOptions(bow)
	return nil
}

// Register us for configuration handling.
func init() {
	pkgcfg.Register(PolicyPath, PolicyDescription, dynamicPoolsOptions, defaultDynamicPoolsOptions)
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/dynamic-pools/metrics.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package dyp

import (
	"sort"
	"strconv"
	"strings"

	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	"github.com/prometheus/client_golang/prometheus"
)

// Prometheus Metric descriptor indices and descriptor table
const (
	dynamicPoolsDesc = iota
)

var descriptors = []*prometheus.Desc{
	dynamicPoolsDesc: prometheus.NewDesc(
		"DynamicPools",
		"CPUs",
		[]string{
			"dynamicPool_type",
			"cpu_class",
			"dynamicPool",
			"cpus",
			"mems",
			"containers",
			"tot_req_millicpu",
			"tot_limit_millicpu",
		}, nil,
	),
}

// Metrics defines the dynamicPools-specific metrics from policy level.
type Metrics struct {
	DynamicPools []*DynamicPoolMetrics
}

// DynamicPoolMetrics define metrics of a dynamicPool instance.
type DynamicPoolMetrics struct {
	// dynamicPool type metrics
	DefName  string
	CpuClass string
	// DynamicPool instance metrics
	PrettyName              string
	Cpus                    cpuset.CPUSet
	Mems                    string
	ContainerNames          string
	ContainerReqMilliCpus   int
	ContainerLimitMilliCpus int
}

// DescribeMetrics generates policy-specific prometheus metrics data
// descriptors.
func (p *dynamicPools) DescribeMetrics() []*prometheus.Desc {
	return descriptors
}

// PollMetrics provides policy metrics for monitoring.
func (p *dynamicPools) PollMetrics() policy.Metrics {
	policyMetrics := &Metrics{}
	policyMetrics.DynamicPools = make([]*DynamicPoolMetrics, len(p.dynamicPools))
	for index, dp := range p.dynamicPools {
		dm := &DynamicPoolMetrics{}
		policyMetrics.DynamicPools[index] = dm
		dm.DefName = dp.Def.Name
		dm.CpuClass = dp.Def.CpuClass
		dm.PrettyName = dp.PrettyName()
		dm.Cpus = dp.Cpus
		dm.Mems = dp.Mems.String()
		cNames := []string{}
		// Get container names, total requested milliCPUs and total limit milliCPUs.
		for _, containerIDs := range dp.PodIDs {
			for _, containerID := range containerIDs {
				if c, ok := p.cch.LookupContainer(containerID); ok {
					cNames = append(cNames, c.PrettyName())
					dm.ContainerReqMilliCpus += p.containerRequestedMilliCpus(containerID)
					dm.ContainerLimitMilliCpus += p.containerLimitedMilliCpus(containerID)
				}
			}
		}
		sort.Strings(cNames)
		dm.ContainerNames = strings.Join(cNames, ",")
	}

	return policyMetrics
}

// CollectMetrics generates prometheus metrics from cached/polled
// policy-specific metrics data.
func (p *dynamicPools) CollectMetrics(m policy.Metrics) ([]prometheus.Metric, error) {
	metrics, ok := m.(*Metrics)
	if !ok {
		return nil, dynamicPoolsError("type mismatch in dynamicPools metrics")
	}
	promMetrics := make([]prometheus.Metric, len(metrics.DynamicPools))
	for index, dm := range metrics.DynamicPools {
		promMetrics[index] = prometheus.MustNewConstMetric(
			descriptors[dynamicPoolsDesc],
			prometheus.GaugeValue,
			float64(dm.Cpus.Size()),
			dm.DefName,
			dm.CpuClass,
			dm.PrettyName,
			dm.Cpus.String(),
			dm.Mems,
			dm.ContainerNames,
			strconv.Itoa(dm.ContainerReqMilliCpus),
			strconv.Itoa(dm.ContainerLimitMilliCpus))
	}
	return promMetrics, nil
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/none/none-policy.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package none

import (
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/prometheus/client_golang/prometheus"
)

const (
	// PolicyName is the name used to activate this policy implementation.
	PolicyName = policy.NonePolicy
	// PolicyDescription is a short description of this policy.
	PolicyDescription = "A no-op policy, doing pretty much nothing."
)

type none struct {
	logger.Logger
	cch cache.Cache
}

var _ policy.Backend = &none{}

// CreateNonePolicy creates a new policy instance.
func CreateNonePolicy(opts *policy.BackendOptions) policy.Backend {
	n := &none{Logger: logger.NewLogger(PolicyName)}
	n.Info("creating policy...")
	return n
}

// Name returns the name of this policy.
func (n *none) Name() string {
	return PolicyName
}

// Description returns the description for this policy.
func (n *none) Description() string {
	return PolicyDescription
}

// Start prepares this policy for accepting allocation/release requests.
func (n *none) Start(add []cache.Container, del []cache.Container) error {
	n.Debug("got started...")
	return nil
}

// Sync synchronizes the active policy state.
func (n *none) Sync(add []cache.Container, del []cache.Container) error {
	n.Debug("(not) synchronizing policy state")
	return nil
}

// AllocateResources is a resource allocation request for this policy.
func (n *none) AllocateResources(c cache.Container) error {
	n.Debug("(not) allocating container %s...", c.PrettyName())
	return nil
}

// ReleaseResources is a resource release request for this policy.
func (n *none) ReleaseResources(c cache.Container) error {
	n.Debug("(not) releasing container %s...", c.PrettyName())
	return nil
}

// UpdateResources is a resource allocation update request for this policy.
func (n *none) UpdateResources(c cache.Container) error {
	n.Debug("(not) updating container %s...", c.PrettyName())
	return nil
}

// Rebalance tries to find an optimal allocation of resources for the current containers.
func (n *none) Rebalance() (bool, error) {
	n.Debug("(not) rebalancing containers...")
	return false, nil
}

// HandleEvent handles policy-specific events.
func (n *none) HandleEvent(*events.Policy) (bool, error) {
	n.Debug("(not) handling event...")
	return false, nil
}

// ExportResourceData provides resource data to export for the container.
func (n *none) ExportResourceData(c cache.Container) map[string]string {
	return nil
}

// Introspect provides data for external introspection.
func (n *none) Introspect(*introspect.State) {
	return
}

// PollMetrics provides policy metrics for monitoring.
func (p *none) PollMetrics() policy.Metrics {
	return nil
}

// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
func (p *none) DescribeMetrics() []*prometheus.Desc {
	return nil
}

// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
func (p *none) CollectMetrics(policy.Metrics) ([]prometheus.Metric, error) {
	return nil, nil
}

// Register us as a policy implementation.
func init() {
	policy.Register(PolicyName, PolicyDescription, CreateNonePolicy)
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/podpools/flags.go
================================================
// Copyright 2020-2021 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package podpools

import (
	"bytes"
	"encoding/json"
	"fmt"

	pkgcfg "github.com/intel/cri-resource-manager/pkg/config"
)

// PodpoolsOptions contains configuration options specific to this policy.
type PodpoolsOptions struct {
	// PinCPU controls pinning containers to CPUs.
	PinCPU bool `json:"PinCPU,omitempty"`
	// PinMemory controls pinning containers to memory nodes.
	PinMemory bool `json:"PinMemory,omitempty"`
	// PoolDefs contains pool definitions
	PoolDefs []*PoolDef `json:"Pools,omitempty"`
}

// PoolDef contains a pool definition.
type PoolDef struct {
	// Name is the name of the pool, or name prefix of
	// multi-instance pools.
	Name string `json:"Name"`
	// CPU specifies the number of CPUs exclusively usable by
	// pods in the pool.
	CPU string `json:"CPU"`
	// MaxPods specifies the maximum number of pods assigned to
	// the pool. 0 (the default) means unlimited. -1 means no
	// pods.
	MaxPods int `json:"MaxPods"`
	// Instances specifies the number of multi-instance pools,
	// either directly or as CPU (count/percentage) reserved for
	// instances. The default is 1.
	Instances string `json:"Instances,omitempty"`
	// FillOrder specifies how multi-instance pools are filled.
	FillOrder FillOrder `json:"FillOrder"`
	// For the future: when enabling dynamic (on-demand) pool
	// instantiation, consider different ways of handling the case
	// of MaxPods>1, FillOrder==Balanced. Creating underloaded
	// pool instances will consume CPUs from other pool instances,
	// in a bad case causing workload migrations between memory
	// controllers when rearranging pool load is needed for
	// creation of new pools.
}

// FillOrder specifies the order in which pool instances should be filled.
type FillOrder int

const (
	FillBalanced FillOrder = iota
	FillPacked
	FillFirstFree
)

var fillOrderNames = map[FillOrder]string{
	FillBalanced:  "Balanced",
	FillPacked:    "Packed",
	FillFirstFree: "FirstFree",
}

// String stringifies a FillOrder
func (fo FillOrder) String() string {
	if fon, ok := fillOrderNames[fo]; ok {
		return fon
	}
	return fmt.Sprintf("#UNNAMED-FILLORDER(%d)", int(fo))
}

// MarshalJSON marshals a FillOrder as a quoted json string
func (fo FillOrder) MarshalJSON() ([]byte, error) {
	buffer := bytes.NewBufferString(fmt.Sprintf("%q", fo))
	return buffer.Bytes(), nil
}

// UnmarshalJSON unmarshals a FillOrder quoted json string to the enum value
func (fo *FillOrder) UnmarshalJSON(b []byte) error {
	var fillOrderName string
	err := json.Unmarshal(b, &fillOrderName)
	if err != nil {
		return err
	}
	for foID, foName := range fillOrderNames {
		if foName == fillOrderName {
			*fo = foID
			return nil
		}
	}
	return podpoolsError("invalid fill order %q", fillOrderName)
}

// defaultPodpoolsOptions returns a new PodpoolsOptions instance, all initialized to defaults.
func defaultPodpoolsOptions() interface{} {
	return &PodpoolsOptions{
		PinCPU:    true,
		PinMemory: true,
	}
}

// Our runtime configuration.
var podpoolsOptions = defaultPodpoolsOptions().(*PodpoolsOptions)

// Register us for configuration handling.
func init() {
	pkgcfg.Register(PolicyPath, PolicyDescription, podpoolsOptions, defaultPodpoolsOptions)
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/podpools/metrics.go
================================================
// Copyright 2020-2021 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package podpools

import (
	"fmt"
	"sort"
	"strconv"

	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	"github.com/intel/cri-resource-manager/pkg/procstats"
	"github.com/intel/cri-resource-manager/pkg/sysfs"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	"github.com/prometheus/client_golang/prometheus"
)

// Metrics defines the podpools-specific metrics from policy level.
type Metrics struct {
	PoolMetrics map[string]*PoolMetrics
}

// PoolMetrics defines the podpools-specific metrics from pool level.
type PoolMetrics struct {
	DefName        string
	PrettyName     string
	CPUs           cpuset.CPUSet
	CPUIds         []int
	MilliCPUs      string
	Memory         string
	ContainerNames string
	PodNames       string
}

// Prometheus Metric descriptor indices and descriptor table
const (
	cpuUsageDesc = iota
	poolCPUUsageDesc
)

var descriptors = []*prometheus.Desc{
	cpuUsageDesc: prometheus.NewDesc(
		"cpu_usage",
		"CPU usage per logical processor",
		[]string{
			"cpu",
		}, nil,
	),
	poolCPUUsageDesc: prometheus.NewDesc(
		"pool_cpu_usage",
		"CPU usage for a given pool",
		[]string{
			"policy",
			"pretty_name",
			"def_name",
			"CPUs",
			"memory",
			"pool_size",
			"pod_name",
			"container_name",
		}, nil,
	),
}

var cpuTimeStat *procstats.CPUTimeStat

// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
func (p *podpools) DescribeMetrics() []*prometheus.Desc {
	return descriptors
}

// PollMetrics provides policy metrics for monitoring.
func (p *podpools) PollMetrics() policy.Metrics {
	if p.pools == nil || len(p.pools) <= 0 {
		log.Error("Failed to pull metrics.")
		return nil
	}
	policyMetrics := &Metrics{}
	policyMetrics.PoolMetrics = make(map[string]*PoolMetrics, len(p.pools))

	for _, pool := range p.pools {
		policyMetrics.PoolMetrics[pool.PrettyName()] = &PoolMetrics{}
		policyMetrics.PoolMetrics[pool.PrettyName()].DefName = pool.Def.Name
		policyMetrics.PoolMetrics[pool.PrettyName()].PrettyName = pool.PrettyName()
		policyMetrics.PoolMetrics[pool.PrettyName()].CPUs = pool.CPUs
		policyMetrics.PoolMetrics[pool.PrettyName()].CPUIds = pool.CPUs.List()
		policyMetrics.PoolMetrics[pool.PrettyName()].MilliCPUs = strconv.Itoa(pool.CPUs.Size() * 1000)
		policyMetrics.PoolMetrics[pool.PrettyName()].Memory = pool.Mems.String()
		policyMetrics.PoolMetrics[pool.PrettyName()].ContainerNames = ""
		policyMetrics.PoolMetrics[pool.PrettyName()].PodNames = ""
		if len(pool.PodIDs) > 0 {
			podIds := make([]string, 0, len(pool.PodIDs))
			for podId := range pool.PodIDs {
				podIds = append(podIds, podId)
			}
			sort.Sort(sort.StringSlice(podIds))
			for _, podId := range podIds {
				for _, containerId := range pool.PodIDs[podId] {
					if container, ok := p.cch.LookupContainer(containerId); ok {
						containerName := container.PrettyName()
						if policyMetrics.PoolMetrics[pool.PrettyName()].ContainerNames == "" {
							policyMetrics.PoolMetrics[pool.PrettyName()].ContainerNames = containerName
						} else {
							policyMetrics.PoolMetrics[pool.PrettyName()].ContainerNames = fmt.Sprintf("%s,%s", policyMetrics.PoolMetrics[pool.PrettyName()].ContainerNames, containerName)
						}
					}
				}
				if pod, ok := p.cch.LookupPod(podId); ok {
					podName := pod.GetName()
					if policyMetrics.PoolMetrics[pool.PrettyName()].PodNames == "" {
						policyMetrics.PoolMetrics[pool.PrettyName()].PodNames = podName
					} else {
						policyMetrics.PoolMetrics[pool.PrettyName()].PodNames = fmt.Sprintf("%s,%s", policyMetrics.PoolMetrics[pool.PrettyName()].PodNames, podName)
					}
				}
			}
		}
	}
	return policyMetrics
}

// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
func (p *podpools) CollectMetrics(m policy.Metrics) ([]prometheus.Metric, error) {
	metrics, ok := m.(*Metrics)
	if !ok {
		return nil, fmt.Errorf("Wrong podpools metrics.")
	}
	if cpuTimeStat == nil {
		if initSys, err := sysfs.DiscoverSystem(); err != nil {
			return nil, err
		} else {
			cpuCount := len(initSys.CPUIDs())
			cpuTimeStat = &procstats.CPUTimeStat{
				PrevIdleTime:       make([]uint64, cpuCount),
				PrevTotalTime:      make([]uint64, cpuCount),
				CurIdleTime:        make([]uint64, cpuCount),
				CurTotalTime:       make([]uint64, cpuCount),
				DeltaIdleTime:      make([]uint64, cpuCount),
				DeltaTotalTime:     make([]uint64, cpuCount),
				CPUUsage:           make([]float64, cpuCount),
				IsGetCPUUsageBegin: false,
			}
		}
	}
	err := cpuTimeStat.GetCPUTimeStat()
	if err != nil {
		return nil, err
	}
	cpuMetrics, err := updateCPUUsageMetrics()
	if err != nil {
		return nil, err
	}
	poolCPUMetrics, err := updatePoolCPUUsageMetrics(metrics)
	if err != nil {
		return nil, err
	}
	return append(cpuMetrics, poolCPUMetrics...), nil
}

// updateCPUUsageMetrics collects the CPU usage per logical processor.
func updateCPUUsageMetrics() ([]prometheus.Metric, error) {
	cpuTimeStat.RLock()
	defer cpuTimeStat.RUnlock()
	sys, err := sysfs.DiscoverSystem()
	if err != nil {
		return nil, err
	}
	onlined := sys.CPUSet().Difference(sys.Offlined())
	onlinedUsage := make([]prometheus.Metric, onlined.Size())
	for i, j := range onlined.List() {
		onlinedUsage[i] = prometheus.MustNewConstMetric(
			descriptors[cpuUsageDesc],
			prometheus.GaugeValue,
			cpuTimeStat.CPUUsage[j],
			strconv.Itoa(j),
		)
	}
	return onlinedUsage, nil
}

// updatePoolCPUUsageMetrics collects the CPU usage of pools defined by podpools-policy.
func updatePoolCPUUsageMetrics(ppm *Metrics) ([]prometheus.Metric, error) {
	if ppm == nil {
		return nil, fmt.Errorf("Podpools metrics used to count pool CPU usage is missing.")
	}
	// Sort the pool metrics.
	poolNames := make([]string, 0, len(ppm.PoolMetrics))
	for poolName := range ppm.PoolMetrics {
		poolNames = append(poolNames, poolName)
	}
	sort.Sort(sort.StringSlice(poolNames))

	// Calculate the CPU usage of a pool and send to prometheus.
	poolCPUUsageMetrics := make([]prometheus.Metric, len(poolNames))
	poolCPUUsageList := make(map[string]float64, len(poolNames))
	cpuTimeStat.RLock()
	defer cpuTimeStat.RUnlock()
	for index, poolName := range poolNames {
		poolDeltaIdleTime := uint64(0)
		poolDeltaTotalTime := uint64(0)
		for _, cpuId := range ppm.PoolMetrics[poolName].CPUIds {
			poolDeltaIdleTime += cpuTimeStat.DeltaIdleTime[cpuId]
			poolDeltaTotalTime += cpuTimeStat.DeltaTotalTime[cpuId]
		}
		poolCPUUsageList[poolName] = 0.0
		if poolDeltaTotalTime != 0 {
			sys, err := sysfs.DiscoverSystem()
			if err != nil {
				return nil, err
			}
			poolCPUOnlined := ppm.PoolMetrics[poolName].CPUs.Difference(sys.Offlined())
			poolCPUUsageList[poolName] = (1.0 - float64(poolDeltaIdleTime)/float64(poolDeltaTotalTime)) * 100.0 * float64(len(poolCPUOnlined.List()))
		}
		poolCPUUsageMetrics[index] = prometheus.MustNewConstMetric(
			descriptors[poolCPUUsageDesc],
			prometheus.GaugeValue,
			poolCPUUsageList[poolName],
			PolicyName,
			poolName,
			ppm.PoolMetrics[poolName].DefName,
			ppm.PoolMetrics[poolName].CPUs.String(),
			ppm.PoolMetrics[poolName].Memory,
			ppm.PoolMetrics[poolName].MilliCPUs,
			ppm.PoolMetrics[poolName].PodNames,
			ppm.PoolMetrics[poolName].ContainerNames,
		)
	}
	return poolCPUUsageMetrics, nil
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/podpools/podpools-policy.go
================================================
// Copyright 2020-2021 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package podpools

import (
	"fmt"
	"sort"
	"strconv"
	"strings"

	corev1 "k8s.io/api/core/v1"
	resapi "k8s.io/apimachinery/pkg/api/resource"

	pkgcfg "github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/cpuallocator"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	policyapi "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/utils"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	idset "github.com/intel/goresctrl/pkg/utils"
)

const (
	// PolicyName is the name used to activate this policy.
	PolicyName = "podpools"
	// PolicyDescription is a short description of this policy.
	PolicyDescription = "Pod-granularity workload placement"
	// PolicyPath is the path of this policy in the configuration hierarchy.
	PolicyPath = "policy." + PolicyName
	// podpoolKey is a pod annotation key, the value is a pod pool name.
	podpoolKey = "pool." + PolicyName + "." + kubernetes.ResmgrKeyNamespace
	// reservedPoolDefName is the name in the reserved pool definition.
	reservedPoolDefName = "reserved"
	// defaultPoolDefName is the name in the default pool definition.
	defaultPoolDefName = "default"
	// podMilliCPUErrorMargin is the maximum error in requested vs
	// allocated mCPUs per pod. For instance, 10 mCPU error margin
	// allows error of magnitude of +-0.5 mCPU/container up to 20
	// containers/pod.
	podMilliCPUErrorMargin = int64(10)
)

// podpools contains configuration and runtime attributes of the podpools policy
type podpools struct {
	options         *policyapi.BackendOptions // configuration common to all policies
	ppoptions       PodpoolsOptions           // podpools-specific configuration
	cch             cache.Cache               // cri-resmgr cache
	allowed         cpuset.CPUSet             // bounding set of CPUs we're allowed to use
	reserved        cpuset.CPUSet             // system-/kube-reserved CPUs
	reservedPoolDef *PoolDef                  // built-in definition of the reserved pool
	defaultPoolDef  *PoolDef                  // built-in definition of the default pool
	pools           []*Pool                   // pools for pods: reserved, default and user-defined
	podMaxMilliCPU  map[string]int64          // maximum total MilliCPUs requested by containers of pods in pools
	cpuAllocator    cpuallocator.CPUAllocator // CPU allocator used by the policy
}

// Pool contains attributes of a pool instance
type Pool struct {
	// Def is the definition from which this pool instance is created.
	Def *PoolDef
	// Instance is the index of this pool instance, starting from
	// zero for every pool definition.
	Instance int
	// CPUs is the set of CPUs exclusive to this pool instance only.
	CPUs cpuset.CPUSet
	// Mems is the set of memory nodes with minimal access delay
	// from CPUs.
	Mems idset.IDSet
	// PodIDs maps pod ID to list of container IDs.
	// - len(PodIDs) is the number of pods in the pool.
	// - len(PodIDs[podID]) is the number of containers of podID
	//   currently assigned to the pool.
	// - Def.MaxPods - len(PodIDs) is free pod capacity.
	PodIDs map[string][]string
}

var log logger.Logger = logger.NewLogger("policy")

// String is a stringer for a pool.
func (pool Pool) String() string {
	podCount := len(pool.PodIDs)
	contCount := 0
	for _, contIDs := range pool.PodIDs {
		contCount += len(contIDs)
	}
	s := fmt.Sprintf("%s{cpus:%s, mems:%s, pods:%d/%d, containers:%d}",
		pool.PrettyName(), pool.CPUs, pool.Mems, podCount, pool.Def.MaxPods, contCount)
	return s
}

// PrettyName returns unique name for a pool.
func (pool Pool) PrettyName() string {
	return fmt.Sprintf("%s[%d]", pool.Def.Name, pool.Instance)
}

// CreatePodpoolsPolicy creates a new policy instance.
func CreatePodpoolsPolicy(policyOptions *policy.BackendOptions) policy.Backend {
	p := &podpools{
		options: policyOptions,
		cch:     policyOptions.Cache,
		reservedPoolDef: &PoolDef{
			Name:    reservedPoolDefName,
			MaxPods: 0,
		},
		defaultPoolDef: &PoolDef{
			Name:    defaultPoolDefName,
			MaxPods: 0,
		},
		podMaxMilliCPU: make(map[string]int64),
		cpuAllocator:   cpuallocator.NewCPUAllocator(policyOptions.System),
	}
	log.Info("creating %s policy...", PolicyName)
	// Handle common policy options: AvailableResources and ReservedResources.
	// p.allowed: CPUs available for the policy
	if allowed, ok := policyOptions.Available[policyapi.DomainCPU]; ok {
		p.allowed = allowed.(cpuset.CPUSet)
	} else {
		// Available CPUs not specified, default to all on-line CPUs.
		p.allowed = policyOptions.System.CPUSet().Difference(policyOptions.System.Offlined())
	}
	// p.reserved: CPUs reserved for kube-system pods, subset of p.allowed.
	p.reserved = cpuset.New()
	if reserved, ok := p.options.Reserved[policyapi.DomainCPU]; ok {
		switch v := reserved.(type) {
		case cpuset.CPUSet:
			p.reserved = p.allowed.Intersection(v)
		case resapi.Quantity:
			reserveCnt := (int(v.MilliValue()) + 999) / 1000
			cpus, err := p.cpuAllocator.AllocateCpus(&p.allowed, reserveCnt, cpuallocator.PriorityNone)
			if err != nil {
				log.Fatal("failed to allocate reserved CPUs: %s", err)
			}
			p.reserved = cpus
			p.allowed = p.allowed.Union(cpus)
		}
	}
	if p.reserved.IsEmpty() {
		log.Fatal("%s cannot run without reserved CPUs that are also AvailableResources", PolicyName)
	}
	// Handle policy-specific options
	log.Debug("creating %s configuration", PolicyName)
	if err := p.setConfig(podpoolsOptions); err != nil {
		log.Fatal("failed to create %s policy: %v", PolicyName, err)
	}

	pkgcfg.GetModule(PolicyPath).AddNotify(p.configNotify)

	return p
}

// Name returns the name of this policy.
func (p *podpools) Name() string {
	return PolicyName
}

// Description returns the description for this policy.
func (p *podpools) Description() string {
	return PolicyDescription
}

// Start prepares this policy for accepting allocation/release requests.
func (p *podpools) Start(add []cache.Container, del []cache.Container) error {
	log.Info("%s policy started", PolicyName)
	return p.Sync(p.cch.GetContainers(), del)
}

// Sync synchronizes the active policy state.
func (p *podpools) Sync(add []cache.Container, del []cache.Container) error {
	log.Debug("synchronizing state...")
	for _, c := range del {
		p.ReleaseResources(c)
	}
	for _, c := range add {
		p.AllocateResources(c)
	}
	return nil
}

// AllocateResources is a resource allocation request for this policy.
func (p *podpools) AllocateResources(c cache.Container) error {
	log.Debug("allocating container %s...", c.PrettyName())
	// Assign container to correct pool.
	pod, ok := c.GetPod()
	if !ok {
		return podpoolsError("cannot find pod of container %s from the cache", c.PrettyName())
	}
	if pool := p.allocatePool(pod); pool != nil {
		p.assignContainer(c, pool)
		p.trackPodCPU(pod, pool)
		if log.DebugEnabled() {
			log.Debug(p.dumpPool(pool))
		}
	} else {
		// Cannot assign container to any of the pooled CPUs.
		return podpoolsError("cannot find CPUs to run container %s - no default or reserved CPUs available", c.PrettyName())
	}
	return nil
}

// ReleaseResources is a resource release request for this policy.
func (p *podpools) ReleaseResources(c cache.Container) error {
	log.Debug("releasing container %s...", c.PrettyName())
	pod, ok := c.GetPod()
	if !ok {
		return podpoolsError("cannot find pod of container %s from the cache", c.PrettyName())
	}
	if pool := p.allocatedPool(pod); pool != nil {
		p.dismissContainer(c, pool)
		if log.DebugEnabled() {
			log.Debug(p.dumpPool(pool))
		}
		if p.containersInPool(pod, pool) == 0 {
			log.Debug("all containers removed, free pool allocation %s for pod %q", pool.PrettyName(), pod.GetName())
			p.validatePodCPU(pod, pool)
			p.freePool(pod, pool)
		}
	} else {
		log.Debug("ReleaseResources: pool-less container %s, nothing to release", c.PrettyName())
	}
	return nil
}

// UpdateResources is a resource allocation update request for this policy.
func (p *podpools) UpdateResources(c cache.Container) error {
	log.Debug("(not) updating container %s...", c.PrettyName())
	return nil
}

// Rebalance tries to find an optimal allocation of resources for the current containers.
func (p *podpools) Rebalance() (bool, error) {
	log.Debug("(not) rebalancing containers...")
	return false, nil
}

// HandleEvent handles policy-specific events.
func (p *podpools) HandleEvent(*events.Policy) (bool, error) {
	log.Debug("(not) handling event...")
	return false, nil
}

// ExportResourceData provides resource data to export for the container.
func (p *podpools) ExportResourceData(c cache.Container) map[string]string {
	return nil
}

// Introspect provides data for external introspection.
func (p *podpools) Introspect(*introspect.State) {
	return
}

// allocatedPool returns a pool already allocated for a pod.
func (p *podpools) allocatedPool(pod cache.Pod) *Pool {
	podID := pod.GetID()
	pools := filterPools(p.pools,
		func(pl *Pool) bool { _, ok := pl.PodIDs[podID]; return ok })
	if len(pools) == 0 {
		return nil
	}
	return pools[0]
}

// allocatePool returns a pool allocated for a pod.
func (p *podpools) allocatePool(pod cache.Pod) *Pool {
	if pool := p.allocatedPool(pod); pool != nil {
		return pool
	}
	poolDef := p.getPoolDef(pod)
	if poolDef == nil {
		return nil
	}
	// Try to find a suitable pool and allocate it for the pod.
	pools := filterPools(p.pools,
		func(pl *Pool) bool {
			return poolDef.Name == pl.Def.Name && (pl.Def.MaxPods > len(pl.PodIDs) || pl.Def.MaxPods == 0)
		})
	// Sort pools according to pool type fill order so that the
	// first pool in the list is the preferred one.
	switch poolDef.FillOrder {
	case FillBalanced:
		sort.Slice(pools, func(i, j int) bool {
			return len(pools[i].PodIDs) < len(pools[j].PodIDs)
		})
	case FillPacked:
		sort.Slice(pools, func(i, j int) bool {
			return len(pools[i].PodIDs) > len(pools[j].PodIDs)
		})
	case FillFirstFree:
		// FirstFree is already the first of the pools list.
	}
	if len(pools) == 0 {
		log.Error("cannot find free %q pool for pod %q, falling back to %q", poolDef.Name, pod.GetName(), defaultPoolDefName)
		pools = []*Pool{p.pools[1]}
	}
	// Found a suitable pool. Allocate it for the pod.
	podID := pod.GetID()
	pool := pools[0]
	pool.PodIDs[podID] = []string{}
	log.Debug("allocated pool %s[%d] for pod %q", pool.Def.Name, pool.Instance, pod.GetName())
	return pool
}

// containersInPool returns the number of containers of a pod in a pool.
func (p *podpools) containersInPool(pod cache.Pod, pool *Pool) int {
	if cnts, ok := pool.PodIDs[pod.GetID()]; ok {
		return len(cnts)
	}
	return 0
}

// dumpPool dumps pool contents in detail.
func (p *podpools) dumpPool(pool *Pool) string {
	conts := []string{}
	pods := []string{}
	for podID, contIDs := range pool.PodIDs {
		podName := podID
		if pod, ok := p.cch.LookupPod(podID); ok {
			podName = pod.GetName()
		}
		pods = append(pods, fmt.Sprintf("%s (mCPU: %d, max=%d)", podName, p.getPodMilliCPU(podID), p.podMaxMilliCPU[podID]))
		for _, contID := range contIDs {
			if cont, ok := p.cch.LookupContainer(contID); ok {
				conts = append(conts, cont.PrettyName())
			} else {
				conts = append(conts, podName+":"+contID)
			}
		}
	}
	s := fmt.Sprintf("Pool{Def.Name: %q, Instance: %d, CPUs: %s, Mems: %s, Def.MaxPods: %d, pods: %v, containers:%v}",
		pool.Def.Name, pool.Instance, pool.CPUs, pool.Mems, pool.Def.MaxPods, pods, conts)
	return s
}

// freePool removes an empty pod from a pool
func (p *podpools) freePool(pod cache.Pod, pool *Pool) {
	podID := pod.GetID()
	delete(pool.PodIDs, podID)
	delete(p.podMaxMilliCPU, podID)
}

// trackPodCPU keeps track on pod's CPU requests.
func (p *podpools) trackPodCPU(pod cache.Pod, pool *Pool) {
	// As we do not have direct information on total CPU resources
	// requested by a pod, we gather the information indirectly by
	// tracking the sum of requested CPUs of its running
	// containers. This enables reacting to misalignment between
	// CPU resources per pod in a pool and CPU resource requests
	// visible to the kube-scheduler.
	podID := pod.GetID()
	current := p.getPodMilliCPU(podID)
	if max, ok := p.podMaxMilliCPU[podID]; ok {
		if max < current {
			p.podMaxMilliCPU[podID] = current
		}
	} else {
		p.podMaxMilliCPU[podID] = current
	}
	// Check overbooking
	if cpuAvail := p.availableMilliCPUs(pool); cpuAvail < 0 {
		log.Error("overbooked pool %q, cpuset:%s: %dm / %dm CPUs used, %d mCPU available", pool.PrettyName(), pool.CPUs, pool.CPUs.Size()*1000-int(cpuAvail), pool.CPUs.Size()*1000, cpuAvail)
	}
}

// validatePodCPU compares max CPU requests against pool CPU capacity per pod.
func (p *podpools) validatePodCPU(pod cache.Pod, pool *Pool) {
	// Log pod configuration error if a pool has fixed amount of
	// CPUs per pod but the pod failed to request the correct
	// amount.
	podID := pod.GetID()
	if podmCPU, ok := p.podMaxMilliCPU[podID]; ok {
		if pool.Def.MaxPods > 0 {
			poolmCPUperPod := int64(pool.CPUs.Size() * 1000 / pool.Def.MaxPods)
			mCPUerr := podmCPU - poolmCPUperPod
			// Allow rounding errors (up and down) when
			// comparing the sum of containers' CPU usages
			// against milli-CPUs allocated per pod in its
			// pool.
			if mCPUerr < -podMilliCPUErrorMargin || mCPUerr > podMilliCPUErrorMargin {
				podName := ""
				if pod, ok := p.cch.LookupPod(podID); ok {
					podName = pod.GetName()
				}
				log.Error("bad CPU requests: pod %q requested %d mCPUs, but in pool %q pods must request %d mCPUs.", podName, podmCPU, pool.Def.Name, poolmCPUperPod)
			}
		}
	}
}

// getPodMilliCPU returns mCPUs requested by podID.
func (p *podpools) getPodMilliCPU(podID string) int64 {
	cpuRequested := int64(0)
	for _, c := range p.cch.GetContainers() {
		if c.GetPodID() == podID {
			if reqCpu, ok := c.GetResourceRequirements().Requests[corev1.ResourceCPU]; ok {
				cpuRequested += reqCpu.MilliValue()
			}
		}
	}
	return cpuRequested
}

// configNotify applies new configuration.
func (p *podpools) configNotify(event pkgcfg.Event, source pkgcfg.Source) error {
	log.Info("configuration %s", event)
	if err := p.setConfig(podpoolsOptions); err != nil {
		log.Error("config update failed: %v", err)
		return err
	}
	log.Info("config updated successfully")
	p.Sync(p.cch.GetContainers(), nil)
	return nil
}

// getPoolDefName returns the name of the pool definition of a pod.
func (p *podpools) getPoolDefName(pod cache.Pod) string {
	if poolDefName, ok := pod.GetEffectiveAnnotation(podpoolKey, ""); ok {
		return poolDefName
	}
	if pod.GetNamespace() == "kube-system" {
		return reservedPoolDefName
	}
	return defaultPoolDefName
}

// getPoolDef returns the pool definition of a pod.
func (p *podpools) getPoolDef(pod cache.Pod) *PoolDef {
	poolDefName := p.getPoolDefName(pod)
	if poolDefName == reservedPoolDefName {
		return p.reservedPoolDef
	}
	if poolDefName == defaultPoolDefName {
		return p.defaultPoolDef
	}
	for _, poolDef := range p.ppoptions.PoolDefs {
		if poolDef.Name == poolDefName {
			return poolDef
		}
	}
	log.Error("pod %q pool %q does not match any pool definition, falling back to %q", pod.GetName(), poolDefName, p.defaultPoolDef.Name)
	return p.defaultPoolDef
}

// applyPoolDef creates user-defined pools or reconfigures built-in
// pools according to the poolDef.
func (p *podpools) applyPoolDef(pools *[]*Pool, poolDef *PoolDef, freeCpus *cpuset.CPUSet, nonReservedCpuCount int) error {
	if len(*pools) < 2 {
		return podpoolsError("internal error: reserved and default pools missing, cannot apply pool definitions")
	}
	reservedPool := (*pools)[0]
	defaultPool := (*pools)[1]
	// Every PoolDef does one of the following:
	// 1. reconfigures the "reserved" pool (most restricted)
	// 2. reconfigutes the "default" pool (somewhat restricted)
	// 3. defines new user-defined pools.
	switch poolDef.Name {
	case "":
		// Case 0: bad name
		return podpoolsError("undefined or empty pool name")

	case reservedPool.Def.Name:
		// Case 1: reconfigure the "reserved" pool.
		// Forbid redefinition of CPU and Instances.
		if poolDef.CPU != "" || poolDef.Instances != "" {
			poolCount, cpusPerPool, err := parseInstancesCPUs(poolDef.Instances, poolDef.CPU, nonReservedCpuCount)
			if err != nil {
				return podpoolsError("pool %q: %w", poolDef.Name, err)
			}
			if poolCount != 1 {
				return podpoolsError("pool %q: cannot change the number of instances", poolDef.Name)
			}
			if cpusPerPool != reservedPool.CPUs.Size() {
				return podpoolsError("pool %q: number of CPUs is conflicting ReservedResources CPUs", poolDef.Name)
			}
		}
		reservedPool.Def.MaxPods = poolDef.MaxPods

	case defaultPool.Def.Name:
		// Case 2: reconfigure the "default" pool.
		// Allow redefinition of CPU but not Instances.
		if poolDef.CPU != "" || poolDef.Instances != "" {
			poolCount, cpusPerPool, err := parseInstancesCPUs(poolDef.Instances, poolDef.CPU, nonReservedCpuCount)
			if err != nil {
				return podpoolsError("pool %q: %w", poolDef.Name, err)
			}
			if poolCount != 1 {
				return podpoolsError("pool %q: cannot change the number of instances", poolDef.Name)
			}
			cpus, err := p.cpuAllocator.AllocateCpus(freeCpus, cpusPerPool, cpuallocator.PriorityNormal)
			if err != nil {
				return podpoolsError("could not allocate %d CPUs for pool %q: %w", cpusPerPool, poolDef.Name, err)
			}
			defaultPool.CPUs = cpus
		}
		defaultPool.Def.MaxPods = poolDef.MaxPods

	default:
		// Case 3: create new user-defined pool(s).
		poolCount, cpusPerPool, err := parseInstancesCPUs(poolDef.Instances, poolDef.CPU, nonReservedCpuCount)
		if err != nil {
			return podpoolsError("pool %q: %w", poolDef.Name, err)
		}
		if poolCount == 0 {
			return podpoolsError("pool %q: insufficient CPUs to create any instances", poolDef.Name)
		}
		if poolCount > 1 && poolDef.FillOrder == FillPacked && poolDef.MaxPods == 0 {
			return podpoolsError("pool %q: %d pool(s) unreachable due to unlimited pod capacity and FillOrder: %s", poolDef.Name, poolCount-1, poolDef.FillOrder)
		}
		log.Debug("allocating %d out of %d non-reserved CPUs for %d %q pools", poolCount*cpusPerPool, nonReservedCpuCount, poolCount, poolDef.Name)
		for poolIndex := 0; poolIndex < poolCount; poolIndex++ {
			if cpusPerPool > freeCpus.Size() {
				return podpoolsError("insufficient CPUs when trying to allocate %d CPUs for pool %s[%d]", cpusPerPool, poolDef.Name, poolIndex)
			}
			cpus, err := p.cpuAllocator.AllocateCpus(freeCpus, cpusPerPool, cpuallocator.PriorityNormal)
			if err != nil {
				return podpoolsError("could not allocate %d CPUs for instance %d of pool %q: %w", cpusPerPool, poolIndex, poolDef.Name, err)
			}
			pool := Pool{
				Def:      poolDef,
				Instance: poolIndex,
				CPUs:     cpus,
			}
			*pools = append(*pools, &pool)
		}
	}
	return nil
}

// setConfig takes new pool configuration into use.
func (p *podpools) setConfig(ppoptions *PodpoolsOptions) error {
	// Instantiate pools for pods.
	pools := []*Pool{}
	// Built-in reserved pool.
	reservedPool := Pool{
		Def:  p.reservedPoolDef,
		CPUs: p.reserved,
	}
	pools = append(pools, &reservedPool)
	// Built-in default pool.
	// The default pool will use reserved CPUs by default. If CPUs
	// are left over after constructing user-defined pools, those
	// will be used as the Default pool instead.
	defaultPool := Pool{
		Def:  p.defaultPoolDef,
		CPUs: reservedPool.CPUs,
	}
	pools = append(pools, &defaultPool)
	// Apply pool definitions from configuration.
	freeCpus := p.allowed.Clone()
	freeCpus = freeCpus.Difference(p.reserved)
	nonReservedCpuCount := freeCpus.Size()
	userPoolDefs := 0
	// First apply customizations to built-in pools: "reserved"
	// and "default".
	for _, poolDef := range ppoptions.PoolDefs {
		if poolDef.Name != reservedPoolDefName && poolDef.Name != defaultPoolDefName {
			continue
		}
		if err := p.applyPoolDef(&pools, poolDef, &freeCpus, nonReservedCpuCount); err != nil {
			return err
		}
	}
	// Update nonReservedCount: if the default pool is customized
	// with its own CPUs, do not count those CPUs in the
	// "Instances: 100%" syntax of user-defined pools.
	nonReservedCpuCount = freeCpus.Size()
	// Apply all user pool definitions, skip "reserved" and "default".
	for _, poolDef := range ppoptions.PoolDefs {
		if poolDef.Name == reservedPoolDefName || poolDef.Name == defaultPoolDefName {
			continue
		}
		if err := p.applyPoolDef(&pools, poolDef, &freeCpus, nonReservedCpuCount); err != nil {
			return err
		}
		userPoolDefs += 1
	}
	// Check if there are unallocated CPUs.
	if freeCpus.Size() > 0 {
		if defaultPool.CPUs.Intersection(reservedPool.CPUs).IsEmpty() {
			// User has reallocated "default" pool CPUs
			log.Debug("%d unused CPUs are added to the default pool.", freeCpus.Size())
			defaultPool.CPUs = defaultPool.CPUs.Union(freeCpus)
		} else {
			log.Debug("%d unused CPUs are used as the default pool.", freeCpus.Size())
			defaultPool.CPUs = freeCpus
		}
	}
	// Finish pool instance initialization.
	log.Info("%s policy pools:", PolicyName)
	for index, pool := range pools {
		pool.Mems = p.closestMems(pool.CPUs)
		pool.PodIDs = make(map[string][]string)
		log.Info("- pool %d: %s", index, pool)
	}
	// No errors in pool creation, take new configuration into use.
	log.Debug("new %s configuration:\n%s", PolicyName, utils.DumpJSON(ppoptions))
	p.pools = pools
	p.ppoptions = *ppoptions
	// Warning on multiple user-defined pools.
	if userPoolDefs > 1 {
		log.Warn("Multiple (%d) user-defined pool definitions on the node. kube-scheduler does not know which of the pools has CPUs left for new workloads, and may overbook pools on the node.", userPoolDefs)
	}
	return nil
}

// closestMems returns memory node IDs good for pinning containers
// that run on given CPUs
func (p *podpools) closestMems(cpus cpuset.CPUSet) idset.IDSet {
	mems := idset.NewIDSet()
	sys := p.options.System
	for _, nodeID := range sys.NodeIDs() {
		if !cpus.Intersection(sys.Node(nodeID).CPUSet()).IsEmpty() {
			mems.Add(nodeID)
		}
	}
	return mems
}

// filterPools returns pools for which the test function returns true
func filterPools(pools []*Pool, test func(*Pool) bool) (ret []*Pool) {
	for _, pool := range pools {
		if test(pool) {
			ret = append(ret, pool)
		}
	}
	return
}

// parseInstancesCPUs parses the number of pool instances and the
// number of CPUs per pool instance from PoolDef Instances and CPUs
// fields.
func parseInstancesCPUs(is string, cs string, freeCpus int) (int, int, error) {
	if cs == "" {
		return 0, 0, podpoolsError("missing CPUs")
	}
	c64, err := strconv.ParseInt(cs, 0, 32)
	if err != nil || c64 <= 0 {
		return 0, 0, podpoolsError("invalid CPUs per pool: %q, integer > 1 expected", cs)
	}
	cpusPerPool := int(c64)
	// Supported Instances specifications:
	// 0. Instances is an empty string.
	//    Create 1 instance.
	// 1. Instances: N %
	//    Use at most N % of freeCpus for all PoolDef instances.
	//    The number of instances is floor(freeCpus * N/100 / cpusPerPool).
	// 2. Instances: N CPUs
	//    Use at most N CPUs for all PoolDef instances.
	//    The number of instances is floor(N / cpusPerPool).
	// 3. Instances: N
	//    Create N instances from PoolDef.
	var instances int
	switch {
	case is == "":
		instances = 1
	case strings.HasSuffix(is, "%"):
		tis := strings.TrimSpace(strings.TrimSuffix(is, "%"))
		i64, err := strconv.ParseInt(tis, 0, 32)
		if err != nil || i64 < 0 {
			return 0, 0, podpoolsError("invalid Instances: %q", is)
		}
		instances = freeCpus * int(i64) / 100 / cpusPerPool
	case strings.HasSuffix(strings.ToLower(is), "cpu"):
		// All these are equivalent: N(cpu|cpus|CPU|CPUs|CPUS) for any N > 0.
		// Handling "CPU" suffix is an alias for "CPUs".
		is = strings.TrimSpace(strings.TrimSuffix(strings.ToLower(is), "cpu")) + "cpus"
		fallthrough
	case strings.HasSuffix(strings.ToLower(is), "cpus"):
		tis := strings.TrimSpace(strings.TrimSuffix(strings.ToLower(is), "cpus"))
		i64, err := strconv.ParseInt(tis, 0, 32)
		if err != nil || i64 < 0 {
			return 0, 0, podpoolsError("invalid Instances: %q", is)
		}
		if i64 > int64(freeCpus) {
			return 0, 0, podpoolsError("insufficient CPUs: %d required for instances but %d is available", i64, freeCpus)
		}
		instances = int(i64) / cpusPerPool
	default:
		i64, err := strconv.ParseInt(is, 0, 32)
		if err != nil || i64 < 0 {
			return 0, 0, podpoolsError("invalid Instances: %q", is)
		}
		instances = int(i64)
	}
	return instances, cpusPerPool, nil
}

// availableMilliCPU returns mCPUs available in a pool.
func (p *podpools) availableMilliCPUs(pool *Pool) int64 {
	cpuAvail := int64(pool.CPUs.Size() * 1000)
	cpuRequested := int64(0)
	for podID := range pool.PodIDs {
		cpuRequested += p.getPodMilliCPU(podID)
	}
	return cpuAvail - cpuRequested
}

// assignContainer adds a container to a pool
func (p *podpools) assignContainer(c cache.Container, pool *Pool) {
	log.Info("assigning container %s to pool %s", c.PrettyName(), pool)
	podID := c.GetPodID()
	pool.PodIDs[podID] = append(pool.PodIDs[podID], c.GetCacheID())
	p.pinCpuMem(c, pool.CPUs, pool.Mems)
}

// dismissContainer removes a container from a pool
func (p *podpools) dismissContainer(c cache.Container, pool *Pool) {
	podID := c.GetPodID()
	pool.PodIDs[podID] = removeString(pool.PodIDs[podID], c.GetCacheID())
}

// pinCpuMem pins container to CPUs and memory nodes if flagged
func (p *podpools) pinCpuMem(c cache.Container, cpus cpuset.CPUSet, mems idset.IDSet) {
	if p.ppoptions.PinCPU {
		log.Debug("  - pinning to cpuset: %s", cpus)
		c.SetCpusetCpus(cpus.String())
		if reqCpu, ok := c.GetResourceRequirements().Requests[corev1.ResourceCPU]; ok {
			mCpu := int(reqCpu.MilliValue())
			c.SetCPUShares(int64(cache.MilliCPUToShares(int64(mCpu))))
		}
	}
	if p.ppoptions.PinMemory {
		log.Debug("  - pinning to memory %s", mems)
		c.SetCpusetMems(mems.String())
	}
}

// podpoolsError formats an error from this policy.
func podpoolsError(format string, args ...interface{}) error {
	return fmt.Errorf(PolicyName+": "+format, args...)
}

// removeString returns the first occurrence of a string from string slice.
func removeString(strings []string, element string) []string {
	for index, s := range strings {
		if s == element {
			strings[index] = strings[len(strings)-1]
			return strings[:len(strings)-1]
		}
	}
	return strings
}

// Register us as a policy implementation.
func init() {
	policy.Register(PolicyName, PolicyDescription, CreatePodpoolsPolicy)
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/podpools/podpools-policy_test.go
================================================
// Copyright 2020-2021 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package podpools

import (
	"fmt"
	"strings"
	"testing"

	"github.com/intel/cri-resource-manager/pkg/cpuallocator"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
)

func validateError(t *testing.T, expectedError string, err error) bool {
	if expectedError != "" {
		if err == nil {
			t.Errorf("Expected error containing %q, did not get any error", expectedError)
			return false
		} else if !strings.Contains(err.Error(), expectedError) {
			t.Errorf("Expected error containing %q, but got %q", expectedError, err.Error())
			return false
		}
	} else {
		if err != nil {
			t.Errorf("Unexpected error %s", err)
			return false
		}
	}
	return true
}

func assertEqualPools(t *testing.T, expectedPool, gotPool Pool) bool {
	if expectedPool.String() != gotPool.String() {
		// Compares Def.Name, Def.Instance, .CPUs, .Mems, Def.MaxPods
		// and assigned pods/containers.
		t.Errorf("expected pool %s, got %s", expectedPool, gotPool)
		return false
	}
	if expectedPool.Def.Instances != gotPool.Def.Instances {
		t.Errorf("pools %s: PoolDef.Instances differ: expected %q, got %q", expectedPool, expectedPool.Def.Instances, gotPool.Def.Instances)
		return false
	}
	if expectedPool.Def.FillOrder != gotPool.Def.FillOrder {
		t.Errorf("pools %s: PoolDef.FillOrder differ: expected %s, got %s", expectedPool, expectedPool.Def.FillOrder, gotPool.Def.FillOrder)
		return false
	}
	return true
}

type mockCpuAllocator struct{}

func (mca *mockCpuAllocator) AllocateCpus(from *cpuset.CPUSet, cnt int, dontcare cpuallocator.CPUPriority) (cpuset.CPUSet, error) {
	switch {
	case from.Size() < cnt:
		return cpuset.New(), fmt.Errorf("cpuset %s does not have %d CPUs", from, cnt)
	case from.Size() == cnt:
		result := from.Clone()
		*from = cpuset.New()
		return result, nil
	default:
		result := cpuset.New()
		for _, cpu := range from.List() {
			if result.Size() >= cnt {
				break
			}
			result = result.Union(cpuset.New(cpu))
		}
		*from = from.Difference(result)
		return result, nil
	}
}

func (mca *mockCpuAllocator) ReleaseCpus(*cpuset.CPUSet, int, cpuallocator.CPUPriority) (cpuset.CPUSet, error) {
	return cpuset.New(), nil
}

func TestApplyPoolDef(t *testing.T) {
	reservedCpus1 := cpuset.CPUSet{}
	reservedPoolDef := PoolDef{
		Name: reservedPoolDefName,
	}
	defaultPoolDef := PoolDef{
		Name: defaultPoolDefName,
	}
	reservedPool := Pool{
		Def:  &reservedPoolDef,
		CPUs: reservedCpus1,
	}
	defaultPool := Pool{
		Def:  &defaultPoolDef,
		CPUs: reservedCpus1,
	}
	normalPoolsAtStart := []Pool{reservedPool, defaultPool}
	singlecpuSingleInstance := PoolDef{
		Name: "singlecpu",
		CPU:  "1",
	}
	quadcpuDualInstance := PoolDef{
		Name:      "quadcpu",
		CPU:       "4",
		Instances: "8 CPUs",
	}
	quadcpuMultiInstance := PoolDef{
		Name:      "quadcpu",
		CPU:       "4",
		Instances: "100%",
	}
	tcases := []struct {
		name             string
		pools            *[]Pool
		poolDef          PoolDef
		freeCpus         string // example: "0-2"
		expectedFreeCpus string // "": no check, "-": assert empty
		expectedError    string // "": error is not allowed, otherwise expected error substring
		expectedPools    *[]Pool
	}{
		// negative tests
		{
			name:          "call apply without built-in pools",
			pools:         &([]Pool{}),
			poolDef:       singlecpuSingleInstance,
			freeCpus:      "0-3",
			expectedError: "pools missing",
		},
		{
			name: "bad reserved CPUs",
			poolDef: PoolDef{
				Name: "reserved",
				CPU:  "two",
			},
			expectedError: "invalid CPUs",
		},
		{
			name: "bad reserved Instances",
			poolDef: PoolDef{
				Name:      "reserved",
				CPU:       "1",
				Instances: "0x",
			},
			expectedError: "invalid Instances",
		},
		{
			name: "bad default CPUs",
			poolDef: PoolDef{
				Name: "default",
				CPU:  "2500m",
			},
			freeCpus:      "0-8",
			expectedError: "invalid CPUs",
		},
		{
			name: "bad default Instances",
			poolDef: PoolDef{
				Name:      "default",
				CPU:       "0xf",
				Instances: "100 % CPUs",
			},
			freeCpus:      "0-95",
			expectedError: "invalid Instances",
		},
		{
			name: "bad user-defined CPUs",
			poolDef: PoolDef{
				Name: "mypool",
			},
			freeCpus:      "0-8",
			expectedError: "missing CPUs",
		},
		{
			name: "too many CPUs on user-defined Instances",
			poolDef: PoolDef{
				Name:      "user pool",
				CPU:       "1",
				Instances: "100 CPUs",
			},
			freeCpus:      "0-95",
			expectedError: "insufficient CPUs",
		},
		{
			name: "unnamed pool",
			poolDef: PoolDef{
				CPU:     "1",
				MaxPods: 1,
			},
			freeCpus:      "0-3",
			expectedError: "undefined or empty pool name",
		},
		{
			name: "unreachable pools",
			poolDef: PoolDef{
				Name:      "unlimited capacity",
				CPU:       "3",
				MaxPods:   0,
				FillOrder: FillPacked,
				Instances: "3",
			},
			freeCpus:      "0-95",
			expectedError: "2 pool(s) unreachable",
		},
		// redefine the reserved pool
		{
			name: "redefine reserved CPUs",
			poolDef: PoolDef{
				Name: "reserved",
				CPU:  "2",
			},
			freeCpus:      "0-3",
			expectedError: "conflicting ReservedResources CPUs",
		},
		{
			name: "redefine reserved instances",
			poolDef: PoolDef{
				Name:      "reserved",
				CPU:       "1",
				Instances: "2",
			},
			freeCpus:      "0-3",
			expectedError: "cannot change the number of instances",
		},
		{
			name: "redefine reserved MaxPods",
			poolDef: PoolDef{
				Name:    "reserved",
				MaxPods: 42,
			},
			freeCpus: "0-3",
			expectedPools: &[]Pool{
				{
					Def: &PoolDef{
						Name:    reservedPoolDefName,
						MaxPods: 42,
					},
					CPUs: reservedPool.CPUs,
				},
				defaultPool,
			},
		},
		// redefine the default pool
		{
			name: "redefine default CPUs",
			poolDef: PoolDef{
				Name: "default",
				CPU:  "2",
			},
			freeCpus:         "0-3",
			expectedFreeCpus: "2-3",
			expectedPools: &[]Pool{
				reservedPool,
				{
					Def: &PoolDef{
						Name: defaultPoolDefName,
					},
					CPUs: cpuset.MustParse("0-1"),
				},
			},
		},
		{
			name: "redefine default instances",
			poolDef: PoolDef{
				Name:      "default",
				CPU:       "1",
				Instances: "2",
			},
			freeCpus:      "0-3",
			expectedError: "cannot change the number of instances",
		},
		{
			name: "redefine default MaxPods",
			poolDef: PoolDef{
				Name:    "default",
				MaxPods: 52,
			},
			freeCpus: "0-3",
			expectedPools: &[]Pool{
				reservedPool,
				{
					Def: &PoolDef{
						Name:    defaultPoolDefName,
						MaxPods: 52,
					},
					CPUs: defaultPool.CPUs,
				},
			},
		},
		// user-defined pools
		{
			name:          "use one CPUs - insufficient",
			poolDef:       singlecpuSingleInstance,
			expectedError: "insufficient CPUs",
		},
		{
			name:             "use one CPU",
			freeCpus:         "0-3",
			poolDef:          singlecpuSingleInstance,
			expectedFreeCpus: "1-3",
			expectedPools: &[]Pool{
				reservedPool,
				defaultPool,
				{
					Def:      &singlecpuSingleInstance,
					Instance: 0,
					CPUs:     cpuset.MustParse("0"),
				},
			},
		},
		{
			name:             "use the only CPU",
			freeCpus:         "0",
			poolDef:          singlecpuSingleInstance,
			expectedFreeCpus: "-",
		},
		{
			name:          "use 2x4 CPUs - insufficient",
			freeCpus:      "0-6",
			poolDef:       quadcpuDualInstance,
			expectedError: "insufficient CPUs",
		},
		{
			name:             "use 2x4 CPUs - consume all",
			freeCpus:         "0-7",
			poolDef:          quadcpuDualInstance,
			expectedFreeCpus: "-",
		},
		{
			name:             "use 2x4 CPUs - CPUs left",
			freeCpus:         "0-8",
			poolDef:          quadcpuDualInstance,
			expectedFreeCpus: "8",
		},
		{
			name:          "use all cpus - but insufficient",
			freeCpus:      "0-2",
			poolDef:       quadcpuMultiInstance,
			expectedError: "insufficient CPUs",
		},
		{
			name:             "use all cpus - partial",
			freeCpus:         "0-6",
			poolDef:          quadcpuMultiInstance,
			expectedFreeCpus: "4-6",
			expectedPools: &[]Pool{
				reservedPool,
				defaultPool,
				{
					Def:      &quadcpuMultiInstance,
					Instance: 0,
					CPUs:     cpuset.MustParse("0-3"),
				},
			},
		},
		{
			name:             "use all cpus - every single one",
			freeCpus:         "0-7",
			poolDef:          quadcpuMultiInstance,
			expectedFreeCpus: "-",
			expectedPools: &[]Pool{
				reservedPool,
				defaultPool,
				{
					Def:      &quadcpuMultiInstance,
					Instance: 0,
					CPUs:     cpuset.MustParse("0-3"),
				}, {
					Def:      &quadcpuMultiInstance,
					Instance: 1,
					CPUs:     cpuset.MustParse("4-7"),
				},
			},
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			// Tests should not change original pools/pooldefs/freeCpus
			// Create copies before calling the function.
			pools := []*Pool{}
			if tc.pools == nil {
				tc.pools = &normalPoolsAtStart
			}
			for i := range *tc.pools {
				copyOfPool := (*tc.pools)[i]
				pools = append(pools, &copyOfPool)
			}
			freeCpus := cpuset.New()
			if tc.freeCpus != "" {
				freeCpus = cpuset.MustParse(tc.freeCpus)
			}
			p := &podpools{
				cpuAllocator: &mockCpuAllocator{},
			}
			err := p.applyPoolDef(&pools, &tc.poolDef, &freeCpus, freeCpus.Size())
			if ok := validateError(t, tc.expectedError, err); ok {
				// check freeCpus modified by applyPoolDef
				if tc.expectedFreeCpus != "" {
					expectedFreeCpus := cpuset.New()
					if tc.expectedFreeCpus != "-" {
						expectedFreeCpus = cpuset.MustParse(tc.expectedFreeCpus)
					}
					if expectedFreeCpus.Size() != freeCpus.Size() {
						t.Errorf("unexpected number of free CPUs left, expected %d, got %d", expectedFreeCpus.Size(), freeCpus.Size())
					}
				}
				// check pools modified by applyPoolDef
				if tc.expectedPools != nil {
					if len(pools) != len(*tc.expectedPools) {
						t.Errorf("unexpected number of new pools, expected %d got %d", len(pools), len(*tc.expectedPools))
						return
					}
					for i := 0; i < len(pools); i++ {
						if !assertEqualPools(t, (*tc.expectedPools)[i], *pools[i]) {
							return
						}
					}
				}
			}
		})
	}
}

func TestParseInstancesCPUs(t *testing.T) {
	tcases := []struct {
		name              string
		instances         string
		cpus              string
		freeCpus          int
		expectedInstances int
		expectedCPUs      int
		expectedError     string
	}{
		{
			name:          "empty CPUs",
			expectedError: "missing CPUs",
		},
		{
			name:          "bad CPUs",
			cpus:          "55%",
			expectedError: "> 1 expected",
		},
		{
			name:          "zero CPUs",
			cpus:          "0",
			expectedError: "> 1 expected",
		},
		{
			name:          "negative CPUs",
			cpus:          "-1",
			expectedError: "> 1 expected",
		},
		{
			name:              "42 CPUs, empty instances defaults to 1",
			cpus:              "42",
			expectedCPUs:      42,
			expectedInstances: 1,
		},
		{
			name:              "instances: 0",
			instances:         "0",
			cpus:              "2",
			freeCpus:          100,
			expectedInstances: 0,
			expectedCPUs:      2,
		},
		{
			name:              "instances: N",
			instances:         "10",
			cpus:              "2",
			freeCpus:          100,
			expectedInstances: 10,
			expectedCPUs:      2,
		},
		{
			name:              "instances: N CPUs",
			instances:         "10 CPUs",
			cpus:              "2",
			freeCpus:          100,
			expectedInstances: 10 / 2,
			expectedCPUs:      2,
		},
		{
			name:              "instances: 1 CPUS",
			instances:         "1 CPUS",
			cpus:              "1",
			freeCpus:          1,
			expectedInstances: 1,
			expectedCPUs:      1,
		},
		{
			name:              "instances: 1 cpu",
			instances:         "1 cpu",
			cpus:              "1",
			freeCpus:          2,
			expectedInstances: 1,
			expectedCPUs:      1,
		},
		{
			name:              "instances: 8cpu",
			instances:         "8cpu",
			cpus:              "2",
			freeCpus:          9,
			expectedInstances: 4,
			expectedCPUs:      2,
		},
		{
			name:              "instances: N %",
			instances:         "90 %",
			cpus:              "2",
			freeCpus:          10,
			expectedInstances: 4, // 10 * (90/100) / 2
			expectedCPUs:      2,
		},
		{
			name:              "instances: N%",
			instances:         "90%",
			cpus:              "90",
			freeCpus:          100,
			expectedInstances: 1,
			expectedCPUs:      90,
		},
		{
			name:              "instances: N %, not enough for any pools",
			instances:         "10 %",
			cpus:              "2",
			freeCpus:          10,
			expectedInstances: 0, // 10 * (10/100) / 2
			expectedCPUs:      2,
		},
		{
			name:          "instances: -N",
			instances:     "-10",
			cpus:          "2",
			expectedError: "invalid Instances",
		},
		{
			name:          "instances: -N CPUs",
			instances:     "-10 CPUs",
			cpus:          "2",
			expectedError: "invalid Instances",
		},
		{
			name:          "instances: N CPUs CPU",
			instances:     "2 CPUs CPU",
			cpus:          "2",
			expectedError: "invalid Instances",
		},
		{
			name:          "instances: -N %",
			instances:     "-10 %",
			cpus:          "2",
			expectedError: "invalid Instances",
		},
		{
			name:          "instances: N CPUs, N < cpus",
			instances:     "3 CPUs",
			cpus:          "4",
			expectedError: "insufficient CPUs",
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			instances, cpus, err := parseInstancesCPUs(tc.instances, tc.cpus, tc.freeCpus)
			if ok := validateError(t, tc.expectedError, err); ok {
				if instances != tc.expectedInstances || cpus != tc.expectedCPUs {
					t.Errorf("Expected (instances, cpus) (%v, %v), but got (%v, %v)", tc.expectedInstances, tc.expectedCPUs, instances, cpus)
				}
			}
		})
	}
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/static/flags.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package static

import (
	"github.com/intel/cri-resource-manager/pkg/config"
	"sigs.k8s.io/yaml"
)

// Options captures our configurable policy parameters.
type options struct {
	// Relax exclusive isolated CPU allocation criteria
	RelaxedIsolation bool `json:"RelaxedIsolation"`
	// Control whether containers are assigned to RDT classes by this policy.
	Rdt Tristate `json:"Rdt"`
}

// Tristate is boolean-like value with 3 states: on, off, automatically-determined.
type Tristate int

const (
	// TristateOff is unconditional boolean false
	TristateOff = iota
	// TristateOn is unconditional boolean true
	TristateOn
	// TristateAuto indicates boolean value should be inferred using other data.
	TristateAuto
)

// Our runtime configuration.
var opt = defaultOptions().(*options)

// UnmarshalJSON implements the unmarshaller function for "encoding/json"
func (t *Tristate) UnmarshalJSON(data []byte) error {
	var value interface{}
	if err := yaml.Unmarshal(data, &value); err != nil {
		return policyError("invalid Tristate value '%s': %v", string(data), err)
	}

	switch value.(type) {
	case bool:
		*t = map[bool]Tristate{false: TristateOff, true: TristateOn}[value.(bool)]
		return nil
	case string:
		if value.(string) == "auto" {
			*t = TristateAuto
			return nil
		}
	}

	return policyError("invalid Tristate value %v of type %T", value, value)
}

// MarshalJSON implements the marshaller function for "encoding/json"
func (t Tristate) MarshalJSON() ([]byte, error) {
	switch t {
	case TristateOff:
		return []byte("false"), nil
	case TristateOn:
		return []byte("true"), nil
	case TristateAuto:
		return []byte("\"auto\""), nil
	}
	return nil, policyError("invalid tristate value %v", t)
}

// String returns the value of Tristate as a string
func (t *Tristate) String() string {
	switch *t {
	case TristateOff:
		return "false"
	case TristateOn:
		return "true"
	}
	return "auto"
}

// defaultOptions returns a new options instance, all initialized to defaults.
func defaultOptions() interface{} {
	return &options{Rdt: TristateAuto}
}

// Register us for configuration handling.
func init() {
	config.Register(PolicyPath, PolicyDescription, opt, defaultOptions)
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/static/static-policy.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package static

import (
	"fmt"
	"strconv"

	corev1 "k8s.io/api/core/v1"
	"k8s.io/apimachinery/pkg/api/resource"

	"github.com/intel/cri-resource-manager/pkg/config"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/prometheus/client_golang/prometheus"

	"github.com/intel/cri-resource-manager/pkg/cpuallocator"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	"github.com/intel/cri-resource-manager/pkg/sysfs"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	idset "github.com/intel/goresctrl/pkg/utils"
)

const (
	// PolicyName is the name used to activate this policy implementation.
	PolicyName = "static"
	// PolicyDescription is a short description of this policy.
	PolicyDescription = "A reimplementation of the static CPU Manager policy."
	// PolicyPath is the path of this policy in the configuration hierarchy.
	PolicyPath = "policy." + PolicyName
)

type static struct {
	logger.Logger

	available     policy.ConstraintSet      // resource availability constraints
	reserved      policy.ConstraintSet      // system/kube-reservation constraints
	reservedCpus  cpuset.CPUSet             // CPUs reserved for system- and kube-tasks
	availableCpus cpuset.CPUSet             // CPUs free usable by this policy
	isolatedCpus  cpuset.CPUSet             // available CPUs isolated from normal scheduling
	sys           sysfs.System              // system/topology information
	numHT         int                       // number of hyperthreads per core
	state         cache.Cache               // policy/state cache
	cpuAllocator  cpuallocator.CPUAllocator // CPU allocator used by the policy
}

// Make sure static implements the policy backend interface.
var _ policy.Backend = &static{}

const (
	// keyPreferIsolated is the annotation used to mark pods preferring isolated CPUs.
	keyPreferIsolated = "prefer-isolated-cpus"
)

// NewStaticPolicy creates a new policy instance.
func NewStaticPolicy(opts *policy.BackendOptions) policy.Backend {
	s := &static{
		Logger:       logger.NewLogger(PolicyName),
		state:        opts.Cache,
		sys:          opts.System,
		available:    opts.Available,
		reserved:     opts.Reserved,
		cpuAllocator: cpuallocator.NewCPUAllocator(opts.System),
	}

	s.Info("creating policy...")

	s.numHT = s.sys.CPU(idset.ID(0)).ThreadCPUSet().Size()

	if err := s.checkConstraints(); err != nil {
		s.Fatal("cannot start with given constraints: %v", err)
	}

	config.GetModule(PolicyPath).AddNotify(s.configNotify)

	return s
}

// Name returns the name of this policy.
func (s *static) Name() string {
	return PolicyName
}

// Description returns the description for this policy.
func (s *static) Description() string {
	return PolicyDescription
}

// Start prepares this policy for accepting allocation/release requests.
func (s *static) Start(add []cache.Container, del []cache.Container) error {
	s.Debug("starting up...")

	if err := s.allocateReserved(); err != nil {
		return policyError("failed allocate reserved CPUs: %v", err)
	}

	s.Info("using reserved CPUs: %s", s.reservedCpus.String())
	s.Info("using available CPUs: %s", s.availableCpus.String())

	if err := s.validateState(s.state); err != nil {
		return policyError("failed to start with given cache/state: %v", err)
	}

	s.validateAssignments()

	return s.Sync(add, del)
}

// Sync synchronizes the active policy state.
func (s *static) Sync(add []cache.Container, del []cache.Container) error {
	s.Debug("synchronizing state...")
	for _, c := range del {
		s.ReleaseResources(c)
	}
	for _, c := range add {
		s.AllocateResources(c)
	}

	return nil
}

// AllocateResources is a resource allocation request for this policy.
func (s *static) AllocateResources(c cache.Container) error {
	s.Info("allocating resource for container %s...", c.PrettyName())

	container := c
	containerID := c.GetCacheID()
	pod, found := c.GetPod()
	if !found {
		return policyError("can't find pod for container %s", containerID)
	}

	err := s.AddContainer(pod, container, containerID)

	return err
}

// ReleaseResources is a resource release request for this policy.
func (s *static) ReleaseResources(c cache.Container) error {
	s.Info("releasing resources of container %s...", c.PrettyName())

	containerID := c.GetCacheID()
	err := s.RemoveContainer(containerID)

	return err
}

// UpdateResources is a resource allocation update request for this policy.
func (s *static) UpdateResources(c cache.Container) error {
	s.Debug("(not) updating container %s...", c.PrettyName())
	return nil
}

// Rebalance tries to find an optimal allocation of resources for the current containers.
func (s *static) Rebalance() (bool, error) {
	s.Debug("(not) rebalancing containers...")
	return false, nil
}

// HandleEvent handles policy-specific events.
func (s *static) HandleEvent(*events.Policy) (bool, error) {
	s.Debug("(not) handling event...")
	return false, nil
}

// ExportResourceData provides resource data to export for the container.
func (s *static) ExportResourceData(c cache.Container) map[string]string {
	data := map[string]string{}

	if cset, ok := s.GetCPUSet(c.GetCacheID()); !ok {
		cset = s.GetDefaultCPUSet()
		data[policy.ExportSharedCPUs] = cset.String()
	} else {
		isolated := cset.Intersection(s.sys.Isolated()).String()
		if isolated != "" {
			data[policy.ExportIsolatedCPUs] = isolated
		}
		exclusive := cset.Difference(s.sys.Isolated()).String()
		if exclusive != "" {
			data[policy.ExportExclusiveCPUs] = exclusive
		}
	}

	return data
}

// Introspect provides data for external introspection.
func (s *static) Introspect(*introspect.State) {
	return
}

// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
func (p *static) DescribeMetrics() []*prometheus.Desc {
	return nil
}

// PollMetrics provides policy metrics for monitoring.
func (p *static) PollMetrics() policy.Metrics {
	return nil
}

// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
func (p *static) CollectMetrics(policy.Metrics) ([]prometheus.Metric, error) {
	return nil, nil
}

func (s *static) configNotify(event config.Event, source config.Source) error {
	s.Info("configuration %s", event)

	if opt.RelaxedIsolation {
		s.Info("isolated exclusive CPUs: globally preferred (all pods)")
	} else {
		s.Info("isolated exclusive CPUs: per-pod (by annotation '%s')",
			kubernetes.ResmgrKey(keyPreferIsolated))
	}

	s.Info("rdt support set to %v", opt.Rdt)

	return nil
}

// assignableCPUs returns the set of unassigned CPUs minus the reserved set.
func (s *static) assignableCPUs(numCPUs int) cpuset.CPUSet {
	cset := s.GetDefaultCPUSet().Difference(s.reservedCpus)

	if cset.Size() < numCPUs && s.isolatedCpus.Size() > 0 {
		s.Warn("not enough non-isolated CPUs (%d) left for request (%d)",
			cset.Size(), numCPUs)
		cset = cset.Union(s.isolatedCpus)
	}

	return cset
}

// AddContainer is the CPU Manager static policy AddContainer function.
func (s *static) AddContainer(pod cache.Pod, container cache.Container, containerID string) error {
	if numCPUs := s.guaranteedCPUs(pod, container); numCPUs != 0 {
		s.Info("[cpumanager] static policy: AddContainer (pod: %s, container: %s, container id: %s)", pod.GetName(), container.GetName(), containerID)
		// container belongs in an exclusively allocated pool

		if _, ok := s.GetCPUSet(containerID); ok {
			s.Info("[cpumanager] static policy: container already present in state, skipping (container: %s, container id: %s)", container.GetName(), containerID)
			return nil
		}

		cpuset, err := s.allocateCPUs(numCPUs, containerID)
		if err != nil {
			s.Error("[cpumanager] unable to allocate %d CPUs (container id: %s, error: %v)", numCPUs, containerID, err)
			return err
		}
		s.Debug("setting cpuset of %s to allocated %s", containerID, cpuset)
		s.SetCPUSet(containerID, cpuset)
	}
	// container belongs in the shared pool (nothing to do; use default cpuset)
	return nil
}

// RemoveContainer is the CPU Manager static policy RemoveContainer function.
func (s *static) RemoveContainer(containerID string) error {
	s.Info("[cpumanager] static policy: RemoveContainer (container id: %s)", containerID)
	if toRelease, ok := s.GetCPUSet(containerID); ok {
		s.Delete(containerID)
		isolated := toRelease.Intersection(s.sys.Isolated())
		ordinary := toRelease.Difference(isolated)

		// Mutate the shared pool, adding released cpus.
		s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(ordinary))
		s.isolatedCpus = s.isolatedCpus.Union(isolated)
	}
	return nil
}

// Notes:
//   By default we assume workloads are not isolation-aware. We
//   only allocate isolated CPUs exclusively to containers if
//
//     - we globally prefer isolated exclusive CPUs, or
//     - the pod prefers isolated exclusive CPUs, or
//     - the container asks a single hyperthread worth of CPU

// cpuPreference checks if isolated CPUs should be tried and are preferred for an allocation.
func (s *static) cpuPreference(containerID string, numCPUs int) (bool, bool) {
	var try, prefer bool

	// Check if we prefer isolated CPUs (globally of per this containers pod).
	if opt.RelaxedIsolation {
		prefer = true
	} else {
		if c, ok := s.state.LookupContainer(containerID); ok {
			p, found := c.GetPod()
			if !found {
				s.Warn("can't find pod for container %s", c.GetID())
				return false, false
			}

			if value, ok := p.GetResmgrAnnotation(keyPreferIsolated); ok {
				if isolated, err := strconv.ParseBool(value); isolated {
					prefer = true
				} else {
					if err != nil {
						s.Error("invalid annotation '%s' on container %s, expecting boolean: %v",
							keyPreferIsolated, c.PrettyName(), err)
					}
				}
			}
		}
	}

	// Try isolated cpus when explicitly asked, or, if a single HT of CPU is requested
	if prefer || (numCPUs == 1 && s.isolatedCpus.Size() >= 1) {
		try = true
	}

	return try, prefer
}

// allocateOrdinaryCPUs tries to take a number of non-isolated CPUs.
func (s *static) allocateOrdinaryCPUs(numCPUs int) (cpuset.CPUSet, error) {
	assignable := s.assignableCPUs(numCPUs)
	result, err := s.takeByTopology(assignable, numCPUs, cpuallocator.PriorityHigh)

	if err != nil {
		return cpuset.New(), err
	}

	s.Info("allocated %d ordinary CPUs: %s", numCPUs, result.String())

	return result, nil
}

// allocateIsolatedCPUs tries to take a number of isolated CPUs, falling back to ordinary ones.
func (s *static) allocateIsolatedCPUs(numCPUs int, prefer bool) (cpuset.CPUSet, error) {
	result, err := s.takeByTopology(s.isolatedCpus, numCPUs, cpuallocator.PriorityHigh)

	switch {
	case err != nil:
		s.Info("falling back to %d ordinary CPUs", numCPUs)
		return s.allocateOrdinaryCPUs(numCPUs)
	case numCPUs == 1 || prefer:
		s.Info("allocated %d isolated CPUs: %s", numCPUs, result.String())
		return result, nil
	default:
		s.Info("falling back to %d ordinary CPUs", numCPUs)
		return s.allocateOrdinaryCPUs(numCPUs)
	}
}

// allocateCPUs allocates the requested number of CPUs.
func (s *static) allocateCPUs(numCPUs int, containerID string) (cpuset.CPUSet, error) {
	var result cpuset.CPUSet
	var err error

	s.Info("[cpumanager] allocateCpus: (numCPUs: %d)", numCPUs)

	if try, prefer := s.cpuPreference(containerID, numCPUs); !try {
		result, err = s.allocateOrdinaryCPUs(numCPUs)
	} else {
		result, err = s.allocateIsolatedCPUs(numCPUs, prefer)
	}

	if err != nil {
		return result, err
	}

	// Remove allocated CPUs from the shared and/or isolated CPUSet.
	s.SetDefaultCPUSet(s.GetDefaultCPUSet().Difference(result))
	s.isolatedCpus = s.isolatedCpus.Difference(result)

	s.Info("[cpumanager] allocateCPUs: returning \"%v\"", result)
	return result, nil
}

func (s *static) guaranteedCPUs(pod cache.Pod, container cache.Container) int {
	qos := pod.GetQOSClass()

	s.Debug("* QoS class for pod %s (%s) is %s", pod.GetID(), pod.GetName(), qos)

	if qos != corev1.PodQOSGuaranteed {
		return 0
	}
	cpuQuantity := container.GetResourceRequirements().Requests[corev1.ResourceCPU]
	if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
		return 0
	}
	// Safe downcast to do for all systems with < 2.1 billion CPUs.
	// Per the language spec, `int` is guaranteed to be at least 32 bits wide.
	// https://golang.org/ref/spec#Numeric_types
	return int(cpuQuantity.Value())
}

// Check our allocations constraints.
func (s *static) checkConstraints() error {
	online := s.sys.CPUSet().Difference(s.sys.Offlined())
	isolated := s.sys.Isolated().Intersection(online)
	online = online.Difference(isolated)

	cpus, ok := s.available[policy.DomainCPU]
	if !ok {
		s.availableCpus = online
	} else {
		switch cpus.(type) {
		case cpuset.CPUSet:
			s.availableCpus = cpus.(cpuset.CPUSet).Intersection(online)
		default:
			return policyError("invalid type for available CPU set: %T", cpus)
		}
	}

	s.isolatedCpus = isolated
	s.Info("system isolated CPUs: %s", s.isolatedCpus)

	return nil
}

// Allocate the requested reserved cpus.
func (s *static) allocateReserved() error {
	var err error
	var reserved cpuset.CPUSet

	cpus, ok := s.reserved[policy.DomainCPU]
	if !ok {
		return policyError("static policy cannot start without reserved CPUs")
	}

	switch cpus.(type) {
	case cpuset.CPUSet:
		reserved = cpus.(cpuset.CPUSet)
		if !reserved.Intersection(s.availableCpus).Equals(reserved) {
			return policyError("some reserved CPUs (%s) are unavailable",
				reserved.Difference(s.availableCpus).String())
		}
	case resource.Quantity:
		qty := cpus.(resource.Quantity)
		count := (int(qty.MilliValue()) + 999) / 1000
		from := s.availableCpus.Clone()
		if reserved, err = s.takeByTopology(from, count, cpuallocator.PriorityNormal); err != nil {
			return policyError("failed to reserve %d CPUs: %v", cpus.(int), err)
		}
	}

	s.reservedCpus = reserved

	return nil
}

// Validate the cache/state supplied for starting.
func (s *static) validateState(state cache.Cache) error {
	s.state = state

	tmpAssignments := s.GetCPUAssignments()
	tmpDefaultCPUset := s.GetDefaultCPUSet()
	allCPUs := s.availableCpus.Clone()
	isolated := s.isolatedCpus.Clone()

	// Default cpuset cannot be empty when assignments exist
	if tmpDefaultCPUset.IsEmpty() {
		if len(tmpAssignments) != 0 {
			return fmt.Errorf("default cpuset cannot be empty")
		}

		// state is empty initialize
		s.SetDefaultCPUSet(allCPUs)

		return nil
	}

	// State has already been initialized from file (is not empty)
	// 1. Check if the reserved cpuset is not part of default cpuset because:
	// - kube/system reserved have changed (increased) - may lead to some containers not being able to start
	// - user tampered with file
	if !s.reservedCpus.Intersection(tmpDefaultCPUset).Equals(s.reservedCpus) {
		return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
			s.reservedCpus.String(), tmpDefaultCPUset.String())
	}

	// 2. Check if state for static policy is consistent
	for cID, cset := range tmpAssignments {
		// None of the cpu in DEFAULT cset should be in s.assignments
		if !tmpDefaultCPUset.Intersection(cset).IsEmpty() {
			return fmt.Errorf("container id: %s cpuset: \"%s\" overlaps with default cpuset \"%s\"",
				cID, cset.String(), tmpDefaultCPUset.String())
		}

		// Remove any potentially taken isolated CPUs from the available isolated set.
		s.isolatedCpus = s.isolatedCpus.Difference(cset)
	}

	s.Info("available (unallocated) isolated CPUs: %s", s.isolatedCpus)

	// 3. It's possible that the set of available CPUs has changed since
	// the state was written. This can be due to for example
	// offlining a CPU when kubelet is not running. If this happens,
	// CPU manager will run into trouble when later it tries to
	// assign non-existent CPUs to containers. Validate that the
	// topology that was received during CPU manager startup matches with
	// the set of CPUs stored in the state.
	totalKnownCPUs := tmpDefaultCPUset.Clone()

	for _, cset := range tmpAssignments {
		totalKnownCPUs = totalKnownCPUs.Union(cset)
	}
	if !totalKnownCPUs.Equals(allCPUs) {
		if totalKnownCPUs.IsSubsetOf(allCPUs.Union(isolated)) {
			return nil
		}
		return fmt.Errorf("current available CPUs \"%s\" are not a superset of CPUs in state \"%s\"",
			allCPUs.Union(isolated).String(), totalKnownCPUs.String())
	}

	return nil
}

// Topology-aware-like allocation wrapper.
func (s *static) takeByTopology(available cpuset.CPUSet, numCPUs int, preferredPrio cpuallocator.CPUPriority) (cpuset.CPUSet, error) {
	from := &available
	cset, err := s.cpuAllocator.AllocateCpus(from, numCPUs, preferredPrio)
	if err != nil {
		return cset, err
	}

	return cset, err
}

// Validate static assignments, purge stale ones.
func (s *static) validateAssignments() {
	// Instead of relying/waiting for an external reconcilation loop to
	// clean up stale container/assignments, we do it ourselves upon startup.

	ca := s.GetCPUAssignments()
	for id, cset := range ca {
		if _, ok := s.state.LookupContainer(id); !ok {
			s.Info("Removing stale assignment of container %s (cpus %s)",
				id, cset.String())
			s.RemoveContainer(id)
		}
	}
}

// policyError creates a policy-specific formatted error
func policyError(format string, args ...interface{}) error {
	return fmt.Errorf(PolicyName+": "+format, args...)
}

//
// Kubelet CPU Manager / policy_static.go adaptation
//
// A set of rudimentary functions to get policy_static.go up and running
// with small enough changes that the code (above) remains recognisable
// for those who are already familiar with the original. These functions
// basically implements a CPU Manager state-like interface on top of our
// cache.

// ContainerCPUAssignments assigns CPU sets per container id.
type ContainerCPUAssignments map[string]cpuset.CPUSet

//
// Cache keys for storing the default cpuset (one for containers
// without exclusive allocations) and static assignments (cpusets
// for containers with exclusive allocations).

const (
	keyAssignments = "CPUAssignments"
	keyDefaultCPUs = "DefaultCPUSet"
)

// GetCPUAssignments gets the current CPU assignments from our state.
func (s *static) GetCPUAssignments() ContainerCPUAssignments {
	var ca map[string]cpuset.CPUSet

	if !s.state.GetPolicyEntry(keyAssignments, &ca) {
		s.Error("no cached CPU assignments")
	}

	if ca == nil {
		ca = make(map[string]cpuset.CPUSet)
		s.state.SetPolicyEntry(keyAssignments, ca)
	}

	return ca
}

// SetCPUAssginments sets the current CPU assignments in our state.
func (s *static) SetCPUAssignments(ca ContainerCPUAssignments) {
	s.state.SetPolicyEntry(keyAssignments, map[string]cpuset.CPUSet(ca))
}

// GetDefaultCPUSet gets the current default CPUSet from our state.
func (s *static) GetDefaultCPUSet() cpuset.CPUSet {
	var cset cpuset.CPUSet

	if !s.state.GetPolicyEntry(keyDefaultCPUs, &cset) {
		s.Error("no cached default CPU set")
	}

	return cset
}

// SetDefaultCPUSet sets the current default CPUSet in our state.
func (s *static) SetDefaultCPUSet(cset cpuset.CPUSet) {
	s.state.SetPolicyEntry(keyDefaultCPUs, cset)

	// update cpuset for containers with default assignment
	ca := s.GetCPUAssignments()
	for _, id := range s.state.GetContainerCacheIds() {
		if _, ok := ca[id]; !ok {
			s.SetCpusetCpus(id, cset.String())
		}
	}
}

// GetCPUSet gets the CPUSet for a container from our state.
func (s *static) GetCPUSet(containerID string) (cpuset.CPUSet, bool) {
	ca := s.GetCPUAssignments()
	cset, ok := ca[containerID]

	return cset.Clone(), ok
}

// SetCPUSet sets the CPUSet for a container in our state.
func (s *static) SetCPUSet(containerID string, cset cpuset.CPUSet) {
	ca := s.GetCPUAssignments()
	ca[containerID] = cset

	s.SetCPUAssignments(ca)
	s.SetCpusetCpus(containerID, cset.String())
}

// Delete deletes the given container from our state.
func (s *static) Delete(containerID string) {
	s.Debug("deleting container %s from assignments", containerID)

	ca := s.GetCPUAssignments()
	delete(ca, containerID)

	s.SetCPUAssignments(ca)
}

// SetCPUSetCpus updates cpuset.cpus for a container.
func (s *static) SetCpusetCpus(id, value string) error {
	c, ok := s.state.LookupContainer(id)
	if !ok {
		return policyError("can't find container '%s'", id)
	}

	c.SetCpusetCpus(value)
	s.Info("container %s: CpusetCpus set to %s", c.PrettyName(), value)

	return nil
}

// Register us as a policy implementation.
func init() {
	policy.Register(PolicyName, PolicyDescription, NewStaticPolicy)
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/static-plus/static-plus-policy.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package staticplus

import (
	"encoding/json"
	"fmt"
	"strconv"

	corev1 "k8s.io/api/core/v1"
	"k8s.io/apimachinery/pkg/api/resource"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/prometheus/client_golang/prometheus"

	"github.com/intel/cri-resource-manager/pkg/cpuallocator"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	"github.com/intel/cri-resource-manager/pkg/sysfs"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
)

const (
	// PolicyName is the name used to activate this policy implementation.
	PolicyName = "static-plus"
	// PolicyDescription is a short description of this policy.
	PolicyDescription = "A simple policy supporting exclusive/pinned and shared allocations."
	// Cache key for storing container resource allocations.
	keyAllocations = "allocations"
	// Cache key for storing the shared pool.
	keySharedPool = "shared-pool"
	// keyPreferIsolated is the annotation used to mark pods preferring isolated CPUs.
	keyPreferIsolated = "prefer-isolated-cpus"
)

// Assignment tracks resource assignments for a single container.
type Assignment struct {
	exclusive cpuset.CPUSet // exclusively allocated cpus
	shared    int           // milli-cpus to allocated from shared cpus
}

// Allocations track all resources allocations by the static+ policy.
type Allocations map[string]*Assignment

// static-plus policy runtime state.
type staticplus struct {
	logger.Logger
	offline      cpuset.CPUSet             // offlined cpus
	available    cpuset.CPUSet             // bounding set of cpus available for us
	reserved     cpuset.CPUSet             // pool (primarily) for system-/kube-tasks
	isolated     cpuset.CPUSet             // primary pool for exclusive allocations
	allocations  Allocations               // container cpu allocations
	sys          sysfs.System              // system/topologu information
	cache        cache.Cache               // system state/cache
	shared       cpuset.CPUSet             // pool for fractional and shared allocations
	cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy
}

// Make sure staticplus implements the policy backend interface.
var _ policy.Backend = &staticplus{}

// CreateStaticPlusPolicy creates a new policy instance.
func CreateStaticPlusPolicy(opts *policy.BackendOptions) policy.Backend {
	p := &staticplus{
		Logger:       logger.NewLogger(PolicyName),
		cache:        opts.Cache,
		sys:          opts.System,
		cpuAllocator: cpuallocator.NewCPUAllocator(opts.System),
	}

	p.Info("creating policy...")

	if err := p.setupPools(opts.Available, opts.Reserved); err != nil {
		p.Fatal("failed to set up cpu pools: %v", err)
	}

	p.dumpPools()

	return p
}

// Name returns the name of this policy.
func (p *staticplus) Name() string {
	return PolicyName
}

// Description returns the description for this policy.
func (p *staticplus) Description() string {
	return PolicyDescription
}

// Start prepares this policy for accepting allocation/release requests.
func (p *staticplus) Start(add []cache.Container, del []cache.Container) error {
	if err := p.restoreCache(); err != nil {
		return policyError("failed to start: %v", err)
	}

	if err := p.updatePools(); err != nil {
		return policyError("failed to start: %v", err)
	}

	return p.Sync(add, del)
}

// Sync synchronizes the state ofd this policy.
func (p *staticplus) Sync(add []cache.Container, del []cache.Container) error {
	p.Debug("synchronizing state...")
	for _, c := range del {
		p.ReleaseResources(c)
	}
	for _, c := range add {
		p.AllocateResources(c)
	}

	return nil
}

// AllocateResources allocates resources for the given container.
func (p *staticplus) AllocateResources(c cache.Container) error {
	var a *Assignment

	id := c.GetCacheID()

	p.Debug("allocating container %s...", id)

	if _, ok := p.allocations[id]; ok {
		return nil
	}

	a, err := p.assignCpus(c)
	if err != nil {
		return err
	}

	return p.addAssignment(c, a)
}

// ReleaseResources release resources assigned to the given container.
func (p *staticplus) ReleaseResources(c cache.Container) error {
	id := c.GetCacheID()

	p.Debug("releasing container %s...", id)

	a, ok := p.allocations[id]
	if !ok {
		return nil
	}

	return p.delAssignment(a, id)
}

// UpdateResources is a resource allocation update request for this policy.
func (p *staticplus) UpdateResources(c cache.Container) error {
	p.Debug("(not) updating container %s...", c.PrettyName())
	return nil
}

// Rebalance tries to find an optimal allocation of resources for the current containers.
func (p *staticplus) Rebalance() (bool, error) {
	p.Debug("(not) rebalancing containers...")
	return false, nil
}

// HandleEvent handles policy-specific events.
func (p *staticplus) HandleEvent(*events.Policy) (bool, error) {
	p.Debug("(not) handling event...")
	return false, nil
}

// ExportResourceData provides resource data to export for the container.
func (p *staticplus) ExportResourceData(c cache.Container) map[string]string {
	a, ok := p.allocations[c.GetCacheID()]
	if !ok {
		// Hmm...
		p.Warn("can't find allocation for container %s", c.PrettyName())
		return nil
	}

	data := map[string]string{}

	if a.shared != 0 {
		data[policy.ExportSharedCPUs] = p.shared.String()
	}
	if a != nil && !a.exclusive.IsEmpty() {
		isolated := a.exclusive.Intersection(p.sys.Isolated()).String()
		if isolated != "" {
			data[policy.ExportIsolatedCPUs] = isolated
		}
		exclusive := a.exclusive.Difference(p.sys.Isolated()).String()
		if exclusive != "" {
			data[policy.ExportExclusiveCPUs] = exclusive
		}
	}

	return data
}

// Introspect provides data for external introspection.
func (p *staticplus) Introspect(*introspect.State) {
	return
}

// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
func (p *staticplus) DescribeMetrics() []*prometheus.Desc {
	return nil
}

// PollMetrics provides policy metrics for monitoring.
func (p *staticplus) PollMetrics() policy.Metrics {
	return nil
}

// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
func (p *staticplus) CollectMetrics(policy.Metrics) ([]prometheus.Metric, error) {
	return nil, nil
}

// policyError creates a formatted policy-specific error.
func policyError(format string, args ...interface{}) error {
	return fmt.Errorf(PolicyName+": "+format, args...)
}

// setupPools sets up the pools we allocate resources from.
func (p *staticplus) setupPools(available, reserved policy.ConstraintSet) error {
	// Set up three disjoint CPU pools for allocating CPU to containers. These
	// three pools are:
	//
	//   1) reserved pool: kube- and system-tasks
	//        Pods in the kube-system namespace are assigned to this pool. The
	//        size of this pool is the requested reservation rounded up to the
	//        closest integer. Any unused fractional part of this pool is used
	//        as a shared pool if the shared pool ever gets fully allocated.
	//
	//   2) isolated pool: primary exclusive allocations
	//        Exclusive CPU allocations are primarily done from this pool. Pods
	//        that request at least 1 full CPU get their exclusive (integer)
	//        CPU shares allocated from this pool unless the pool has already
	//        been exhausted (in which case we try to slice off exclusive CPUs
	//        from the shared pool).
	//
	//   3) shared pool: shared allocations, secondary exclusive allocations
	//        Shared CPU allocations are served from this pool. Pods fractional
	//        CPU shares are allocated from this pool. If the isolated pool has
	//        been exhausted exclusive allocations are sliced off from this
	//        pool. If this pool has been fully allocated, shared allocations
	//        are oversubscribed to the reserved pool.

	p.offline = p.sys.Offlined()

	cpus, ok := available[policy.DomainCPU]
	if !ok {
		p.available = p.sys.CPUSet().Difference(p.offline)
	} else {
		p.available = cpus.(cpuset.CPUSet).Difference(p.offline)
	}

	p.isolated = p.sys.Isolated().Intersection(p.available)
	p.available = p.available.Difference(p.isolated)

	cpus, ok = reserved[policy.DomainCPU]
	if !ok {
		return policyError("cannot start without any reserved CPUs")
	}

	switch cpus.(type) {
	case cpuset.CPUSet:
		p.reserved = cpus.(cpuset.CPUSet).Intersection(p.available)
		if !p.reserved.Equals(cpus.(cpuset.CPUSet)) {
			return policyError("part of the reserved CPUs (%s) are not available: %s",
				cpus.(cpuset.CPUSet).String(), cpus.(cpuset.CPUSet).Difference(p.available))
		}
		p.available = p.available.Difference(p.reserved)

	case resource.Quantity:
		var err error
		qty := cpus.(resource.Quantity)
		count := (int(qty.MilliValue()) + 999) / 1000
		if count < 2 && p.available.Contains(0) {
			p.reserved = cpuset.New(0)
			p.available = p.available.Difference(p.reserved)
		} else {
			p.reserved, err = p.takeCPUs(&p.available, nil, count, cpuallocator.PriorityNormal)
			if err != nil {
				return policyError("failed to reserve %d CPUs from %s: %v",
					count, p.available.String())
			}
		}
	}

	p.shared = p.available

	return nil
}

// Restore saved policy state from the cache.
func (p *staticplus) restoreCache() error {
	if !p.cache.GetPolicyEntry(keySharedPool, &p.shared) {
		p.Warn("initializing empty policy state...")

		p.shared = p.available
		p.allocations = make(Allocations)
		p.cache.SetPolicyEntry(keySharedPool, &p.shared)
		p.cache.SetPolicyEntry(keyAllocations,
			cache.Cachable(&cachedAllocations{a: p.allocations}))
	} else {
		p.Info("restoring cached policy state...")

		ca := cachedAllocations{}
		if !p.cache.GetPolicyEntry(keyAllocations, &ca) {
			return policyError("failed to restore state from cache, no allocations")
		}
		p.allocations = ca.a
	}

	p.dumpPools()
	p.dumpAllocations()

	return nil
}

// requestedCpus calculates the exclusive and shared cpu allocations for a container.
func (p *staticplus) requestedCpus(c cache.Container) (int, int) {
	cpuReq, ok := c.GetResourceRequirements().Requests[corev1.ResourceCPU]
	if !ok {
		return 0, 0
	}

	full := int(cpuReq.MilliValue()) / 1000
	part := int(cpuReq.MilliValue()) - 1000*full

	return full, part
}

// optOutFromIsolated checks if a container prefers (to opt out from) isolated CPUs.
func (p *staticplus) optOutFromIsolation(c cache.Container) bool {
	preferIsolated := true

	if pod, found := c.GetPod(); !found {
		p.Warn("can't find pod for container %s", c.PrettyName())
	} else {
		if value, ok := pod.GetResmgrAnnotation(keyPreferIsolated); ok {
			if isolated, err := strconv.ParseBool(value); !isolated {
				if err != nil {
					p.Error("invalid annotation '%s' on container %s, expecting boolean: %v",
						keyPreferIsolated, c.PrettyName(), err)
				} else {
					p.Info("container %s is opted-out from isolation", c.PrettyName())
				}
				preferIsolated = false
			} else {
				p.Info("container %s explicitly opted-in for isolation", c.PrettyName())
			}
		} else {
			p.Info("container %s goes with default isolation", c.PrettyName())
		}
	}

	return !preferIsolated
}

// assignCpus allocates cpus for a containers.
func (p *staticplus) assignCpus(c cache.Container) (*Assignment, error) {
	full, part := p.requestedCpus(c)

	// system containers always share (the reserved) cpus
	if c.GetNamespace() == metav1.NamespaceSystem {
		return &Assignment{shared: 1000*full + part}, nil
	}

	// assign to the shared pool if less than a single cpu was requested
	if full == 0 {
		return &Assignment{shared: part}, nil
	}

	// if there is capacity in the isolated pool, slice cpus off from it
	if p.isolated.Size() >= full && !p.optOutFromIsolation(c) {
		cpus, err := p.takeCPUs(&p.isolated, nil, full, cpuallocator.PriorityHigh)
		if err != nil {
			return nil, policyError("failed to allocate %d isolated CPUs: %v",
				full, err)
		}
		return &Assignment{exclusive: cpus, shared: part}, nil
	}

	// otherwise, try to slice off cpus from the shared pool
	if p.shared.Size() >= full {
		cpus, err := p.takeCPUs(&p.shared, nil, full, cpuallocator.PriorityHigh)
		if err != nil {
			return nil, policyError("failed to allocate %d exclusive CPUs: %v",
				full, err)
		}
		return &Assignment{exclusive: cpus, shared: part}, nil
	}

	// we're screwed, not enough cpu in either isolated or shared pool
	return nil, policyError("failed to allocate %d exclusive CPUs: %s",
		full, "not enough capacity")
}

// addAssignment updates container allocations for a newly added container assignment.
func (p *staticplus) addAssignment(c cache.Container, a *Assignment) error {
	switch {
	// always assign system containers to the reserved pool
	case c.GetNamespace() == metav1.NamespaceSystem:
		c.SetCpusetCpus(p.reserved.String())
		c.SetCPUShares(int64(MilliCPUToShares(a.shared)))

		p.Info("system container %s allocated (%d mCPU) to reserved pool %s",
			c.PrettyName(), a.shared, p.reserved.String())

		// for shared-only assignments, it's enough to update the container
	case a.exclusive.IsEmpty():
		c.SetCpusetCpus(p.shared.String())
		c.SetCPUShares(int64(MilliCPUToShares(a.shared)))

		p.Info("container %s allocated (%d mCPU) to shared pool %s",
			c.PrettyName(), a.shared, p.shared.String())

		// isolated, sliced-off exclusive, or mixed allocation
	default:
		var kind string
		var isolated bool
		if isolated = !a.exclusive.Intersection(p.sys.Isolated()).IsEmpty(); isolated {
			kind = "isolated"
		} else {
			kind = "exclusive"
		}
		if a.shared != 0 {
			c.SetCpusetCpus(a.exclusive.Union(p.shared).String())
			c.SetCPUShares(int64(MilliCPUToShares(a.shared)))
			p.Info("container %s allocated to %s (%s) and shared (%d mCPU) pool %s",
				c.PrettyName(), kind, a.exclusive.String(), a.shared, p.shared.String())
		} else {
			c.SetCpusetCpus(a.exclusive.String())
			c.SetCPUShares(int64(MilliCPUToShares(1000 * a.exclusive.Size())))
			p.Info("container %s allocated to %s CPUs %s", c.PrettyName(),
				kind, a.exclusive.String())
		}

		// for sliced-off exclusive we might need to update other containers shared allocations
		if !a.exclusive.IsEmpty() && a.exclusive.Intersection(p.sys.Isolated()).IsEmpty() {
			if err := p.updateSharedAllocations(); err != nil {
				return err
			}
		}
	}

	p.allocations[c.GetCacheID()] = a

	p.cache.SetPolicyEntry(keySharedPool, p.shared)
	p.cache.SetPolicyEntry(keyAllocations,
		cache.Cachable(&cachedAllocations{a: p.allocations}))

	return nil
}

// delAssignment updates container allocations for a deleted container assignment.
func (p *staticplus) delAssignment(a *Assignment, id string) error {
	delete(p.allocations, id)

	switch {
	// for shared-only allocations there is not much to do...
	case a.exclusive.IsEmpty():
		p.Info("freed shared-only (%d mCPU) allocations of container %s",
			a.shared, id)

		// for isolated exclusive cpus, return them to the pool
	case !a.exclusive.Intersection(p.sys.Isolated()).IsEmpty():
		p.isolated = p.isolated.Union(a.exclusive)

		p.Info("freed isolated allocations (%s) of container %s",
			a.exclusive.String(), id)

		// for cpus sliced off the shared pool, return then and update others
	default:
		p.shared = p.shared.Union(a.exclusive)

		p.Info("freed exclusive allocations (%s) of container %s",
			a.exclusive.String(), id)

		if err := p.updateSharedAllocations(); err != nil {
			return err
		}
	}

	p.cache.SetPolicyEntry(keySharedPool, p.shared)
	p.cache.SetPolicyEntry(keyAllocations,
		cache.Cachable(&cachedAllocations{a: p.allocations}))

	return nil
}

// updateSharedAllocations updates containers with shared allocations.
func (p *staticplus) updateSharedAllocations() error {
	avail := 1000 * p.shared.Size()

	for id, ca := range p.allocations {
		cac, ok := p.cache.LookupContainer(id)
		if !ok {
			p.Warn("can't find allocated container %s", id)
			// remove and recalculate shared CPUs
			p.delAssignment(ca, id)
			return p.updateSharedAllocations()
		}

		if !ca.exclusive.Intersection(p.sys.Isolated()).IsEmpty() && ca.shared == 0 {
			continue
		}

		cset := p.shared.Union(ca.exclusive)

		if avail <= 0 {
			cset = cset.Union(p.reserved)
			p.Warn("out of free shared (%s) capacity, using reserved pool (%s) as well",
				p.shared.String(), p.reserved.String())
		}

		if cac.GetCpusetCpus() != cset.String() {
			cac.SetCpusetCpus(cset.String())

			p.Info("container %s reallocated to exclusive (%s) and shared (%d mCPU) pool %s",
				cac.PrettyName(), ca.exclusive.String(), ca.shared, cset.String())
		}

		avail -= ca.shared
	}

	if avail < 0 {
		p.Warn("not enough free capacity in shared pool (%s): lacking %d mCPU",
			p.shared.String(), -avail)
	} else {
		p.Info("free shared (%s) capacity left: %d mCPU", p.shared.String(), avail)
	}

	return nil
}

// updatePools updates the pools according to the current asignments.
func (p *staticplus) updatePools() error {
	for id, ca := range p.allocations {
		if ca.exclusive.IsEmpty() {
			continue
		}

		isolated := ca.exclusive.Intersection(p.sys.Isolated())
		excshare := ca.exclusive.Difference(isolated)

		if !isolated.IsEmpty() && !excshare.IsEmpty() {
			return policyError("container %s has exclusive isolated (%s) and shareable (%s) cpus",
				id, isolated.String(), excshare.String())
		}

		p.isolated = p.isolated.Difference(isolated)
		p.shared = p.shared.Difference(excshare)
	}

	if err := p.updateSharedAllocations(); err != nil {
		return err
	}

	p.cache.SetPolicyEntry(keySharedPool, p.shared)
	p.cache.SetPolicyEntry(keyAllocations,
		cache.Cachable(&cachedAllocations{a: p.allocations}))

	return nil
}

// dumpPools dumps the current state of pools.
func (p *staticplus) dumpPools() {
	p.Info("current CPU pools:")
	offline := p.offline.String()
	if offline == "" {
		offline = "<none>"
	}
	isolated := p.isolated.String()
	if isolated == "" {
		isolated = "<none>"
	}

	p.Info("  offline:  %s", offline)
	p.Info("  reserved: %s", p.reserved.String())
	p.Info("  shared:   %s", p.shared.String())
	p.Info("  isolated: %s", isolated)

}

// dumpAllocations dumps the current allocations.
func (p *staticplus) dumpAllocations() {
	p.Info("container CPU allocations:")
	switch {
	case p.allocations == nil:
		p.Info("  <nil>")
	case len(p.allocations) == 0:
		p.Info("  <none>")
	default:
		for id, ca := range p.allocations {
			e := ca.exclusive.String()
			if e == "" {
				e = "<none>"
			}
			p.Info("  %s: exclusive: %s, shared: %d milli-cpu", id, e, ca.shared)
		}
	}
}

// Take up to cnt CPUs from a given CPU set to another.
func (p *staticplus) takeCPUs(from, to *cpuset.CPUSet, cnt int, preferredPrio cpuallocator.CPUPriority) (cpuset.CPUSet, error) {
	cset, err := p.cpuAllocator.AllocateCpus(from, cnt, preferredPrio)
	if err != nil {
		return cset, err
	}

	if to != nil {
		*to = to.Union(cset)
	}

	return cset, err
}

//
// Cachable data types for storing private static-plus policy data in the cache.
//

// CachedAllocations implements Cache.Cachable boilerplate for Allocations.
type CachedAllocations interface {
	cache.Cachable
}

type cachedAllocations struct {
	a Allocations
}

var _ cache.Cachable = &cachedAllocations{}

var _ json.Marshaler = &cachedAllocations{}
var _ json.Unmarshaler = &cachedAllocations{}

func (ca *cachedAllocations) Get() interface{} {
	return *ca
}

func (ca *cachedAllocations) Set(value interface{}) {
	switch value.(type) {
	case cachedAllocations:
		ca.a = value.(cachedAllocations).a
	case *cachedAllocations:
		ca.a = value.(*cachedAllocations).a
	}
}

type marshallableAssignment struct {
	Exclusive string
	Shared    int
}

func (ca *cachedAllocations) MarshalJSON() ([]byte, error) {
	dst := make(map[string]*marshallableAssignment)
	for id, r := range ca.a {
		dst[id] = &marshallableAssignment{
			Exclusive: r.exclusive.String(),
			Shared:    r.shared,
		}
	}

	return json.Marshal(dst)
}

func (ca *cachedAllocations) UnmarshalJSON(data []byte) error {
	var err error

	dst := make(map[string]*marshallableAssignment)
	if err = json.Unmarshal(data, &dst); err != nil {
		return err
	}

	ca.a = make(map[string]*Assignment)
	for id, r := range dst {
		if r == nil {
			continue
		}
		cset, err := cpuset.Parse(r.Exclusive)
		if err != nil {
			return policyError("failed to unmarshal cpuset '%s': %v",
				r.Exclusive, err)
		}
		ca.a[id] = &Assignment{
			exclusive: cset,
			shared:    r.Shared,
		}
	}

	return nil
}

// Functions for calculating CFS cpu.shares and cpu.cfs_quota_us.
//
//	Notes: These functions are almost verbatim taken from the kubelet
//	code (from k8s.io/kubernetes/pkg/kubelet/cm/helpers_linux.go).
//	Since these are exported there, we could try to import them, set the
//	related feature gates (kubefeatures.CPUCFSQuotaPeriod) for ourselves
//	into the desired positions (disabled most probably for now) and use
//	the imported code.
const (
	MinShares     = 2
	SharesPerCPU  = 1024
	MilliCPUToCPU = 1000

	// 100000 is equivalent to 100ms
	QuotaPeriod    = 100000
	MinQuotaPeriod = 1000
)

// MilliCPUToQuota converts milliCPU to CFS quota and period values.
func MilliCPUToQuota(milliCPU int64, period int64) (quota int64) {
	// CFS quota is measured in two values:
	//  - cfs_period_us=100ms (the amount of time to measure usage across given by period)
	//  - cfs_quota=20ms (the amount of cpu time allowed to be used across a period)
	// so in the above example, you are limited to 20% of a single CPU
	// for multi-cpu environments, you just scale equivalent amounts
	// see https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt for details

	if milliCPU == 0 {
		return
	}

	if true /*!utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod)*/ {
		period = QuotaPeriod
	}

	// we then convert your milliCPU to a value normalized over a period
	quota = (milliCPU * period) / MilliCPUToCPU

	// quota needs to be a minimum of 1ms.
	if quota < MinQuotaPeriod {
		quota = MinQuotaPeriod
	}
	return
}

// MilliCPUToShares converts the milliCPU to CFS shares.
func MilliCPUToShares(milliCPU int) int64 {
	if milliCPU == 0 {
		// Docker converts zero milliCPU to unset, which maps to kernel default
		// for unset: 1024. Return 2 here to really match kernel default for
		// zero milliCPU.
		return MinShares
	}
	// Conceptually (milliCPU / milliCPUToCPU) * sharesPerCPU, but factored to improve rounding.
	shares := (milliCPU * SharesPerCPU) / MilliCPUToCPU
	if shares < MinShares {
		return MinShares
	}
	return int64(shares)
}

// Register us as a policy implementation.
func init() {
	policy.Register(PolicyName, PolicyDescription, CreateStaticPlusPolicy)
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/static-pools/config.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package stp

import (
	"fmt"
	"os"
	"path"
	"regexp"
	"strconv"
	"strings"

	"sigs.k8s.io/yaml"

	pkgcfg "github.com/intel/cri-resource-manager/pkg/config"
)

// config captures our runtime configurable parameters.
type config struct {
	// Pools defines our set of pools in use.
	Pools pools `json:"pools,omitempty"`
	// ConfDirPath is the filesystem path to the legacy configuration directry structure.
	ConfDirPath string
	// ConfFilePath is the filesystem path to the legacy configuration file.
	ConfFilePath string
	// LabelNode controls whether backwards-compatible CMK node label is created.
	LabelNode bool
	// TaintNode controls whether backwards-compatible CMK node taint is created.
	TaintNode bool
}

type pools map[string]poolConfig

type cpuList struct {
	Socket     uint64
	Cpuset     string // TODO: might want to use cpuset from kubelet
	containers map[string]struct{}
}

// STP policy runtime configuration with their defaults.
var conf = defaultConfig().(*config)

// defaultConfig returns a new conf instance, all initialized to defaults.
func defaultConfig() interface{} {
	return &config{
		Pools:       make(pools),
		ConfDirPath: "/etc/cmk",
	}
}

func (c *cpuList) addContainer(id string) {
	if c.containers == nil {
		c.containers = make(map[string]struct{})
	}
	c.containers[id] = struct{}{}
}

func (c *cpuList) removeContainer(id string) {
	if c.containers == nil {
		return
	}
	delete(c.containers, id)
}

func (c *cpuList) getContainers() []string {
	if c.containers == nil {
		return []string{}
	}

	ret := make([]string, len(c.containers))
	i := 0
	for k := range c.containers {
		ret[i] = k
		i++
	}
	return ret
}

type poolConfig struct {
	Exclusive bool `json:"exclusive"`
	// Per-socket cpu lists
	CPULists []*cpuList `json:"cpuLists"`
}

func (p *poolConfig) cpuSet() string {
	cpuset := ""
	delim := ""
	for _, cl := range p.CPULists {
		cpuset += delim + cl.Cpuset
		delim = ","
	}
	return cpuset
}

var (
	cpusetValidationRe = regexp.MustCompile(`^(([\d]+)|([\d]+-[\d]+))(,(([\d]+)|([\d]+-[\d]+)))*$`)
)

func parseConfData(raw []byte) (pools, error) {
	conf := &struct {
		Pools pools
	}{}

	err := yaml.Unmarshal(raw, &conf)
	if err != nil {
		return nil, stpError("Failed to parse config file: %v", err)
	}
	return conf.Pools, nil
}

func readConfFile(filepath string) (pools, error) {
	// Read config data
	data, err := os.ReadFile(filepath)
	if err != nil {
		return nil, stpError("Failed to read config file: %v", err)
	}

	return parseConfData(data)
}

func readConfDir(confDir string) (pools, error) {
	conf := pools{}

	// List pools in the pools configuration directory
	poolsDir := path.Join(confDir, "pools")
	pools, err := os.ReadDir(poolsDir)
	if err != nil {
		return nil, stpError("Failed to list pools config directory %s: %v", poolsDir, err)
	}

	// Read pool configurations
	for _, pool := range pools {
		poolConf, err := readPoolConfDir(path.Join(poolsDir, pool.Name()))
		if err != nil {
			return nil, stpError("Failed to read pool çonfiguration: %v", err)
		}
		conf[pool.Name()] = poolConf
	}

	return conf, nil
}

// Read configuration of one pool from original CMK configuration directory tree
func readPoolConfDir(poolDir string) (poolConfig, error) {
	conf := poolConfig{Exclusive: false, CPULists: []*cpuList{}}

	// Read pool's exclusivity flag
	exclusive, err := os.ReadFile(path.Join(poolDir, "exclusive"))
	if err != nil {
		return conf, fmt.Errorf("Failed to read pool exclusive setting in %s: %v", poolDir, err)
	}
	if len(exclusive) == 1 && exclusive[0] == '1' {
		conf.Exclusive = true
	}

	// Read socket configurations (per-socket cpu lists)
	files, err := os.ReadDir(poolDir)
	if err != nil {
		return conf, fmt.Errorf("Failed to list pool config directory %s: %v", poolDir, err)
	}
	for _, file := range files {
		if !file.IsDir() {
			// Skip non-directory files (e.g. 'exclusive' file)
			continue
		}

		socketPath := path.Join(poolDir, file.Name())
		socketCPULists, err := readSocketConfDir(socketPath)
		if err != nil {
			return conf, fmt.Errorf("Failed to list pool socket config: %s", err)
		}

		conf.CPULists = append(conf.CPULists, socketCPULists...)
	}
	return conf, nil
}

// Read configuration (cpu lists) of a socket of one pool in original CMK
// configuration directory tree
func readSocketConfDir(socketDir string) ([]*cpuList, error) {
	// Get socket number from the name of the directory
	socketNum, err := strconv.ParseUint(path.Base(socketDir), 10, 32)
	if err != nil {
		return nil, fmt.Errorf("Invalid socket id %s: %v", socketDir, err)
	}

	// Socket directory contains a set of subdirectories, one per cpu list
	cpuListDirs, err := os.ReadDir(socketDir)
	if err != nil {
		return nil, fmt.Errorf("Failed to list socket directory %s: %v", socketDir, err)
	}

	conf := make([]*cpuList, len(cpuListDirs))

	for i, cpuListDir := range cpuListDirs {
		// Validate that the cpulist conforms to cpuset formatting
		if err := validateCPUList(cpuListDir.Name()); err != nil {
			return nil, fmt.Errorf("Invalid cpu list in %s: %v", socketDir, err)
		}
		conf[i] = &cpuList{Socket: socketNum,
			Cpuset:     cpuListDir.Name(),
			containers: map[string]struct{}{}}
	}
	return conf, nil
}

func validateCPUList(name string) error {
	// NOTE: Naive implementation, we only check that it "looks right", we don't
	// check that the actual numbers make sense, i.e. that numbers are in
	// ascending order
	if !cpusetValidationRe.MatchString(name) {
		return fmt.Errorf("%q does not look like a cpuset", name)
	}
	return nil
}

// Convert cpu list configuration directory name into a cpuList
func parseCPUListName(name string) ([]uint, error) {
	// The name should be a list of cpu ids (positive integers) separated by commas
	cpuListMembers := strings.Split(name, ",")

	cpus := make([]uint, len(cpuListMembers))

	// Convert cpu ids to a list of integers
	for i, cpuStr := range cpuListMembers {
		cpu, err := strconv.ParseUint(cpuStr, 10, 32)
		if err != nil {
			return cpus, fmt.Errorf("Invalid cpu id in %s: %v", name, err)
		}
		cpus[i] = uint(cpu)
	}
	return cpus, nil
}

// Register us for command line option processing and configuration management.
func init() {
	pkgcfg.Register(PolicyPath, PolicyDescription, conf, defaultConfig)
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/static-pools/node.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package stp

import (
	"strconv"
	"time"

	core_v1 "k8s.io/api/core/v1"

	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/agent"
	"github.com/intel/cri-resource-manager/pkg/log"
)

const (
	exclusiveCoreResourceName = "cmk.intel.com/exclusive-cores"
	cmkLegacyNodeLabelName    = "cmk.intel.com/cmk-node"
)

type nodeUpdater struct {
	log.Logger
	agent agent.Interface
	conf  chan config
}

func newNodeUpdater(agent agent.Interface) *nodeUpdater {
	return &nodeUpdater{
		Logger: log.NewLogger("static-pools-nu"),
		agent:  agent,
		conf:   make(chan config, 1),
	}
}

func (u *nodeUpdater) start() error {
	u.Info("starting node updater")

	if u.agent == nil || u.agent.IsDisabled() {
		return stpError("cri-resmgr-agent connection required")
	}

	go func() {
		var pending *config
		var retry <-chan time.Time

		for {
			select {
			case c := <-u.conf:
				pending = &c
				retry = time.After(0)
			case _ = <-retry:
				if pending != nil {
					err := u.updateNode(pending, -1)
					if err != nil {
						u.Info("node update failed: %v", err)
						retry = time.After(5 * time.Second)
					} else {
						u.Info("node successfully updated")
						pending = nil
						retry = nil
					}
				} else {
					u.Panic("BUG: node update with nil config requested")
				}
			}
		}
	}()

	return nil
}

func (u *nodeUpdater) update(c config) {
	// Pop possibly pending value from the channel
	select {
	case <-u.conf:
	default:
	}
	u.conf <- c
}

// Update Node object with STP/CMK-specific things
func (u *nodeUpdater) updateNode(conf *config, opTimeout time.Duration) error {
	// Count total number of cpu lists of all exclusive pools
	numExclusiveCPULists := 0
	for _, pool := range conf.Pools {
		if pool.Exclusive {
			numExclusiveCPULists += len(pool.CPULists)
		}
	}

	// Update extended resources
	resources := map[string]string{
		exclusiveCoreResourceName: strconv.Itoa(numExclusiveCPULists)}
	u.Info("updating node capacity (extended resources)")
	if err := u.agent.UpdateNodeCapacity(resources, opTimeout); err != nil {
		return err
	}

	// Manage legacy node label
	if conf.LabelNode {
		u.Info("creating CMK node label")
		err := u.agent.SetLabels(map[string]string{cmkLegacyNodeLabelName: "true"}, opTimeout)
		if err != nil {
			return stpError("failed to update legacy node label: %v", err)
		}
	} else {
		u.Info("removing CMK node label")
		err := u.agent.RemoveLabels([]string{cmkLegacyNodeLabelName}, opTimeout)
		if err != nil {
			return stpError("failed to update legacy node label: %v", err)
		}
	}

	// Manage legacy node taint
	nodeTaints, err := u.agent.GetTaints(opTimeout)
	if err != nil {
		return stpError("failed to fetch node taints: %v", err)
	}

	legacyTaint := core_v1.Taint{
		Key:    "cmk",
		Value:  "true",
		Effect: core_v1.TaintEffectNoSchedule,
	}
	cmkTaints := []core_v1.Taint{legacyTaint}
	_, tainted := u.agent.FindTaintIndex(nodeTaints, &legacyTaint)

	if !tainted && conf.TaintNode {
		u.Info("creating CMK node taint")
		if err := u.agent.SetTaints(cmkTaints, opTimeout); err != nil {
			return stpError("failed to set legacy node taint: %v", err)
		}
	}
	if tainted && !conf.TaintNode {
		u.Debug("removing CMK node taint")
		if err := u.agent.RemoveTaints(cmkTaints, opTimeout); err != nil {
			return stpError("failed to clear legacy node taint: %v", err)
		}
	}

	return nil
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/static-pools/stp-policy.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stp

import (
	"flag"
	"fmt"
	"io"
	"math/rand"
	"strconv"
	"strings"

	pkgcfg "github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/utils"
	"github.com/prometheus/client_golang/prometheus"
)

const (
	// PolicyName is the name used to activate this policy implementation.
	PolicyName = "static-pools"
	// PolicyDescription is a short description of this policy.
	PolicyDescription = "A reimplementation of CMK (CPU Manager for Kubernetes)."
	// PolicyPath is the path of this policy in the configuration hierarchy.
	PolicyPath = "policy." + PolicyName
	// StpEnvPool is the name of the env variable for selecting STP pool of a container
	StpEnvPool = "STP_POOL"
	// StpEnvSocketID is the name of the env variable for selecting cpu socket of a container
	StpEnvSocketID = "STP_SOCKET_ID"
	// StpEnvNoAffinity is the name of the env variable for switching off cpuset enforcement
	StpEnvNoAffinity = "STP_NO_AFFINITY"
	// CmkEnvAssigned is the name of the env variable that the original CMK
	// sets to communicate the selected cpuset to the workload. We use the same
	// environment variable for compatibility.
	CmkEnvAssigned = "CMK_CPUS_ASSIGNED"
	// CmkEnvInfra is the name of the env variable that the original CMK sets
	// to communicate all CPUs of the infra pool to the workload. We use the
	// same environment variable for compatibility.
	CmkEnvInfra = "CMK_CPUS_INFRA"
	// CmkEnvShared is the name of the env variable that the original CMK sets
	// to communicate all CPUs of the shared pool to the workload. We use the
	// same environment variable for compatibility.
	CmkEnvShared = "CMK_CPUS_SHARED"
	// CmkEnvNumCores is the name of the env used in the original CMK to select
	// the number of exclusive CPUs, deprecated here
	CmkEnvNumCores = "CMK_NUM_CORES"
	// PoolInfra is the hardcoded name of the 'infra' pool
	CmkPoolInfra = "infra"
	// PoolInfra is the hardcoded name of the 'infra' pool
	CmkPoolShared = "shared"
)

type stp struct {
	logger.Logger

	conf        *config      // STP policy configuration
	nodeUpdater *nodeUpdater // node updater thread
	state       cache.Cache  // state cache
}

var _ policy.Backend = &stp{}

//
// Policy backend implementation
//

// CreateStpPolicy creates a new policy instance.
func CreateStpPolicy(opts *policy.BackendOptions) policy.Backend {
	stp := &stp{
		Logger:      logger.NewLogger(PolicyName),
		state:       opts.Cache,
		nodeUpdater: newNodeUpdater(opts.AgentCli),
	}

	stp.Info("creating policy...")

	pkgcfg.GetModule(PolicyPath).AddNotify(stp.configNotify)

	return stp
}

// Name returns the name of this policy.
func (stp *stp) Name() string {
	return PolicyName
}

// Description returns the description for this policy.
func (stp *stp) Description() string {
	return PolicyDescription
}

// Start prepares this policy for accepting allocation/release requests.
func (stp *stp) Start(add []cache.Container, del []cache.Container) error {
	if err := stp.nodeUpdater.start(); err != nil {
		return err
	}

	if stp.conf == nil {
		if err := stp.setConfig(conf); err != nil {
			return err
		}
	}

	if err := stp.initializeState(); err != nil {
		return err
	}
	stp.Debug("retrieved stp container states from cache:\n%s", utils.DumpJSON(*stp.getContainerRegistry()))

	if err := stp.Sync(add, del); err != nil {
		return err
	}

	stp.Debug("preparing for making decisions...")

	return nil
}

// Sync synchronizes the state of this policy.
func (stp *stp) Sync(add []cache.Container, del []cache.Container) error {
	stp.Debug("synchronizing state...")
	for _, c := range del {
		stp.ReleaseResources(c)
	}
	for _, c := range add {
		stp.AllocateResources(c)
	}

	return nil
}

// AllocateResources is a resource allocation request for this policy.
func (stp *stp) AllocateResources(c cache.Container) error {
	containerID := c.GetCacheID()
	stp.Debug("allocating resources for container %s...", containerID)

	cs := stpContainerStatus{Socket: -1}

	// Default pool name
	poolName := CmkPoolShared

	// Get resource requests
	stp.Debug("RESOURCE REQUESTS: %s", c.GetResourceRequirements().Requests)
	requestedCPUs, ok := c.GetResourceRequirements().Requests[exclusiveCoreResourceName]
	if ok {
		nCPUs, _ := requestedCPUs.AsInt64()
		cs.NExclusiveCPUs = nCPUs
	}

	// Parse container command line. Backwards compatibility for old CMK
	// workloads
	cmkArgs := stp.parseContainerCmdline(c.GetCommand(), c.GetArgs())
	if cmkArgs != nil {
		poolName = cmkArgs.Pool
		cs.Socket = cmkArgs.SocketID
		cs.NoAffinity = cmkArgs.NoAffinity

		// Overwrite container commandline
		c.SetCommand(cmkArgs.Command)
		c.SetArgs([]string{})

		stp.Debug("parsed options from container command line: %v", cmkArgs)
	}

	// Get STP options from container env
	envVal, ok := c.GetEnv(StpEnvSocketID)
	if ok {
		socketID, err := strconv.ParseInt(envVal, 10, 32)
		if err != nil {
			stp.Warn("unable to parse socket id from %q: %v", StpEnvSocketID, err)
		} else {
			cs.Socket = socketID
		}
	}
	envVal, ok = c.GetEnv(StpEnvPool)
	if ok {
		poolName = envVal
	}
	_, ok = c.GetEnv(StpEnvNoAffinity)
	if ok {
		// We do not care about the value of the env variable here
		cs.NoAffinity = true
	}

	// Force socket to -1 if pool is not "socket aware"
	if poolName == CmkPoolInfra {
		cs.Socket = -1
	}

	// Get pool configuration
	if _, ok := stp.conf.Pools[poolName]; !ok {
		return stpError("non-existent pool %q", poolName)
	}
	cs.Pool = poolName

	// Allocate (CPU) resources for the container
	err := stp.allocateStpResources(c, cs)
	if err != nil {
		return err
	}

	return nil
}

// ReleaseResources is a resource release request for this policy.
func (stp *stp) ReleaseResources(c cache.Container) error {
	stp.Debug("releasing resources of container %s...", c.PrettyName())
	stp.releaseStpResources(c.GetCacheID())
	return nil
}

// UpdateResources is a resource allocation update request for this policy.
func (stp *stp) UpdateResources(c cache.Container) error {
	stp.Debug("updating resource allocations of container %s...", c.PrettyName())
	return nil
}

// Rebalance tries to find an optimal allocation of resources for the current containers.
func (stp *stp) Rebalance() (bool, error) {
	stp.Debug("(not) rebalancing containers...")
	return false, nil
}

// HandleEvent handles policy-specific events.
func (stp *stp) HandleEvent(*events.Policy) (bool, error) {
	stp.Debug("(not) handling event...")
	return false, nil
}

// ExportResourceData provides resource data to export for the container.
func (stp *stp) ExportResourceData(c cache.Container) map[string]string {
	return nil
}

// Introspect provides data for external introspection.
func (stp *stp) Introspect(*introspect.State) {
	return
}

// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
func (p *stp) DescribeMetrics() []*prometheus.Desc {
	return nil
}

// PollMetrics provides policy metrics for monitoring.
func (p *stp) PollMetrics() policy.Metrics {
	return nil
}

// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
func (p *stp) CollectMetrics(policy.Metrics) ([]prometheus.Metric, error) {
	return nil, nil
}

func (stp *stp) configNotify(event pkgcfg.Event, source pkgcfg.Source) error {
	stp.Info("configuration %s", event)

	if err := stp.setConfig(conf); err != nil {
		return err
	}

	stp.Info("config updated successfully")

	return nil
}

func (stp *stp) setConfig(cfg *config) error {
	// Read legacy pools configuration if the given config has no pools configured
	if cfg.Pools == nil || len(cfg.Pools) == 0 {
		if len(cfg.ConfDirPath) > 0 {
			stp.Debug("Reading legacy configuration directory tree %q", cfg.ConfDirPath)
			p, err := readConfDir(cfg.ConfDirPath)
			if err != nil {
				stp.Warn("failed to read configuration directory: %v", err)
			} else {
				cfg.Pools = p
			}
		}
		if len(cfg.ConfFilePath) > 0 {
			stp.Debug("Reading legacy configuration file %q", cfg.ConfFilePath)
			p, err := readConfFile(cfg.ConfFilePath)
			if err != nil {
				stp.Warn("failed to read configuration file: %v", err)
			} else {
				if cfg.Pools != nil || len(cfg.Pools) > 0 {
					stp.Info("Overriding pool configuration from %q with configuration from %q",
						cfg.ConfDirPath, cfg.ConfFilePath)
				}
				cfg.Pools = p
			}
		}
	}

	if err := stp.verifyConfig(cfg); err != nil {
		return err
	}

	stp.conf = cfg
	stp.Debug("policy configuration:\n%s", utils.DumpJSON(stp.conf))

	stp.nodeUpdater.update(*stp.conf)

	return nil
}

//
// Helper functions for STP policy backend
//

func stpError(format string, args ...interface{}) error {
	return fmt.Errorf(PolicyName+": "+format, args...)
}

func (stp *stp) initializeState() error {
	ccr := stp.getContainerRegistry()

	for id := range *ccr {
		// Remove orphaned containers
		if _, ok := stp.state.LookupContainer(id); !ok {
			stp.Info("removing orphaned container %s from policy cache", id)
			stp.releaseStpResources(id)
		}
	}

	return stp.verifyConfig(stp.conf)
}

// Verify configuration against the existing set of containers
func (stp *stp) verifyConfig(cfg *config) error {
	//  Sanity check for config
	if cfg == nil || cfg.Pools == nil || len(cfg.Pools) == 0 {
		return stpError("invalid config, no pools configured")
	}

	// Loop through all existing containers
	ccr := stp.getContainerRegistry()
	for id, cs := range *ccr {
		// Check that pool for container exists
		pool, ok := cfg.Pools[cs.Pool]
		if !ok {
			return stpError("invalid stp configuration: pool %q for container %q not found", cs.Pool, id)
		}

		// Check that pool exclusivity is compatible with container configuration
		if pool.Exclusive && cs.NExclusiveCPUs < 1 {
			return stpError("invalid stp configuration: container %q with no exclusive CPUs set to run in exclusive pool %q", id, cs.Pool)
		} else if !pool.Exclusive && cs.NExclusiveCPUs > 0 {
			return stpError("invalid stp configuration: container %q with exclusive CPUs set to run in non-exclusive pool %q", id, cs.Pool)
		}

		// Check that cpu lists (cpuset) of container can be satisfied by the pool
		// NOTE: we do not try to do any migration to possibly free cpu lists
		// if the originally allocated cpu lists are not available
		// TODO: for non-exclusive pools it might be feasible just to alter the
		// cpuset (i.e. reconcile new cpu list using the existing pool/socket
		// spec for container) in case cpu lists do not match exactly
		for _, cCpuset := range cs.Cpusets {
			for i, pClist := range pool.CPULists {
				if cCpuset == pClist.Cpuset {
					pool.CPULists[i].addContainer(id)
					break
				}
				if i == len(pool.CPULists)-1 {
					return stpError("invalid stp configuration: cpu list %q configured for container %q not found in pool %q", cCpuset, id, cs.Pool)
				}
			}
		}
	}

	return nil
}

type cmkLegacyArgs struct {
	Pool       string
	SocketID   int64
	Command    []string
	NoAffinity bool
}

// parseContainerCmdline tries to parse the pool name and socket id parameters
// from container command line
func (stp *stp) parseContainerCmdline(cmd, args []string) *cmkLegacyArgs {
	// NOTE: This is naive implementation and not foolproof. E.g. args could be
	// defined throught env variables
	cmdLine := append(cmd, args...)
	stp.Debug("Parsing container command line %v\n", cmdLine)

	cmkArgs := parseCmkCmdline(cmdLine)

	// If we didn't find cmk arguments, try to parse each argument separately
	// in case cmk was invoked like 'bash -c "cmk isolate ..."
	// NOTE: We do somewhat naive strings.Fields() here, there is room for
	// improvement by usage go-shellquote or similar
	if cmkArgs == nil {
		for _, arg := range cmdLine {
			cmkArgs = parseCmkCmdline(strings.Fields(arg))
			if cmkArgs != nil {
				break
			}
		}
	}
	return cmkArgs
}

func parseCmkCmdline(args []string) *cmkLegacyArgs {
	parsedArgs := cmkLegacyArgs{}

	// Create parser
	cmkCmd := flag.NewFlagSet("cmk-legacy", flag.ContinueOnError)
	cmkCmd.SetOutput(io.Discard)
	cmkCmd.StringVar(&parsedArgs.Pool, "pool", "", "pool to use")
	cmkCmd.Int64Var(&parsedArgs.SocketID, "socket-id", -1, "socket id to use")
	cmkCmd.BoolVar(&parsedArgs.NoAffinity, "no-affinity", false, "Do not set cpu affinity before forking the child command")
	// Args that we're not really interested in
	_ = cmkCmd.String("conf-dir", "", "CMK configuration directory")

	if len(args) > 1 && args[0] == "cmk" && args[1] == "isolate" {
		err := cmkCmd.Parse(args[2:])
		// Parse out (i.e. ignore) all unknown args
		for err != nil {
			err = cmkCmd.Parse(cmkCmd.Args())
		}
		// Pool needs to be defined
		if parsedArgs.Pool != "" {
			parsedArgs.Command = cmkCmd.Args()
			return &parsedArgs
		}
	}
	return nil
}

func (stp *stp) allocateStpResources(c cache.Container, cs stpContainerStatus) error {
	var CPULists [](*cpuList)

	// Get pool configuration for this container
	pool, ok := stp.conf.Pools[cs.Pool]
	if !ok {
		return stpError("BUG: pool %q not found", cs.Pool)
	}

	availableCPULists := getAvailableCPULists(cs.Socket, &pool)

	if pool.Exclusive {
		if cs.NExclusiveCPUs < 1 {
			return stpError("exclusive pool specified but the number of exclusive CPUs requested is 0")
		}

		// Check the possible deprecated CMK_NUM_CORES setting. Print a warning
		// if this does not match what was requested through extended resources
		envNumCores, ok := c.GetEnv(CmkEnvNumCores)
		if ok {
			iNumCores, err := strconv.ParseInt(envNumCores, 10, 64)
			if err != nil || iNumCores != cs.NExclusiveCPUs {
				stp.Warn("Ignoring deprecated env variable setting, %s=%q does "+
					"not match the number of cores (%d) from resource request",
					CmkEnvNumCores, envNumCores, cs.NExclusiveCPUs)
			}
		}

		if int64(len(availableCPULists)) < cs.NExclusiveCPUs {
			if cs.Socket < 0 {
				return stpError("not enough free cpu lists in pool %q", cs.Pool)
			}
			return stpError("not enough free cpu lists in pool %q with socket id %d", cs.Pool, cs.Socket)
		}

		CPULists = availableCPULists[0:cs.NExclusiveCPUs]

	} else {
		/* NOTE (from CMK): This allocation algorithm is probably an
		oversimplification, however for known use cases the non-exclusive
		pools should never have more than one cpu list anyhow.
		If that ceases to hold in the future, we could explore population
		or load-based spreading. Keeping it simple for now. */
		if len(availableCPULists) == 0 {
			return stpError("no available cpu lists in pool %q with socket id %d", cs.Pool, cs.Socket)
		}

		i := rand.Int31n(int32((len(availableCPULists))))
		CPULists = availableCPULists[i : i+1]
	}

	containerID := c.GetCacheID()
	cpuset := ""
	sep := ""
	for _, cl := range CPULists {
		cl.addContainer(containerID)
		cpuset += sep + cl.Cpuset
		sep = ","
		cs.Cpusets = append(cs.Cpusets, cpuset)
	}

	// Commit our changes
	containers := stp.getContainerRegistry()
	(*containers)[containerID] = cs
	stp.setContainerRegistry(containers)

	if cs.NoAffinity {
		stp.Info("not setting cpuset for container  %q as --no-affinity was specified", containerID)
	} else {
		stp.Info("setting cpuset of container %q to %q", containerID, cpuset)
		c.SetCpusetCpus(cpuset)
	}

	c.SetEnv(CmkEnvAssigned, cpuset)

	// Advertise CPUs belonging to the infa pool
	pool, ok = stp.conf.Pools[CmkPoolInfra]
	if ok {
		c.SetEnv(CmkEnvInfra, pool.cpuSet())
	}

	// Advertise CPUs belonging to the shared pool
	pool, ok = stp.conf.Pools[CmkPoolShared]
	if ok {
		c.SetEnv(CmkEnvShared, pool.cpuSet())
	}

	return nil
}

// getAvailableCPULists Constructa a list of available cpu lists that satisfy
// the possible socket constraint
func getAvailableCPULists(socket int64, pool *poolConfig) [](*cpuList) {
	availableCPULists := make([](*cpuList), 0, len(pool.CPULists))
	for _, c := range pool.CPULists {
		if socket < 0 || socket == int64(c.Socket) {
			if pool.Exclusive && len(c.getContainers()) > 0 {
				continue
			}
			availableCPULists = append(availableCPULists, c)
		}
	}
	return availableCPULists
}

func (stp *stp) releaseStpResources(containerID string) error {
	ccr := *stp.getContainerRegistry()
	if cs, ok := ccr[containerID]; ok {
		pool, ok := stp.conf.Pools[cs.Pool]
		if !ok {
			return stpError("BUG: pool %q for container %q not found", cs.Pool, containerID)
		}
		for _, clist := range pool.CPULists {
			clist.removeContainer(containerID)
		}
		delete(ccr, containerID)

		// Commit our changes to stp cache
		stp.setContainerRegistry(&ccr)
	}

	return nil
}

//
// Handling of cached data
//

const (
	cacheKeyContainerRegistry = "ContainerRegistry"
)

type stpContainerStatus struct {
	Pool           string   // pool configuration
	Socket         int64    // physical socket id
	NExclusiveCPUs int64    // number of exclusive cpus
	Cpusets        []string // cpusets (cpu lists) assigned to this container
	NoAffinity     bool     // disable cpuset enforcing
}

// stpContainerCache contains STP-specific data of containers
type stpContainerCache map[string]stpContainerStatus

// Set the value of cached cachableContainerRegistry object
func (c *stpContainerCache) Set(value interface{}) {
	switch value.(type) {
	case stpContainerCache:
		*c = value.(stpContainerCache)
	case *stpContainerCache:
		cp := value.(*stpContainerCache)
		*c = *cp
	}
}

// Get the cached cachableContainerRegistry object
func (c *stpContainerCache) Get() interface{} {
	return *c
}

// getContainerRegistry gets the current state of our container registry
func (stp *stp) getContainerRegistry() *stpContainerCache {
	ccr := &stpContainerCache{}

	if !stp.state.GetPolicyEntry(cacheKeyContainerRegistry, ccr) {
		stp.Error("no cached container registry found")
	}

	return ccr
}

// setContainerRegistry caches the state of our container registry
func (stp *stp) setContainerRegistry(ccr *stpContainerCache) {
	stp.state.SetPolicyEntry(cacheKeyContainerRegistry, cache.Cachable(ccr))
}

// Register us as a policy implementation.
func init() {
	policy.Register(PolicyName, PolicyDescription, CreateStpPolicy)
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/static-pools/stp-policy_test.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package stp

import (
	"encoding/json"
	"testing"

	"github.com/google/go-cmp/cmp"

	logger "github.com/intel/cri-resource-manager/pkg/log"
)

func TestParseContainerCmdline(t *testing.T) {
	stp := &stp{Logger: logger.NewLogger(PolicyName + "-test")}

	// 1. empty command line should return a nil pointer
	args := stp.parseContainerCmdline([]string{}, []string{})
	if args != nil {
		t.Errorf("Exptected <nil> but got %v", *args)
	}

	// 2. case where cmk isolate command is in container "Command"
	args = stp.parseContainerCmdline([]string{"cmk", "isolate", "--pool", "foo", "--socket-id=2", "--conf-dir=/etc", "cmd", "-arg"}, []string{})
	expected := cmkLegacyArgs{Pool: "foo", SocketID: 2, Command: []string{"cmd", "-arg"}}
	if args == nil || !cmp.Equal(expected, *args) {
		t.Errorf("Exptected %v but got %v", expected, *args)
	}

	// 3. we should ignore unknown cmk options
	args = stp.parseContainerCmdline([]string{"cmk", "isolate", "--invalid-1=inv1", "--pool", "foo", "--invalid-2=inv2", "cmd", "--arg"}, []string{})
	expected = cmkLegacyArgs{Pool: "foo", SocketID: -1, Command: []string{"cmd", "--arg"}}
	if args == nil || !cmp.Equal(expected, *args) {
		t.Errorf("Exptected %v but got %v", expected, *args)
	}

	// 4. --pool should be defined in cmk options
	args = stp.parseContainerCmdline([]string{"cmk", "isolate", "--socket-id=2", "cmd", "--arg"}, []string{})
	if args != nil {
		t.Errorf("Exptected <nil> but got %v", *args)
	}

	// 5. parsing from container "Args"
	args = stp.parseContainerCmdline([]string{"bash"}, []string{"-c", "cmk isolate --pool=foo --socket-id=2 cmd --arg"})
	expected = cmkLegacyArgs{Pool: "foo", SocketID: 2, Command: []string{"cmd", "--arg"}}
	if args == nil || !cmp.Equal(expected, *args) {
		t.Errorf("Exptected %v but got %v", expected, *args)
	}

	// 6. Only _cmk_ isolate should be accepted
	args = stp.parseContainerCmdline([]string{"bash"}, []string{"-c", "dmk isolate --pool=foo cmd --arg"})
	if args != nil {
		t.Errorf("Exptected <nil> but got %v", *args)
	}
}

func TestCachableData(t *testing.T) {
	ccr := &stpContainerCache{"id1": stpContainerStatus{Pool: "p", Socket: 1}}

	// Test JSON marshalling of cached data
	data, err := json.Marshal(ccr)
	if err != nil {
		t.Errorf("JSON marshal failed: %v", err)
	}
	expected := []byte(`{"id1":{"Pool":"p","Socket":1,"NExclusiveCPUs":0,"Cpusets":null,"NoAffinity":false}}`)
	if !cmp.Equal(expected, data) {
		t.Errorf("Exptected %s but got %s", expected, data)
	}

	// Test JSON unmarshalling of cached data
	ccr2 := &stpContainerCache{}
	err = json.Unmarshal(data, ccr2)
	if err != nil {
		t.Errorf("JSON unmarshal failed: %v", err)
	}
	if !cmp.Equal(*ccr, *ccr2) {
		t.Errorf("Exptected %v but got %v", *ccr, *ccr2)
	}
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/affinity.go
================================================
// Copyright Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"github.com/intel/cri-resource-manager/pkg/apis/resmgr"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
)

// Calculate pool affinities for the given container.
func (p *policy) calculatePoolAffinities(container cache.Container) (map[int]int32, error) {
	log.Debug("=> calculating pool affinities...")

	affinities, err := p.calculateContainerAffinity(container)
	if err != nil {
		return nil, err
	}

	result := make(map[int]int32, len(p.nodes))
	for id, w := range affinities {
		grant, ok := p.allocations.grants[id]
		if !ok {
			continue
		}
		node := grant.GetCPUNode()
		result[node.NodeID()] += w

		// TODO: calculate affinity for memory here too?
	}

	return result, nil
}

// Calculate affinity of this container (against all other containers).
func (p *policy) calculateContainerAffinity(container cache.Container) (map[string]int32, error) {
	log.Debug("* calculating affinity for container %s...", container.PrettyName())

	ca, err := container.GetAffinity()
	if err != nil {
		return nil, err
	}

	result := make(map[string]int32)
	for _, a := range ca {
		for id, w := range p.cache.EvaluateAffinity(a) {
			result[id] += w
		}
	}

	// self-affinity does not make sense, so remove any
	delete(result, container.GetCacheID())

	log.Debug("  => affinity: %v", result)

	return result, nil
}

// Register our policy-specific implicit affinities with the Cache.
func (p *policy) registerImplicitAffinities() error {
	affinities := []struct {
		name     string
		disabled bool
		affinity cache.ImplicitAffinity
	}{
		{
			name: "AVX512-pull/push",
			affinity: func(c cache.Container, hasExplicit bool) *cache.Affinity {
				_, tagged := c.GetTag(cache.TagAVX512)
				if tagged {
					return cache.GlobalAffinity("tags/"+cache.TagAVX512, 5)
				}
				return cache.GlobalAntiAffinity("tags/"+cache.TagAVX512, 5)
			},
		},
		{
			name:     "colocate-pods",
			disabled: !opt.ColocatePods,
			affinity: func(c cache.Container, hasExplicit bool) *cache.Affinity {
				if hasExplicit {
					return nil
				}
				pod, ok := c.GetPod()
				if !ok {
					log.Error("failed to inject pod-colocation affinity, can't find pod")
					return nil
				}
				return &cache.Affinity{
					Scope: pod.ScopeExpression(),
					Match: &resmgr.Expression{
						Op: resmgr.AlwaysTrue,
					},
					Weight: 10,
				}
			},
		},
		{
			name:     "colocate-namespaces",
			disabled: !opt.ColocateNamespaces,
			affinity: func(c cache.Container, hasExplicit bool) *cache.Affinity {
				if hasExplicit {
					return nil
				}
				return &cache.Affinity{
					Scope: &resmgr.Expression{
						Op: resmgr.AlwaysTrue,
					},
					Match: &resmgr.Expression{
						Key: resmgr.KeyNamespace,
						Op:  resmgr.Equals,
						Values: []string{
							c.GetNamespace(),
						},
					},
					Weight: 10,
				}
			},
		},
	}

	enabled := map[string]cache.ImplicitAffinity{}
	for _, a := range affinities {
		if a.disabled {
			log.Info("implicit affinity %s is disabled", a.name)
			continue
		}
		enabled[PolicyName+":"+a.name] = a.affinity
	}

	if err := p.cache.AddImplicitAffinities(enabled); err != nil {
		return policyError("failed to register implicit affinities: %v", err)
	}

	return nil
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/cache.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"encoding/json"
	"time"

	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	idset "github.com/intel/goresctrl/pkg/utils"
)

const (
	keyAllocations = "allocations"
	keyConfig      = "config"
)

func (p *policy) saveAllocations() {
	p.cache.SetPolicyEntry(keyAllocations, cache.Cachable(&p.allocations))
	p.cache.Save()
}

func (p *policy) restoreAllocations(allocations *allocations) error {
	savedAllocations := allocations.clone()
	p.allocations = p.newAllocations()

	//
	// Try to reinstate all grants with the exact same resource assignments
	// as saved. If that fails, release and try to reallocate all corresponding
	// containers with pool hints pointing to the currently assigned pools. If
	// this fails too, save the original allocations unchanged to the cache and
	// return an error.
	//

	if err := p.reinstateGrants(allocations.grants); err != nil {
		log.Error("failed to reinstate grants verbatim: %v", err)
		containers, poolHints := allocations.getContainerPoolHints()
		if err := p.reallocateResources(containers, poolHints); err != nil {
			p.allocations = savedAllocations
			p.saveAllocations() // undo any potential changes in saved cache
			return err
		}
	}

	return nil
}

// reinstateGrants tries to restore the given grants exactly as such.
func (p *policy) reinstateGrants(grants map[string]Grant) error {
	for id, grant := range grants {
		c := grant.GetContainer()

		pool := grant.GetCPUNode()
		supply := pool.FreeSupply()

		if err := supply.Reserve(grant); err != nil {
			return policyError("failed to update pool %q with CPU grant of %q: %v",
				pool.Name(), c.PrettyName(), err)
		}

		log.Info("updated pool %q with reinstated CPU grant of %q",
			pool.Name(), c.PrettyName())

		pool = grant.GetMemoryNode()
		if err := supply.ReserveMemory(grant); err != nil {
			grant.GetCPUNode().FreeSupply().ReleaseCPU(grant)
			return policyError("failed to update pool %q with extra memory of %q: %v",
				pool.Name(), c.PrettyName(), err)
		}

		log.Info("updated pool %q with reinstanted memory reservation of %q",
			pool.Name(), c.PrettyName())

		p.allocations.grants[id] = grant
		p.applyGrant(grant)
	}

	p.updateSharedAllocations(nil)

	return nil
}

type cachedGrant struct {
	Exclusive   string
	Part        int
	CPUType     cpuClass
	Container   string
	Pool        string
	MemoryPool  string
	MemType     memoryType
	Memset      idset.IDSet
	MemoryLimit memoryMap
	ColdStart   time.Duration
}

func newCachedGrant(cg Grant) *cachedGrant {
	ccg := &cachedGrant{}
	ccg.Exclusive = cg.ExclusiveCPUs().String()
	ccg.Part = cg.CPUPortion()
	ccg.CPUType = cg.CPUType()
	ccg.Container = cg.GetContainer().GetCacheID()
	ccg.Pool = cg.GetCPUNode().Name()
	ccg.MemoryPool = cg.GetMemoryNode().Name()
	ccg.MemType = cg.MemoryType()
	ccg.Memset = cg.Memset().Clone()

	ccg.MemoryLimit = make(memoryMap)
	for key, value := range cg.MemLimit() {
		ccg.MemoryLimit[key] = value
	}

	ccg.ColdStart = cg.ColdStart()

	return ccg
}

func (ccg *cachedGrant) ToGrant(policy *policy) (Grant, error) {
	node, ok := policy.nodes[ccg.Pool]
	if !ok {
		return nil, policyError("cache error: failed to restore %v, unknown pool/node", *ccg)
	}
	container, ok := policy.cache.LookupContainer(ccg.Container)
	if !ok {
		return nil, policyError("cache error: failed to restore %v, unknown container", *ccg)
	}

	g := newGrant(
		node,
		container,
		ccg.CPUType,
		cpuset.MustParse(ccg.Exclusive),
		ccg.Part,
		ccg.MemType,
		ccg.MemoryLimit,
		ccg.ColdStart,
	)

	if g.Memset().String() != ccg.Memset.String() {
		log.Error("cache error: mismatch in stored/recalculated memset: %s != %s",
			ccg.Memset, g.Memset())
	}

	return g, nil
}

func (cg *grant) MarshalJSON() ([]byte, error) {
	return json.Marshal(newCachedGrant(cg))
}

func (cg *grant) UnmarshalJSON(data []byte) error {
	ccg := cachedGrant{}

	if err := json.Unmarshal(data, &ccg); err != nil {
		return policyError("failed to restore grant: %v", err)
	}

	cg.exclusive = cpuset.MustParse(ccg.Exclusive)

	return nil
}

func (a *allocations) MarshalJSON() ([]byte, error) {
	cgrants := make(map[string]*cachedGrant)
	for id, cg := range a.grants {
		cgrants[id] = newCachedGrant(cg)
	}

	return json.Marshal(cgrants)
}

func (a *allocations) UnmarshalJSON(data []byte) error {
	var err error

	cgrants := make(map[string]*cachedGrant)
	if err := json.Unmarshal(data, &cgrants); err != nil {
		return policyError("failed to restore allocations: %v", err)
	}

	a.grants = make(map[string]Grant, 32)
	for id, ccg := range cgrants {
		a.grants[id], err = ccg.ToGrant(a.policy)
		if err != nil {
			log.Error("removing unresolvable cached grant %v: %v", *ccg, err)
			delete(a.grants, id)
		} else {
			log.Debug("resolved cache grant: %v", a.grants[id].String())
		}
	}

	return nil
}

func (a *allocations) Get() interface{} {
	return a
}

func (a *allocations) Set(value interface{}) {
	var from *allocations

	switch value.(type) {
	case allocations:
		v := value.(allocations)
		from = &v
	case *allocations:
		from = value.(*allocations)
	}

	a.grants = make(map[string]Grant, 32)
	for id, cg := range from.grants {
		a.grants[id] = cg
	}
}

func (a *allocations) Dump(logfn func(format string, args ...interface{}), prefix string) {
	for _, cg := range a.grants {
		logfn(prefix+"%s", cg)
	}
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/cache_test.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"bytes"
	"testing"

	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
)

func TestToGrant(t *testing.T) {
	tcases := []struct {
		name          string
		policy        *policy
		cgrant        *cachedGrant
		expectedError bool
	}{
		{
			name:   "unknown node",
			cgrant: &cachedGrant{},
			policy: &policy{
				nodes: map[string]Node{
					"node1": &node{},
				},
			},
			expectedError: true,
		},
		{
			name: "known node but failed lookup",
			cgrant: &cachedGrant{
				Pool: "node1",
			},
			policy: &policy{
				nodes: map[string]Node{
					"node1": &node{},
				},
				cache: &mockCache{},
			},
			expectedError: true,
		},
		{
			name: "known node",
			cgrant: &cachedGrant{
				Pool: "node1",
			},
			policy: &policy{
				nodes: map[string]Node{
					"node1": &node{},
				},
				cache: &mockCache{
					returnValue2ForLookupContainer: true,
				},
			},
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			_, err := tc.cgrant.ToGrant(tc.policy)
			if tc.expectedError && err == nil {
				t.Errorf("Expected error, but got success")
			}
			if !tc.expectedError && err != nil {
				t.Errorf("Unxpected error: %+v", err)
			}
		})
	}
}

func TestAllocationMarshalling(t *testing.T) {
	tcases := []struct {
		name                       string
		data                       []byte
		expectedUnmarshallingError bool
		expectedMarshallingError   bool
	}{
		{
			name: "non-zero Exclusive",
			data: []byte(`{"key1":{"Exclusive":"1","Part":1,"CPUType":0,"Container":"1","Pool":"testnode","MemoryPool":"testnode","MemType":"DRAM,PMEM,HBM","Memset":"","MemoryLimit":{},"ColdStart":0}}`),
		},
		{
			name: "zero Exclusive",
			data: []byte(`{"key1":{"Exclusive":"","Part":1,"CPUType":0,"Container":"1","Pool":"testnode","MemoryPool":"testnode","MemType":"DRAM,PMEM,HBM","Memset":"","MemoryLimit":{},"ColdStart":0}}`),
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			alloc := &allocations{
				policy: &policy{
					nodes: map[string]Node{
						"testnode": &virtualnode{
							node: node{
								name:    "testnode",
								kind:    UnknownNode,
								noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(0, 0, 0), createMemoryMap(0, 0, 0)),
								freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(0, 0, 0), createMemoryMap(0, 0, 0)),
							},
						},
					},
					cache: &mockCache{
						returnValue1ForLookupContainer: &mockContainer{
							returnValueForGetCacheID: "1",
						},
						returnValue2ForLookupContainer: true,
					},
				},
			}
			unmarshallingErr := alloc.UnmarshalJSON(tc.data)
			if tc.expectedUnmarshallingError && unmarshallingErr == nil {
				t.Errorf("Expected unmarshalling error, but got success")
			}
			if !tc.expectedUnmarshallingError && unmarshallingErr != nil {
				t.Errorf("Unxpected unmarshalling error: %+v", unmarshallingErr)
			}

			out, marshallingErr := alloc.MarshalJSON()
			if !bytes.Equal(out, tc.data) {
				t.Errorf("Expected\n%q\nBut got\n%q", tc.data, out)
			}
			if tc.expectedMarshallingError && marshallingErr == nil {
				t.Errorf("Expected marshalling error, but got success")
			}
			if !tc.expectedMarshallingError && marshallingErr != nil {
				t.Errorf("Unxpected marshalling error: %+v", marshallingErr)
			}

		})
	}
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/coldstart.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"time"

	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events"
)

// trigger cold start for the container if necessary.
func (p *policy) triggerColdStart(c cache.Container) error {
	log.Info("coldstart: triggering coldstart for %s...", c.PrettyName())
	g, ok := p.allocations.grants[c.GetCacheID()]
	if !ok {
		log.Warn("coldstart: no grant found, nothing to do...")
		return nil
	}

	coldStart := g.ColdStart()
	if coldStart <= 0 {
		log.Info("coldstart: no coldstart, nothing to do...")
		return nil
	}

	// Start a timer to restore the grant memset to full. Store the
	// timer so that we can release it if the grant is destroyed before
	// the timer elapses.
	duration := coldStart
	timer := time.AfterFunc(duration, func() {
		e := &events.Policy{
			Type:   ColdStartDone,
			Source: PolicyName,
			Data:   c.GetID(),
		}
		if err := p.options.SendEvent(e); err != nil {
			// we should retry this later, the channel is probably full...
			log.Error("Ouch... we'should retry this later.")
		}
	})
	g.AddTimer(timer)
	return nil
}

// finish an ongoing coldstart for the container.
func (p *policy) finishColdStart(c cache.Container) (bool, error) {
	g, ok := p.allocations.grants[c.GetCacheID()]
	if !ok {
		log.Warn("coldstart: no grant found, nothing to do...")
		return false, policyError("coldstart: no grant found for %s", c.PrettyName())
	}

	log.Info("restoring memset to grant %v", g)
	g.RestoreMemset()
	g.ClearTimer()

	return true, nil
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/coldstart_test.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"fmt"
	"sync"
	"testing"
	"time"

	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events"
	policyapi "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	system "github.com/intel/cri-resource-manager/pkg/sysfs"
	idset "github.com/intel/goresctrl/pkg/utils"
)

var globalPolicy *policy
var mutex sync.Mutex

func sendEvent(param interface{}) error {
	// Simulate event synchronization in the upper levels.
	mutex.Lock()
	defer mutex.Unlock()

	fmt.Printf("Event received: %v", param)
	event := param.(*events.Policy)
	globalPolicy.HandleEvent(event)
	return nil
}

func TestColdStart(t *testing.T) {

	// Idea with cold start is that the workload is first allocated only PMEM node. Only when timer expires
	// (or some other event is triggered) is the DRAM node added to the memset. This causes the initial
	// memory allocations to be made from PMEM only.

	tcases := []struct {
		name                     string
		numaNodes                []system.Node
		req                      Request
		affinities               map[int]int32
		container                cache.Container
		expectedColdStartTimeout time.Duration
		expectedDRAMNodeID       int
		expectedPMEMNodeID       int
		expectedDRAMSystemNodeID idset.ID
		expectedPMEMSystemNodeID idset.ID
	}{
		{
			name: "three node cold start",
			numaNodes: []system.Node{
				&mockSystemNode{id: 1, memFree: 10000, memTotal: 10000, memType: system.MemoryTypeDRAM, distance: []int{5, 5, 1}},
				&mockSystemNode{id: 2, memFree: 50000, memTotal: 50000, memType: system.MemoryTypePMEM, distance: []int{5, 1, 5}},
			},
			container: &mockContainer{
				name:                     "demo-coldstart-container",
				returnValueForGetCacheID: "1234",
				pod: &mockPod{
					coldStartTimeout:                   1000 * time.Millisecond,
					returnValue1FotGetResmgrAnnotation: "demo-coldstart-container: pmem,dram",
					returnValue2FotGetResmgrAnnotation: true,
					coldStartContainerName:             "demo-coldstart-container",
				},
			},
			expectedColdStartTimeout: 1000 * time.Millisecond,
			expectedDRAMNodeID:       101,
			expectedDRAMSystemNodeID: idset.ID(1),
			expectedPMEMSystemNodeID: idset.ID(2),
			expectedPMEMNodeID:       102,
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			policy := &policy{
				sys: &mockSystem{
					nodes: tc.numaNodes,
				},
				cache: &mockCache{
					returnValue1ForLookupContainer: tc.container,
					returnValue2ForLookupContainer: true,
				},
				allocations: allocations{
					grants: make(map[string]Grant, 0),
				},
				options: &policyapi.BackendOptions{},
			}
			policy.allocations.policy = policy
			policy.options.SendEvent = sendEvent

			if err := policy.buildPoolsByTopology(); err != nil {
				t.Errorf("failed to build topology pool")
			}

			grant, err := policy.allocatePool(tc.container, "")
			if err != nil {
				panic(err)
			}
			if grant.ColdStart() != tc.expectedColdStartTimeout {
				t.Errorf("Expected coldstart value '%v', but got '%v'", tc.expectedColdStartTimeout, grant.ColdStart())
			}

			policy.allocations.grants[tc.container.GetCacheID()] = grant

			mems := grant.Memset()
			if len(mems) != 1 || mems.Members()[0] != tc.expectedPMEMSystemNodeID {
				t.Errorf("Expected one memory controller %v, got: %v", tc.expectedPMEMSystemNodeID, mems)
			}

			if grant.MemoryType()&memoryDRAM != 0 {
				// FIXME: should we report only the limited memory types or the granted types
				// while the cold start is going on?
				// t.Errorf("No DRAM was expected before coldstart timer: %v", grant.MemoryType())
			}

			globalPolicy = policy

			policy.options.SendEvent(&events.Policy{
				Type: events.ContainerStarted,
				Data: tc.container,
			})

			time.Sleep(tc.expectedColdStartTimeout * 2)

			newMems := grant.Memset()
			if len(newMems) != 2 {
				t.Errorf("Expected two memory controllers, got %d: %v", len(newMems), newMems)
			}
			if !newMems.Has(tc.expectedPMEMSystemNodeID) || !newMems.Has(tc.expectedDRAMSystemNodeID) {
				t.Errorf("Didn't get all expected system nodes in mems, got: %v", newMems)
			}
		})
	}
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/error.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"fmt"
)

// policyError creates a formatted policy-specific error.
func policyError(format string, args ...interface{}) error {
	return fmt.Errorf(PolicyName+": "+format, args...)
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/flags.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	config "github.com/intel/cri-resource-manager/pkg/config"
)

// Options captures our configurable policy parameters.
type options struct {
	// PinCPU controls CPU pinning in this policy.
	PinCPU bool
	// PinMemory controls memory pinning in this policy.
	PinMemory bool
	// PreferIsolated controls whether isolated CPUs are preferred for isolated allocations.
	PreferIsolated bool `json:"PreferIsolatedCPUs"`
	// PreferShared controls whether shared CPU allocation is always preferred by default.
	PreferShared bool `json:"PreferSharedCPUs"`
	// ReservedPoolNamespaces is a list of namespace globs that will be allocated to reserved CPUs
	ReservedPoolNamespaces []string `json:"ReservedPoolNamespaces,omitempty"`
	// ColocatePods causes all containers in a pod to have affinity for each other.
	ColocatePods bool `json:"ColocatePods"`
	// ColocateNamespaces causes all containers in a namespace to have affinity for each other.
	ColocateNamespaces bool `json:"ColocateNamespaces"`
}

// Our runtime configuration.
var opt = defaultOptions().(*options)
var aliasOpt = defaultOptions().(*options)

// defaultOptions returns a new options instance, all initialized to defaults.
func defaultOptions() interface{} {
	return &options{
		PinCPU:                 true,
		PinMemory:              true,
		PreferIsolated:         true,
		PreferShared:           false,
		ReservedPoolNamespaces: []string{"kube-system"},
	}
}

// Register us for configuration handling.
func init() {
	config.Register(PolicyPath, PolicyDescription, opt, defaultOptions)
	config.Register(AliasPath, PolicyDescription, aliasOpt, defaultOptions)
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/hint.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"strconv"
	"strings"

	system "github.com/intel/cri-resource-manager/pkg/sysfs"
	"github.com/intel/cri-resource-manager/pkg/topology"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	idset "github.com/intel/goresctrl/pkg/utils"
)

// Calculate the hint score of the given hint and CPUSet.
func cpuHintScore(hint topology.Hint, CPUs cpuset.CPUSet) float64 {
	hCPUs, err := cpuset.Parse(hint.CPUs)
	if err != nil {
		log.Warn("invalid hint CPUs '%s' from %s", hint.CPUs, hint.Provider)
		return 0.0
	}
	common := hCPUs.Intersection(CPUs)
	return float64(common.Size()) / float64(hCPUs.Size())
}

// Calculate the NUMA node score of the given hint and NUMA node.
func numaHintScore(hint topology.Hint, sysIDs ...idset.ID) float64 {
	for _, idstr := range strings.Split(hint.NUMAs, ",") {
		hID, err := strconv.ParseInt(idstr, 0, 0)
		if err != nil {
			log.Warn("invalid hint NUMA node %s from %s", idstr, hint.Provider)
			return 0.0
		}

		for _, id := range sysIDs {
			if hID == int64(id) {
				return 1.0
			}
		}
	}

	return 0.0
}

// Calculate the die node score of the given hint and die.
func dieHintScore(hint topology.Hint, sysID idset.ID, socket system.CPUPackage) float64 {
	numaNodes := idset.NewIDSet(socket.DieNodeIDs(sysID)...)

	for _, idstr := range strings.Split(hint.NUMAs, ",") {
		hID, err := strconv.ParseInt(idstr, 0, 0)
		if err != nil {
			log.Warn("invalid hint NUMA node %s from %s", idstr, hint.Provider)
			return 0.0
		}

		if numaNodes.Has(idset.ID(hID)) {
			return 1.0
		}
	}

	return 0.0
}

// Calculate the socket node score of the given hint and NUMA node.
func socketHintScore(hint topology.Hint, sysID idset.ID) float64 {
	for _, idstr := range strings.Split(hint.Sockets, ",") {
		id, err := strconv.ParseInt(idstr, 0, 0)
		if err != nil {
			log.Warn("invalid hint socket '%s' from %s", idstr, hint.Provider)
			return 0.0
		}
		if id == int64(sysID) {
			return 1.0
		}
	}

	return 0.0
}

// return the cpuset for the CPU, NUMA or socket hints, preferred in this particular order.
func (cs *supply) hintCpus(h topology.Hint) cpuset.CPUSet {
	var cpus cpuset.CPUSet

	switch {
	case h.CPUs != "":
		cpus = cpuset.MustParse(h.CPUs)

	case h.NUMAs != "":
		for _, idstr := range strings.Split(h.NUMAs, ",") {
			if id, err := strconv.ParseInt(idstr, 0, 0); err == nil {
				if node := cs.node.System().Node(idset.ID(id)); node != nil {
					cpus = cpus.Union(node.CPUSet())
				}
			}
		}

	case h.Sockets != "":
		for _, idstr := range strings.Split(h.Sockets, ",") {
			if id, err := strconv.ParseInt(idstr, 0, 0); err == nil {
				if pkg := cs.node.System().Package(idset.ID(id)); pkg != nil {
					cpus = cpus.Union(pkg.CPUSet())
				}
			}
		}
	}

	return cpus
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/hint_test.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"testing"

	"github.com/intel/cri-resource-manager/pkg/topology"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	idset "github.com/intel/goresctrl/pkg/utils"
)

func TestCpuHintScore(t *testing.T) {
	tcases := []struct {
		name     string
		expected float64
		hint     topology.Hint
		cpus     cpuset.CPUSet
		disabled bool // TODO(rojkov): remove this field when the code is fixed.
	}{
		{
			name:     "handle zero cpu size gracefully",
			disabled: true,
		},
		{
			name: "handle unparsable cpu size gracefully",
			hint: topology.Hint{
				CPUs: "unparsable",
			},
		},
		{
			name: "non-zero cpu size hint and empty CPUs",
			hint: topology.Hint{
				CPUs: "1",
			},
		},
		{
			name: "hint corresponding to given CPU",
			hint: topology.Hint{
				CPUs: "1,2",
			},
			cpus:     cpuset.New(1),
			expected: 0.5,
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			if tc.disabled {
				t.Skipf("The case '%s' is skipped", tc.name)
			}
			actual := cpuHintScore(tc.hint, tc.cpus)
			if actual != tc.expected {
				t.Errorf("Expected %f, but got %f", tc.expected, actual)
			}
		})
	}
}

func TestNumaHintScore(t *testing.T) {
	tcases := []struct {
		name     string
		expected float64
		hint     topology.Hint
		ids      []idset.ID
	}{
		{
			name: "handle unparsable NUMAs gracefully",
			hint: topology.Hint{
				NUMAs: "unparsable",
			},
		},
		{
			name: "non-zero NUMA hint and empty NUMAs",
			hint: topology.Hint{
				NUMAs: "1",
			},
		},
		{
			name: "hint corresponding to a given ID",
			ids:  []idset.ID{1},
			hint: topology.Hint{
				NUMAs: "1,2",
			},
			expected: 1.0,
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			actual := numaHintScore(tc.hint, tc.ids...)
			if actual != tc.expected {
				t.Errorf("Expected %f, but got %f", tc.expected, actual)
			}
		})
	}
}

func TestSocketHintScore(t *testing.T) {
	tcases := []struct {
		name     string
		expected float64
		hint     topology.Hint
		id       idset.ID
	}{
		{
			name: "handle unparsable Sockets gracefully",
			hint: topology.Hint{
				Sockets: "unparsable",
			},
		},
		{
			name: "non-zero Sockets hint and empty Sockets",
			hint: topology.Hint{
				Sockets: "1",
			},
		},
		{
			name: "hint corresponding to a given ID",
			id:   1,
			hint: topology.Hint{
				Sockets: "1,2",
			},
			expected: 1.0,
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			actual := socketHintScore(tc.hint, tc.id)
			if actual != tc.expected {
				t.Errorf("Expected %f, but got %f", tc.expected, actual)
			}
		})
	}
}

func TestHintCpus(t *testing.T) {
	tcases := []struct {
		name     string
		supply   *supply
		hint     topology.Hint
		expected cpuset.CPUSet
	}{
		{
			name:   "handle unparsable Sockets gracefully",
			supply: &supply{},
			hint: topology.Hint{
				Sockets: "unparsable",
			},
		},
		{
			name: "non-zero Sockets hint and empty system.Package",
			supply: &supply{
				node: &node{
					policy: &policy{
						sys: &mockSystem{},
					},
				},
			},
			hint: topology.Hint{
				Sockets: "1",
			},
		},
		{
			name:   "handle unparsable NUMAs gracefully",
			supply: &supply{},
			hint: topology.Hint{
				NUMAs: "unparsable",
			},
		},
		{
			name: "non-zero NUMAs hint and empty system.Node",
			supply: &supply{
				node: &node{
					policy: &policy{
						sys: &mockSystem{},
					},
				},
			},
			hint: topology.Hint{
				NUMAs: "1",
			},
		},
		// TODO(rojkov): add tests for non-empty system.Package's (can't be done while system.Package is closed struct)
		{
			name:   "non-zero CPUs hint",
			supply: &supply{},
			hint: topology.Hint{
				CPUs: "1",
			},
			expected: cpuset.New(1),
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			actual := tc.supply.hintCpus(tc.hint)
			if tc.expected.IsEmpty() && actual.IsEmpty() {
				return
			}
			if !tc.expected.Equals(actual) {
				t.Errorf("Expected %+v, but got %+v", tc.expected, actual)
			}
		})
	}
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/logging.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"fmt"

	logger "github.com/intel/cri-resource-manager/pkg/log"
)

// Create our logger instance.
var log logger.Logger = logger.NewLogger("policy")

// indent produces an indentation string for the given level.
const (
	IndentDepth = 4
)

func indent(prefix string, level ...int) string {
	if len(level) < 1 {
		return prefix
	}

	depth := level[0] * IndentDepth
	return prefix + fmt.Sprintf("%*.*s", depth, depth, "")
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/mocks_test.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"os"
	"time"

	"github.com/intel/cri-resource-manager/pkg/apis/resmgr"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/config"
	system "github.com/intel/cri-resource-manager/pkg/sysfs"
	"github.com/intel/cri-resource-manager/pkg/topology"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	"github.com/intel/goresctrl/pkg/sst"
	idset "github.com/intel/goresctrl/pkg/utils"
	v1 "k8s.io/api/core/v1"
	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"
)

type mockSystemNode struct {
	id       idset.ID // node id
	memFree  uint64
	memTotal uint64
	memType  system.MemoryType
	distance []int
}

func (fake *mockSystemNode) MemoryInfo() (*system.MemInfo, error) {
	return &system.MemInfo{MemFree: fake.memFree, MemTotal: fake.memTotal}, nil
}

func (fake *mockSystemNode) PackageID() idset.ID {
	return 0
}

func (fake *mockSystemNode) DieID() idset.ID {
	return 0
}

func (fake *mockSystemNode) ID() idset.ID {
	return fake.id
}

func (fake *mockSystemNode) GetMemoryType() system.MemoryType {
	return fake.memType
}

func (fake *mockSystemNode) HasNormalMemory() bool {
	return true
}

func (fake *mockSystemNode) CPUSet() cpuset.CPUSet {
	return cpuset.New()
}

func (fake *mockSystemNode) Distance() []int {
	if len(fake.distance) == 0 {
		return []int{0}
	}
	return fake.distance
}

func (fake *mockSystemNode) DistanceFrom(id idset.ID) int {
	return 0
}

type mockCPUPackage struct {
}

func (p *mockCPUPackage) ID() idset.ID {
	return idset.ID(0)
}

func (p *mockCPUPackage) CPUSet() cpuset.CPUSet {
	return cpuset.New()
}

func (p *mockCPUPackage) NodeIDs() []idset.ID {
	return []idset.ID{}
}

func (p *mockCPUPackage) DieIDs() []idset.ID {
	return []idset.ID{0}
}

func (p *mockCPUPackage) DieCPUSet(idset.ID) cpuset.CPUSet {
	return cpuset.New()
}

func (p *mockCPUPackage) DieNodeIDs(idset.ID) []idset.ID {
	return []idset.ID{}
}

func (p *mockCPUPackage) SstInfo() *sst.SstPackageInfo {
	return &sst.SstPackageInfo{}
}

type mockCPU struct {
	isolated cpuset.CPUSet
	online   cpuset.CPUSet
	id       idset.ID
	node     mockSystemNode
	pkg      mockCPUPackage
}

func (c *mockCPU) BaseFrequency() uint64 {
	return 0
}
func (c *mockCPU) EPP() system.EPP {
	return system.EPPUnknown
}
func (c *mockCPU) ID() idset.ID {
	return idset.ID(0)
}
func (c *mockCPU) PackageID() idset.ID {
	return c.pkg.ID()
}
func (c *mockCPU) DieID() idset.ID {
	return idset.ID(0)
}
func (c *mockCPU) NodeID() idset.ID {
	return c.node.ID()
}
func (c *mockCPU) CoreID() idset.ID {
	return c.id
}
func (c *mockCPU) ThreadCPUSet() cpuset.CPUSet {
	return cpuset.New()
}
func (c *mockCPU) FrequencyRange() system.CPUFreq {
	return system.CPUFreq{}
}
func (c *mockCPU) Online() bool {
	return true
}
func (c *mockCPU) Isolated() bool {
	return false
}
func (c *mockCPU) SetFrequencyLimits(min, max uint64) error {
	return nil
}

func (c *mockCPU) SstClos() int {
	return -1
}

type mockSystem struct {
	isolatedCPU  int
	nodes        []system.Node
	cpuCount     int
	packageCount int
	socketCount  int
}

func (fake *mockSystem) Node(id idset.ID) system.Node {
	for _, node := range fake.nodes {
		if node.ID() == id {
			return node
		}
	}
	return &mockSystemNode{}
}

func (fake *mockSystem) CPU(idset.ID) system.CPU {
	return &mockCPU{}
}
func (fake *mockSystem) CPUCount() int {
	if fake.cpuCount == 0 {
		return 1
	}
	return fake.cpuCount
}
func (fake *mockSystem) Discover() error {
	return nil
}
func (fake *mockSystem) Package(idset.ID) system.CPUPackage {
	return &mockCPUPackage{}
}
func (fake *mockSystem) Offlined() cpuset.CPUSet {
	return cpuset.New()
}
func (fake *mockSystem) Isolated() cpuset.CPUSet {
	if fake.isolatedCPU > 0 {
		return cpuset.New(fake.isolatedCPU)
	}

	return cpuset.New()
}
func (fake *mockSystem) CPUSet() cpuset.CPUSet {
	return cpuset.New()
}
func (fake *mockSystem) CPUIDs() []idset.ID {
	return []idset.ID{}
}
func (fake *mockSystem) PackageCount() int {
	if fake.packageCount == 0 {
		return 1
	}
	return fake.packageCount
}
func (fake *mockSystem) SocketCount() int {
	if fake.socketCount == 0 {
		return 1
	}
	return fake.socketCount
}
func (fake *mockSystem) NUMANodeCount() int {
	return len(fake.nodes)
}
func (fake *mockSystem) ThreadCount() int {
	if fake.cpuCount == 0 {
		return 1
	}
	return fake.cpuCount
}
func (fake *mockSystem) PackageIDs() []idset.ID {
	ids := make([]idset.ID, len(fake.nodes))
	for i, node := range fake.nodes {
		ids[i] = node.PackageID()
	}
	return ids
}
func (fake *mockSystem) NodeIDs() []idset.ID {
	ids := make([]idset.ID, len(fake.nodes))
	for i, node := range fake.nodes {
		ids[i] = node.ID()
	}
	return ids
}
func (fake *mockSystem) SetCPUFrequencyLimits(min, max uint64, cpus idset.IDSet) error {
	return nil
}
func (fake *mockSystem) SetCpusOnline(online bool, cpus idset.IDSet) (idset.IDSet, error) {
	return idset.NewIDSet(), nil
}
func (fake *mockSystem) NodeDistance(idset.ID, idset.ID) int {
	return 10
}

type mockContainer struct {
	name                                  string
	namespace                             string
	returnValueForGetResourceRequirements v1.ResourceRequirements
	returnValueForGetCacheID              string
	returnValueForGetID                   string
	memoryLimit                           int64
	cpuset                                cpuset.CPUSet
	returnValueForQOSClass                v1.PodQOSClass
	pod                                   cache.Pod
}

func (m *mockContainer) PrettyName() string {
	return m.name
}
func (m *mockContainer) GetPod() (cache.Pod, bool) {
	if m.pod == nil {
		return &mockPod{}, false
	}
	return m.pod, true
}
func (m *mockContainer) GetID() string {
	return m.returnValueForGetID
}
func (m *mockContainer) GetPodID() string {
	panic("unimplemented")
}
func (m *mockContainer) GetCacheID() string {
	if len(m.returnValueForGetCacheID) == 0 {
		return "0"
	}

	return m.returnValueForGetCacheID
}
func (m *mockContainer) GetName() string {
	return m.name
}
func (m *mockContainer) GetNamespace() string {
	return m.namespace
}
func (m *mockContainer) UpdateState(cache.ContainerState) {
	panic("unimplemented")
}
func (m *mockContainer) GetState() cache.ContainerState {
	panic("unimplemented")
}
func (m *mockContainer) GetQOSClass() v1.PodQOSClass {
	if len(m.returnValueForQOSClass) == 0 {
		return v1.PodQOSGuaranteed
	}

	return m.returnValueForQOSClass
}
func (m *mockContainer) GetImage() string {
	panic("unimplemented")
}
func (m *mockContainer) GetCommand() []string {
	panic("unimplemented")
}
func (m *mockContainer) GetArgs() []string {
	panic("unimplemented")
}
func (m *mockContainer) GetLabelKeys() []string {
	panic("unimplemented")
}
func (m *mockContainer) GetLabel(string) (string, bool) {
	panic("unimplemented")
}
func (m *mockContainer) GetLabels() map[string]string {
	panic("unimplemented")
}
func (m *mockContainer) GetResmgrLabelKeys() []string {
	panic("unimplemented")
}
func (m *mockContainer) GetResmgrLabel(string) (string, bool) {
	panic("unimplemented")
}
func (m *mockContainer) GetAnnotationKeys() []string {
	panic("unimplemented")
}
func (m *mockContainer) GetAnnotation(string, interface{}) (string, bool) {
	panic("unimplemented")
}
func (m *mockContainer) GetResmgrAnnotationKeys() []string {
	panic("unimplemented")
}
func (m *mockContainer) GetResmgrAnnotation(string, interface{}) (string, bool) {
	panic("unimplemented")
}
func (m *mockContainer) GetEffectiveAnnotation(key string) (string, bool) {
	pod, ok := m.GetPod()
	if !ok {
		return "", false
	}
	return pod.GetEffectiveAnnotation(key, m.name)
}
func (m *mockContainer) GetAnnotations() map[string]string {
	panic("unimplemented")
}
func (m *mockContainer) GetEnvKeys() []string {
	panic("unimplemented")
}
func (m *mockContainer) GetEnv(string) (string, bool) {
	panic("unimplemented")
}
func (m *mockContainer) GetMounts() []cache.Mount {
	panic("unimplemented")
}
func (m *mockContainer) GetMountByHost(string) *cache.Mount {
	panic("unimplemented")
}
func (m *mockContainer) GetMountByContainer(string) *cache.Mount {
	panic("unimplemented")
}
func (m *mockContainer) GetDevices() []cache.Device {
	panic("unimplemented")
}
func (m *mockContainer) GetDeviceByHost(string) *cache.Device {
	panic("unimplemented")
}
func (m *mockContainer) GetDeviceByContainer(string) *cache.Device {
	panic("unimplemented")
}
func (m *mockContainer) GetResourceRequirements() v1.ResourceRequirements {
	return m.returnValueForGetResourceRequirements
}
func (m *mockContainer) GetLinuxResources() *criv1.LinuxContainerResources {
	panic("unimplemented")
}
func (m *mockContainer) SetCommand([]string) {
	panic("unimplemented")
}
func (m *mockContainer) SetArgs([]string) {
	panic("unimplemented")
}
func (m *mockContainer) SetLabel(string, string) {
	panic("unimplemented")
}
func (m *mockContainer) DeleteLabel(string) {
	panic("unimplemented")
}
func (m *mockContainer) SetAnnotation(string, string) {
	panic("unimplemented")
}
func (m *mockContainer) DeleteAnnotation(string) {
	panic("unimplemented")
}
func (m *mockContainer) SetEnv(string, string) {
	panic("unimplemented")
}
func (m *mockContainer) UnsetEnv(string) {
	panic("unimplemented")
}
func (m *mockContainer) InsertMount(*cache.Mount) {
	panic("unimplemented")
}
func (m *mockContainer) DeleteMount(string) {
	panic("unimplemented")
}
func (m *mockContainer) InsertDevice(*cache.Device) {
	panic("unimplemented")
}
func (m *mockContainer) DeleteDevice(string) {
	panic("unimplemented")
}
func (m *mockContainer) GetTopologyHints() topology.Hints {
	return topology.Hints{}
}
func (m *mockContainer) GetCPUPeriod() int64 {
	panic("unimplemented")
}
func (m *mockContainer) GetCPUQuota() int64 {
	panic("unimplemented")
}
func (m *mockContainer) GetCPUShares() int64 {
	panic("unimplemented")
}
func (m *mockContainer) GetMemoryLimit() int64 {
	return m.memoryLimit
}
func (m *mockContainer) GetOomScoreAdj() int64 {
	panic("unimplemented")
}
func (m *mockContainer) GetCpusetCpus() string {
	return m.cpuset.String()
}
func (m *mockContainer) GetCpusetMems() string {
	panic("unimplemented")
}
func (m *mockContainer) SetLinuxResources(*criv1.LinuxContainerResources) {
	panic("unimplemented")
}
func (m *mockContainer) SetCPUPeriod(int64) {
	panic("unimplemented")
}
func (m *mockContainer) SetCPUQuota(int64) {
	panic("unimplemented")
}
func (m *mockContainer) SetCPUShares(int64) {
}
func (m *mockContainer) SetMemoryLimit(int64) {
	panic("unimplemented")
}
func (m *mockContainer) SetOomScoreAdj(int64) {
	panic("unimplemented")
}
func (m *mockContainer) SetCpusetCpus(string) {
}
func (m *mockContainer) SetCpusetMems(string) {
}
func (m *mockContainer) UpdateCriCreateRequest(*criv1.CreateContainerRequest) error {
	panic("unimplemented")
}
func (m *mockContainer) CriUpdateRequest() (*criv1.UpdateContainerResourcesRequest, error) {
	panic("unimplemented")
}
func (m *mockContainer) GetAffinity() ([]*cache.Affinity, error) {
	return nil, nil
}
func (m *mockContainer) SetRDTClass(string) {
	panic("unimplemented")
}
func (m *mockContainer) GetRDTClass() string {
	panic("unimplemented")
}
func (m *mockContainer) SetBlockIOClass(string) {
	panic("unimplemented")
}
func (m *mockContainer) GetBlockIOClass() string {
	panic("unimplemented")
}
func (m *mockContainer) SetToptierLimit(int64) {
	panic("unimplemented")
}
func (m *mockContainer) GetToptierLimit() int64 {
	panic("unimplemented")
}
func (m *mockContainer) SetPageMigration(*cache.PageMigrate) {
	return
}
func (m *mockContainer) GetPageMigration() *cache.PageMigrate {
	return nil
}
func (m *mockContainer) SetCRIRequest(req interface{}) error {
	panic("unimplemented")
}
func (m *mockContainer) GetCRIRequest() (interface{}, bool) {
	panic("unimplemented")
}
func (m *mockContainer) ClearCRIRequest() (interface{}, bool) {
	panic("unimplemented")
}
func (m *mockContainer) GetCRIEnvs() []*criv1.KeyValue {
	panic("unimplemented")
}
func (m *mockContainer) GetCRIMounts() []*criv1.Mount {
	panic("unimplemented")
}
func (m *mockContainer) GetCRIDevices() []*criv1.Device {
	panic("unimplemented")
}
func (m *mockContainer) GetPending() []string {
	panic("unimplemented")
}
func (m *mockContainer) HasPending(string) bool {
	panic("unimplemented")
}
func (m *mockContainer) ClearPending(string) {
	panic("unimplemented")
}
func (m *mockContainer) GetTag(string) (string, bool) {
	panic("unimplemented")
}
func (m *mockContainer) SetTag(string, string) (string, bool) {
	panic("unimplemented")
}
func (m *mockContainer) DeleteTag(string) (string, bool) {
	panic("unimplemented")
}
func (m *mockContainer) String() string {
	return "mockContainer"
}
func (m *mockContainer) Eval(string) interface{} {
	panic("unimplemented")
}
func (m *mockContainer) GetProcesses() ([]string, error) {
	panic("unimplemented")
}
func (m *mockContainer) GetTasks() ([]string, error) {
	panic("unimplemented")
}
func (m *mockContainer) GetCgroupDir() string {
	panic("unimplemented")
}

type mockPod struct {
	name                               string
	returnValueFotGetQOSClass          v1.PodQOSClass
	returnValue1FotGetResmgrAnnotation string
	returnValue2FotGetResmgrAnnotation bool
	coldStartTimeout                   time.Duration
	coldStartContainerName             string
	annotations                        map[string]string
}

func (m *mockPod) GetInitContainers() []cache.Container {
	panic("unimplemented")
}
func (m *mockPod) GetContainers() []cache.Container {
	panic("unimplemented")
}
func (m *mockPod) GetContainer(string) (cache.Container, bool) {
	panic("unimplemented")
}
func (m *mockPod) GetID() string {
	panic("unimplemented")
}
func (m *mockPod) GetUID() string {
	panic("unimplemented")
}
func (m *mockPod) GetName() string {
	return m.name
}
func (m *mockPod) GetNamespace() string {
	panic("unimplemented")
}
func (m *mockPod) GetState() cache.PodState {
	panic("unimplemented")
}
func (m *mockPod) GetQOSClass() v1.PodQOSClass {
	return m.returnValueFotGetQOSClass
}
func (m *mockPod) GetLabelKeys() []string {
	panic("unimplemented")
}
func (m *mockPod) GetLabel(string) (string, bool) {
	panic("unimplemented")
}
func (m *mockPod) GetResmgrLabelKeys() []string {
	panic("unimplemented")
}
func (m *mockPod) GetResmgrLabel(string) (string, bool) {
	panic("unimplemented")
}
func (m *mockPod) GetAnnotationKeys() []string {
	panic("unimplemented")
}
func (m *mockPod) GetAnnotation(string) (string, bool) {
	panic("unimplemented")
}
func (m *mockPod) GetAnnotationObject(string, interface{}, func([]byte, interface{}) error) (bool, error) {
	panic("unimplemented")
}
func (m *mockPod) GetResmgrAnnotationKeys() []string {
	panic("unimplemented")
}
func (m *mockPod) GetResmgrAnnotation(key string) (string, bool) {
	if key == keyColdStartPreference && len(m.coldStartContainerName) > 0 {
		return m.coldStartContainerName + ": { duration: " + m.coldStartTimeout.String() + " }", true
	}
	return m.returnValue1FotGetResmgrAnnotation, m.returnValue2FotGetResmgrAnnotation
}
func (m *mockPod) GetResmgrAnnotationObject(string, interface{}, func([]byte, interface{}) error) (bool, error) {
	panic("unimplemented")
}
func (m *mockPod) GetEffectiveAnnotation(key, container string) (string, bool) {
	if v, ok := m.annotations[key+"/container."+container]; ok {
		return v, true
	}
	if v, ok := m.annotations[key+"/pod"]; ok {
		return v, true
	}
	v, ok := m.annotations[key]
	return v, ok
}
func (m *mockPod) GetCgroupParentDir() string {
	panic("unimplemented")
}
func (m *mockPod) GetPodResourceRequirements() cache.PodResourceRequirements {
	panic("unimplemented")
}
func (m *mockPod) GetContainerAffinity(string) ([]*cache.Affinity, error) {
	panic("unimplemented")
}
func (m *mockPod) ScopeExpression() *resmgr.Expression {
	panic("unimplemented")
}
func (m *mockPod) String() string {
	return "mockPod"
}
func (m *mockPod) Eval(string) interface{} {
	panic("unimplemented")
}
func (m *mockPod) GetProcesses(bool) ([]string, error) {
	panic("unimplemented")
}
func (m *mockPod) GetTasks(bool) ([]string, error) {
	panic("unimplemented")
}

type mockCache struct {
	returnValueForGetPolicyEntry   bool
	returnValue1ForLookupContainer cache.Container
	returnValue2ForLookupContainer bool
}

func (m *mockCache) InsertPod(string, interface{}, *cache.PodStatus) (cache.Pod, error) {
	panic("unimplemented")
}
func (m *mockCache) DeletePod(string) cache.Pod {
	panic("unimplemented")
}
func (m *mockCache) LookupPod(string) (cache.Pod, bool) {
	panic("unimplemented")
}
func (m *mockCache) InsertContainer(interface{}) (cache.Container, error) {
	panic("unimplemented")
}
func (m *mockCache) UpdateContainerID(string, interface{}) (cache.Container, error) {
	panic("unimplemented")
}
func (m *mockCache) DeleteContainer(string) cache.Container {
	panic("unimplemented")
}
func (m *mockCache) LookupContainer(string) (cache.Container, bool) {
	return m.returnValue1ForLookupContainer, m.returnValue2ForLookupContainer
}
func (m *mockCache) LookupContainerByCgroup(path string) (cache.Container, bool) {
	panic("unimplemented")
}
func (m *mockCache) GetPendingContainers() []cache.Container {
	panic("unimplemented")
}
func (m *mockCache) GetPods() []cache.Pod {
	panic("unimplemented")
}
func (m *mockCache) GetContainers() []cache.Container {
	panic("unimplemented")
}
func (m *mockCache) GetContainerCacheIds() []string {
	panic("unimplemented")
}
func (m *mockCache) GetContainerIds() []string {
	panic("unimplemented")
}
func (m *mockCache) FilterScope(*resmgr.Expression) []cache.Container {
	panic("unimplemented")
}
func (m *mockCache) EvaluateAffinity(*cache.Affinity) map[string]int32 {
	return map[string]int32{
		"fake key": 1,
	}
}
func (m *mockCache) AddImplicitAffinities(map[string]cache.ImplicitAffinity) error {
	return nil
}
func (m *mockCache) GetActivePolicy() string {
	panic("unimplemented")
}
func (m *mockCache) SetActivePolicy(string) error {
	panic("unimplemented")
}
func (m *mockCache) ResetActivePolicy() error {
	panic("unimplemented")
}
func (m *mockCache) SetPolicyEntry(string, interface{}) {
}
func (m *mockCache) GetPolicyEntry(string, interface{}) bool {
	return m.returnValueForGetPolicyEntry
}
func (m *mockCache) SetConfig(*config.RawConfig) error {
	panic("unimplemented")
}
func (m *mockCache) GetConfig() *config.RawConfig {
	panic("unimplemented")
}
func (m *mockCache) ResetConfig() error {
	panic("unimplemented")
}
func (m *mockCache) SetAdjustment(*config.Adjustment) (bool, map[string]error) {
	panic("unimplemented")
}
func (m *mockCache) Save() error {
	return nil
}
func (m *mockCache) RefreshPods(*criv1.ListPodSandboxResponse, map[string]*cache.PodStatus) ([]cache.Pod, []cache.Pod, []cache.Container) {
	panic("unimplemented")
}
func (m *mockCache) RefreshContainers(*criv1.ListContainersResponse) ([]cache.Container, []cache.Container) {
	panic("unimplemented")
}
func (m *mockCache) ContainerDirectory(string) string {
	panic("unimplemented")
}
func (m *mockCache) OpenFile(string, string, os.FileMode) (*os.File, error) {
	panic("unimplemented")
}
func (m *mockCache) WriteFile(string, string, os.FileMode, []byte) error {
	panic("unimplemented")
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/node.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"fmt"

	system "github.com/intel/cri-resource-manager/pkg/sysfs"
	"github.com/intel/cri-resource-manager/pkg/topology"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	idset "github.com/intel/goresctrl/pkg/utils"
)

//
// Nodes (currently) correspond to some tangible entity in the hardware topology
// hierarchy: full machine (virtual root in multi-socket systems), an individual
// sockets a NUMA node. These nodes are linked into a tree resembling the topology
// tree, with the full machine at the top, and CPU cores at the bottom. In a single
// socket system, the virtual root is replaced with the single socket. In a single
// NUMA node case, the single node is omitted. Also, CPU cores are not modelled as
// nodes, instead they are properties of the nodes (as capacity and free CPU).
//

// NodeKind represents a unique node type.
type NodeKind string

const (
	// NilNode is the type of a nil node.
	NilNode NodeKind = ""
	// UnknownNode is the type of unknown node type.
	UnknownNode NodeKind = "unknown"
	// SocketNode represents a physical CPU package/socket in the system.
	SocketNode NodeKind = "socket"
	// DieNode represents a die within a physical CPU package/socket in the system.
	DieNode NodeKind = "die"
	// NumaNode represents a NUMA node in the system.
	NumaNode NodeKind = "numa node"
	// VirtualNode represents a virtual node, currently the root multi-socket setups.
	VirtualNode NodeKind = "virtual node"
)

const (
	// OverfitPenalty is the per layer penalty for overfitting in the node tree.
	OverfitPenalty = 0.9
)

// Node is the abstract interface our partition tree nodes implement.
type Node interface {
	// IsNil tests if this node is nil.
	IsNil() bool
	// Name returns the name of this node.
	Name() string
	// Kind returns the type of this node.
	Kind() NodeKind
	// NodeID returns the (enumerated) node id of this node.
	NodeID() int
	// Parent returns the parent node of this node.
	Parent() Node
	// Children returns the child nodes of this node.
	Children() []Node
	// LinkParent sets the given node as the parent node, and appends this node as a its child.
	LinkParent(Node)
	// AddChildren appends the nodes to the children, *WITHOUT* updating their parents.
	AddChildren([]Node)
	// IsSameNode returns true if the given node is the same as this one.
	IsSameNode(Node) bool
	// IsRootNode returns true if this node has no parent.
	IsRootNode() bool
	// IsLeafNode returns true if this node has no children.
	IsLeafNode() bool
	// Get the distance of this node from the root node.
	RootDistance() int
	// Get the height of this node (inverse of depth: tree depth - node depth).
	NodeHeight() int
	// System returns the policy sysfs instance.
	System() system.System
	// Policy returns the policy back pointer.
	Policy() *policy
	// DiscoverSupply
	DiscoverSupply(assignedNUMANodes []idset.ID) Supply
	// GetSupply returns the full CPU at this node.
	GetSupply() Supply
	// FreeSupply returns the available CPU supply of this node.
	FreeSupply() Supply
	// GrantedReservedCPU returns the amount of granted reserved CPU of this node and its children.
	GrantedReservedCPU() int
	// GrantedSharedCPU returns the amount of granted shared CPU of this node and its children.
	GrantedSharedCPU() int
	// GetMemset
	GetMemset(mtype memoryType) idset.IDSet
	// AssignNUMANodes assigns the given set of NUMA nodes to this one.
	AssignNUMANodes(ids []idset.ID)
	// DepthFirst traverse the tree@node calling the function at each node.
	DepthFirst(func(Node) error) error
	// BreadthFirst traverse the tree@node calling the function at each node.
	BreadthFirst(func(Node) error) error
	// Dump state of the node.
	Dump(string, ...int)
	// Dump type-specific state of the node.
	dump(string, ...int)

	GetMemoryType() memoryType
	HasMemoryType(memoryType) bool
	GetPhysicalNodeIDs() []idset.ID

	GetScore(Request) Score
	HintScore(topology.Hint) float64
}

// node represents data common to all node types.
type node struct {
	policy   *policy     // policy back pointer
	self     nodeself    // upcasted/type-specific interface
	name     string      // node name
	id       int         // node id
	kind     NodeKind    // node type
	depth    int         // node depth in the tree
	parent   Node        // parent node
	children []Node      // child nodes
	noderes  Supply      // CPU and memory available at this node
	freeres  Supply      // CPU and memory allocatable at this node
	mem      idset.IDSet // controllers with normal DRAM attached
	pMem     idset.IDSet // controllers with PMEM attached
	hbm      idset.IDSet // controllers with HBM attached
}

// nodeself is used to 'upcast' a generic Node interface to a type-specific one.
type nodeself struct {
	node Node
}

// socketnode represents a physical CPU package/socket in the system.
type socketnode struct {
	node                     // common node data
	id     idset.ID          // NUMA node socket id
	syspkg system.CPUPackage // corresponding system.Package
}

// dienode represents a die within a physical CPU package/socket in the system.
type dienode struct {
	node                     // common node data
	id     idset.ID          // die id within socket
	syspkg system.CPUPackage // corresponding system.Package
}

// numanode represents a NUMA node in the system.
type numanode struct {
	node                // common node data
	id      idset.ID    // NUMA node system id
	sysnode system.Node // corresponding system.Node
}

// virtualnode represents a virtual node (ATM only the root in a multi-socket system).
type virtualnode struct {
	node // common node data
}

// special node instance to represent a nonexistent node
var nilnode Node = &node{
	name:     "<nil node>",
	id:       -1,
	kind:     NilNode,
	depth:    -1,
	children: nil,
}

// Init initializes the resource with common node data.
func (n *node) init(p *policy, name string, kind NodeKind, parent Node) {
	n.policy = p
	n.name = name
	n.kind = kind
	n.parent = parent
	n.id = -1

	n.LinkParent(parent)

	n.mem = idset.NewIDSet()
	n.pMem = idset.NewIDSet()
	n.hbm = idset.NewIDSet()
}

// IsNil tests if a node
func (n *node) IsNil() bool {
	return n.kind == NilNode
}

// Name returns the name of this node.
func (n *node) Name() string {
	if n.IsNil() {
		return "<nil node>"
	}
	return n.name
}

// Kind returns the kind of this node.
func (n *node) Kind() NodeKind {
	return n.kind
}

// NodeID returns the node id of this node.
func (n *node) NodeID() int {
	if n.IsNil() {
		return -1
	}
	return n.id
}

// IsSameNode checks if the given node is that same as this one.
func (n *node) IsSameNode(other Node) bool {
	return n.NodeID() == other.NodeID()
}

// IsRootNode returns true if this node has no parent.
func (n *node) IsRootNode() bool {
	return n.parent.IsNil()
}

// IsLeafNode returns true if this node has no children.
func (n *node) IsLeafNode() bool {
	return len(n.children) == 0
}

// RootDistance returns the distance of this node from the root node.
func (n *node) RootDistance() int {
	if n.IsNil() {
		return -1
	}
	return n.depth
}

// NodeHeight returns the hight of this node (tree depth - node depth).
func (n *node) NodeHeight() int {
	if n.IsNil() {
		return -1
	}
	return n.policy.depth - n.depth
}

// Parent returns the parent of this node.
func (n *node) Parent() Node {
	if n.IsNil() {
		return nil
	}

	return n.parent
}

// Children returns the children of this node.
func (n *node) Children() []Node {
	if n.IsNil() {
		return nil
	}

	return n.children
}

// LinkParent sets the given node as the node parent and appends this node to the parents children.
func (n *node) LinkParent(parent Node) {
	n.parent = parent
	if !parent.IsNil() {
		parent.AddChildren([]Node{n})
	}

	n.depth = parent.RootDistance() + 1
}

// AddChildren appends the nodes to the childres, *WITHOUT* setting their parent.
func (n *node) AddChildren(nodes []Node) {
	n.children = append(n.children, nodes...)
}

// Dump information/state of the node.
func (n *node) Dump(prefix string, level ...int) {
	if !log.DebugEnabled() {
		return
	}

	lvl := 0
	if len(level) > 0 {
		lvl = level[0]
	}
	idt := indent(prefix, lvl)

	n.self.node.dump(prefix, lvl)
	log.Debug("%s  - %s", idt, n.noderes.DumpCapacity())
	log.Debug("%s  - %s", idt, n.freeres.DumpAllocatable())
	n.freeres.DumpMemoryState(idt + "  ")
	if n.mem.Size() > 0 {
		log.Debug("%s  - normal memory: %v", idt, n.mem)
	}
	if n.hbm.Size() > 0 {
		log.Debug("%s  - HBM memory: %v", idt, n.hbm)
	}
	if n.pMem.Size() > 0 {
		log.Debug("%s  - PMEM memory: %v", idt, n.pMem)
	}
	for _, grant := range n.policy.allocations.grants {
		cpuNodeID := grant.GetCPUNode().NodeID()
		memNodeID := grant.GetMemoryNode().NodeID()
		switch {
		case cpuNodeID == n.id && memNodeID == n.id:
			log.Debug("%s    + cpu+mem %s", idt, grant)
		case cpuNodeID == n.id:
			log.Debug("%s    + cpuonly %s", idt, grant)
		case memNodeID == n.id:
			log.Debug("%s    + memonly %s", idt, grant)
		}
	}
	if !n.Parent().IsNil() {
		log.Debug("%s  - parent: <%s>", idt, n.Parent().Name())
	}
	if len(n.children) > 0 {
		log.Debug("%s  - children:", idt)
		for _, c := range n.children {
			c.Dump(prefix, lvl+1)
		}
	}
}

// Dump type-specific information about the node.
func (n *node) dump(prefix string, level ...int) {
	n.self.node.dump(prefix, level...)
}

// Do a depth-first traversal starting at node calling the given function at each node.
func (n *node) DepthFirst(fn func(Node) error) error {
	for _, c := range n.children {
		if err := c.DepthFirst(fn); err != nil {
			return err
		}
	}

	return fn(n)
}

// Do a breadth-first traversal starting at node calling the given function at each node.
func (n *node) BreadthFirst(fn func(Node) error) error {
	if err := fn(n); err != nil {
		return err
	}

	for _, c := range n.children {
		if err := c.BreadthFirst(fn); err != nil {
			return err
		}
	}

	return nil
}

// System returns the policy System instance.
func (n *node) System() system.System {
	return n.policy.sys
}

// Policy returns the policy back pointer.
func (n *node) Policy() *policy {
	return n.policy
}

// GetSupply returns the full CPU supply of this node.
func (n *node) GetSupply() Supply {
	return n.self.node.GetSupply()
}

// Discover CPU available at this node.
func (n *node) DiscoverSupply(assignedNUMANodes []idset.ID) Supply {
	return n.self.node.DiscoverSupply(assignedNUMANodes)
}

// discoverSupply discovers the resource supply assigned to this pool node.
func (n *node) discoverSupply(assignedNUMANodes []idset.ID) Supply {
	if n.noderes != nil {
		return n.noderes.Clone()
	}

	if !n.IsLeafNode() {
		log.Debug("%s: cumulating child resources...", n.Name())

		if len(assignedNUMANodes) > 0 {
			log.Fatal("invalid pool setup: trying to attach NUMA nodes to non-leaf node %s",
				n.Name())
		}

		n.noderes = newSupply(n, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, nil, nil)
		for _, c := range n.children {
			supply := c.GetSupply()
			n.noderes.Cumulate(supply)
			n.mem.Add(c.GetMemset(memoryDRAM).Members()...)
			n.hbm.Add(c.GetMemset(memoryHBM).Members()...)
			n.pMem.Add(c.GetMemset(memoryPMEM).Members()...)
			log.Debug("  + %s", supply.DumpCapacity())
		}
		log.Debug("  = %s", n.noderes.DumpCapacity())
	} else {
		log.Debug("%s: discovering attached/assigned resources...", n.Name())

		mmap := createMemoryMap(0, 0, 0)
		cpus := cpuset.New()

		for _, nodeID := range assignedNUMANodes {
			node := n.System().Node(nodeID)
			nodeCPUs := node.CPUSet()

			meminfo, err := node.MemoryInfo()
			if err != nil {
				log.Fatal("%s: failed to get memory info for NUMA node #%d", n.Name(), nodeID)
			}

			switch node.GetMemoryType() {
			case system.MemoryTypeDRAM:
				n.mem.Add(nodeID)
				mmap.AddDRAM(meminfo.MemTotal)
				shortCPUs := cpuset.ShortCPUSet(nodeCPUs)
				log.Debug("  + assigned DRAM NUMA node #%d (cpuset: %s, DRAM %.2fM)",
					nodeID, shortCPUs, float64(meminfo.MemTotal)/float64(1024*1024))
			case system.MemoryTypePMEM:
				n.pMem.Add(nodeID)
				mmap.AddPMEM(meminfo.MemTotal)
				log.Debug("  + assigned PMEM NUMA node #%d (DRAM %.2fM)", nodeID,
					float64(meminfo.MemTotal)/float64(1024*1024))
			case system.MemoryTypeHBM:
				n.hbm.Add(nodeID)
				mmap.AddHBM(meminfo.MemTotal)
				log.Debug("  + assigned HBMEM NUMA node #%d (DRAM %.2fM)",
					nodeID, float64(meminfo.MemTotal)/float64(1024*1024))
			default:
				log.Fatal("NUMA node #%d with unknown memory type %v", node.GetMemoryType())
			}

			allowed := nodeCPUs.Intersection(n.policy.allowed)
			isolated := allowed.Intersection(n.policy.isolated)
			reserved := allowed.Intersection(n.policy.reserved).Difference(isolated)
			sharable := allowed.Difference(isolated).Difference(reserved)

			if !reserved.IsEmpty() {
				log.Debug("    allowed reserved CPUs: %s", cpuset.ShortCPUSet(reserved))
			}
			if !sharable.IsEmpty() {
				log.Debug("    allowed sharable CPUs: %s", cpuset.ShortCPUSet(sharable))
			}
			if !isolated.IsEmpty() {
				log.Debug("    allowed isolated CPUs: %s", cpuset.ShortCPUSet(isolated))
			}

			cpus = cpus.Union(allowed)
		}

		isolated := cpus.Intersection(n.policy.isolated)
		reserved := cpus.Intersection(n.policy.reserved).Difference(isolated)
		sharable := cpus.Difference(isolated).Difference(reserved)
		n.noderes = newSupply(n, isolated, reserved, sharable, 0, 0, mmap, nil)
		log.Debug("  = %s", n.noderes.DumpCapacity())
	}

	n.freeres = n.noderes.Clone()
	return n.noderes.Clone()
}

// FreeSupply returns the available CPU supply of this node.
func (n *node) FreeSupply() Supply {
	return n.freeres
}

// Get the set of memory attached to this node.
func (n *node) GetMemset(mtype memoryType) idset.IDSet {
	if n.self.node == nil { // protect against &node{}-abuse by test cases...
		return idset.NewIDSet()
	}
	return n.self.node.GetMemset(mtype)
}

// AssignNUMANodes assigns the given set of NUMA nodes to this one.
func (n *node) AssignNUMANodes(ids []idset.ID) {
	n.self.node.AssignNUMANodes(ids)
}

// assignNUMANodes assigns the given set of NUMA nodes to this one.
func (n *node) assignNUMANodes(ids []idset.ID) {
	mem := createMemoryMap(0, 0, 0)

	for _, numaNodeID := range ids {
		if n.mem.Has(numaNodeID) || n.pMem.Has(numaNodeID) || n.hbm.Has(numaNodeID) {
			log.Warn("*** NUMA node #%d already discovered by or assigned to %s",
				numaNodeID, n.Name())
			continue
		}
		numaNode := n.policy.sys.Node(numaNodeID)
		memTotal := uint64(0)
		if meminfo, err := numaNode.MemoryInfo(); err != nil {
			log.Error("%s: failed to get memory info for NUMA node #%d",
				n.Name(), numaNodeID)
		} else {
			memTotal = meminfo.MemTotal
		}
		switch numaNode.GetMemoryType() {
		case system.MemoryTypeDRAM:
			mem.Add(memTotal, 0, 0)
			n.mem.Add(numaNodeID)
			log.Info("*** DRAM NUMA node #%d assigned to pool node %q",
				numaNodeID, n.Name())
		case system.MemoryTypePMEM:
			n.pMem.Add(numaNodeID)
			mem.Add(0, memTotal, 0)
			log.Info("*** PMEM NUMA node #%d assigned to pool node %q",
				numaNodeID, n.Name())
		case system.MemoryTypeHBM:
			n.hbm.Add(numaNodeID)
			mem.Add(0, 0, memTotal)
			log.Info("*** HBM NUMA node #%d assigned to pool node %q",
				numaNodeID, n.Name())
		default:
			log.Fatal("can't assign NUMA node #%d of type %v to pool node %q",
				numaNodeID, numaNode.GetMemoryType())
		}
	}

	n.noderes.AssignMemory(mem)
	n.freeres.AssignMemory(mem)
}

// Discover the set of memory attached to this node.
func (n *node) GetPhysicalNodeIDs() []idset.ID {
	return n.self.node.GetPhysicalNodeIDs()
}

// GrantedReservedCPU returns the amount of granted reserved CPU of this node and its children.
func (n *node) GrantedReservedCPU() int {
	grantedReserved := n.freeres.GrantedReserved()
	for _, c := range n.children {
		grantedReserved += c.GrantedReservedCPU()
	}
	return grantedReserved
}

// GrantedSharedCPU returns the amount of granted shared CPU of this node and its children.
func (n *node) GrantedSharedCPU() int {
	grantedShared := n.freeres.GrantedShared()
	for _, c := range n.children {
		grantedShared += c.GrantedSharedCPU()
	}
	return grantedShared
}

// Get Score for a cpu request.
func (n *node) GetScore(req Request) Score {
	f := n.FreeSupply()
	return f.GetScore(req)
}

// HintScore calculates the (CPU) score of the node for the given topology hint.
func (n *node) HintScore(hint topology.Hint) float64 {
	return n.self.node.HintScore(hint)
}

func (n *node) GetMemoryType() memoryType {
	var memoryMask memoryType = 0x0
	if n.pMem.Size() > 0 {
		memoryMask |= memoryPMEM
	}
	if n.mem.Size() > 0 {
		memoryMask |= memoryDRAM
	}
	if n.hbm.Size() > 0 {
		memoryMask |= memoryHBM
	}
	return memoryMask
}

func (n *node) HasMemoryType(reqType memoryType) bool {
	nodeType := n.GetMemoryType()
	return (nodeType & reqType) == reqType
}

// NewNumaNode create a node for a CPU socket.
func (p *policy) NewNumaNode(id idset.ID, parent Node) Node {
	n := &numanode{}
	n.self.node = n
	n.node.init(p, fmt.Sprintf("NUMA node #%v", id), NumaNode, parent)
	n.id = id
	n.sysnode = p.sys.Node(id)

	return n
}

// Dump (the NUMA-specific parts of) this node.
func (n *numanode) dump(prefix string, level ...int) {
	log.Debug("%s<NUMA node #%v>", indent(prefix, level...), n.id)
}

// Get CPU supply available at this node.
func (n *numanode) GetSupply() Supply {
	return n.noderes.Clone()
}

func (n *numanode) GetPhysicalNodeIDs() []idset.ID {
	return []idset.ID{n.id}
}

// DiscoverSupply discovers the CPU supply available at this node.
func (n *numanode) DiscoverSupply(assignedNUMANodes []idset.ID) Supply {
	return n.node.discoverSupply(assignedNUMANodes)
}

// GetMemset returns the set of memory attached to this node.
func (n *numanode) GetMemset(mtype memoryType) idset.IDSet {
	mset := idset.NewIDSet()

	if mtype&memoryDRAM != 0 {
		mset.Add(n.mem.Members()...)
	}
	if mtype&memoryHBM != 0 {
		mset.Add(n.hbm.Members()...)
	}
	if mtype&memoryPMEM != 0 {
		mset.Add(n.pMem.Members()...)
	}

	return mset
}

// AssignNUMANodes assigns the given NUMA nodes to this one.
func (n *numanode) AssignNUMANodes(ids []idset.ID) {
	n.node.assignNUMANodes(ids)
}

// HintScore calculates the (CPU) score of the node for the given topology hint.
func (n *numanode) HintScore(hint topology.Hint) float64 {
	switch {
	case hint.CPUs != "":
		return cpuHintScore(hint, n.sysnode.CPUSet())

	case hint.NUMAs != "":
		return numaHintScore(hint, n.id)

	case hint.Sockets != "":
		pkgID := n.sysnode.PackageID()
		score := socketHintScore(hint, n.sysnode.PackageID())
		if score > 0.0 {
			// penalize underfit reciprocally (inverse-proportionally) to the socket size
			score /= float64(len(n.System().Package(pkgID).NodeIDs()))
		}
		return score
	}

	return 0.0
}

// NewDieNode create a node for a CPU die.
func (p *policy) NewDieNode(id idset.ID, parent Node) Node {
	pkg := parent.(*socketnode)
	n := &dienode{}
	n.self.node = n
	n.node.init(p, fmt.Sprintf("die #%v/%v", pkg.id, id), DieNode, parent)
	n.id = id
	n.syspkg = p.sys.Package(pkg.id)

	return n
}

// Dump (the die-specific parts of) this node.
func (n *dienode) dump(prefix string, level ...int) {
	log.Debug("%s<die #%v/%v>", indent(prefix, level...), n.syspkg.ID(), n.id)
}

// Get CPU supply available at this node.
func (n *dienode) GetSupply() Supply {
	return n.noderes.Clone()
}

func (n *dienode) GetPhysicalNodeIDs() []idset.ID {
	ids := make([]idset.ID, 0)
	ids = append(ids, n.id)
	for _, c := range n.children {
		cIds := c.GetPhysicalNodeIDs()
		ids = append(ids, cIds...)
	}
	return ids
}

// DiscoverSupply discovers the CPU supply available at this die.
func (n *dienode) DiscoverSupply(assignedNUMANodes []idset.ID) Supply {
	return n.node.discoverSupply(assignedNUMANodes)
}

// GetMemset returns the set of memory attached to this die.
func (n *dienode) GetMemset(mtype memoryType) idset.IDSet {
	mset := idset.NewIDSet()

	if mtype&memoryDRAM != 0 {
		mset.Add(n.mem.Members()...)
	}
	if mtype&memoryHBM != 0 {
		mset.Add(n.hbm.Members()...)
	}
	if mtype&memoryPMEM != 0 {
		mset.Add(n.pMem.Members()...)
	}

	return mset
}

// AssignNUMANodes assigns the given NUMA nodes to this one.
func (n *dienode) AssignNUMANodes(ids []idset.ID) {
	n.node.assignNUMANodes(ids)
}

// HintScore calculates the (CPU) score of the node for the given topology hint.
func (n *dienode) HintScore(hint topology.Hint) float64 {
	switch {
	case hint.CPUs != "":
		return cpuHintScore(hint, n.syspkg.CPUSet())

	case hint.NUMAs != "":
		return OverfitPenalty * dieHintScore(hint, n.id, n.syspkg)

	case hint.Sockets != "":
		score := socketHintScore(hint, n.syspkg.ID())
		if score > 0.0 {
			// penalize underfit reciprocally (inverse-proportionally) to the socket size in dies
			score /= float64(len(n.syspkg.DieNodeIDs(n.id)))
		}
		return score
	}

	return 0.0
}

// NewSocketNode create a node for a CPU socket.
func (p *policy) NewSocketNode(id idset.ID, parent Node) Node {
	n := &socketnode{}
	n.self.node = n
	n.node.init(p, fmt.Sprintf("socket #%v", id), SocketNode, parent)
	n.id = id
	n.syspkg = p.sys.Package(id)

	return n
}

// Dump (the socket-specific parts of) this node.
func (n *socketnode) dump(prefix string, level ...int) {
	log.Debug("%s<socket #%v>", indent(prefix, level...), n.id)
}

// Get CPU supply available at this node.
func (n *socketnode) GetSupply() Supply {
	return n.noderes.Clone()
}

func (n *socketnode) GetPhysicalNodeIDs() []idset.ID {
	ids := make([]idset.ID, 0)
	ids = append(ids, n.id)
	for _, c := range n.children {
		cIds := c.GetPhysicalNodeIDs()
		ids = append(ids, cIds...)
	}
	return ids
}

// DiscoverSupply discovers the CPU supply available at this socket.
func (n *socketnode) DiscoverSupply(assignedNUMANodes []idset.ID) Supply {
	return n.node.discoverSupply(assignedNUMANodes)
}

// GetMemset returns the set of memory attached to this socket.
func (n *socketnode) GetMemset(mtype memoryType) idset.IDSet {
	mset := idset.NewIDSet()

	if mtype&memoryDRAM != 0 {
		mset.Add(n.mem.Members()...)
	}
	if mtype&memoryHBM != 0 {
		mset.Add(n.hbm.Members()...)
	}
	if mtype&memoryPMEM != 0 {
		mset.Add(n.pMem.Members()...)
	}

	return mset
}

// AssignNUMANodes assigns the given NUMA nodes to this one.
func (n *socketnode) AssignNUMANodes(ids []idset.ID) {
	n.node.assignNUMANodes(ids)
}

// HintScore calculates the (CPU) score of the node for the given topology hint.
func (n *socketnode) HintScore(hint topology.Hint) float64 {
	switch {
	case hint.CPUs != "":
		return cpuHintScore(hint, n.syspkg.CPUSet())

	case hint.NUMAs != "":
		return OverfitPenalty * numaHintScore(hint, n.syspkg.NodeIDs()...)

	case hint.Sockets != "":
		return socketHintScore(hint, n.id)
	}

	return 0.0
}

// NewVirtualNode creates a new virtual node.
func (p *policy) NewVirtualNode(name string, parent Node) Node {
	n := &virtualnode{}
	n.self.node = n
	n.node.init(p, fmt.Sprintf("%s", name), VirtualNode, parent)

	return n
}

// Dump (the virtual-node specific parts of) this node.
func (n *virtualnode) dump(prefix string, level ...int) {
	log.Debug("%s<virtual %s>", indent(prefix, level...), n.name)
}

// Get CPU supply available at this node.
func (n *virtualnode) GetSupply() Supply {
	return n.noderes.Clone()
}

// DiscoverSupply discovers the CPU supply available at this node.
func (n *virtualnode) DiscoverSupply(assignedNUMANodes []idset.ID) Supply {
	return n.node.discoverSupply(assignedNUMANodes)
}

// GetMemset returns the set of memory attached to this socket.
func (n *virtualnode) GetMemset(mtype memoryType) idset.IDSet {
	mset := idset.NewIDSet()

	if mtype&memoryDRAM != 0 {
		mset.Add(n.mem.Members()...)
	}
	if mtype&memoryHBM != 0 {
		mset.Add(n.hbm.Members()...)
	}
	if mtype&memoryPMEM != 0 {
		mset.Add(n.pMem.Members()...)
	}

	return mset
}

// AssignNUMANodes assigns the given NUMA nodes to this one.
func (n *virtualnode) AssignNUMANodes(ids []idset.ID) {
	log.Panic("cannot assign NUMA nodes #%s to %s",
		idset.NewIDSet(ids...).String(), n.Name())
}

// HintScore calculates the (CPU) score of the node for the given topology hint.
func (n *virtualnode) HintScore(hint topology.Hint) float64 {
	// don't bother calculating any scores, the root should always score 1.0
	switch {
	case hint.CPUs != "":
		return cpuHintScore(hint, n.System().CPUSet())

	case hint.NUMAs != "":
		return OverfitPenalty * OverfitPenalty

	case hint.Sockets != "":
		return OverfitPenalty
	}

	return 0.0
}

func (n *virtualnode) GetPhysicalNodeIDs() []idset.ID {
	ids := make([]idset.ID, 0)
	for _, c := range n.children {
		cIds := c.GetPhysicalNodeIDs()
		ids = append(ids, cIds...)
	}
	return ids
}

// Finalize the setup of nilnode.
func init() {
	nilnode.(*node).self.node = nilnode
	nilnode.(*node).parent = nilnode.(*node).self.node
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/pod-preferences.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"encoding/json"
	"fmt"
	"path/filepath"
	"strconv"
	"strings"
	"time"

	"sigs.k8s.io/yaml"

	corev1 "k8s.io/api/core/v1"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

	"github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes"
)

const (
	// annotation key for opting in to multiple isolated exclusive CPUs per container.
	keyIsolationPreference = "prefer-isolated-cpus"
	// annotation key for opting out of exclusive allocation and relaxed topology fitting.
	keySharedCPUPreference = "prefer-shared-cpus"
	// annotation key for type of memory to allocate
	keyMemoryTypePreference = "memory-type"
	// annotation key for type "cold start" of workloads
	keyColdStartPreference = "cold-start"
	// annotation key for reserved pools
	keyReservedCPUsPreference = "prefer-reserved-cpus"

	// effective annotation key for isolated CPU preference
	preferIsolatedCPUsKey = keyIsolationPreference + "." + kubernetes.ResmgrKeyNamespace
	// effective annotation key for shared CPU preference
	preferSharedCPUsKey = keySharedCPUPreference + "." + kubernetes.ResmgrKeyNamespace
	// effective annotation key for memory type preference
	preferMemoryTypeKey = keyMemoryTypePreference + "." + kubernetes.ResmgrKeyNamespace
	// effective annotation key for "cold start" preference
	preferColdStartKey = keyColdStartPreference + "." + kubernetes.ResmgrKeyNamespace
	// annotation key for reserved pools
	preferReservedCPUsKey = keyReservedCPUsPreference + "." + kubernetes.ResmgrKeyNamespace
)

// cpuClass is a type of CPU to allocate
type cpuClass int

// names by cpu class
var cpuClassNames = map[cpuClass]string{
	cpuNormal:   "normal",
	cpuReserved: "reserved",
}

const (
	cpuNormal cpuClass = iota
	cpuReserved
)

// types by memory type name
var memoryNamedTypes = map[string]memoryType{
	"dram":  memoryDRAM,
	"pmem":  memoryPMEM,
	"hbm":   memoryHBM,
	"mixed": memoryAll,
}

// names by memory type
var memoryTypeNames = map[memoryType]string{
	memoryDRAM: "DRAM",
	memoryPMEM: "PMEM",
	memoryHBM:  "HBM",
}

// memoryType is bitmask of types of memory to allocate
type memoryType int

// memoryType bits
const (
	memoryUnspec memoryType = (0x1 << iota) >> 1
	memoryDRAM
	memoryPMEM
	memoryHBM
	memoryFirstUnusedBit
	memoryAll = memoryFirstUnusedBit - 1

	// type of memory to use if none specified
	defaultMemoryType = memoryAll
)

// isolatedCPUsPreference returns whether isolated CPUs should be preferred for
// containers that allocate multiple CPUs, and if the container was explicitly
// annotated with this setting.
//
// If the effective annotations are not found, this function falls back to
// looking for the deprecated syntax by calling podIsolationPreference.
func isolatedCPUsPreference(pod cache.Pod, container cache.Container) (bool, bool) {
	key := preferIsolatedCPUsKey
	value, ok := pod.GetEffectiveAnnotation(key, container.GetName())
	if !ok {
		return podIsolationPreference(pod, container)
	}

	preference, err := strconv.ParseBool(value)
	if err != nil {
		log.Error("invalid CPU isolation preference annotation (%q, %q): %v",
			key, value, err)
		return opt.PreferIsolated, false
	}

	log.Debug("%s: effective CPU isolation preference %v", container.PrettyName(), preference)

	return preference, true
}

// sharedCPUsPreference returns whether shared CPUs should be preferred for
// containers otherwise eligible for exclusive allocation, and whether the
// container was explicitly annotated with this setting.
//
// If the effective annotations are not found, this function falls back to
// looking for the deprecated syntax by calling podSharedCPUPreference.
func sharedCPUsPreference(pod cache.Pod, container cache.Container) (bool, bool) {
	key := preferSharedCPUsKey
	value, ok := pod.GetEffectiveAnnotation(key, container.GetName())
	if !ok {
		return podSharedCPUPreference(pod, container)
	}

	preference, err := strconv.ParseBool(value)
	if err != nil {
		log.Error("invalid shared CPU preference annotation (%q, %q): %v",
			key, value, err)
		return opt.PreferShared, false
	}

	log.Debug("%s: effective shared CPU preference %v", container.PrettyName(), preference)

	return preference, true
}

// memoryTypePreference returns what type of memory should be allocated for the container.
//
// If the effective annotations are not found, this function falls back to
// looking for the deprecated syntax by calling podMemoryTypePreference.
func memoryTypePreference(pod cache.Pod, container cache.Container) memoryType {
	key := preferMemoryTypeKey
	value, ok := pod.GetEffectiveAnnotation(key, container.GetName())
	if !ok {
		return podMemoryTypePreference(pod, container)
	}

	mtype, err := parseMemoryType(value)
	if err != nil {
		log.Error("invalid memory type preference (%q, %q): %v", key, value, err)
		return memoryUnspec
	}

	log.Debug("%s: effective cold start preference %v", container.PrettyName(), mtype)

	return mtype
}

// coldStartPreference figures out 'cold start' preferences for the container, IOW
// if the container memory should be allocated for an initial 'cold start' period
// from PMEM, and how long this initial period should be.
//
// If the effective annotations are not found, this function falls back to
// looking for the deprecated syntax by calling podColdStartPreference.
func coldStartPreference(pod cache.Pod, container cache.Container) (ColdStartPreference, error) {
	key := preferColdStartKey
	value, ok := pod.GetEffectiveAnnotation(key, container.GetName())
	if !ok {
		return podColdStartPreference(pod, container)
	}

	preference := ColdStartPreference{}
	if err := yaml.Unmarshal([]byte(value), &preference); err != nil {
		log.Error("failed to parse cold start preference (%q, %q): %v",
			keyColdStartPreference, value, err)
		return ColdStartPreference{}, policyError("invalid cold start preference %q: %v",
			value, err)
	}

	if preference.Duration < 0 || time.Duration(preference.Duration) > time.Hour {
		return ColdStartPreference{}, policyError("cold start duration %s out of range",
			preference.Duration.String())
	}

	log.Debug("%s: effective cold start preference %v",
		container.PrettyName(), preference.Duration.String())

	return preference, nil
}

// podIsolationPreference checks if containers explicitly prefers to run on multiple isolated CPUs.
// The first return value indicates whether the container is isolated or not.
// The second return value indicates whether that decision was explicit (true) or implicit (false).
func podIsolationPreference(pod cache.Pod, container cache.Container) (bool, bool) {
	key := keyIsolationPreference
	value, ok := pod.GetResmgrAnnotation(key)
	if !ok {
		return opt.PreferIsolated, false
	}

	log.Warn("WARNING: using deprecated annotation %q", key)
	log.Warn("WARNING: consider using instead")
	log.Warn("WARNING:     %q, or", preferIsolatedCPUsKey+"/container."+container.GetName())
	log.Warn("WARNING:     %q", preferIsolatedCPUsKey+"/pod")

	if value == "false" || value == "true" {
		return (value[0] == 't'), true
	}

	preferences := map[string]bool{}
	if err := yaml.Unmarshal([]byte(value), &preferences); err != nil {
		log.Error("failed to parse isolation preference %s = '%s': %v",
			keyIsolationPreference, value, err)
		return opt.PreferIsolated, false
	}

	name := container.GetName()
	if pref, ok := preferences[name]; ok {
		log.Debug("%s per-container isolation preference '%v'", name, pref)
		return pref, true
	}

	log.Debug("%s defaults to isolation preference '%v'", name, opt.PreferIsolated)
	return opt.PreferIsolated, false
}

// podSharedCPUPreference checks if a container wants to opt-out from exclusive allocation.
// The first return value indicates if the container prefers to opt-out from
// exclusive (sliced-off or isolated) CPU allocation even if it was otherwise
// eligible for it.
func podSharedCPUPreference(pod cache.Pod, container cache.Container) (bool, bool) {
	key := keySharedCPUPreference
	value, ok := pod.GetResmgrAnnotation(key)
	if !ok {
		return opt.PreferShared, false
	}

	log.Warn("WARNING: using deprecated annotation %q", key)
	log.Warn("WARNING: consider using instead")
	log.Warn("WARNING:     %q, or", preferSharedCPUsKey+"/container."+container.GetName())
	log.Warn("WARNING:     %q", preferSharedCPUsKey+"/pod")

	if value == "false" || value == "true" {
		return value[0] == 't', true
	}

	preferences := map[string]string{}
	if err := yaml.Unmarshal([]byte(value), &preferences); err != nil {
		log.Error("failed to parse shared CPU preference %s = '%s': %v",
			keySharedCPUPreference, value, err)
		return opt.PreferShared, false
	}

	name := container.GetName()
	pref, ok := preferences[name]
	if !ok {
		return opt.PreferShared, false
	}
	if pref == "false" || pref == "true" {
		return pref[0] == 't', true
	}

	log.Error("invalid shared CPU boolean preference for container %s: %s", name, pref)
	return opt.PreferShared, false
}

// ColdStartPreference lists the various ways the container can be configured to trigger
// cold start. Currently, only timer is supported. If the "duration" is set to a duration
// greater than 0, cold start is enabled and the DRAM controller is added to the container
// after the duration has passed.
type ColdStartPreference struct {
	Duration config.Duration // `json:"duration,omitempty"`
}

// podColdStartPreference figures out if the container memory should be first allocated from PMEM.
// It returns the time (in milliseconds) after which DRAM controller should be added to the mix.
func podColdStartPreference(pod cache.Pod, container cache.Container) (ColdStartPreference, error) {
	key := keyColdStartPreference
	value, ok := pod.GetResmgrAnnotation(key)
	if !ok {
		return ColdStartPreference{}, nil
	}

	log.Warn("WARNING: using deprecated annotation %q", key)
	log.Warn("WARNING: consider using instead")
	log.Warn("WARNING:     %q, or", preferColdStartKey+"/container."+container.GetName())
	log.Warn("WARNING:     %q", preferColdStartKey+"/pod")

	preferences := map[string]ColdStartPreference{}
	if err := yaml.Unmarshal([]byte(value), &preferences); err != nil {
		log.Error("failed to parse cold start preference %s = '%s': %v",
			key, value, err)
		return ColdStartPreference{}, err
	}
	name := container.GetName()
	preference, ok := preferences[name]
	if !ok {
		log.Debug("container %s has no entry among cold start preferences", container.PrettyName())
		return ColdStartPreference{}, nil
	}

	if preference.Duration < 0 || time.Duration(preference.Duration) > time.Hour {
		// Duration can't be negative. We also reject durations which are longer than one hour.
		return ColdStartPreference{}, fmt.Errorf("failed to validate cold start timeout %s: value out of scope", preference.Duration.String())
	}

	return preference, nil
}

func checkReservedPoolNamespaces(namespace string) bool {
	if namespace == metav1.NamespaceSystem {
		return true
	}

	for _, str := range opt.ReservedPoolNamespaces {
		ret, err := filepath.Match(str, namespace)
		if err != nil {
			return false
		}

		if ret {
			return true
		}
	}

	return false
}

func checkReservedCPUsAnnotations(c cache.Container) (bool, bool) {
	hintSetting, ok := c.GetEffectiveAnnotation(preferReservedCPUsKey)
	if !ok {
		return false, false
	}

	preference, err := strconv.ParseBool(hintSetting)
	if err != nil {
		log.Error("failed to parse reserved CPU preference %s = '%s': %v",
			keyReservedCPUsPreference, hintSetting, err)
		return false, false
	}

	return preference, true
}

// cpuAllocationPreferences figures out the amount and kind of CPU to allocate.
// Returned values:
// 1. full: number of full CPUs
// 2. fraction: amount of fractional CPU in milli-CPU
// 3. isolate: (bool) whether to prefer isolated full CPUs
// 4. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal)
func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, bool, cpuClass) {
	//
	// CPU allocation preferences for a container consist of
	//
	//   - the number of exclusive cores to allocate
	//   - the amount of fractional cores to allocate (in milli-CPU)
	//   - whether kernel-isolated cores are preferred for exclusive allocation
	//   - cpu class IOW, whether reserved or normal cores should be allocated
	//
	// The rules for determining these preferences are:
	//
	//   - reserved cores are only and always preferred for kube-system namespace containers
	//   - kube-system namespace containers:
	//       => fractional/shared (reserved) cores
	//   - BestEffort QoS class containers:
	//       => fractional/shared cores
	//   - Burstable QoS class containers:
	//       => fractional/shared cores
	//   - Guaranteed QoS class containers:
	//      - 1 full core > CPU request
	//          => fractional/shared cores
	//      - 1 full core <= CPU request < 2 full cores:
	//          a. fractional allocation:
	//            - shared preference explicitly annotated/configured false:
	//              => mixed cores, prefer isolated, unless annotated/configured otherwise (*)
	//            - shared preference explicitly annotated/configured true:
	//              => shared cores
	//          b. non-fractional allocation:
	//            - shared preference explicitly annotated true:
	//              => shared cores
	//            - isolated default preference false or explicitly annotated false:
	//              => exclusive cores
	//            - isolated default preference true or explicitly annotated true:
	//              => exclusive cores, prefer isolated (*)
	//      - 2 full cores <= CPU request
	//          a. fractional allocation:
	//            - shared preference explicitly annotated false:
	//              => mixed cores, prefer isolated only if explicitly annotated (**)
	//            - otherwise (no shared annotation):
	//              => shared cores
	//          b. non-fractional allocation:
	//            - shared preference explicitly annotated true:
	//              => shared cores
	//            - otherwise (no shared annotation):
	//              => exclusive cores, prefer isolated only if explicitly annotated (**)
	//
	//   - Rationale for isolation defaults:
	//     *)
	//        In the single core case, a workload does not need to do anything extra to
	//        benefit from running on isolated vs. ordinary exclusive cores. Therefore,
	//        allocating isolated cores is a safe default choice.
	//     **)
	//        In the multiple cores case, a workload needs to be 'isolation-aware' to
	//        benefit (or actually to not even get hindered) by running on isolated vs.
	//        ordinary exclusive cores. If it gets isolated cores allocated, it needs
	//        to actively spread itself/its correct processes over the cores, because
	//        the scheduler is not going to do load-balancing for it. Therefore, the
	//        safe choice in this case is to not allocate isolated cores by default.
	//

	namespace := container.GetNamespace()
	request := container.GetResourceRequirements().Requests[corev1.ResourceCPU]
	qosClass := pod.GetQOSClass()
	fraction := int(request.MilliValue())

	// easy cases: kube-system namespace, Burstable or BestEffort QoS class containers
	preferReserved, explicitReservation := checkReservedCPUsAnnotations(container)
	switch {
	case preferReserved == true:
		return 0, fraction, false, cpuReserved
	case checkReservedPoolNamespaces(namespace) && !explicitReservation:
		return 0, fraction, false, cpuReserved
	case qosClass == corev1.PodQOSBurstable:
		return 0, fraction, false, cpuNormal
	case qosClass == corev1.PodQOSBestEffort:
		return 0, 0, false, cpuNormal
	}

	// complex case: Guaranteed QoS class containers
	cores := fraction / 1000
	fraction = fraction % 1000
	preferIsolated, explicitIsolated := isolatedCPUsPreference(pod, container)
	preferShared, explicitShared := sharedCPUsPreference(pod, container)

	switch {
	// sub-core CPU request
	case cores == 0:
		return 0, fraction, false, cpuNormal
		// 1 <= CPU request < 2
	case cores < 2:
		// fractional allocation, potentially mixed
		if fraction > 0 {
			if preferShared {
				return 0, 1000*cores + fraction, false, cpuNormal
			}
			return cores, fraction, preferIsolated, cpuNormal
		}
		// non-fractional allocation
		if preferShared && explicitShared {
			return 0, 1000*cores + fraction, false, cpuNormal
		}
		return cores, fraction, preferIsolated, cpuNormal
		// CPU request >= 2
	default:
		// fractional allocation, only mixed if explicitly annotated as unshared
		if fraction > 0 {
			if !preferShared && explicitShared {
				return cores, fraction, preferIsolated && explicitIsolated, cpuNormal
			}
			return 0, 1000*cores + fraction, false, cpuNormal
		}
		// non-fractional allocation
		if preferShared && explicitShared {
			return 0, 1000 * cores, false, cpuNormal
		}
		return cores, fraction, preferIsolated && explicitIsolated, cpuNormal
	}
}

// podMemoryTypePreference returns what type of memory should be allocated for the container.
func podMemoryTypePreference(pod cache.Pod, c cache.Container) memoryType {
	key := keyMemoryTypePreference
	value, ok := pod.GetResmgrAnnotation(key)
	if !ok {
		log.Debug("pod %s has no memory preference annotations", pod.GetName())
		return memoryUnspec
	}

	log.Warn("WARNING: using deprecated annotation %q", key)
	log.Warn("WARNING: consider using instead")
	log.Warn("WARNING:     %q, or", keyMemoryTypePreference+"/container."+c.GetName())
	log.Warn("WARNING:     %q", keyMemoryTypePreference+"/pod")

	// Try to parse as per-container preference. Assume common for all containers if fails.
	pref := ""
	preferences := map[string]string{}
	if err := yaml.Unmarshal([]byte(value), &preferences); err == nil {
		name := c.GetName()
		p, ok := preferences[name]
		if !ok {
			log.Debug("container %s has no entry among memory preferences", c.PrettyName())
			return memoryUnspec
		}
		pref = p
	} else {
		pref = value
	}

	mtype, err := parseMemoryType(pref)
	if err != nil {
		log.Error("invalid memory type preference ('%s') in annotation %s: %v",
			pref, keyMemoryTypePreference, err)
		return memoryUnspec
	}
	log.Debug("container %s has effective memory preference: %s", c.PrettyName(), mtype)
	return mtype
}

// memoryAllocationPreference returns the amount and kind of memory to allocate.
func memoryAllocationPreference(pod cache.Pod, c cache.Container) (uint64, uint64, memoryType) {
	resources := c.GetResourceRequirements()
	mtype := memoryTypePreference(pod, c)
	req, lim := uint64(0), uint64(0)

	if memReq, ok := resources.Requests[corev1.ResourceMemory]; ok {
		req = uint64(memReq.Value())
	}
	if memLim, ok := resources.Limits[corev1.ResourceMemory]; ok {
		lim = uint64(memLim.Value())
	}

	return req, lim, mtype
}

// String stringifies a cpuClass.
func (t cpuClass) String() string {
	if cpuClassName, ok := cpuClassNames[t]; ok {
		return cpuClassName
	}
	return fmt.Sprintf("#UNNAMED-CPUCLASS(%d)", int(t))
}

// String stringifies a memoryType.
func (t memoryType) String() string {
	str := ""
	sep := ""
	for _, bit := range []memoryType{memoryDRAM, memoryPMEM, memoryHBM} {
		if int(t)&int(bit) != 0 {
			str += sep + memoryTypeNames[bit]
			sep = ","
		}
	}
	return str
}

// parseMemoryType parses a memory type string, ideally produced by String()
func parseMemoryType(value string) (memoryType, error) {
	if value == "" {
		return memoryUnspec, nil
	}
	mtype := 0
	for _, typestr := range strings.Split(value, ",") {
		t, ok := memoryNamedTypes[strings.ToLower(typestr)]
		if !ok {
			return memoryUnspec, policyError("unknown memory type value '%s'", typestr)
		}
		mtype |= int(t)
	}
	return memoryType(mtype), nil
}

// MarshalJSON is the JSON marshaller for memoryType.
func (t memoryType) MarshalJSON() ([]byte, error) {
	value := t.String()
	return json.Marshal(value)
}

// UnmarshalJSON is the JSON unmarshaller for memoryType
func (t *memoryType) UnmarshalJSON(data []byte) error {
	ival := 0
	if err := json.Unmarshal(data, &ival); err == nil {
		*t = memoryType(ival)
		return nil
	}

	value := ""
	if err := json.Unmarshal(data, &value); err != nil {
		return policyError("failed to unmarshal memoryType '%s': %v",
			string(data), err)
	}

	mtype, err := parseMemoryType(value)
	if err != nil {
		return policyError("failed parse memoryType '%s': %v", value, err)
	}

	*t = mtype
	return nil
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/pod-preferences_test.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"testing"

	corev1 "k8s.io/api/core/v1"
	v1 "k8s.io/api/core/v1"
	resapi "k8s.io/apimachinery/pkg/api/resource"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestPodIsolationPreference(t *testing.T) {
	tcases := []struct {
		name             string
		pod              *mockPod
		container        *mockContainer
		expectedIsolate  bool
		expectedExplicit bool
		disabled         bool
	}{
		{
			name:     "podIsolationPreference() should handle nil pod arg gracefully",
			disabled: true,
		},
		{
			name:            "return defaults",
			pod:             &mockPod{},
			container:       &mockContainer{},
			expectedIsolate: opt.PreferIsolated,
		},
		{
			name: "prefer resmgr's annotation value",
			pod: &mockPod{
				returnValue1FotGetResmgrAnnotation: "true",
				returnValue2FotGetResmgrAnnotation: true,
			},
			container:        &mockContainer{},
			expectedIsolate:  true,
			expectedExplicit: true,
		},
		{
			name: "return defaults for unparsable",
			pod: &mockPod{
				returnValue1FotGetResmgrAnnotation: "UNPARSABLE",
				returnValue2FotGetResmgrAnnotation: true,
			},
			container:       &mockContainer{},
			expectedIsolate: opt.PreferIsolated,
		},
		{
			name: "podIsolationPreference() should handle nil container arg gracefully",
			pod: &mockPod{
				returnValue1FotGetResmgrAnnotation: "key: true",
				returnValue2FotGetResmgrAnnotation: true,
			},
			container: &mockContainer{},
			disabled:  true,
		},
		{
			name: "return defaults for missing preferences",
			pod: &mockPod{
				returnValue1FotGetResmgrAnnotation: "key: true",
				returnValue2FotGetResmgrAnnotation: true,
			},
			container:       &mockContainer{},
			expectedIsolate: opt.PreferIsolated,
		},
		{
			name: "return defined preferences",
			pod: &mockPod{
				returnValue1FotGetResmgrAnnotation: "testcontainer: false",
				returnValue2FotGetResmgrAnnotation: true,
			},
			container: &mockContainer{
				name: "testcontainer",
			},
			expectedExplicit: true,
		},
		// effective annotation tests
		{
			name: "prefer resmgr's annotation value",
			pod: &mockPod{
				annotations: map[string]string{
					preferIsolatedCPUsKey + "/container.c0": "true",
				},
			},
			container:        &mockContainer{name: "c0"},
			expectedIsolate:  true,
			expectedExplicit: true,
		},
		{
			name: "prefer resmgr's annotation value",
			pod: &mockPod{
				annotations: map[string]string{
					preferIsolatedCPUsKey + "/container.c0": "false",
				},
			},
			container:        &mockContainer{name: "c0"},
			expectedIsolate:  false,
			expectedExplicit: true,
		},
		{
			name: "return defaults for unparsable annotation value",
			pod: &mockPod{
				annotations: map[string]string{
					preferIsolatedCPUsKey + "/container.c0": "blah",
				},
			},
			container:       &mockContainer{name: "c0"},
			expectedIsolate: opt.PreferIsolated,
		},
		{
			name: "return defaults for missing preferences",
			pod: &mockPod{
				annotations: map[string]string{
					preferIsolatedCPUsKey + "/container.c0": "true",
				},
			},
			container:       &mockContainer{name: "c1"},
			expectedIsolate: opt.PreferIsolated,
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			if tc.disabled {
				t.Skipf("The case '%s' is skipped", tc.name)
			}
			isolate, explicit := isolatedCPUsPreference(tc.pod, tc.container)
			if isolate != tc.expectedIsolate || explicit != tc.expectedExplicit {
				t.Errorf("Expected (%v, %v), but got (%v, %v)", tc.expectedIsolate, tc.expectedExplicit, isolate, explicit)
			}
		})
	}
}

func TestPodSharedCPUPreference(t *testing.T) {
	tcases := []struct {
		name           string
		pod            *mockPod
		container      *mockContainer
		expectedShared bool
		disabled       bool
	}{
		{
			name:     "podSharedCPUPreference() should handle nil pod arg gracefully",
			disabled: true,
		},
		{
			name:           "return defaults",
			pod:            &mockPod{},
			container:      &mockContainer{},
			expectedShared: opt.PreferShared,
		},
		{
			name: "prefer resmgr's annotation value",
			pod: &mockPod{
				returnValue1FotGetResmgrAnnotation: "true",
				returnValue2FotGetResmgrAnnotation: true,
			},
			container:      &mockContainer{},
			expectedShared: true,
		},
		{
			name: "return defaults for unparsable",
			pod: &mockPod{
				returnValue1FotGetResmgrAnnotation: "UNPARSABLE",
				returnValue2FotGetResmgrAnnotation: true,
			},
			container:      &mockContainer{},
			expectedShared: opt.PreferShared,
		},
		{
			name: "podSharedCPUPreference() should handle nil container arg gracefully",
			pod: &mockPod{
				returnValue1FotGetResmgrAnnotation: "key: true",
				returnValue2FotGetResmgrAnnotation: true,
			},
			container: &mockContainer{},
			disabled:  true,
		},
		{
			name: "return defaults for missing preferences",
			pod: &mockPod{
				returnValue1FotGetResmgrAnnotation: "key: true",
				returnValue2FotGetResmgrAnnotation: true,
			},
			container:      &mockContainer{},
			expectedShared: opt.PreferShared,
		},
		{
			name: "return defined preferences",
			pod: &mockPod{
				returnValue1FotGetResmgrAnnotation: "testcontainer: false",
				returnValue2FotGetResmgrAnnotation: true,
			},
			container: &mockContainer{
				name: "testcontainer",
			},
		},
		{
			name: "return defaults for unparsable annotation value",
			pod: &mockPod{
				returnValue1FotGetResmgrAnnotation: "testcontainer: UNPARSABLE",
				returnValue2FotGetResmgrAnnotation: true,
			},
			container: &mockContainer{
				name: "testcontainer",
			},
			expectedShared: opt.PreferShared,
		},
		// effective annotation tests
		{
			name: "prefer resmgr's annotation value",
			pod: &mockPod{
				annotations: map[string]string{
					preferSharedCPUsKey + "/container.c0": "true",
				},
			},
			container:      &mockContainer{name: "c0"},
			expectedShared: true,
		},
		{
			name: "prefer resmgr's annotation value",
			pod: &mockPod{
				annotations: map[string]string{
					preferSharedCPUsKey + "/container.c0": "false",
				},
			},
			container:      &mockContainer{name: "c0"},
			expectedShared: false,
		},
		{
			name: "return defaults for unparsable annotation value",
			pod: &mockPod{
				annotations: map[string]string{
					preferSharedCPUsKey + "/container.c0": "blah",
				},
			},
			container:      &mockContainer{name: "c0"},
			expectedShared: opt.PreferShared,
		},
		{
			name: "return defaults for missing preferences",
			pod: &mockPod{
				annotations: map[string]string{
					preferSharedCPUsKey + "/container.c0": "true",
				},
			},
			container:      &mockContainer{name: "c1"},
			expectedShared: opt.PreferShared,
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			if tc.disabled {
				t.Skipf("The case '%s' is skipped", tc.name)
			}
			shared, _ := sharedCPUsPreference(tc.pod, tc.container)
			if shared != tc.expectedShared {
				t.Errorf("Expected %v, but got %v", tc.expectedShared, shared)
			}
		})
	}
}

func TestCpuAllocationPreferences(t *testing.T) {
	tcases := []struct {
		name                   string
		pod                    *mockPod
		container              *mockContainer
		preferIsolated         bool
		preferShared           bool
		expectedFull           int
		expectedFraction       int
		expectedIsolate        bool
		expectedCpuType        cpuClass
		disabled               bool
		reservedPoolNamespaces []string
	}{
		{
			name:     "cpuAllocationPreferences() should handle nil container arg gracefully",
			disabled: true,
		},
		{
			name:      "no resource requirements",
			pod:       &mockPod{},
			container: &mockContainer{},
		},
		{
			name: "cpuAllocationPreferences() should handle nil pod arg gracefully",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("1"),
					},
				},
			},
			disabled: true,
		},
		{
			name: "return defaults",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("1"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSBurstable,
			},
			expectedFraction: 1000,
			expectedIsolate:  false,
		},
		{
			name: "return request's value for system container",
			container: &mockContainer{
				namespace: metav1.NamespaceSystem,
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSBurstable,
			},
			expectedFraction: 2000,
			expectedCpuType:  cpuReserved,
		},
		{
			name: "return request's value for burstable QoS",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSBurstable,
			},
			expectedFraction: 2000,
		},
		{
			name: "guaranteed QoS with sub-core request",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("750m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			expectedFull:     0,
			expectedFraction: 750,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with sub-core request, prefer isolated",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("750m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			preferIsolated:   true,
			expectedFull:     0,
			expectedFraction: 750,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with sub-core request, prefer shared",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("750m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			preferShared:     true,
			expectedFull:     0,
			expectedFraction: 750,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with sub-core request, prefer isolated & shared",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("750m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			preferIsolated:   true,
			preferShared:     true,
			expectedFull:     0,
			expectedFraction: 750,
			expectedIsolate:  false,
		},

		{
			name: "guaranteed QoS with single full core request, prefer isolated",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("1"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			preferIsolated:  true,
			expectedFull:    1,
			expectedIsolate: true,
		},
		{
			name: "guaranteed QoS with single full core request, prefer no isolated",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("1"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			preferIsolated:  false,
			expectedFull:    1,
			expectedIsolate: false,
		},
		{
			name: "guaranteed QoS with single full core request, prefer shared",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("1"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			preferShared:     true,
			expectedFull:     1,
			expectedFraction: 0,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with single full core request, prefer isolated & shared",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("1"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			preferIsolated:   true,
			preferShared:     true,
			expectedFull:     1,
			expectedFraction: 0,
			expectedIsolate:  true,
		},
		{
			name: "guaranteed QoS with single full core request, annotated shared",
			container: &mockContainer{
				name: "testcontainer",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("1"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
				annotations: map[string]string{
					preferSharedCPUsKey + "/container.testcontainer": "true",
				},
			},
			preferIsolated:   true,
			preferShared:     true,
			expectedFull:     0,
			expectedFraction: 1000,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with single full core request, annotated no isolated",
			container: &mockContainer{
				name: "testcontainer",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("1"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
				annotations: map[string]string{
					preferIsolatedCPUsKey + "/container.testcontainer": "false",
				},
			},
			preferIsolated:   true,
			preferShared:     true,
			expectedFull:     1,
			expectedFraction: 0,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with potential mixed request",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("1500m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			expectedFull:     1,
			expectedFraction: 500,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with potential mixed request, prefer isolated",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("1500m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			preferIsolated:   true,
			expectedFull:     1,
			expectedFraction: 500,
			expectedIsolate:  true,
		},
		{
			name: "guaranteed QoS with potential mixed request, prefer shared",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("1500m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			preferShared:     true,
			expectedFull:     0,
			expectedFraction: 1500,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with potential mixed request, prefer isolated & shared",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("1500m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			preferIsolated:   true,
			preferShared:     true,
			expectedFull:     0,
			expectedFraction: 1500,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with multi-core full request",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			expectedFull:    2,
			expectedIsolate: false,
		},
		{
			name: "guaranteed QoS with multi-core full request, prefer isolated",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			preferIsolated:  true,
			expectedFull:    2,
			expectedIsolate: false,
		},
		{
			name: "guaranteed QoS with multi-core full request, prefer shared",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			preferShared:    true,
			expectedFull:    2,
			expectedIsolate: false,
		},
		{
			name: "guaranteed QoS with multi-core full request, prefer isolated & shared",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			preferIsolated:  true,
			preferShared:    true,
			expectedFull:    2,
			expectedIsolate: false,
		},
		{
			name: "guaranteed QoS with multi-core full request, annotate isolated",
			container: &mockContainer{
				name: "testcontainer",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
				annotations: map[string]string{
					preferIsolatedCPUsKey + "/container.testcontainer": "true",
				},
			},
			expectedFull:    2,
			expectedIsolate: true,
		},
		{
			name: "guaranteed QoS with multi-core full request, annotate shared",
			container: &mockContainer{
				name: "testcontainer",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
				annotations: map[string]string{
					preferSharedCPUsKey + "/container.testcontainer": "true",
				},
			},
			expectedFull:     0,
			expectedFraction: 2000,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with multi-core full request, annotate isolated & shared",
			container: &mockContainer{
				name: "testcontainer",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
				annotations: map[string]string{
					preferIsolatedCPUsKey + "/container.testcontainer": "true",
					preferSharedCPUsKey + "/container.testcontainer":   "true",
				},
			},
			expectedFull:     0,
			expectedFraction: 2000,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with multi-core mixed request",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2500m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			expectedFull:     0,
			expectedFraction: 2500,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with multi-core mixed request, prefer isolated",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2500m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			expectedFull:     0,
			expectedFraction: 2500,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with multi-core mixed request, prefer shared",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2500m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			expectedFull:     0,
			expectedFraction: 2500,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with multi-core mixed request, prefer isolated & shared",
			container: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2500m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
			},
			expectedFull:     0,
			expectedFraction: 2500,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with multi-core mixed request, annotate isolated",
			container: &mockContainer{
				name: "testcontainer",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2500m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
				annotations: map[string]string{
					preferIsolatedCPUsKey + "/container.testcontainer": "true",
				},
			},
			expectedFull:     0,
			expectedFraction: 2500,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with multi-core mixed request, annotate shared",
			container: &mockContainer{
				name: "testcontainer",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2500m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
				annotations: map[string]string{
					preferSharedCPUsKey + "/container.testcontainer": "true",
				},
			},
			expectedFull:     0,
			expectedFraction: 2500,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with multi-core mixed request, annotate isolated & shared",
			container: &mockContainer{
				name: "testcontainer",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2500m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
				annotations: map[string]string{
					preferIsolatedCPUsKey + "/container.testcontainer": "true",
					preferSharedCPUsKey + "/container.testcontainer":   "true",
				},
			},
			expectedFull:     0,
			expectedFraction: 2500,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with multi-core mixed request, annotate no shared",
			container: &mockContainer{
				name: "testcontainer",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2500m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
				annotations: map[string]string{
					preferSharedCPUsKey + "/container.testcontainer": "false",
				},
			},
			expectedFull:     2,
			expectedFraction: 500,
			expectedIsolate:  false,
		},
		{
			name: "guaranteed QoS with multi-core mixed request, annotate isolated, no shared",
			container: &mockContainer{
				name: "testcontainer",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2500m"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
				annotations: map[string]string{
					preferIsolatedCPUsKey + "/container.testcontainer": "true",
					preferSharedCPUsKey + "/container.testcontainer":   "false",
				},
			},
			expectedFull:     2,
			expectedFraction: 500,
			expectedIsolate:  true,
		},
		{
			name: "return request's value for reserved pool namespace container",
			container: &mockContainer{
				namespace: "foobar",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSBurstable,
			},
			expectedFraction:       2000,
			expectedCpuType:        cpuReserved,
			reservedPoolNamespaces: []string{"foobar"},
		},
		{
			name: "return request's value for reserved pool namespace container using a glob 1",
			container: &mockContainer{
				namespace: "foobar2",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSBurstable,
			},
			expectedFraction:       2000,
			expectedCpuType:        cpuReserved,
			reservedPoolNamespaces: []string{"foobar*"},
		},
		{
			name: "return request's value for reserved pool namespace container using a glob 2",
			container: &mockContainer{
				namespace: "foobar-testing",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSBurstable,
			},
			expectedFraction:       2000,
			expectedCpuType:        cpuReserved,
			reservedPoolNamespaces: []string{"barfoo", "foobar*"},
		},
		{
			name: "return request's value for reserved pool namespace container using a glob 3",
			container: &mockContainer{
				namespace: "testing",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSBurstable,
			},
			expectedFraction:       2000,
			expectedCpuType:        cpuNormal,
			reservedPoolNamespaces: []string{"barfoo", "foobar?"},
		},
		{
			name: "return request's value for reserved pool namespace container using a glob 4",
			container: &mockContainer{
				namespace: "1foobar2",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSBurstable,
			},
			expectedFraction:       2000,
			expectedCpuType:        cpuNormal,
			reservedPoolNamespaces: []string{"barfoo", "foobar?"},
		},
		{
			name: "return request's value for reserved pool namespace container using a glob 5",
			container: &mockContainer{
				namespace: "foobar12",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Requests: v1.ResourceList{
						corev1.ResourceCPU: resapi.MustParse("2"),
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSBurstable,
			},
			expectedFraction:       2000,
			expectedCpuType:        cpuNormal,
			reservedPoolNamespaces: []string{"barfoo", "foobar?", "testing"},
		},
		{
			name: "return request's value for reserved cpu annotation container",
			container: &mockContainer{
				name: "testcontainer",
				pod: &mockPod{
					returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
					annotations: map[string]string{
						preferReservedCPUsKey + "/container.special": "false",
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSBurstable,
			},
			expectedFraction: 0,
			expectedCpuType:  cpuNormal,
		},
		{
			name: "return request's value for reserved cpu annotation container",
			container: &mockContainer{
				pod: &mockPod{
					returnValueFotGetQOSClass: corev1.PodQOSGuaranteed,
					annotations: map[string]string{
						preferReservedCPUsKey + "/pod": "true",
					},
				},
			},
			pod: &mockPod{
				returnValueFotGetQOSClass: corev1.PodQOSBurstable,
			},
			expectedFraction: 0,
			expectedCpuType:  cpuReserved,
		},
	}

	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			if tc.disabled {
				t.Skipf("The case '%s' is skipped", tc.name)
			}
			opt.PreferIsolated, opt.PreferShared = tc.preferIsolated, tc.preferShared
			opt.ReservedPoolNamespaces = tc.reservedPoolNamespaces
			full, fraction, isolate, cpuType := cpuAllocationPreferences(tc.pod, tc.container)
			if full != tc.expectedFull || fraction != tc.expectedFraction ||
				isolate != tc.expectedIsolate || cpuType != tc.expectedCpuType {
				t.Errorf("Expected (%v, %v, %v, %s), but got (%v, %v, %v, %s)",
					tc.expectedFull, tc.expectedFraction, tc.expectedIsolate, tc.expectedCpuType,
					full, fraction, isolate, cpuType)
			}
		})
	}
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/pools.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"math"
	"sort"

	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes"
	system "github.com/intel/cri-resource-manager/pkg/sysfs"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	idset "github.com/intel/goresctrl/pkg/utils"
)

// buildPoolsByTopology builds a hierarchical tree of pools based on HW topology.
func (p *policy) buildPoolsByTopology() error {
	if err := p.checkHWTopology(); err != nil {
		return err
	}

	// Notes:
	//   we never create pool nodes for PMEM-only NUMA nodes (as these
	//   are always without any close/local set of CPUs). We instead
	//   assign the PMEM memory of such a node to one of the closest
	//   normal (DRAM) pool NUMA nodes.
	//
	//   Akin to omitting lone dies from their parent, we omit from the
	//   pool tree each NUMA node that would end up being the only child
	//   of its parent (a die or a socket pool node). Resources for each
	//   such node will get discovered by and assigned to the would be
	//   parent which is now a leaf (die or socket) node in the tree.
	//
	//   The PMEM memory of (omitted) PMEM-only nodes is assigned
	//   to one of the closest normal (DRAM) NUMA nodes. This right
	//   assignment has already been calculated by assignPMEMNodes().
	//   However, making the corresponding assignment in the pool
	//   tree is a bit more involved as the DRAM node where a PMEM
	//   node has been assigned to might have gotten omitted from the
	//   tree if it ended up being a lone child. We use the recorded
	//   per NUMA node surrogates to find both if and where resources
	//   of omitted DRAM NUMA nodes need to get assigned to, and also
	//   where PMEM NUMA node resources need to get assigned to.

	log.Debug("building topology pool tree...")

	p.nodes = make(map[string]Node)

	// create a virtual root node, if we have a multi-socket system
	if p.sys.SocketCount() > 1 {
		p.root = p.NewVirtualNode("root", nilnode)
		p.nodes[p.root.Name()] = p.root
		log.Debug("  + created pool (root) %q", p.root.Name())
	} else {
		log.Debug("  - omitted pool virtual root (single-socket system)")
	}

	// create socket nodes, for a single-socket system set the only socket as the root
	sockets := map[idset.ID]Node{}
	for _, socketID := range p.sys.PackageIDs() {
		var socket Node

		if p.root != nil {
			socket = p.NewSocketNode(socketID, p.root)
			log.Debug("    + created pool %q", socket.Parent().Name()+"/"+socket.Name())
		} else {
			socket = p.NewSocketNode(socketID, nilnode)
			p.root = socket
			log.Debug("    + created pool %q (as root)", socket.Name())
		}

		p.nodes[socket.Name()] = socket
		sockets[socketID] = socket
	}

	// create dies for every socket, but only if we have more than one die in the socket
	numaDies := map[idset.ID]Node{} // created die Nodes per NUMA node id
	for socketID, socket := range sockets {
		dieIDs := p.sys.Package(socketID).DieIDs()
		if len(dieIDs) < 2 {
			log.Debug("      - omitted pool %q (die count: %d)", socket.Name()+"/die #0",
				len(dieIDs))
			continue
		}
		for _, dieID := range dieIDs {
			die := p.NewDieNode(dieID, socket)
			p.nodes[die.Name()] = die
			for _, numaNodeID := range p.sys.Package(socketID).DieNodeIDs(dieID) {
				numaDies[numaNodeID] = die
			}
			log.Debug("      + created pool %q", die.Parent().Name()+"/"+die.Name())
		}
	}

	// create pool nodes for NUMA nodes
	pmemNodes := map[idset.ID]system.Node{} // collected PMEM-only nodes
	dramNodes := map[idset.ID]system.Node{} // collected DRAM-only nodes
	numaSurrogates := map[idset.ID]Node{}   // surrogate leaf nodes for omitted NUMA nodes
	for _, numaNodeID := range p.sys.NodeIDs() {
		var numaNode Node

		numaSysNode := p.sys.Node(numaNodeID)
		switch numaSysNode.GetMemoryType() {
		case system.MemoryTypeDRAM:
			dramNodes[numaNodeID] = numaSysNode
		case system.MemoryTypePMEM:
			pmemNodes[numaNodeID] = numaSysNode
			log.Debug("        - omitted pool \"NUMA node #%d\": PMEM node", numaNodeID)
			continue // don't create pool, will assign to a closest DRAM node
		default:
			log.Warn("        - ignored pool \"NUMA node #%d\": unhandled memory type %v",
				numaNodeID, numaSysNode.GetMemoryType())
			continue
		}

		//
		// Notes:
		//   We omit inserting NUMA nodes (as leaf nodes) in the tree, if that NUMA node
		//   would be the only child of its parent. In this case, we record the would-be
		//   parent as the surrogate for the NUMA node. This surrogate will get assigned
		//   any closest PMEM-only NUMA node that the original one would have received.
		//

		if die, ok := numaDies[numaNodeID]; ok {
			if p.parentNumaNodeCountWithCPUs(numaSysNode) < 2 {
				numaSurrogates[numaNodeID] = die
				log.Debug("        - omitted pool \"NUMA node #%d\": using surrogate %q",
					numaNodeID, numaSurrogates[numaNodeID].Name())
				continue
			}
			numaNode = p.NewNumaNode(numaNodeID, die)
		} else {
			socket := sockets[p.sys.Node(numaNodeID).PackageID()]
			if p.parentNumaNodeCountWithCPUs(numaSysNode) < 2 {
				numaSurrogates[numaNodeID] = socket
				log.Debug("        - omitted pool \"NUMA node #%d\": using surrogate %q",
					numaNodeID, numaSurrogates[numaNodeID].Name())
				continue
			}
			numaNode = p.NewNumaNode(numaNodeID, socket)
		}

		p.nodes[numaNode.Name()] = numaNode
		numaSurrogates[numaNodeID] = numaNode
		log.Debug("        + created pool %q", numaNode.Parent().Name()+"/"+numaNode.Name())
	}

	// set up assignment of PMEM and DRAM node resources to pool nodes and surrogates
	assigned := p.assignNUMANodes(numaSurrogates, pmemNodes, dramNodes)
	log.Debug("NUMA node to pool assignment:")
	for n, numaNodeIDs := range assigned {
		log.Debug("  pool %q: NUMA nodes #%s", n.Name(), idset.NewIDSet(numaNodeIDs...))
	}

	// enumerate pools, calculate depth, discover resource capacity, assign NUMA nodes
	p.pools = make([]Node, 0)
	p.root.DepthFirst(func(n Node) error {
		p.pools = append(p.pools, n)
		n.(*node).id = p.nodeCnt
		p.nodeCnt++

		if p.depth < n.(*node).depth {
			p.depth = n.(*node).depth
		}

		n.DiscoverSupply(assigned[n.(*node).self.node])
		delete(assigned, n.(*node).self.node)

		return nil
	})

	// make sure all PMEM nodes got assigned
	if len(assigned) > 0 {
		for node, pmem := range assigned {
			log.Error("failed to assign PMEM NUMA nodes #%s (to NUMA node/surrogate %s %v)",
				idset.NewIDSet(pmem...), node.Name(), node)
		}
		log.Fatal("internal error: unassigned PMEM NUMA nodes remaining")
	}

	p.root.Dump("<pool-setup>")

	return nil
}

// parentNumaNodeCountWithCPUs returns the number of CPU-ful NUMA nodes in the parent die/socket.
func (p *policy) parentNumaNodeCountWithCPUs(numaNode system.Node) int {
	socketID := numaNode.PackageID()
	socket := p.sys.Package(socketID)
	count := 0
	for _, nodeID := range socket.DieNodeIDs(numaNode.DieID()) {
		node := p.sys.Node(nodeID)
		if !node.CPUSet().IsEmpty() {
			count++
		}
	}
	return count
}

// assignNUMANodes assigns each PMEM node to one of the closest DRAM nodes
func (p *policy) assignNUMANodes(surrogates map[idset.ID]Node, pmem, dram map[idset.ID]system.Node) map[Node][]idset.ID {
	// collect the closest DRAM NUMA nodes (sorted by idset.ID) for each PMEM NUMA node.
	closest := map[idset.ID][]idset.ID{}
	for pmemID := range pmem {
		var min []idset.ID
		for dramID := range dram {
			if len(min) < 1 {
				min = []idset.ID{dramID}
			} else {
				minDist := p.sys.NodeDistance(pmemID, min[0])
				newDist := p.sys.NodeDistance(pmemID, dramID)
				switch {
				case newDist == minDist:
					min = append(min, dramID)
				case newDist < minDist:
					min = []idset.ID{dramID}
				}
			}
		}
		sort.Slice(min, func(i, j int) bool { return min[i] < min[j] })
		closest[pmemID] = min
	}

	assigned := map[Node][]idset.ID{}

	// assign each PMEM node to the closest DRAM surrogate with the least PMEM assigned
	for pmemID, min := range closest {
		var taker Node
		var takerID idset.ID

		for _, dramID := range min {
			if taker == nil {
				taker = surrogates[dramID]
				takerID = dramID
			} else {
				if len(assigned[taker]) > len(assigned[surrogates[dramID]]) {
					taker = surrogates[dramID]
					takerID = dramID
				}
			}
		}
		if taker == nil {
			log.Panic("failed to assign CPU-less PMEM node #%d to any surrogate", pmemID)
		}

		assigned[taker] = append(assigned[taker], pmemID)
		log.Debug("        + PMEM node #%d assigned to %s with distance %v", pmemID, taker.Name(),
			p.sys.NodeDistance(pmemID, takerID))
	}

	// assign each DRAM node to its own surrogate (can be the DRAM node itself)
	for dramID := range dram {
		taker := surrogates[dramID]
		assigned[taker] = append([]idset.ID{dramID}, assigned[taker]...)
		log.Debug("        + DRAM node #%d assigned to %s", dramID, taker.Name())
	}

	return assigned
}

// checkHWTopology verifies our otherwise implicit assumptions about the HW.
func (p *policy) checkHWTopology() error {
	// NUMA nodes (memory controllers) should not be shared by multiple sockets.
	socketNodes := map[idset.ID]cpuset.CPUSet{}
	for _, socketID := range p.sys.PackageIDs() {
		pkg := p.sys.Package(socketID)
		socketNodes[socketID] = system.CPUSetFromIDSet(idset.NewIDSet(pkg.NodeIDs()...))
	}
	for id1, nodes1 := range socketNodes {
		for id2, nodes2 := range socketNodes {
			if id1 == id2 {
				continue
			}
			if shared := nodes1.Intersection(nodes2); !shared.IsEmpty() {
				log.Error("can't handle HW topology: sockets #%v, #%v share NUMA node(s) #%s",
					id1, id2, shared.String())
				return policyError("unhandled HW topology: sockets #%v, #%v share NUMA node(s) #%s",
					id1, id2, shared.String())
			}
		}
	}

	// NUMA nodes (memory controllers) should not be shared by multiple dies.
	for _, socketID := range p.sys.PackageIDs() {
		pkg := p.sys.Package(socketID)
		for _, id1 := range pkg.DieIDs() {
			nodes1 := idset.NewIDSet(pkg.DieNodeIDs(id1)...)
			for _, id2 := range pkg.DieIDs() {
				if id1 == id2 {
					continue
				}
				nodes2 := idset.NewIDSet(pkg.DieNodeIDs(id2)...)
				if shared := system.CPUSetFromIDSet(nodes1).Intersection(system.CPUSetFromIDSet(nodes2)); !shared.IsEmpty() {
					log.Error("can't handle HW topology: "+
						"socket #%v, dies #%v,%v share NUMA node(s) #%s",
						socketID, id1, id2, shared.String())
					return policyError("unhandled HW topology: "+
						"socket #%v, dies #%v,#%v share NUMA node(s) #%s",
						socketID, id1, id2, shared.String())
				}
			}
		}
	}

	// NUMA distance matrix should be symmetric.
	for _, from := range p.sys.NodeIDs() {
		for _, to := range p.sys.NodeIDs() {
			d1 := p.sys.NodeDistance(from, to)
			d2 := p.sys.NodeDistance(to, from)
			if d1 != d2 {
				log.Error("asymmetric NUMA distance (#%d, #%d): %d != %d",
					from, to, d1, d2)
				return policyError("asymmetric NUMA distance (#%d, #%d): %d != %d",
					from, to, d1, d2)
			}
		}
	}

	return nil
}

// Pick a pool and allocate resource from it to the container.
func (p *policy) allocatePool(container cache.Container, poolHint string) (Grant, error) {
	var pool Node

	request := newRequest(container)

	if p.root.FreeSupply().ReservedCPUs().IsEmpty() && request.CPUType() == cpuReserved {
		// Fallback to allocating reserved CPUs from the shared pool
		// if there are no reserved CPUs.
		request.SetCPUType(cpuNormal)
	}

	// Assumption: in the beginning the CPUs and memory will be allocated from
	// the same pool. This assumption can be relaxed later, requires separate
	// (but connected) scoring of memory and CPU.

	if request.CPUType() == cpuReserved || container.GetNamespace() == kubernetes.NamespaceSystem {
		pool = p.root
	} else {
		affinity, err := p.calculatePoolAffinities(request.GetContainer())

		if err != nil {
			return nil, policyError("failed to calculate affinity for container %s: %v",
				container.PrettyName(), err)
		}

		scores, pools := p.sortPoolsByScore(request, affinity)

		if log.DebugEnabled() {
			log.Debug("* node fitting for %s", request)
			for idx, n := range pools {
				log.Debug("    - #%d: node %s, score %s, affinity: %d",
					idx, n.Name(), scores[n.NodeID()], affinity[n.NodeID()])
			}
		}

		if len(pools) == 0 {
			return nil, policyError("no suitable pool found for container %s",
				container.PrettyName())
		}

		if poolHint != "" {
			for idx, p := range pools {
				if p.Name() == poolHint {
					log.Debug("* using hinted pool %q (#%d best fit)", poolHint, idx+1)
					pool = p
					break
				}
			}
			if pool == nil {
				log.Debug("* cannot use hinted pool %q", poolHint)
			}
		}

		if pool == nil {
			pool = pools[0]
		}
	}

	supply := pool.FreeSupply()
	grant, err := supply.Allocate(request)
	if err != nil {
		return nil, policyError("failed to allocate %s from %s: %v",
			request, supply.DumpAllocatable(), err)
	}

	log.Debug("allocated req '%s' to memory node '%s' (memset %s,%s,%s)",
		container.PrettyName(), grant.GetMemoryNode().Name(),
		grant.GetMemoryNode().GetMemset(memoryDRAM),
		grant.GetMemoryNode().GetMemset(memoryPMEM),
		grant.GetMemoryNode().GetMemset(memoryHBM))

	// In case the workload is assigned to a memory node with multiple
	// child nodes, there is no guarantee that the workload will
	// allocate memory "nicely". Instead we'll have to make the
	// conservative assumption that the memory will all be allocated
	// from one single node, and that node can be any of the child
	// nodes in the system. Thus, we'll need to reserve the memory
	// from all child nodes, and move the containers already
	// assigned to the child nodes upwards in the topology tree, if
	// they no longer fit to the child node that they are in. In
	// other words, they'll need to have a wider range of memory
	// node options in order to fit to memory.
	//
	//
	// Example:
	//
	// Workload 1 and Workload 2 are running on the leaf nodes:
	//
	//                    +----------------+
	//                    |Total mem: 4G   |
	//                    |Total CPUs: 4   |            Workload 1:
	//                    |Reserved:       |
	//                    |  1.5G          |             1G mem
	//                    |                |
	//                    |                |            Workload 2:
	//                    |                |
	//                    +----------------+             0.5G mem
	//                       /          \
	//                      /            \
	//                     /              \
	//                    /                \
	//                   /                  \
	//                  /                    \
	//                 /                      \
	//                /                        \
	//  +----------------+                  +----------------+
	//  |Total mem: 2G   |                  |Total mem: 2G   |
	//  |Total CPUs: 2   |                  |Total CPUs: 2   |
	//  |Reserved:       |                  |Reserved:       |
	//  |  1G            |                  |  0.5G          |
	//  |                |                  |                |
	//  |                |                  |                |
	//  |     * WL 1     |                  |     * WL 2     |
	//  +----------------+                  +----------------+
	//
	//
	// Then Workload 3 comes in and is assigned to the root node. Memory
	// reservations are done on the leaf nodes:
	//
	//                    +----------------+
	//                    |Total mem: 4G   |
	//                    |Total CPUs: 4   |            Workload 1:
	//                    |Reserved:       |
	//                    |  3G            |             1G mem
	//                    |                |
	//                    |                |            Workload 2:
	//                    |  * WL 3        |
	//                    +----------------+             0.5G mem
	//                       /          \
	//                      /            \              Workload 3:
	//                     /              \
	//                    /                \             1.5G mem
	//                   /                  \
	//                  /                    \
	//                 /                      \
	//                /                        \
	//  +----------------+                  +----------------+
	//  |Total mem: 2G   |                  |Total mem: 2G   |
	//  |Total CPUs: 2   |                  |Total CPUs: 2   |
	//  |Reserved:       |                  |Reserved:       |
	//  |  2.5G          |                  |  2G            |
	//  |                |                  |                |
	//  |                |                  |                |
	//  |     * WL 1     |                  |     * WL 2     |
	//  +----------------+                  +----------------+
	//
	//
	// Workload 1 no longer fits to the leaf node, because the total
	// reservation from the leaf node is over the memory maximum.
	// Thus, it's moved upwards in the tree to the root node. Memory
	// resevations are again updated accordingly:
	//
	//                    +----------------+
	//                    |Total mem: 4G   |
	//                    |Total CPUs: 4   |            Workload 1:
	//                    |Reserved:       |
	//                    |  3G            |             1G mem
	//                    |                |
	//                    |  * WL 1        |            Workload 2:
	//                    |  * WL 3        |
	//                    +----------------+             0.5G mem
	//                       /          \
	//                      /            \              Workload 3:
	//                     /              \
	//                    /                \             1.5G mem
	//                   /                  \
	//                  /                    \
	//                 /                      \
	//                /                        \
	//  +----------------+                  +----------------+
	//  |Total mem: 2G   |                  |Total mem: 2G   |
	//  |Total CPUs: 2   |                  |Total CPUs: 2   |
	//  |Reserved:       |                  |Reserved:       |
	//  |  2.5G          |                  |  3G            |
	//  |                |                  |                |
	//  |                |                  |                |
	//  |                |                  |     * WL 2     |
	//  +----------------+                  +----------------+
	//
	//
	// Now Workload 2 doesn't fit to the leaf node either. It's also moved
	// to the root node:
	//
	//                    +----------------+
	//                    |Total mem: 4G   |
	//                    |Total CPUs: 4   |            Workload 1:
	//                    |Reserved:       |
	//                    |  3G            |             1G mem
	//                    |  * WL 2        |
	//                    |  * WL 1        |            Workload 2:
	//                    |  * WL 3        |
	//                    +----------------+             0.5G mem
	//                       /          \
	//                      /            \              Workload 3:
	//                     /              \
	//                    /                \             1.5G mem
	//                   /                  \
	//                  /                    \
	//                 /                      \
	//                /                        \
	//  +----------------+                  +----------------+
	//  |Total mem: 2G   |                  |Total mem: 2G   |
	//  |Total CPUs: 2   |                  |Total CPUs: 2   |
	//  |Reserved:       |                  |Reserved:       |
	//  |  3G            |                  |  3G            |
	//  |                |                  |                |
	//  |                |                  |                |
	//  |                |                  |                |
	//  +----------------+                  +----------------+
	//

	// We need to analyze all existing containers which are a subset of current grant.
	memset := grant.GetMemoryNode().GetMemset(grant.MemoryType())

	// Add an extra memory reservation to all subnodes.
	// TODO: no need to do any of this if no memory request
	grant.UpdateExtraMemoryReservation()

	// See how much memory reservations the workloads on the
	// nodes up from this one cause to the node. We only need to
	// analyze the workloads up until this node, because it's
	// guaranteed that the subtree can hold the workloads.

	// If it turns out that the current workloads no longer fit
	// to the node with the reservations from nodes from above
	// in the tree, move all nodes upward. Note that this
	// creates a reservation of the same size to the node, so in
	// effect the node has to be empty of its "own" workloads.
	// In this case move all the workloads one level up in the tree.

	changed := true
	for changed {
		changed = false
		for _, oldGrant := range p.allocations.grants {
			oldMemset := oldGrant.GetMemoryNode().GetMemset(grant.MemoryType())
			if oldMemset.Size() < memset.Size() && memset.Has(oldMemset.Members()...) {
				changed, err = oldGrant.ExpandMemset()
				if err != nil {
					return nil, err
				}
				if changed {
					log.Debug("* moved container %s upward to node %s to guarantee memory",
						oldGrant.GetContainer().PrettyName(), oldGrant.GetMemoryNode().Name())
					break
				}
			}
		}
	}

	p.allocations.grants[container.GetCacheID()] = grant

	p.saveAllocations()

	return grant, nil
}

// Apply the result of allocation to the requesting container.
func (p *policy) applyGrant(grant Grant) {
	log.Debug("* applying grant %s", grant)

	container := grant.GetContainer()
	cpuType := grant.CPUType()
	exclusive := grant.ExclusiveCPUs()
	reserved := grant.ReservedCPUs()
	shared := grant.SharedCPUs()
	cpuPortion := grant.SharedPortion()

	cpus := ""
	kind := ""
	if cpuType == cpuNormal {
		if exclusive.IsEmpty() {
			cpus = shared.String()
			kind = "shared"
		} else {
			kind = "exclusive"
			if cpuPortion > 0 {
				kind += "+shared"
				cpus = exclusive.Union(shared).String()
			} else {
				cpus = exclusive.String()
			}
		}
	} else if cpuType == cpuReserved {
		kind = "reserved"
		cpus = reserved.String()
		cpuPortion = grant.ReservedPortion()
	} else {
		log.Debug("unsupported granted cpuType %s", cpuType)
		return
	}

	mems := ""
	if opt.PinMemory {
		mems = grant.Memset().String()
	}

	if opt.PinCPU {
		if cpus != "" {
			log.Debug("  => pinning to (%s) cpuset %s", kind, cpus)
		} else {
			log.Debug("  => not pinning CPUs, allocated cpuset is empty...")
		}
		container.SetCpusetCpus(cpus)

		// Notes:
		//     It is extremely important to ensure that the exclusive subset of mixed
		//     CPU allocations are really exclusive at the level of the whole system
		//     and not just the orchestration. This is something we can't really do
		//     from here reliably ATM.
		//
		//     We set the CPU scheduling weight for the whole container (all processes
		//     within the container) according to container's partial allocation.
		//     This is typically a sub-CPU allocation (< 1000 mCPU) which is meant to be
		//     consumed by an 'infra/mgmt' process within the container from the shared subset
		//     of CPUs assigned to the container. The container entry point or the processes
		//     within the container are supposed to arrange so that the 'infra' process(es)
		//     are pinned to the shared CPUs and the 'data/performance critical' critical'
		//     process(es) to the exclusive CPU(s).
		//
		//     With this setup the kernel will slice out the correct amount of CPU from
		//     the shared pool for the 'infra' process as it competes with other workloads'
		//     processes in the same pool. Also the 'data' process should run fine, since
		//     it does not need to compete for CPU with any other processes in the system
		//     as long as that allocation is genuinely system-wide exclusive.
		container.SetCPUShares(int64(cache.MilliCPUToShares(int64(cpuPortion))))
	}

	if mems != "" {
		log.Debug("  => pinning to memory %s", mems)
		container.SetCpusetMems(mems)
		p.setDemotionPreferences(container, grant)
	} else {
		log.Debug("  => not pinning memory, memory set is empty...")
	}
}

// Release resources allocated by this grant.
func (p *policy) releasePool(container cache.Container) (Grant, bool) {
	log.Debug("* releasing resources allocated to %s", container.PrettyName())

	grant, ok := p.allocations.grants[container.GetCacheID()]
	if !ok {
		log.Debug("  => no grant found, nothing to do...")
		return nil, false
	}

	log.Debug("  => releasing grant %s...", grant)

	// Remove the grant from all supplys it uses.
	grant.Release()

	delete(p.allocations.grants, container.GetCacheID())
	p.saveAllocations()

	return grant, true
}

// Update shared allocations effected by agrant.
func (p *policy) updateSharedAllocations(grant *Grant) {
	if grant != nil {
		log.Debug("* updating shared allocations affected by %s", (*grant).String())
		if (*grant).CPUType() == cpuReserved {
			log.Debug("  this grant uses reserved CPUs, does not affect shared allocations")
			return
		}
	} else {
		log.Debug("* updating shared allocations")
	}

	for _, other := range p.allocations.grants {
		if grant != nil {
			if other.GetContainer().GetCacheID() == (*grant).GetContainer().GetCacheID() {
				continue
			}
		}

		if other.CPUType() == cpuReserved {
			log.Debug("  => %s not affected (only reserved CPUs)...", other)
			continue
		}

		if other.SharedPortion() == 0 && !other.ExclusiveCPUs().IsEmpty() {
			log.Debug("  => %s not affected (only exclusive CPUs)...", other)
			continue
		}

		if opt.PinCPU {
			shared := other.GetCPUNode().FreeSupply().SharableCPUs()
			exclusive := other.ExclusiveCPUs()
			if exclusive.IsEmpty() {
				log.Debug("  => updating %s with shared CPUs of %s: %s...",
					other, other.GetCPUNode().Name(), shared.String())
				other.GetContainer().SetCpusetCpus(shared.String())
			} else {
				log.Debug("  => updating %s with exclusive+shared CPUs of %s: %s+%s...",
					other, other.GetCPUNode().Name(), exclusive.String(), shared.String())
				other.GetContainer().SetCpusetCpus(exclusive.Union(shared).String())
			}
		}
	}
}

// setDemotionPreferences sets the dynamic demotion preferences a container.
func (p *policy) setDemotionPreferences(c cache.Container, g Grant) {
	log.Debug("%s: setting demotion preferences...", c.PrettyName())

	// System containers should not be demoted.
	if c.GetNamespace() == kubernetes.NamespaceSystem {
		c.SetPageMigration(nil)
		return
	}

	memType := g.GetMemoryNode().GetMemoryType()
	if memType&memoryDRAM == 0 || memType&memoryPMEM == 0 {
		c.SetPageMigration(nil)
		return
	}

	dram := g.GetMemoryNode().GetMemset(memoryDRAM)
	pmem := g.GetMemoryNode().GetMemset(memoryPMEM)

	log.Debug("%s: eligible for demotion from %s to %s NUMA node(s)",
		c.PrettyName(), dram, pmem)

	c.SetPageMigration(&cache.PageMigrate{
		SourceNodes: dram,
		TargetNodes: pmem,
	})
}

func (p *policy) filterInsufficientResources(req Request, originals []Node) []Node {
	sufficient := make([]Node, 0)

	for _, node := range originals {
		// TODO: Need to filter based on the memory demotion scheme here. For example, if the request is
		// of memory type memoryAll, the memory used might be PMEM until it's full and after that DRAM. If
		// it's DRAM, amount of PMEM should not be considered and so on. How to find this out in a live
		// system?

		supply := node.FreeSupply()
		reqMemType := req.MemoryType()

		if reqMemType == memoryUnspec {
			// The algorithm for handling unspecified memory allocations is the same as for handling a request
			// with memory type all.
			reqMemType = memoryAll
		}

		required := req.MemAmountToAllocate()

		for _, memType := range []memoryType{memoryPMEM, memoryDRAM, memoryHBM} {
			if reqMemType&memType != 0 {
				extra := supply.ExtraMemoryReservation(memType)
				free := supply.MemoryLimit()[memType]
				if extra > free {
					continue
				}
				if required+extra <= free {
					sufficient = append(sufficient, node)
					required = 0
					break
				}
				if req.ColdStart() > 0 {
					// For a "cold start" request, the memory request must fit completely in the PMEM. So reject the node.
					break
				}
				// Subtracting unsigned integers.
				// Here free >= extra, that is, (free - extra) is non-negative,
				// and required > free - extra, that is, required stays positive.
				required -= (free - extra)
			}
		}
		if required > 0 {
			log.Debug("%s: filtered out %s with insufficient memory", req.GetContainer().PrettyName(), node.Name())
		}
	}
	return sufficient
}

// Score pools against the request and sort them by score.
func (p *policy) sortPoolsByScore(req Request, aff map[int]int32) (map[int]Score, []Node) {
	scores := make(map[int]Score, p.nodeCnt)

	p.root.DepthFirst(func(n Node) error {
		scores[n.NodeID()] = n.GetScore(req)
		return nil
	})

	// Filter out pools which don't have enough uncompressible resources
	// (memory) to satisfy the request.
	filteredPools := p.filterInsufficientResources(req, p.pools)

	sort.Slice(filteredPools, func(i, j int) bool {
		return p.compareScores(req, filteredPools, scores, aff, i, j)
	})

	return scores, filteredPools
}

// Compare two pools by scores for allocation preference.
func (p *policy) compareScores(request Request, pools []Node, scores map[int]Score,
	affinity map[int]int32, i int, j int) bool {
	node1, node2 := pools[i], pools[j]
	depth1, depth2 := node1.RootDistance(), node2.RootDistance()
	id1, id2 := node1.NodeID(), node2.NodeID()
	score1, score2 := scores[id1], scores[id2]
	cpuType := request.CPUType()
	isolated1, reserved1, shared1 := score1.IsolatedCapacity(), score1.ReservedCapacity(), score1.SharedCapacity()
	isolated2, reserved2, shared2 := score2.IsolatedCapacity(), score2.ReservedCapacity(), score2.SharedCapacity()
	a1 := affinityScore(affinity, node1)
	a2 := affinityScore(affinity, node2)

	log.Debug("comparing scores for %s and %s", node1.Name(), node2.Name())
	log.Debug("  %s: %s, affinity score %f", node1.Name(), score1.String(), a1)
	log.Debug("  %s: %s, affinity score %f", node2.Name(), score2.String(), a2)

	//
	// Notes:
	//
	// Our scoring/score sorting algorithm is:
	//
	// 1) - insufficient isolated, reserved or shared capacity loses
	// 2) - if we have affinity, the higher affinity score wins
	// 3) - if only one node matches the memory type request, it wins
	// 4) - if we have topology hints
	//       * better hint score wins
	//       * for a tie, prefer the lower node then the smaller id
	// 5) - if a node is lower in the tree it wins
	// 6) - for reserved allocations
	//       * more unallocated reserved capacity per colocated container wins
	// 7) - for (non-reserved) isolated allocations
	//       * more isolated capacity wins
	//       * for a tie, prefer the smaller id
	// 8) - for (non-reserved) exclusive allocations
	//       * more slicable (shared) capacity wins
	//       * for a tie, prefer the smaller id
	// 9) - for (non-reserved) shared-only allocations
	//       * fewer colocated containers win
	//       * for a tie prefer more shared capacity
	// 10) - lower id wins
	//
	// Before this comparison is reached, nodes with insufficient uncompressible resources
	// (memory) have been filtered out.

	// 1) a node with insufficient isolated or shared capacity loses
	switch {
	case cpuType == cpuNormal && ((isolated2 < 0 && isolated1 >= 0) || (shared2 <= 0 && shared1 > 0)):
		log.Debug("  => %s loses, insufficent isolated or shared", node2.Name())
		return true
	case cpuType == cpuNormal && ((isolated1 < 0 && isolated2 >= 0) || (shared1 <= 0 && shared2 > 0)):
		log.Debug("  => %s loses, insufficent isolated or shared", node1.Name())
		return false
	case cpuType == cpuReserved && reserved2 < 0 && reserved1 >= 0:
		log.Debug("  => %s loses, insufficent reserved", node2.Name())
		return true
	case cpuType == cpuReserved && reserved1 < 0 && reserved2 >= 0:
		log.Debug("  => %s loses, insufficent reserved", node1.Name())
		return false
	}

	log.Debug("  - isolated/reserved/shared insufficiency is a TIE")

	// 2) higher affinity score wins
	if a1 > a2 {
		log.Debug("  => %s loses on affinity", node2.Name())
		return true
	}
	if a2 > a1 {
		log.Debug("  => %s loses on affinity", node1.Name())
		return false
	}

	log.Debug("  - affinity is a TIE")

	// 3) matching memory type wins
	if reqType := request.MemoryType(); reqType != memoryUnspec {
		if node1.HasMemoryType(reqType) && !node2.HasMemoryType(reqType) {
			log.Debug("  => %s WINS on memory type", node1.Name())
			return true
		}
		if !node1.HasMemoryType(reqType) && node2.HasMemoryType(reqType) {
			log.Debug("  => %s WINS on memory type", node2.Name())
			return false
		}

		log.Debug("  - memory type is a TIE")
	}

	// 4) better topology hint score wins
	hScores1 := score1.HintScores()
	if len(hScores1) > 0 {
		hScores2 := score2.HintScores()
		hs1, nz1 := combineHintScores(hScores1)
		hs2, nz2 := combineHintScores(hScores2)

		if hs1 > hs2 {
			log.Debug("  => %s WINS on hints", node1.Name())
			return true
		}
		if hs2 > hs1 {
			log.Debug("  => %s WINS on hints", node2.Name())
			return false
		}

		log.Debug("  - hints are a TIE")

		if hs1 == 0 {
			if nz1 > nz2 {
				log.Debug("  => %s WINS on non-zero hints", node1.Name())
				return true
			}
			if nz2 > nz1 {
				log.Debug("  => %s WINS on non-zero hints", node2.Name())
				return false
			}

			log.Debug("  - non-zero hints are a TIE")
		}

		// for a tie, prefer lower nodes and smaller ids
		if hs1 == hs2 && nz1 == nz2 && (hs1 != 0 || nz1 != 0) {
			if depth1 > depth2 {
				log.Debug("  => %s WINS as it is lower", node1.Name())
				return true
			}
			if depth1 < depth2 {
				log.Debug("  => %s WINS as it is lower", node2.Name())
				return false
			}

			log.Debug("  => %s WINS based on equal hint socres, lower id",
				map[bool]string{true: node1.Name(), false: node2.Name()}[id1 < id2])

			return id1 < id2
		}
	}

	// 5) a lower node wins
	if depth1 > depth2 {
		log.Debug("  => %s WINS on depth", node1.Name())
		return true
	}
	if depth1 < depth2 {
		log.Debug("  => %s WINS on depth", node2.Name())
		return false
	}

	log.Debug("  - depth is a TIE")

	if request.CPUType() == cpuReserved {
		// 6) if requesting reserved CPUs, more reserved
		//    capacity per colocated container wins. Reserved
		//    CPUs cannot be precisely accounted as they run
		//    also BestEffort containers that do not carry
		//    information on their CPU needs.
		if reserved1/(score1.Colocated()+1) > reserved2/(score2.Colocated()+1) {
			return true
		}
		if reserved2/(score2.Colocated()+1) > reserved1/(score1.Colocated()+1) {
			return false
		}
		log.Debug("  - reserved capacity is a TIE")
	} else if request.CPUType() == cpuNormal {
		// 7) more isolated capacity wins
		if request.Isolate() && (isolated1 > 0 || isolated2 > 0) {
			if isolated1 > isolated2 {
				return true
			}
			if isolated2 > isolated1 {
				return false
			}

			log.Debug("  => %s WINS based on equal isolated capacity, lower id",
				map[bool]string{true: node1.Name(), false: node2.Name()}[id1 < id2])

			return id1 < id2
		}

		// 8) more slicable shared capacity wins
		if request.FullCPUs() > 0 && (shared1 > 0 || shared2 > 0) {
			if shared1 > shared2 {
				log.Debug("  => %s WINS on more slicable capacity", node1.Name())
				return true
			}
			if shared2 > shared1 {
				log.Debug("  => %s WINS on more slicable capacity", node2.Name())
				return false
			}

			log.Debug("  => %s WINS based on equal slicable capacity, lower id",
				map[bool]string{true: node1.Name(), false: node2.Name()}[id1 < id2])

			return id1 < id2
		}

		// 9) fewer colocated containers win
		if score1.Colocated() < score2.Colocated() {
			log.Debug("  => %s WINS on colocation score", node1.Name())
			return true
		}
		if score2.Colocated() < score1.Colocated() {
			log.Debug("  => %s WINS on colocation score", node2.Name())
			return false
		}

		log.Debug("  - colocation score is a TIE")

		// more shared capacity wins
		if shared1 > shared2 {
			log.Debug("  => %s WINS on more shared capacity", node1.Name())
			return true
		}
		if shared2 > shared1 {
			log.Debug("  => %s WINS on more shared capacity", node2.Name())
			return false
		}
	}

	// 10) lower id wins
	log.Debug("  => %s WINS based on lower id",
		map[bool]string{true: node1.Name(), false: node2.Name()}[id1 < id2])

	return id1 < id2
}

// affinityScore calculate the 'goodness' of the affinity for a node.
func affinityScore(affinities map[int]int32, node Node) float64 {
	Q := 0.75

	// Calculate affinity for every node as a combination of
	// affinities of the nodes on the path from the node to
	// the root and the nodes in the subtree under the node.
	//
	// The combined affinity for node n is Sum_x(A_x*D_x),
	// where for every node x, A_x is the affinity for x and
	// D_x is Q ** (number of links from node to x). IOW, the
	// effective affinity is the sum of the affinity of n and
	// the affinity of each node x of the above mentioned set
	// diluted proprotionally to the distance of x to n, with
	// Q being 0.75.

	var score float64
	for n, q := node.Parent(), Q; !n.IsNil(); n, q = n.Parent(), q*Q {
		a := affinities[n.NodeID()]
		score += q * float64(a)
	}
	node.BreadthFirst(func(n Node) error {
		diff := float64(n.RootDistance() - node.RootDistance())
		q := math.Pow(Q, diff)
		a := affinities[n.NodeID()]
		score += q * float64(a)
		return nil
	})
	return score
}

// hintScores calculates combined full and zero-filtered hint scores.
func combineHintScores(scores map[string]float64) (float64, float64) {
	if len(scores) == 0 {
		return 0.0, 0.0
	}

	combined, filtered := 1.0, 0.0
	for _, score := range scores {
		combined *= score
		if score != 0.0 {
			if filtered == 0.0 {
				filtered = score
			} else {
				filtered *= score
			}
		}
	}
	return combined, filtered
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/pools_test.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"fmt"
	"os"
	"path"
	"testing"

	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	policyapi "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"

	v1 "k8s.io/api/core/v1"
	resapi "k8s.io/apimachinery/pkg/api/resource"

	system "github.com/intel/cri-resource-manager/pkg/sysfs"
	"github.com/intel/cri-resource-manager/pkg/utils"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
)

func findNodeWithID(id int, nodes []Node) Node {
	for _, node := range nodes {
		if node.NodeID() == id {
			return node
		}
	}
	panic("No node found with id " + fmt.Sprintf("%d", id))
}

func findNodeWithName(name string, nodes []Node) Node {
	for _, node := range nodes {
		if node.Name() == name {
			return node
		}
	}
	panic("No node found with name " + name)
}

func setLinks(nodes []Node, tree map[int][]int) {
	hasParent := map[int]struct{}{}
	for parent, children := range tree {
		parentNode := findNodeWithID(parent, nodes)
		for _, child := range children {
			childNode := findNodeWithID(child, nodes)
			childNode.LinkParent(parentNode)
			hasParent[child] = struct{}{}
		}
	}
	orphans := []int{}
	for id := range tree {
		if _, ok := hasParent[id]; !ok {
			node := findNodeWithID(id, nodes)
			node.LinkParent(nilnode)
			orphans = append(orphans, id)
		}
	}
	if len(orphans) != 1 {
		panic(fmt.Sprintf("expected one root node, got %d with IDs %v", len(orphans), orphans))
	}
}

func TestMemoryLimitFiltering(t *testing.T) {

	// Test the scoring algorithm with synthetic data. The assumptions are:

	// 1. The first node in "nodes" is the root of the tree.

	tcases := []struct {
		name                   string
		nodes                  []Node
		numaNodes              []system.Node
		req                    Request
		affinities             map[int]int32
		tree                   map[int][]int
		expectedRemainingNodes []int
	}{
		{
			name: "single node memory limit (fits)",
			nodes: []Node{
				&numanode{
					node: node{
						id:      100,
						name:    "testnode0",
						kind:    UnknownNode,
						noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(10001, 0, 0), createMemoryMap(0, 0, 0)),
						freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(10001, 0, 0), createMemoryMap(0, 0, 0)),
					},
					id: 0, // system node id
				},
			},
			numaNodes: []system.Node{
				&mockSystemNode{id: 0, memFree: 10001, memTotal: 10001},
			},
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   defaultMemoryType,
				container: &mockContainer{},
			},
			expectedRemainingNodes: []int{100},
			tree:                   map[int][]int{100: {}},
		},
		{
			name: "single node memory limit (doesn't fit)",
			nodes: []Node{
				&numanode{
					node: node{
						id:      100,
						name:    "testnode0",
						kind:    UnknownNode,
						noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(9999, 0, 0), createMemoryMap(0, 0, 0)),
						freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(9999, 0, 0), createMemoryMap(0, 0, 0)),
					},
					id: 0, // system node id
				},
			},
			numaNodes: []system.Node{
				&mockSystemNode{id: 0, memFree: 9999, memTotal: 9999},
			},
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   defaultMemoryType,
				container: &mockContainer{},
			},
			expectedRemainingNodes: []int{},
			tree:                   map[int][]int{100: {}},
		},
		{
			name: "two node memory limit (fits to leaf)",
			nodes: []Node{
				&virtualnode{
					node: node{
						id:      100,
						name:    "testnode0",
						kind:    UnknownNode,
						noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(10001, 0, 0), createMemoryMap(0, 0, 0)),
						freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(10001, 0, 0), createMemoryMap(0, 0, 0)),
					},
				},
				&numanode{
					node: node{
						id:      101,
						name:    "testnode1",
						kind:    UnknownNode,
						noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(10001, 0, 0), createMemoryMap(0, 0, 0)),
						freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(10001, 0, 0), createMemoryMap(0, 0, 0)),
					},
					id: 0, // system node id
				},
			},
			numaNodes: []system.Node{
				&mockSystemNode{id: 0, memFree: 10001, memTotal: 10001},
			},
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   defaultMemoryType,
				container: &mockContainer{},
			},
			expectedRemainingNodes: []int{100, 101},
			tree:                   map[int][]int{100: {101}, 101: {}},
		},
		{
			name: "three node memory limit (fits to root)",
			nodes: []Node{
				&virtualnode{
					node: node{
						id:      100,
						name:    "testnode0",
						kind:    UnknownNode,
						noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(12000, 0, 0), createMemoryMap(0, 0, 0)),
						freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(12000, 0, 0), createMemoryMap(0, 0, 0)),
					},
				},
				&numanode{
					node: node{
						id:      101,
						name:    "testnode1",
						kind:    UnknownNode,
						noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(6000, 0, 0), createMemoryMap(0, 0, 0)),
						freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(6000, 0, 0), createMemoryMap(0, 0, 0)),
					},
					id: 0, // system node id
				},
				&numanode{
					node: node{
						id:      102,
						name:    "testnode2",
						kind:    UnknownNode,
						noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(6000, 0, 0), createMemoryMap(0, 0, 0)),
						freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(6000, 0, 0), createMemoryMap(0, 0, 0)),
					},
					id: 1, // system node id
				},
			},
			numaNodes: []system.Node{
				&mockSystemNode{id: 0, memFree: 6000, memTotal: 6000},
				&mockSystemNode{id: 1, memFree: 6000, memTotal: 6000},
			},
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   defaultMemoryType,
				container: &mockContainer{},
			},
			expectedRemainingNodes: []int{100},
			tree:                   map[int][]int{100: {101, 102}, 101: {}, 102: {}},
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			setLinks(tc.nodes, tc.tree)
			policy := &policy{
				sys: &mockSystem{
					nodes: tc.numaNodes,
				},
				pools:       tc.nodes,
				cache:       &mockCache{},
				root:        tc.nodes[0],
				nodeCnt:     len(tc.nodes),
				allocations: allocations{},
			}
			// back pointers
			for _, node := range tc.nodes {
				switch node.(type) {
				case *numanode:
					numaNode := node.(*numanode)
					numaNode.self.node = numaNode
					noderes := numaNode.noderes.(*supply)
					noderes.node = node
					freeres := numaNode.freeres.(*supply)
					freeres.node = node
					numaNode.policy = policy
				case *virtualnode:
					virtualNode := node.(*virtualnode)
					virtualNode.self.node = virtualNode
					noderes := virtualNode.noderes.(*supply)
					noderes.node = node
					freeres := virtualNode.freeres.(*supply)
					freeres.node = node
					virtualNode.policy = policy
				}
			}
			policy.allocations.policy = policy

			scores, filteredPools := policy.sortPoolsByScore(tc.req, tc.affinities)
			fmt.Printf("scores: %v, remaining pools: %v\n", scores, filteredPools)

			if len(filteredPools) != len(tc.expectedRemainingNodes) {
				t.Errorf("Wrong nodes in the filtered pool: expected %v but got %v", tc.expectedRemainingNodes, filteredPools)
			}

			for _, id := range tc.expectedRemainingNodes {
				found := false
				for _, node := range filteredPools {
					if node.NodeID() == id {
						found = true
						break
					}
				}
				if !found {
					t.Errorf("Did not find id %d in filtered pools: %v", id, filteredPools)
				}
			}
		})
	}
}

func TestPoolCreation(t *testing.T) {

	// Test pool creation with "real" sysfs data.

	// Create a temporary directory for the test data.
	dir, err := os.MkdirTemp("", "cri-resource-manager-test-sysfs-")
	if err != nil {
		panic(err)
	}
	defer os.RemoveAll(dir)

	// Uncompress the test data to the directory.
	err = utils.UncompressTbz2(path.Join("testdata", "sysfs.tar.bz2"), dir)
	if err != nil {
		panic(err)
	}

	tcases := []struct {
		path                    string
		name                    string
		req                     Request
		affinities              map[int]int32
		expectedRemainingNodes  []int
		expectedFirstNodeMemory memoryType
		expectedLeafNodeCPUs    int
		expectedRootNodeCPUs    int
		// TODO: expectedRootNodeMemory   int
	}{
		{
			path: path.Join(dir, "sysfs", "desktop", "sys"),
			name: "sysfs pool creation from a desktop system",
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   memoryAll,
				container: &mockContainer{},
			},
			expectedRemainingNodes:  []int{0},
			expectedFirstNodeMemory: memoryDRAM,
			expectedLeafNodeCPUs:    20,
			expectedRootNodeCPUs:    20,
		},
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "sysfs pool creation from a server system",
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   memoryDRAM,
				container: &mockContainer{},
			},
			expectedRemainingNodes:  []int{0, 1, 2, 3, 4, 5, 6},
			expectedFirstNodeMemory: memoryDRAM | memoryPMEM,
			expectedLeafNodeCPUs:    28,
			expectedRootNodeCPUs:    112,
		},
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "pmem request on a server system",
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   memoryDRAM | memoryPMEM,
				container: &mockContainer{},
			},
			expectedRemainingNodes:  []int{0, 1, 2, 3, 4, 5, 6},
			expectedFirstNodeMemory: memoryDRAM | memoryPMEM,
			expectedLeafNodeCPUs:    28,
			expectedRootNodeCPUs:    112,
		},
		{
			path: path.Join(dir, "sysfs", "4-socket-server-nosnc", "sys"),
			name: "sysfs pool creation from a 4 socket server with SNC disabled",
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   memoryAll,
				container: &mockContainer{},
			},
			expectedRemainingNodes:  []int{0, 1, 2, 3, 4},
			expectedFirstNodeMemory: memoryDRAM,
			expectedLeafNodeCPUs:    36,
			expectedRootNodeCPUs:    36 * 4,
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			sys, err := system.DiscoverSystemAt(tc.path)
			if err != nil {
				panic(err)
			}

			reserved, _ := resapi.ParseQuantity("750m")
			policyOptions := &policyapi.BackendOptions{
				Cache:  &mockCache{},
				System: sys,
				Reserved: policyapi.ConstraintSet{
					policyapi.DomainCPU: reserved,
				},
			}

			log.EnableDebug()
			policy := CreateTopologyAwarePolicy(policyOptions).(*policy)

			if policy.root.GetSupply().SharableCPUs().Size()+policy.root.GetSupply().IsolatedCPUs().Size()+policy.root.GetSupply().ReservedCPUs().Size() != tc.expectedRootNodeCPUs {
				t.Errorf("Expected %d CPUs, got %d", tc.expectedRootNodeCPUs,
					policy.root.GetSupply().SharableCPUs().Size()+policy.root.GetSupply().IsolatedCPUs().Size()+policy.root.GetSupply().ReservedCPUs().Size())
			}

			for _, p := range policy.pools {
				if p.IsLeafNode() {
					if len(p.Children()) != 0 {
						t.Errorf("Leaf node %v had %d children", p, len(p.Children()))
					}
					if p.GetSupply().SharableCPUs().Size()+p.GetSupply().IsolatedCPUs().Size()+p.GetSupply().ReservedCPUs().Size() != tc.expectedLeafNodeCPUs {
						t.Errorf("Expected %d CPUs, got %d (%s)", tc.expectedLeafNodeCPUs,
							p.GetSupply().SharableCPUs().Size()+p.GetSupply().IsolatedCPUs().Size()+p.GetSupply().ReservedCPUs().Size(),
							p.GetSupply().DumpCapacity())
					}
				}
			}

			scores, filteredPools := policy.sortPoolsByScore(tc.req, tc.affinities)
			fmt.Printf("scores: %v, remaining pools: %v\n", scores, filteredPools)

			if len(filteredPools) != len(tc.expectedRemainingNodes) {
				t.Errorf("Wrong number of nodes in the filtered pool: expected %d but got %d", len(tc.expectedRemainingNodes), len(filteredPools))
			}

			for _, id := range tc.expectedRemainingNodes {
				found := false
				for _, node := range filteredPools {
					if node.NodeID() == id {
						found = true
						break
					}
				}
				if !found {
					t.Errorf("Did not find id %d in filtered pools: %s", id, filteredPools)
				}
			}

			if len(filteredPools) > 0 && filteredPools[0].GetMemoryType() != tc.expectedFirstNodeMemory {
				t.Errorf("Expected first node memory type %v, got %v", tc.expectedFirstNodeMemory, filteredPools[0].GetMemoryType())
			}
		})
	}
}

func TestWorkloadPlacement(t *testing.T) {

	// Do some workloads (containers) and see how they are placed in the
	// server system.

	// Create a temporary directory for the test data.
	dir, err := os.MkdirTemp("", "cri-resource-manager-test-sysfs-")
	if err != nil {
		panic(err)
	}
	defer os.RemoveAll(dir)

	// Uncompress the test data to the directory.
	err = utils.UncompressTbz2(path.Join("testdata", "sysfs.tar.bz2"), dir)
	if err != nil {
		panic(err)
	}

	tcases := []struct {
		path                   string
		name                   string
		req                    Request
		affinities             map[int]int32
		expectedRemainingNodes []int
		expectedLeafNode       bool
	}{
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "workload placement on a server system leaf node",
			req: &request{
				memReq:  10000,
				memLim:  10000,
				memType: memoryUnspec,
				isolate: false,
				full:    25, // 28 - 2 isolated = 26: but fully exhausting the shared CPU subpool is disallowed

				container: &mockContainer{},
			},
			expectedRemainingNodes: []int{0, 1, 2, 3, 4, 5, 6},
			expectedLeafNode:       true,
		},
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "workload placement on a server system root node: CPUs don't fit to leaf",
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   memoryUnspec,
				isolate:   false,
				full:      29,
				container: &mockContainer{},
			},
			expectedRemainingNodes: []int{0, 1, 2, 3, 4, 5, 6},
			expectedLeafNode:       false,
		},
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "workload placement on a server system root node: memory doesn't fit to leaf",
			req: &request{
				memReq:    190000000000,
				memLim:    190000000000,
				memType:   memoryUnspec,
				isolate:   false,
				full:      28,
				container: &mockContainer{},
			},
			expectedRemainingNodes: []int{2, 6},
			expectedLeafNode:       false,
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			sys, err := system.DiscoverSystemAt(tc.path)
			if err != nil {
				panic(err)
			}

			reserved, _ := resapi.ParseQuantity("750m")
			policyOptions := &policyapi.BackendOptions{
				Cache:  &mockCache{},
				System: sys,
				Reserved: policyapi.ConstraintSet{
					policyapi.DomainCPU: reserved,
				},
			}

			log.EnableDebug()
			policy := CreateTopologyAwarePolicy(policyOptions).(*policy)

			scores, filteredPools := policy.sortPoolsByScore(tc.req, tc.affinities)
			fmt.Printf("scores: %v, remaining pools: %v\n", scores, filteredPools)

			if len(filteredPools) != len(tc.expectedRemainingNodes) {
				t.Errorf("Wrong number of nodes in the filtered pool: expected %d but got %d", len(tc.expectedRemainingNodes), len(filteredPools))
			}

			for _, id := range tc.expectedRemainingNodes {
				found := false
				for _, node := range filteredPools {
					if node.NodeID() == id {
						found = true
						break
					}
				}
				if !found {
					t.Errorf("Did not find id %d in filtered pools: %s", id, filteredPools)
				}
			}
			if filteredPools[0].IsLeafNode() != tc.expectedLeafNode {
				t.Errorf("Workload should have been placed in a leaf node: %t", tc.expectedLeafNode)
			}
		})
	}
}

func TestContainerMove(t *testing.T) {

	// In case there's not enough memory to guarantee that the
	// containers running on child nodes won't get OOM killed, they need
	// to be moved upwards in the tree.

	// Create a temporary directory for the test data.
	dir, err := os.MkdirTemp("", "cri-resource-manager-test-sysfs-")
	if err != nil {
		panic(err)
	}
	defer os.RemoveAll(dir)

	// Uncompress the test data to the directory.
	err = utils.UncompressTbz2(path.Join("testdata", "sysfs.tar.bz2"), dir)
	if err != nil {
		panic(err)
	}

	tcases := []struct {
		path                          string
		name                          string
		container1                    cache.Container
		container2                    cache.Container
		container3                    cache.Container
		affinities                    map[int]int32
		expectedLeafNodeForContainer1 bool
		expectedLeafNodeForContainer2 bool
		expectedLeafNodeForContainer3 bool
		expectedChangeForContainer1   bool
		expectedChangeForContainer2   bool
		expectedChangeForContainer3   bool
	}{
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "workload placement on a server system leaf node",
			container1: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Limits: v1.ResourceList{
						v1.ResourceCPU:    resapi.MustParse("2"),
						v1.ResourceMemory: resapi.MustParse("1000"),
					},
				},
				returnValueForGetCacheID: "first",
			},
			container2: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Limits: v1.ResourceList{
						v1.ResourceCPU:    resapi.MustParse("2"),
						v1.ResourceMemory: resapi.MustParse("1000"),
					},
				},
				returnValueForGetCacheID: "second",
			},
			container3: &mockContainer{
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Limits: v1.ResourceList{
						v1.ResourceCPU:    resapi.MustParse("2"),
						v1.ResourceMemory: resapi.MustParse("1000"),
					},
				},
				returnValueForGetCacheID: "third",
			},
			expectedLeafNodeForContainer1: true,
			expectedLeafNodeForContainer2: true,
			expectedLeafNodeForContainer3: true,
		},
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "workload placement on a server system non-leaf node",
			container1: &mockContainer{
				name: "c1",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Limits: v1.ResourceList{
						v1.ResourceCPU:    resapi.MustParse("2"),
						v1.ResourceMemory: resapi.MustParse("1000"),
					},
				},
				returnValueForGetCacheID: "first",
			},
			container2: &mockContainer{
				name: "c2",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Limits: v1.ResourceList{
						v1.ResourceCPU:    resapi.MustParse("2"),
						v1.ResourceMemory: resapi.MustParse("190000000000"), // 180 GB
					},
				},
				returnValueForGetCacheID: "second",
			},
			container3: &mockContainer{
				name: "c3",
				returnValueForGetResourceRequirements: v1.ResourceRequirements{
					Limits: v1.ResourceList{
						v1.ResourceCPU:    resapi.MustParse("2"),
						v1.ResourceMemory: resapi.MustParse("140000000000"), // 130 GB
					},
				},
				returnValueForGetCacheID: "third",
			},
			expectedLeafNodeForContainer1: false,
			expectedLeafNodeForContainer2: false,
			expectedLeafNodeForContainer3: false,
			expectedChangeForContainer1:   true,
		},
	}
	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			sys, err := system.DiscoverSystemAt(tc.path)
			if err != nil {
				panic(err)
			}

			reserved, _ := resapi.ParseQuantity("750m")
			policyOptions := &policyapi.BackendOptions{
				Cache:  &mockCache{},
				System: sys,
				Reserved: policyapi.ConstraintSet{
					policyapi.DomainCPU: reserved,
				},
			}

			log.EnableDebug()
			policy := CreateTopologyAwarePolicy(policyOptions).(*policy)

			grant1, err := policy.allocatePool(tc.container1, "")
			if err != nil {
				panic(err)
			}
			fmt.Printf("grant 1 memsets: dram %s, pmem %s\n", grant1.GetMemoryNode().GetMemset(memoryDRAM), grant1.GetMemoryNode().GetMemset(memoryPMEM))

			grant2, err := policy.allocatePool(tc.container2, "")
			if err != nil {
				panic(err)
			}
			fmt.Printf("grant 2 memsets: dram %s, pmem %s\n", grant2.GetMemoryNode().GetMemset(memoryDRAM), grant2.GetMemoryNode().GetMemset(memoryPMEM))

			grant3, err := policy.allocatePool(tc.container3, "")
			if err != nil {
				panic(err)
			}
			fmt.Printf("grant 3 memsets: dram %s, pmem %s\n", grant3.GetMemoryNode().GetMemset(memoryDRAM), grant3.GetMemoryNode().GetMemset(memoryPMEM))

			if (grant1.GetCPUNode().IsSameNode(grant1.GetMemoryNode())) && tc.expectedChangeForContainer1 {
				t.Errorf("Workload 1 should have been relocated: %t, node: %s", tc.expectedChangeForContainer1, grant1.GetMemoryNode().Name())
			}
			if (grant2.GetCPUNode().IsSameNode(grant2.GetMemoryNode())) && tc.expectedChangeForContainer2 {
				t.Errorf("Workload 2 should have been relocated: %t, node: %s", tc.expectedChangeForContainer2, grant2.GetMemoryNode().Name())
			}
			if (grant3.GetCPUNode().IsSameNode(grant3.GetMemoryNode())) && tc.expectedChangeForContainer3 {
				t.Errorf("Workload 3 should have been relocated: %t, node: %s", tc.expectedChangeForContainer3, grant3.GetMemoryNode().Name())
			}

			if grant1.GetMemoryNode().IsLeafNode() != tc.expectedLeafNodeForContainer1 {
				t.Errorf("Workload 1 should have been placed in a leaf node: %t, node: %s", tc.expectedLeafNodeForContainer1, grant1.GetMemoryNode().Name())
			}
			if grant2.GetMemoryNode().IsLeafNode() != tc.expectedLeafNodeForContainer2 {
				t.Errorf("Workload 2 should have been placed in a leaf node: %t, node: %s", tc.expectedLeafNodeForContainer2, grant2.GetMemoryNode().Name())
			}
			if grant3.GetMemoryNode().IsLeafNode() != tc.expectedLeafNodeForContainer3 {
				t.Errorf("Workload 3 should have been placed in a leaf node: %t, node: %s", tc.expectedLeafNodeForContainer3, grant3.GetMemoryNode().Name())
			}
		})
	}
}

func TestAffinities(t *testing.T) {
	//
	// Test how (already pre-calculated) affinities affect workload placement.
	//

	// Create a temporary directory for the test data.
	dir, err := os.MkdirTemp("", "cri-resource-manager-test-sysfs-")
	if err != nil {
		panic(err)
	}
	defer os.RemoveAll(dir)

	// Uncompress the test data to the directory.
	err = utils.UncompressTbz2(path.Join("testdata", "sysfs.tar.bz2"), dir)
	if err != nil {
		panic(err)
	}

	tcases := []struct {
		path       string
		name       string
		req        Request
		affinities map[string]int32
		expected   string
	}{
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "no affinities",
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   memoryUnspec,
				isolate:   false,
				full:      3,
				container: &mockContainer{},
			},
			affinities: map[string]int32{},
			expected:   "NUMA node #2",
		},
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "reserved - no affinities",
			req: &request{
				cpuType:   cpuReserved,
				memReq:    10000,
				memLim:    10000,
				memType:   memoryUnspec,
				isolate:   false,
				full:      0,
				container: &mockContainer{},
			},
			affinities: map[string]int32{},
			expected:   "NUMA node #0",
		},
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "affinity to NUMA node #1",
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   memoryUnspec,
				isolate:   false,
				full:      3,
				container: &mockContainer{},
			},
			affinities: map[string]int32{
				"NUMA node #1": 1,
			},
			expected: "NUMA node #1",
		},
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "affinity to socket #1",
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   memoryUnspec,
				isolate:   false,
				full:      3,
				container: &mockContainer{},
			},
			affinities: map[string]int32{
				"socket #1": 1,
			},
			expected: "socket #1",
		},
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "equal affinities to NUMA node #1, socket #1",
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   memoryUnspec,
				isolate:   false,
				full:      3,
				container: &mockContainer{},
			},
			affinities: map[string]int32{
				"socket #1":    1,
				"NUMA node #1": 1,
			},
			expected: "NUMA node #1",
		},
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "equal affinities to NUMA node #1, NUMA node #3",
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   memoryUnspec,
				isolate:   false,
				full:      3,
				container: &mockContainer{},
			},
			affinities: map[string]int32{
				"NUMA node #1": 1,
				"NUMA node #3": 1,
			},
			expected: "socket #1",
		},
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "double affinity to NUMA node #1 vs. #3",
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   memoryUnspec,
				isolate:   false,
				full:      3,
				container: &mockContainer{},
			},
			affinities: map[string]int32{
				"NUMA node #1": 2,
				"NUMA node #3": 1,
			},
			expected: "socket #1",
		},
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "triple affinity to NUMA node #1 vs. #3",
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   memoryUnspec,
				isolate:   false,
				full:      3,
				container: &mockContainer{},
			},
			affinities: map[string]int32{
				"NUMA node #1": 3,
				"NUMA node #3": 1,
			},
			expected: "NUMA node #1",
		},
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "double affinity to NUMA node #0,#3 vs. socket #1",
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   memoryUnspec,
				isolate:   false,
				full:      3,
				container: &mockContainer{},
			},
			affinities: map[string]int32{
				"NUMA node #0": 2,
				"NUMA node #3": 2,
				"socket #1":    1,
			},
			expected: "root",
		},
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "equal affinity to NUMA node #0,#3 vs. socket #1",
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   memoryUnspec,
				isolate:   false,
				full:      3,
				container: &mockContainer{},
			},
			affinities: map[string]int32{
				"NUMA node #0": 1,
				"NUMA node #3": 1,
				"socket #1":    1,
			},
			expected: "root",
		},
		{
			path: path.Join(dir, "sysfs", "server", "sys"),
			name: "half the affinity to NUMA node #0,#3 vs. socket #1",
			req: &request{
				memReq:    10000,
				memLim:    10000,
				memType:   memoryUnspec,
				isolate:   false,
				full:      3,
				container: &mockContainer{},
			},
			affinities: map[string]int32{
				"NUMA node #0": 1,
				"NUMA node #3": 1,
				"socket #1":    2,
			},
			expected: "socket #1",
		},
	}

	for _, tc := range tcases {
		t.Run(tc.name, func(t *testing.T) {
			sys, err := system.DiscoverSystemAt(tc.path)
			if err != nil {
				panic(err)
			}

			reserved, _ := resapi.ParseQuantity("750m")
			policyOptions := &policyapi.BackendOptions{
				Cache:  &mockCache{},
				System: sys,
				Reserved: policyapi.ConstraintSet{
					policyapi.DomainCPU: reserved,
				},
			}

			log.EnableDebug()
			policy := CreateTopologyAwarePolicy(policyOptions).(*policy)

			affinities := map[int]int32{}
			for name, weight := range tc.affinities {
				affinities[findNodeWithName(name, policy.pools).NodeID()] = weight
			}

			log.EnableDebug()
			scores, filteredPools := policy.sortPoolsByScore(tc.req, affinities)
			fmt.Printf("scores: %v, remaining pools: %v\n", scores, filteredPools)

			if len(filteredPools) < 1 {
				t.Errorf("pool scoring failed to find any pools")
			}

			node := filteredPools[0]
			if node.Name() != tc.expected {
				t.Errorf("expected best pool %s, got %s", tc.expected, node.Name())
			}
		})
	}
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/resources.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"fmt"
	"strconv"
	"time"

	v1 "k8s.io/api/core/v1"

	"github.com/intel/cri-resource-manager/pkg/cpuallocator"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/topology"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	idset "github.com/intel/goresctrl/pkg/utils"
)

// Supply represents avaialbe CPU and memory capacity of a node.
type Supply interface {
	// GetNode returns the node supplying this capacity.
	GetNode() Node
	// Clone creates a copy of this supply.
	Clone() Supply
	// IsolatedCPUs returns the isolated cpuset in this supply.
	IsolatedCPUs() cpuset.CPUSet
	// ReservedCPUs returns the reserved cpuset in this supply.
	ReservedCPUs() cpuset.CPUSet
	// SharableCPUs returns the sharable cpuset in this supply.
	SharableCPUs() cpuset.CPUSet
	// GrantedReserved returns the locally granted reserved CPU capacity in this supply.
	GrantedReserved() int
	// GrantedShared returns the locally granted shared CPU capacity in this supply.
	GrantedShared() int
	// GrantedMemory returns the locally granted memory capacity in this supply.
	GrantedMemory(memoryType) uint64
	// Cumulate cumulates the given supply into this one.
	Cumulate(Supply)
	// AssignMemory adds extra memory to this supply (for extra NUMA nodes assigned to a pool).
	AssignMemory(mem memoryMap)
	// AccountAllocateCPU accounts for (removes) allocated exclusive capacity from the supply.
	AccountAllocateCPU(Grant)
	// AccountReleaseCPU accounts for (reinserts) released exclusive capacity into the supply.
	AccountReleaseCPU(Grant)
	// GetScore calculates how well this supply fits/fulfills the given request.
	GetScore(Request) Score
	// AllocatableSharedCPU calculates the allocatable amount of shared CPU of this supply.
	AllocatableSharedCPU(...bool) int
	// Allocate allocates CPU capacity from this supply and returns it as a grant.
	Allocate(Request) (Grant, error)
	// ReleaseCPU releases a previously allocated CPU grant from this supply.
	ReleaseCPU(Grant)
	// ReleaseMemory releases a previously allocated memory grant from this supply.
	ReleaseMemory(Grant)
	// ReallocateMemory updates the Grant to allocate memory from this supply.
	ReallocateMemory(Grant) error
	// ExtraMemoryReservation returns the memory reservation.
	ExtraMemoryReservation(memoryType) uint64
	// SetExtraMemroyReservation sets the extra memory reservation based on the granted memory.
	SetExtraMemoryReservation(Grant)
	// ReleaseExtraMemoryReservation removes the extra memory reservations based on the granted memory.
	ReleaseExtraMemoryReservation(Grant)
	// MemoryLimit returns the amount of various memory types belonging to this grant.
	MemoryLimit() memoryMap

	// Reserve accounts for CPU grants after reloading cached allocations.
	Reserve(Grant) error
	// ReserveMemory accounts for memory grants after reloading cached allocations.
	ReserveMemory(Grant) error
	// DumpCapacity returns a printable representation of the supply's resource capacity.
	DumpCapacity() string
	// DumpAllocatable returns a printable representation of the supply's alloctable resources.
	DumpAllocatable() string
	// DumpMemoryState dumps the state of the available and allocated memory.
	DumpMemoryState(string)
}

// Request represents CPU and memory resources requested by a container.
type Request interface {
	// GetContainer returns the container requesting CPU capacity.
	GetContainer() cache.Container
	// String returns a printable representation of this request.
	String() string
	// CPUType returns the type of requested CPU.
	CPUType() cpuClass
	// SetCPUType sets the type of requested CPU.
	SetCPUType(cpuType cpuClass)
	// FullCPUs return the number of full CPUs requested.
	FullCPUs() int
	// CPUFraction returns the amount of fractional milli-CPU requested.
	CPUFraction() int
	// Isolate returns whether isolated CPUs are preferred for this request.
	Isolate() bool
	// MemoryType returns the type(s) of requested memory.
	MemoryType() memoryType
	// MemAmountToAllocate retuns how much memory we need to reserve for a request.
	MemAmountToAllocate() uint64
	// ColdStart returns the cold start timeout.
	ColdStart() time.Duration
}

// Grant represents CPU and memory capacity allocated to a container from a node.
type Grant interface {
	// SetCPUPortion sets the fraction CPU portion for the grant.
	SetCPUPortion(fraction int)
	// SetMemoryAllocation sets the memory allocation for the grant.
	SetMemoryAllocation(memoryType, memoryMap, time.Duration)
	// Clone creates a copy of this grant.
	Clone() Grant
	// RefetchNodes updates the stored cpu and memory nodes of this grant by name.
	RefetchNodes() error
	// GetContainer returns the container CPU capacity is granted to.
	GetContainer() cache.Container
	// GetCPUNode returns the node that granted CPU capacity to the container.
	GetCPUNode() Node
	// GetMemoryNode returns the node which granted memory capacity to
	// the container.
	GetMemoryNode() Node
	// CPUType returns the type of granted CPUs
	CPUType() cpuClass
	// CPUPortion returns granted milli-CPUs of non-full CPUs of CPUType().
	// CPUPortion() == ReservedPortion() + SharedPortion().
	CPUPortion() int
	// ExclusiveCPUs returns the exclusively granted non-isolated cpuset.
	ExclusiveCPUs() cpuset.CPUSet
	// ReservedCPUs returns the reserved granted cpuset.
	ReservedCPUs() cpuset.CPUSet
	// ReservedPortion() returns the amount of CPUs in milli-CPU granted.
	ReservedPortion() int
	// SharedCPUs returns the shared granted cpuset.
	SharedCPUs() cpuset.CPUSet
	// SharedPortion returns the amount of CPUs in milli-CPU granted.
	SharedPortion() int
	// IsolatedCpus returns the exclusively granted isolated cpuset.
	IsolatedCPUs() cpuset.CPUSet
	// MemoryType returns the type(s) of granted memory.
	MemoryType() memoryType
	// SetMemoryNode updates the grant memory controllers.
	SetMemoryNode(Node)
	// Memset returns the granted memory controllers as a string.
	Memset() idset.IDSet
	// ExpandMemset() makes the memory controller set larger as the grant
	// is moved up in the node hierarchy.
	ExpandMemset() (bool, error)
	// MemLimit returns the amount of memory that the container is
	// allowed to use.
	MemLimit() memoryMap
	// String returns a printable representation of this grant.
	String() string
	// Release releases the grant from all the Supplys it uses.
	Release()
	// AccountAllocateCPU accounts for (removes) allocated exclusive capacity for this grant.
	AccountAllocateCPU()
	// AccountReleaseCPU accounts for (reinserts) released exclusive capacity for this grant.
	AccountReleaseCPU()
	// UpdateExtraMemoryReservation() updates the reservations in the subtree
	// of nodes under the node from which the memory was granted.
	UpdateExtraMemoryReservation()
	// RestoreMemset restores the granted memory set to node maximum
	// and reapplies the grant.
	RestoreMemset()
	// ColdStart returns the cold start timeout.
	ColdStart() time.Duration
	// AddTimer adds a cold start timer.
	AddTimer(*time.Timer)
	// StopTimer stops a cold start timer.
	StopTimer()
	// ClearTimer clears the cold start timer pointer.
	ClearTimer()
}

// Score represents how well a supply can satisfy a request.
type Score interface {
	// Calculate the actual score from the collected parameters.
	Eval() float64
	// Supply returns the supply associated with this score.
	Supply() Supply
	// Request returns the request associated with this score.
	Request() Request

	IsolatedCapacity() int
	ReservedCapacity() int
	SharedCapacity() int
	Colocated() int
	HintScores() map[string]float64

	String() string
}

type memoryMap map[memoryType]uint64

// supply implements our Supply interface.
type supply struct {
	node                 Node                // node supplying CPUs and memory
	isolated             cpuset.CPUSet       // isolated CPUs at this node
	reserved             cpuset.CPUSet       // reserved CPUs at this node
	sharable             cpuset.CPUSet       // sharable CPUs at this node
	grantedReserved      int                 // amount of reserved CPUs allocated
	grantedShared        int                 // amount of shareable CPUs allocated
	mem                  memoryMap           // available memory for this node
	grantedMem           memoryMap           // total memory granted
	extraMemReservations map[Grant]memoryMap // how much memory each workload above has requested
}

var _ Supply = &supply{}

// request implements our Request interface.
type request struct {
	container cache.Container // container for this request
	full      int             // number of full CPUs requested
	fraction  int             // amount of fractional CPU requested
	isolate   bool            // prefer isolated exclusive CPUs
	cpuType   cpuClass        // preferred CPU type (normal, reserved)

	memReq  uint64     // memory request
	memLim  uint64     // memory limit
	memType memoryType // requested types of memory

	// coldStart tells the timeout (in milliseconds) how long to wait until
	// a DRAM memory controller should be added to a container asking for a
	// mixed DRAM/PMEM memory allocation. This allows for a "cold start" where
	// initial memory requests are made to the PMEM memory. A value of 0
	// indicates that cold start is not explicitly requested.
	coldStart time.Duration
}

var _ Request = &request{}

// grant implements our Grant interface.
type grant struct {
	container      cache.Container // container CPU is granted to
	node           Node            // node CPU is supplied from
	memoryNode     Node            // node memory is supplied from
	exclusive      cpuset.CPUSet   // exclusive CPUs
	cpuType        cpuClass        // type of CPUs (normal, reserved, ...)
	cpuPortion     int             // milliCPUs granted from CPUs of cpuType
	memType        memoryType      // requested types of memory
	memset         idset.IDSet     // assigned memory nodes
	allocatedMem   memoryMap       // memory limit
	coldStart      time.Duration   // how long until cold start is done
	coldStartTimer *time.Timer     // timer to trigger cold start timeout
}

var _ Grant = &grant{}

// score implements our Score interface.
type score struct {
	supply    Supply             // CPU supply (node)
	req       Request            // CPU request (container)
	isolated  int                // remaining isolated CPUs
	reserved  int                // remaining reserved CPUs
	shared    int                // remaining shared capacity
	colocated int                // number of colocated containers
	hints     map[string]float64 // hint scores
}

var _ Score = &score{}

// newSupply creates CPU supply for the given node, cpusets and existing grant.

func newSupply(n Node, isolated, reserved, sharable cpuset.CPUSet, grantedReserved int, grantedShared int, mem, grantedMem memoryMap) Supply {
	if mem == nil {
		mem = createMemoryMap(0, 0, 0)
	}
	if grantedMem == nil {
		grantedMem = createMemoryMap(0, 0, 0)
	}
	return &supply{
		node:                 n,
		isolated:             isolated.Clone(),
		reserved:             reserved.Clone(),
		sharable:             sharable.Clone(),
		grantedReserved:      grantedReserved,
		grantedShared:        grantedShared,
		mem:                  mem,
		grantedMem:           grantedMem,
		extraMemReservations: make(map[Grant]memoryMap),
	}
}

func createMemoryMap(dram, pmem, hbm uint64) memoryMap {
	return memoryMap{
		memoryDRAM:   dram,
		memoryPMEM:   pmem,
		memoryHBM:    hbm,
		memoryAll:    dram + pmem + hbm,
		memoryUnspec: 0,
	}
}

func (m memoryMap) Add(dram, pmem, hbm uint64) {
	m[memoryDRAM] += dram
	m[memoryPMEM] += pmem
	m[memoryPMEM] += hbm
	m[memoryAll] += dram + pmem + hbm
}

func (m memoryMap) AddDRAM(dram uint64) {
	m[memoryDRAM] += dram
	m[memoryAll] += dram
}

func (m memoryMap) AddPMEM(pmem uint64) {
	m[memoryPMEM] += pmem
	m[memoryAll] += pmem
}

func (m memoryMap) AddHBM(hbm uint64) {
	m[memoryHBM] += hbm
	m[memoryAll] += hbm
}

func (m memoryMap) String() string {
	mem, sep := "", ""

	dram, pmem, hbm, types := m[memoryDRAM], m[memoryPMEM], m[memoryHBM], 0
	if dram > 0 || pmem > 0 || hbm > 0 {
		if dram > 0 {
			mem += "DRAM " + prettyMem(dram)
			sep = ", "
			types++
		}
		if pmem > 0 {
			mem += sep + "PMEM " + prettyMem(pmem)
			sep = ", "
			types++
		}
		if hbm > 0 {
			mem += sep + "HBM " + prettyMem(hbm)
			types++
		}
		if types > 1 {
			mem += sep + "total " + prettyMem(pmem+dram+hbm)
		}
	}

	return mem
}

// GetNode returns the node supplying CPU and memory.
func (cs *supply) GetNode() Node {
	return cs.node
}

// Clone clones the given CPU supply.
func (cs *supply) Clone() Supply {
	// Copy the maps.
	mem := make(memoryMap)
	for key, value := range cs.mem {
		mem[key] = value
	}
	grantedMem := make(memoryMap)
	for key, value := range cs.grantedMem {
		grantedMem[key] = value
	}
	return newSupply(cs.node, cs.isolated, cs.reserved, cs.sharable, cs.grantedReserved, cs.grantedShared, mem, grantedMem)
}

// IsolatedCpus returns the isolated CPUSet of this supply.
func (cs *supply) IsolatedCPUs() cpuset.CPUSet {
	return cs.isolated.Clone()
}

// ReservedCpus returns the reserved CPUSet of this supply.
func (cs *supply) ReservedCPUs() cpuset.CPUSet {
	return cs.reserved.Clone()
}

// SharableCpus returns the sharable CPUSet of this supply.
func (cs *supply) SharableCPUs() cpuset.CPUSet {
	return cs.sharable.Clone()
}

// GrantedReserved returns the locally granted reserved CPU capacity.
func (cs *supply) GrantedReserved() int {
	return cs.grantedReserved
}

// GrantedShared returns the locally granted sharable CPU capacity.
func (cs *supply) GrantedShared() int {
	return cs.grantedShared
}

func (cs *supply) GrantedMemory(memType memoryType) uint64 {
	// Return only granted memory of correct type
	return cs.grantedMem[memType]
}

func (cs *supply) MemoryLimit() memoryMap {
	return cs.mem
}

// Cumulate more CPU to supply.
func (cs *supply) Cumulate(more Supply) {
	mcs := more.(*supply)

	cs.isolated = cs.isolated.Union(mcs.isolated)
	cs.reserved = cs.reserved.Union(mcs.reserved)
	cs.sharable = cs.sharable.Union(mcs.sharable)
	cs.grantedReserved += mcs.grantedReserved
	cs.grantedShared += mcs.grantedShared

	for key, value := range mcs.mem {
		cs.mem[key] += value
	}
	for key, value := range mcs.grantedMem {
		cs.grantedMem[key] += value
	}
}

// AssignMemory adds memory (for extra NUMA nodes assigned to a pool node).
func (cs *supply) AssignMemory(mem memoryMap) {
	for key, value := range mem {
		cs.mem[key] += value
	}
}

// AccountAllocateCPU accounts for (removes) allocated exclusive capacity from the supply.
func (cs *supply) AccountAllocateCPU(g Grant) {
	if cs.node.IsSameNode(g.GetCPUNode()) {
		return
	}
	exclusive := g.ExclusiveCPUs()
	cs.isolated = cs.isolated.Difference(exclusive)
	cs.sharable = cs.sharable.Difference(exclusive)
}

// AccountReleaseCPU accounts for (reinserts) released exclusive capacity into the supply.
func (cs *supply) AccountReleaseCPU(g Grant) {
	if cs.node.IsSameNode(g.GetCPUNode()) {
		return
	}

	ncs := cs.node.GetSupply()
	nodecpus := ncs.IsolatedCPUs().Union(ncs.SharableCPUs())
	grantcpus := g.ExclusiveCPUs().Intersection(nodecpus)

	isolated := grantcpus.Intersection(ncs.IsolatedCPUs())
	sharable := grantcpus.Intersection(ncs.SharableCPUs())
	cs.isolated = cs.isolated.Union(isolated)
	cs.sharable = cs.sharable.Union(sharable)
}

// allocateMemory tries to fulfill the memory allocation part of a request.
func (cs *supply) allocateMemory(r Request) (memoryMap, error) {
	reqType := r.MemoryType()
	if reqType == memoryUnspec {
		reqType = memoryAll
	}

	allocated := createMemoryMap(0, 0, 0)
	requested := r.MemAmountToAllocate()
	remaining := requested

	//
	// Notes:
	//   We try to allocate PMEM, then DRAM, and finally HBM, honoring
	//   the types allowed by the request. We don't need to care about
	//   extra memory reservations for this node as all the nodes with
	//   insufficient memory have been filtered out before allocation.
	//
	//   However, for cold started containers we do check if there is
	//   enough PMEM free to accomodate the full request and bail out
	//   if that check fails.
	//

	for _, memType := range []memoryType{memoryPMEM, memoryDRAM, memoryHBM} {
		if remaining > 0 && (reqType&memType) != 0 {
			available := cs.mem[memType]

			log.Debug("%s: trying %s %s of %s available",
				r.GetContainer().PrettyName(),
				prettyMem(remaining), memType.String(), prettyMem(available))

			if remaining <= available {
				allocated[memType] = remaining
			} else {
				allocated[memType] = available
			}

			cs.grantedMem[memType] += allocated[memType]
			cs.mem[memType] -= allocated[memType]
			remaining -= allocated[memType]
		}

		if remaining > 0 {
			if r.ColdStart() > 0 && memType == memoryPMEM {
				return nil, policyError("internal error: "+
					"not enough PMEM for cold start at %s", cs.GetNode().Name())
			}
		} else {
			break
		}
	}

	if remaining > 0 {
		log.Debug("%s: %s allocation from %s fell short %s",
			r.GetContainer().PrettyName(),
			reqType.String(), cs.GetNode().Name(), prettyMem(remaining))

		for memType, amount := range allocated {
			if amount > 0 {
				cs.grantedMem[memType] -= amount
				cs.mem[memType] += amount
			}
		}

		return nil, policyError("internal error: "+
			"not enough memory at %s", cs.node.Name())
	}

	cs.grantedMem[memoryAll] += requested
	cs.mem[memoryAll] -= requested

	return allocated, nil
}

// Allocate allocates a grant from the supply.
func (cs *supply) Allocate(r Request) (Grant, error) {
	grant, err := cs.AllocateCPU(r)
	if err != nil {
		return nil, err
	}

	memory, err := cs.allocateMemory(r)
	if err != nil {
		cs.ReleaseCPU(grant)
		return nil, err
	}

	grant.SetMemoryAllocation(r.MemoryType(), memory, r.ColdStart())

	return grant, nil
}

// AllocateCPU allocates CPU for a grant from the supply.
func (cs *supply) AllocateCPU(r Request) (Grant, error) {
	var exclusive cpuset.CPUSet
	var err error

	cr := r.(*request)

	full := cr.full
	fraction := cr.fraction

	cpuType := cr.cpuType

	if cpuType == cpuReserved && full > 0 {
		log.Warn("exclusive reserved CPUs not supported, allocating %d full CPUs as fractions", full)
		fraction += full * 1000
		full = 0
	}

	if cpuType == cpuReserved && fraction > 0 && cs.AllocatableReservedCPU() < fraction {
		log.Warn("possible misconfiguration of reserved resources:")
		log.Warn("  %s: allocatable %s", cs.GetNode().Name(), cs.DumpAllocatable())
		log.Warn("  %s: needs %d reserved, only %d available",
			cr.GetContainer().PrettyName(), fraction, cs.AllocatableReservedCPU())
		log.Warn("  falling back to using normal unreserved CPUs instead...")
		cpuType = cpuNormal
	}

	// allocate isolated exclusive CPUs or slice them off the sharable set
	switch {
	case full > 0 && cs.isolated.Size() >= full && cr.isolate:
		exclusive, err = cs.takeCPUs(&cs.isolated, nil, full)
		if err != nil {
			return nil, policyError("internal error: "+
				"%s: can't take %d exclusive isolated CPUs from %s: %v",
				cs.node.Name(), full, cs.isolated, err)
		}

	case full > 0 && cs.AllocatableSharedCPU() > 1000*full:
		exclusive, err = cs.takeCPUs(&cs.sharable, nil, full)
		if err != nil {
			return nil, policyError("internal error: "+
				"%s: can't take %d exclusive CPUs from %s: %v",
				cs.node.Name(), full, cs.sharable, err)
		}

	case full > 0:
		return nil, policyError("internal error: "+
			"%s: can't slice %d exclusive CPUs from %s, %dm available",
			cs.node.Name(), full, cs.sharable, cs.AllocatableSharedCPU())
	}

	grant := newGrant(cs.node, cr.GetContainer(), cpuType, exclusive, 0, 0, nil, 0)
	grant.AccountAllocateCPU()

	if fraction > 0 {
		if cpuType == cpuNormal {
			// allocate requested portion of shared CPUs
			if cs.AllocatableSharedCPU() < fraction {
				cs.ReleaseCPU(grant)
				return nil, policyError("internal error: "+
					"%s: not enough %dm sharable CPU for %dm, %dm available",
					cs.node.Name(), fraction, cs.sharable, cs.AllocatableSharedCPU())
			}
			cs.grantedShared += fraction
		} else if cpuType == cpuReserved {
			// allocate requested portion of reserved CPUs
			if cs.AllocatableReservedCPU() < fraction {
				cs.ReleaseCPU(grant)
				return nil, policyError("internal error: "+
					"%s: not enough reserved CPU: %dm requested, %dm available",
					cs.node.Name(), fraction, cs.AllocatableReservedCPU())
			}
			cs.grantedReserved += fraction
		}
		grant.SetCPUPortion(fraction)
	}

	return grant, nil
}

func (cs *supply) ReallocateMemory(g Grant) error {
	log.Debug("%s: reallocating memory (%s) from %s to %s",
		g.GetContainer().PrettyName(),
		g.MemLimit().String(),
		g.GetMemoryNode().Name(),
		cs.GetNode().Name())

	// The grant has been previously allocated from another supply. Reallocate it here.
	g.GetMemoryNode().FreeSupply().ReleaseMemory(g)

	mem := uint64(0)
	allocatedMemory := g.MemLimit()
	for key, value := range allocatedMemory {
		if cs.mem[key] < value {
			return policyError("internal error: not enough memory for reallocation at %s (released from %s)", cs.GetNode().Name(), g.GetMemoryNode().Name())
		}
		cs.mem[key] -= value
		cs.grantedMem[key] += value
		mem += value
	}
	cs.grantedMem[memoryAll] += mem
	cs.mem[memoryAll] -= mem
	return nil
}

func (cs *supply) ReleaseCPU(g Grant) {
	isolated := g.ExclusiveCPUs().Intersection(cs.node.GetSupply().IsolatedCPUs())
	sharable := g.ExclusiveCPUs().Difference(isolated)

	cs.isolated = cs.isolated.Union(isolated)
	cs.sharable = cs.sharable.Union(sharable)
	cs.grantedReserved -= g.ReservedPortion()
	cs.grantedShared -= g.SharedPortion()

	g.AccountReleaseCPU()
}

// ReleaseMemory returns memory from the given grant to the supply.
func (cs *supply) ReleaseMemory(g Grant) {
	releasedMemory := uint64(0)

	log.Debug("%s: releasing granted memory (%s) from %s",
		g.GetContainer().PrettyName(),
		g.MemLimit().String(), cs.GetNode().Name())

	for key, value := range g.MemLimit() {
		cs.grantedMem[key] -= value
		cs.mem[key] += value
		releasedMemory += value
	}
	cs.grantedMem[memoryAll] -= releasedMemory
	cs.mem[memoryAll] += releasedMemory

	cs.node.DepthFirst(func(n Node) error {
		n.FreeSupply().ReleaseExtraMemoryReservation(g)
		return nil
	})
}

func (cs *supply) ExtraMemoryReservation(memType memoryType) uint64 {
	extra := uint64(0)
	for _, res := range cs.extraMemReservations {
		extra += res[memType]
	}
	return extra
}

func (cs *supply) ReleaseExtraMemoryReservation(g Grant) {
	if mems, ok := cs.extraMemReservations[g]; ok {
		log.Debug("%s: releasing extra memory reservation (%s) from %s",
			g.GetContainer().PrettyName(), mems.String(),
			cs.GetNode().Name())
		delete(cs.extraMemReservations, g)
	}
}

func (cs *supply) SetExtraMemoryReservation(g Grant) {
	res := make(memoryMap)
	extraMemory := uint64(0)
	for key, value := range g.MemLimit() {
		res[key] = value
		extraMemory += value
	}
	res[memoryAll] = extraMemory
	cs.extraMemReservations[g] = res
}

func (cs *supply) Reserve(g Grant) error {
	if g.CPUType() == cpuNormal {
		isolated := g.IsolatedCPUs()
		exclusive := g.ExclusiveCPUs().Difference(isolated)
		sharedPortion := g.SharedPortion()
		if !cs.isolated.Intersection(isolated).Equals(isolated) {
			return policyError("can't reserve isolated CPUs (%s) of %s from %s",
				isolated.String(), g.String(), cs.DumpAllocatable())
		}
		if !cs.sharable.Intersection(exclusive).Equals(exclusive) {
			return policyError("can't reserve exclusive CPUs (%s) of %s from %s",
				exclusive.String(), g.String(), cs.DumpAllocatable())
		}
		if cs.AllocatableSharedCPU() < 1000*exclusive.Size()+sharedPortion {
			return policyError("can't reserve %d shared CPUs of %s from %s",
				sharedPortion, g.String(), cs.DumpAllocatable())
		}
		cs.isolated = cs.isolated.Difference(isolated)
		cs.sharable = cs.sharable.Difference(exclusive)
		cs.grantedShared += sharedPortion
	} else if g.CPUType() == cpuReserved {
		sharedPortion := 1000*g.ExclusiveCPUs().Size() + g.SharedPortion()
		if sharedPortion > 0 && cs.AllocatableReservedCPU() < sharedPortion {
			return policyError("can't reserve %d reserved CPUs of %s from %s",
				sharedPortion, g.String(), cs.DumpAllocatable())
		}
		cs.grantedReserved += sharedPortion
	}

	g.AccountAllocateCPU()

	return nil
}

func (cs *supply) ReserveMemory(g Grant) error {
	mem := uint64(0)
	allocatedMemory := g.MemLimit()
	for key, value := range allocatedMemory {
		if cs.mem[key] < value {
			return policyError("internal error: not enough memory for allocation at %s", g.GetMemoryNode().Name())
		}
		cs.mem[key] -= value
		cs.grantedMem[key] += value
		mem += value
	}
	cs.grantedMem[memoryAll] += mem
	cs.mem[memoryAll] -= mem
	g.UpdateExtraMemoryReservation()
	return nil
}

// takeCPUs takes up to cnt CPUs from a given CPU set to another.
func (cs *supply) takeCPUs(from, to *cpuset.CPUSet, cnt int) (cpuset.CPUSet, error) {
	cset, err := cs.node.Policy().cpuAllocator.AllocateCpus(from, cnt, cpuallocator.PriorityHigh)
	if err != nil {
		return cset, err
	}

	if to != nil {
		*to = to.Union(cset)
	}

	return cset, err
}

// DumpCapacity returns a printable representation of the supply's resource capacity.
func (cs *supply) DumpCapacity() string {
	cpu, mem, sep := "", cs.mem.String(), ""

	if !cs.isolated.IsEmpty() {
		cpu = fmt.Sprintf("isolated:%s", cpuset.ShortCPUSet(cs.isolated))
		sep = ", "
	}
	if !cs.reserved.IsEmpty() {
		cpu += sep + fmt.Sprintf("reserved:%s (%dm)", cpuset.ShortCPUSet(cs.reserved),
			1000*cs.reserved.Size())
		sep = ", "
	}
	if !cs.sharable.IsEmpty() {
		cpu += sep + fmt.Sprintf("sharable:%s (%dm)", cpuset.ShortCPUSet(cs.sharable),
			1000*cs.sharable.Size())
	}

	capacity := "<" + cs.node.Name() + " capacity: "

	if cpu == "" && mem == "" {
		capacity += "-"
	} else {
		sep = ""
		if cpu != "" {
			capacity += "CPU: " + cpu
			sep = ", "
		}
		if mem != "" {
			capacity += sep + "MemLimit: " + mem
		}
	}
	capacity += ">"

	return capacity
}

// DumpAllocatable returns a printable representation of the supply's resource capacity.
func (cs *supply) DumpAllocatable() string {
	cpu, mem, sep := "", cs.mem.String(), ""

	if !cs.isolated.IsEmpty() {
		cpu = fmt.Sprintf("isolated:%s", cpuset.ShortCPUSet(cs.isolated))
		sep = ", "
	}
	if !cs.reserved.IsEmpty() {
		cpu += sep + fmt.Sprintf("reserved:%s (allocatable: %dm)", cpuset.ShortCPUSet(cs.reserved), cs.AllocatableReservedCPU())
		sep = ", "
		if cs.grantedReserved > 0 {
			cpu += sep + fmt.Sprintf("grantedReserved:%dm", cs.grantedReserved)
		}
	}
	local_grantedShared := cs.grantedShared
	total_grantedShared := cs.node.GrantedSharedCPU()
	if !cs.sharable.IsEmpty() {
		cpu += sep + fmt.Sprintf("sharable:%s (", cpuset.ShortCPUSet(cs.sharable))
		sep = ""
		if local_grantedShared > 0 || total_grantedShared > 0 {
			cpu += fmt.Sprintf("grantedShared:")
			kind := ""
			if local_grantedShared > 0 {
				cpu += fmt.Sprintf("%dm", local_grantedShared)
				kind = "local"
				sep = "/"
			}
			if total_grantedShared > 0 {
				cpu += sep + fmt.Sprintf("%dm", total_grantedShared)
				kind += sep + "subtree"
			}
			cpu += " " + kind
			sep = ", "
		}
		cpu += sep + fmt.Sprintf("allocatable:%dm)", cs.AllocatableSharedCPU(true))
	}

	allocatable := "<" + cs.node.Name() + " allocatable: "

	if cpu == "" && mem == "" {
		allocatable += "-"
	} else {
		sep = ""
		if cpu != "" {
			allocatable += "CPU: " + cpu
			sep = ", "
		}
		if mem != "" {
			allocatable += sep + "MemLimit: " + mem
		}
	}
	allocatable += ">"

	return allocatable
}

// prettyMem formats the given amount as k, M, G, or T units.
func prettyMem(value uint64) string {
	units := []string{"k", "M", "G", "T"}
	coeffs := []uint64{1 << 10, 1 << 20, 1 << 30, 1 << 40}

	c, u := uint64(1), ""
	for i := 0; i < len(units); i++ {
		if coeffs[i] > value {
			break
		}
		c, u = coeffs[i], units[i]
	}
	v := float64(value) / float64(c)

	return strconv.FormatFloat(v, 'f', 2, 64) + u
}

// DumpMemoryState dumps the state of the available and allocated memory.
func (cs *supply) DumpMemoryState(prefix string) {
	memTypes := []memoryType{memoryDRAM, memoryPMEM, memoryHBM}
	totalFree := uint64(0)
	totalGranted := uint64(0)
	for _, kind := range memTypes {
		free := cs.mem[kind]
		granted := cs.grantedMem[kind]
		if free != 0 || granted != 0 {
			log.Debug(prefix+"- %s: free: %s, granted %s",
				kind, prettyMem(free), prettyMem(granted))
		}
		totalFree += free
		totalGranted += granted
	}
	log.Debug(prefix+"- total free: %s, total granted %s",
		prettyMem(totalFree), prettyMem(totalGranted))

	printHdr := true
	if len(cs.extraMemReservations) > 0 {
		for g, memMap := range cs.extraMemReservations {
			split := ""
			sep := ""
			total := uint64(0)
			if mem := memMap[memoryDRAM]; mem > 0 {
				split = "DRAM " + prettyMem(mem)
				sep = ", "
				total += mem
			}
			if mem := memMap[memoryPMEM]; mem > 0 {
				split += sep + "PMEM " + prettyMem(mem)
				sep = ", "
				total += mem
			}
			if mem := memMap[memoryHBM]; mem > 0 {
				split += sep + "HBMEM " + prettyMem(mem)
				sep = ", "
				total += mem
			}
			if total > 0 {
				if printHdr {
					log.Debug(prefix + "- extra reservations:")
					printHdr = false
				}
				log.Debug(prefix+"  - %s: %s (%s)",
					g.GetContainer().PrettyName(), prettyMem(total), split)
			}
		}
	}
}

// newRequest creates a new request for the given container.
func newRequest(container cache.Container) Request {
	pod, _ := container.GetPod()
	full, fraction, isolate, cpuType := cpuAllocationPreferences(pod, container)
	req, lim, mtype := memoryAllocationPreference(pod, container)
	coldStart := time.Duration(0)

	log.Debug("%s: CPU preferences: cpuType=%s, full=%v, fraction=%v, isolate=%v",
		container.PrettyName(), cpuType, full, fraction, isolate)

	if mtype == memoryUnspec {
		mtype = defaultMemoryType
	}

	if mtype&memoryPMEM != 0 && mtype&memoryDRAM != 0 {
		parsedColdStart, err := coldStartPreference(pod, container)
		if err != nil {
			log.Error("Failed to parse cold start preference")
		} else {
			if parsedColdStart.Duration > 0 {
				if coldStartOff {
					log.Error("coldstart disabled (movable non-DRAM memory zones present)")
				} else {
					coldStart = time.Duration(parsedColdStart.Duration)
				}
			}
		}
	} else if mtype == memoryPMEM {
		if coldStartOff {
			mtype = mtype | memoryDRAM
			log.Error("%s: forced also DRAM usage (movable non-DRAM memory zones present)",
				container.PrettyName())
		}
	}

	return &request{
		container: container,
		full:      full,
		fraction:  fraction,
		isolate:   isolate,
		cpuType:   cpuType,
		memReq:    req,
		memLim:    lim,
		memType:   mtype,
		coldStart: coldStart,
	}
}

// GetContainer returns the container requesting CPU.
func (cr *request) GetContainer() cache.Container {
	return cr.container
}

// String returns aprintable representation of the CPU request.
func (cr *request) String() string {
	mem := "<Memory request: limit:" + prettyMem(cr.memLim) + ", req:" + prettyMem(cr.memReq) + ">"
	isolated := map[bool]string{false: "", true: "isolated "}[cr.isolate]
	switch {
	case cr.full == 0 && cr.fraction == 0:
		return "<CPU request " + cr.container.PrettyName() + ": ->" + mem

	case cr.full > 0 && cr.fraction > 0:
		return fmt.Sprintf("<CPU request "+cr.container.PrettyName()+": "+
			"%sexclusive: %d, shared: %d>", isolated, cr.full, cr.fraction) + mem

	case cr.full > 0:
		return fmt.Sprintf("<CPU request "+
			cr.container.PrettyName()+": %sexclusive: %d>", isolated, cr.full) + mem

	default:
		return fmt.Sprintf("<CPU request "+
			cr.container.PrettyName()+": shared: %d>", cr.fraction) + mem
	}
}

// CPUType returns the requested type of CPU for the grant.
func (cr *request) CPUType() cpuClass {
	return cr.cpuType
}

// SetCPUType sets the requested type of CPU for the grant.
func (cr *request) SetCPUType(cpuType cpuClass) {
	cr.cpuType = cpuType
}

// FullCPUs return the number of full CPUs requested.
func (cr *request) FullCPUs() int {
	return cr.full
}

// CPUFraction returns the amount of fractional milli-CPU requested.
func (cr *request) CPUFraction() int {
	return cr.fraction
}

// Isolate returns whether isolated CPUs are preferred for this request.
func (cr *request) Isolate() bool {
	return cr.isolate
}

// MemAmountToAllocate retuns how much memory we need to reserve for a request.
func (cr *request) MemAmountToAllocate() uint64 {
	var amount uint64 = 0
	switch cr.GetContainer().GetQOSClass() {
	case v1.PodQOSBurstable:
		// May be a request and/or limit. We focus on the limit because we
		// need to prepare for the case when all containers are using all
		// the memory they are allowed to. If limit is not set then we'll
		// allocate the request (which the container will get).
		if cr.memLim > 0 {
			amount = cr.memLim
		} else {
			amount = cr.memReq
		}
	case v1.PodQOSGuaranteed:
		// Limit and request are the same.
		amount = cr.memLim
	case v1.PodQOSBestEffort:
		// No requests or limits.
		amount = 0
	}
	return amount
}

// MemoryType returns the requested type of memory for the grant.
func (cr *request) MemoryType() memoryType {
	return cr.memType
}

// ColdStart returns the cold start timeout (in milliseconds).
func (cr *request) ColdStart() time.Duration {
	return cr.coldStart
}

// Score collects data for scoring this supply wrt. the given request.
func (cs *supply) GetScore(req Request) Score {
	score := &score{
		supply: cs,
		req:    req,
	}

	cr := req.(*request)
	full, part := cr.full, cr.fraction
	if full == 0 && part == 0 {
		part = 1
	}

	score.reserved = cs.AllocatableReservedCPU()
	score.shared = cs.AllocatableSharedCPU()

	if cr.CPUType() == cpuReserved {
		// calculate free reserved capacity
		score.reserved -= part
	} else {
		// calculate isolated node capacity CPU
		if cr.isolate {
			score.isolated = cs.isolated.Size() - full
		}

		// if we don't want isolated or there is not enough, calculate slicable capacity
		if !cr.isolate || score.isolated < 0 {
			score.shared -= 1000 * full
		}

		// calculate fractional capacity
		score.shared -= part
	}

	// calculate colocation score
	for _, grant := range cs.node.Policy().allocations.grants {
		if cr.CPUType() == grant.CPUType() && grant.GetCPUNode().NodeID() == cs.node.NodeID() {
			score.colocated++
		}
	}

	// calculate real hint scores
	hints := cr.container.GetTopologyHints()
	score.hints = make(map[string]float64, len(hints))

	for provider, hint := range cr.container.GetTopologyHints() {
		if provider == topology.ProviderKubelet {
			log.Warn(" - ignoring topology pseudo-hint from kubelet allocation %s", hint)
			continue
		}
		log.Debug(" - evaluating topology hint %s", hint)
		score.hints[provider] = cs.node.HintScore(hint)
	}

	return score
}

// AllocatableReservedCPU calculates the allocatable amount of reserved CPU of this supply.
func (cs *supply) AllocatableReservedCPU() int {
	if cs.reserved.Size() == 0 {
		// This supply has no room for reserved (not even of zero-sized)
		return -1
	}
	reserved := 1000*cs.reserved.Size() - cs.node.GrantedReservedCPU()
	for node := cs.node.Parent(); !node.IsNil(); node = node.Parent() {
		pSupply := node.FreeSupply()
		pReserved := 1000*pSupply.ReservedCPUs().Size() - pSupply.GetNode().GrantedReservedCPU()
		if pReserved < reserved {
			reserved = pReserved
		}
	}
	return reserved
}

// AllocatableSharedCPU calculates the allocatable amount of shared CPU of this supply.
func (cs *supply) AllocatableSharedCPU(quiet ...bool) int {
	verbose := !(len(quiet) > 0 && quiet[0])

	// Notes:
	//   Take into account the supplies/grants in all ancestors, making sure
	//   none of them gets overcommitted as the result of fulfilling this request.
	shared := 1000*cs.sharable.Size() - cs.node.GrantedSharedCPU()
	if verbose {
		log.Debug("%s: unadjusted free shared CPU: %dm", cs.node.Name(), shared)
	}
	for node := cs.node.Parent(); !node.IsNil(); node = node.Parent() {
		pSupply := node.FreeSupply()
		pShared := 1000*pSupply.SharableCPUs().Size() - pSupply.GetNode().GrantedSharedCPU()
		if pShared < shared {
			if verbose {
				log.Debug("%s: capping free shared CPU (%dm -> %dm) to avoid overcommit of %s",
					cs.node.Name(), shared, pShared, node.Name())
			}
			shared = pShared
		}
	}
	if verbose {
		log.Debug("%s: ancestor-adjusted free shared CPU: %dm", cs.node.Name(), shared)
	}
	return shared
}

// Eval...
func (score *score) Eval() float64 {
	return 1.0
}

func (score *score) Supply() Supply {
	return score.supply
}

func (score *score) Request() Request {
	return score.req
}

func (score *score) IsolatedCapacity() int {
	return score.isolated
}

func (score *score) ReservedCapacity() int {
	return score.reserved
}

func (score *score) SharedCapacity() int {
	return score.shared
}

func (score *score) Colocated() int {
	return score.colocated
}

func (score *score) HintScores() map[string]float64 {
	return score.hints
}

func (score *score) String() string {
	return fmt.Sprintf("<CPU score: node %s, isolated:%d, reserved:%d, shared:%d, colocated:%d, hints: %v>",
		score.supply.GetNode().Name(), score.isolated, score.reserved, score.shared, score.colocated, score.hints)
}

// newGrant creates a CPU grant from the given node for the container.
func newGrant(n Node, c cache.Container, cpuType cpuClass, exclusive cpuset.CPUSet, cpuPortion int, mt memoryType, allocated memoryMap, coldstart time.Duration) Grant {
	grant := &grant{
		node:       n,
		memoryNode: n,
		container:  c,
		cpuType:    cpuType,
		exclusive:  exclusive,
		cpuPortion: cpuPortion,
	}
	if allocated != nil {
		grant.SetMemoryAllocation(mt, allocated, coldstart)
	}
	return grant
}

// SetCPUPortion sets the fractional CPU portion for the grant.
func (cg *grant) SetCPUPortion(fraction int) {
	cg.cpuPortion = fraction
}

// SetMemoryAllocation sets the memory allocation for the grant.
func (cg *grant) SetMemoryAllocation(mt memoryType, allocated memoryMap, coldstart time.Duration) {
	initial := memoryPMEM
	if coldstart <= 0 {
		initial = mt
	}
	mems := cg.node.GetMemset(initial)
	if mems.Size() == 0 {
		mems = cg.node.GetMemset(memoryDRAM)
		if mems.Size() == 0 {
			mems = cg.node.GetMemset(memoryAll)
		}
	}
	mems = mems.Clone()

	cg.memType = mt
	cg.memset = mems
	cg.allocatedMem = allocated
	cg.coldStart = coldstart
}

// Clone creates a copy of this grant.
func (cg *grant) Clone() Grant {
	return &grant{
		node:         cg.GetCPUNode(),
		memoryNode:   cg.GetMemoryNode(),
		container:    cg.GetContainer(),
		exclusive:    cg.ExclusiveCPUs(),
		cpuType:      cg.CPUType(),
		cpuPortion:   cg.SharedPortion(),
		memType:      cg.MemoryType(),
		memset:       cg.Memset().Clone(),
		allocatedMem: cg.MemLimit(),
		coldStart:    cg.ColdStart(),
	}
}

// RefetchNodes updates the stored cpu and memory nodes of this grant by name.
func (cg *grant) RefetchNodes() error {
	node, ok := cg.node.Policy().nodes[cg.node.Name()]
	if !ok {
		return policyError("failed to refetch grant cpu node %s", cg.node.Name())
	}
	memoryNode, ok := cg.memoryNode.Policy().nodes[cg.memoryNode.Name()]
	if !ok {
		return policyError("failed to refetch grant memory node %s", cg.memoryNode.Name())
	}
	cg.node = node
	cg.memoryNode = memoryNode
	return nil
}

// GetContainer returns the container this grant is valid for.
func (cg *grant) GetContainer() cache.Container {
	return cg.container
}

// GetNode returns the Node this grant gets its CPU allocation from.
func (cg *grant) GetCPUNode() Node {
	return cg.node
}

// GetNode returns the Node this grant gets its memory allocation from.
func (cg *grant) GetMemoryNode() Node {
	return cg.memoryNode
}

func (cg *grant) SetMemoryNode(n Node) {
	cg.memoryNode = n
	cg.memset = n.GetMemset(cg.MemoryType())
}

// CPUType returns the requested type of CPU for the grant.
func (cg *grant) CPUType() cpuClass {
	return cg.cpuType
}

// CPUPortion returns granted milli-CPUs of non-full CPUs of CPUType().
func (cg *grant) CPUPortion() int {
	return cg.cpuPortion
}

// ExclusiveCPUs returns the non-isolated exclusive CPUSet in this grant.
func (cg *grant) ExclusiveCPUs() cpuset.CPUSet {
	return cg.exclusive
}

// ReservedCPUs returns the reserved CPUSet in the supply of this grant.
func (cg *grant) ReservedCPUs() cpuset.CPUSet {
	return cg.node.GetSupply().ReservedCPUs()
}

// ReservedPortion returns the milli-CPU allocation for the reserved CPUSet in this grant.
func (cg *grant) ReservedPortion() int {
	if cg.cpuType == cpuReserved {
		return cg.cpuPortion
	}
	return 0
}

// SharedCPUs returns the shared CPUSet in the supply of this grant.
func (cg *grant) SharedCPUs() cpuset.CPUSet {
	return cg.node.FreeSupply().SharableCPUs()
}

// SharedPortion returns the milli-CPU allocation for the shared CPUSet in this grant.
func (cg *grant) SharedPortion() int {
	if cg.cpuType == cpuNormal {
		return cg.cpuPortion
	}
	return 0
}

// ExclusiveCPUs returns the isolated exclusive CPUSet in this grant.
func (cg *grant) IsolatedCPUs() cpuset.CPUSet {
	return cg.node.GetSupply().IsolatedCPUs().Intersection(cg.exclusive)
}

// MemoryType returns the requested type of memory for the grant.
func (cg *grant) MemoryType() memoryType {
	return cg.memType
}

// Memset returns the granted memory controllers as an IDSet.
func (cg *grant) Memset() idset.IDSet {
	return cg.memset
}

// MemLimit returns the granted memory.
func (cg *grant) MemLimit() memoryMap {
	return cg.allocatedMem
}

// String returns a printable representation of the CPU grant.
func (cg *grant) String() string {
	var cpuType, isolated, exclusive, reserved, shared string
	cpuType = fmt.Sprintf("cputype: %s", cg.cpuType)
	isol := cg.IsolatedCPUs()
	if !isol.IsEmpty() {
		isolated = fmt.Sprintf(", isolated: %s", isol)
	}
	if !cg.exclusive.IsEmpty() {
		exclusive = fmt.Sprintf(", exclusive: %s", cg.exclusive)
	}
	if cg.ReservedPortion() > 0 {
		reserved = fmt.Sprintf(", reserved: %s (%dm)",
			cg.node.FreeSupply().ReservedCPUs(), cg.ReservedPortion())
	}
	if cg.SharedPortion() > 0 {
		shared = fmt.Sprintf(", shared: %s (%dm)",
			cg.node.FreeSupply().SharableCPUs(), cg.SharedPortion())
	}

	mem := cg.allocatedMem.String()
	if mem != "" {
		mem = ", MemLimit: " + mem
	}

	return fmt.Sprintf("<grant for %s from %s: %s%s%s%s%s%s>",
		cg.container.PrettyName(), cg.node.Name(), cpuType, isolated, exclusive, reserved, shared, mem)
}

func (cg *grant) AccountAllocateCPU() {
	cg.node.DepthFirst(func(n Node) error {
		n.FreeSupply().AccountAllocateCPU(cg)
		return nil
	})
	for node := cg.node.Parent(); !node.IsNil(); node = node.Parent() {
		node.FreeSupply().AccountAllocateCPU(cg)
	}
}

func (cg *grant) Release() {
	cg.GetCPUNode().FreeSupply().ReleaseCPU(cg)
	cg.GetMemoryNode().FreeSupply().ReleaseMemory(cg)
	cg.StopTimer()
}

func (cg *grant) AccountReleaseCPU() {
	cg.node.DepthFirst(func(n Node) error {
		n.FreeSupply().AccountReleaseCPU(cg)
		return nil
	})
	for node := cg.node.Parent(); !node.IsNil(); node = node.Parent() {
		node.FreeSupply().AccountReleaseCPU(cg)
	}
}

func (cg *grant) RestoreMemset() {
	mems := cg.GetMemoryNode().GetMemset(cg.memType)
	cg.memset = mems
	cg.GetMemoryNode().Policy().applyGrant(cg)
}

func (cg *grant) ExpandMemset() (bool, error) {
	supply := cg.GetMemoryNode().FreeSupply()
	node := cg.GetMemoryNode()
	parent := node.Parent()

	// We have to assume that the memory has been allocated how we granted it (if PMEM ran out
	// the allocations have been made from DRAM and so on).

	// Figure out if there is enough memory now to have grant as-is.
	extra := supply.ExtraMemoryReservation(memoryAll)
	free := supply.MemoryLimit()[memoryAll]
	if extra <= free {
		// The grant fits in the node even with extra reservations
		return false, nil
	}
	// Else it doesn't fit, so move the grant up in the memory tree.
	required := uint64(0)
	for _, memType := range []memoryType{memoryPMEM, memoryDRAM, memoryHBM} {
		required += cg.MemLimit()[memType]
	}
	log.Debug("out-of-memory risk in %s: extra reservations %s > free %s -> moving up %s total memory grant from %s",
		cg, prettyMem(extra), prettyMem(free), prettyMem(required), node.Name())

	// Find an ancestor where the grant fits. As reservations in
	// child nodes do not show up in free + extra in parent nodes,
	// releasing the grant is not necessary before searching.
	for ; !parent.IsNil(); parent = parent.Parent() {
		pSupply := parent.FreeSupply()
		parentFree := pSupply.MemoryLimit()[memoryAll]
		parentExtra := pSupply.ExtraMemoryReservation(memoryAll)
		if parentExtra+required <= parentFree {
			required = 0
			break
		}
		log.Debug("- %s has %s free but %s extra reservations, moving further up",
			parent.Name(), prettyMem(parentFree), prettyMem(parentExtra))
	}
	if required > 0 {
		return false, fmt.Errorf("internal error: cannot find enough memory (%s) for %s from ancestors of %s", prettyMem(required), cg, node.Name())
	}

	// Release granted memory from the node and allocate it from the parent node.
	err := parent.FreeSupply().ReallocateMemory(cg)
	if err != nil {
		return false, err
	}
	cg.SetMemoryNode(parent)
	cg.UpdateExtraMemoryReservation()

	// Make the container to use the new memory set.
	// FIXME: this could be done in a second pass to avoid doing this many times
	cg.GetMemoryNode().Policy().applyGrant(cg)

	return true, nil
}

func (cg *grant) UpdateExtraMemoryReservation() {
	// For every subnode, make sure that this grant is added to the extra memory allocation.
	cg.GetMemoryNode().DepthFirst(func(n Node) error {
		// No extra allocation should be done to the node itself.
		if !n.IsSameNode(cg.GetMemoryNode()) {
			supply := n.FreeSupply()
			supply.SetExtraMemoryReservation(cg)
		}
		return nil
	})
}

func (cg *grant) ColdStart() time.Duration {
	return cg.coldStart
}

func (cg *grant) AddTimer(timer *time.Timer) {
	cg.coldStartTimer = timer
}

func (cg *grant) StopTimer() {
	if cg.coldStartTimer != nil {
		cg.coldStartTimer.Stop()
		cg.coldStartTimer = nil
	}
}

func (cg *grant) ClearTimer() {
	if cg.coldStartTimer != nil {
		cg.coldStartTimer = nil
	}
}


================================================
FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/topology-aware-policy.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
	"errors"

	v1 "k8s.io/api/core/v1"
	resapi "k8s.io/apimachinery/pkg/api/resource"

	"github.com/prometheus/client_golang/prometheus"

	"github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/cpuallocator"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"

	policyapi "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	system "github.com/intel/cri-resource-manager/pkg/sysfs"
	idset "github.com/intel/goresctrl/pkg/utils"
)

const (
	// PolicyName is the name used to activate this policy implementation.
	PolicyName = "topology-aware"
	// PolicyDescription is a short description of this policy.
	PolicyDescription = "A policy for prototyping memory tiering."
	// PolicyPath is the path of this policy in the configuration hierarchy.
	PolicyPath = "policy." + PolicyName
	// AliasName is the 'memtier' alias name for this policy.
	AliasName = "memtier"
	// AliasPath is the 'memtier' alias configuration path for this policy.
	AliasPath = "policy." + AliasName

	// ColdStartDone is the event generated for the end of a container cold start period.
	ColdStartDone = "cold-start-done"
)

// allocations is our cache.Cachable for saving resource allocations in the cache.
type allocations struct {
	policy *policy
	grants map[string]Grant
}

// policy is our runtime state for this policy.
type policy struct {
	options      *policyapi.BackendOptions // options we were created or reconfigured with
	cache        cache.Cache               // pod/container cache
	sys          system.System             // system/HW topology info
	allowed      cpuset.CPUSet             // bounding set of CPUs we're allowed to use
	reserved     cpuset.CPUSet             // system-/kube-reserved CPUs
	reserveCnt   int                       // number of CPUs to reserve if given as resource.Quantity
	isolated     cpuset.CPUSet             // (our allowed set of) isolated CPUs
	nodes        map[string]Node           // pool nodes by name
	pools        []Node                    // pre-populated node slice for scoring, etc...
	root         Node                      // root of our pool/partition tree
	nodeCnt      int                       // number of pools
	depth        int                       // tree depth
	allocations  allocations               // container pool assignments
	cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy
	coldstartOff bool                      // coldstart forced off (have movable PMEM zones)
	isAlias      bool                      // whether started by referencing AliasName
}

// Make sure policy implements the policy.Backend interface.
var _ policyapi.Backend = &policy{}

// Whether we have coldstart forced off due to PMEM in movable memory zones.
var coldStartOff bool

// CreateTopologyAwarePolicy creates a new policy instance.
func CreateTopologyAwarePolicy(opts *policyapi.BackendOptions) policyapi.Backend {
	return createPolicy(opts, false)
}

// CreateMemtierPolicy creates a new policy instance, aliased as 'memtier'.
func CreateMemtierPolicy(opts *policyapi.BackendOptions) policyapi.Backend {
	return createPolicy(opts, true)
}

// createPolicy creates a new policy instance.
func createPolicy(opts *policyapi.BackendOptions, isAlias bool) policyapi.Backend {
	p := &policy{
		cache:        opts.Cache,
		sys:          opts.System,
		options:      opts,
		cpuAllocator: cpuallocator.NewCPUAllocator(opts.System),
		isAlias:      isAlias,
	}

	if isAlias {
		*opt = *aliasOpt
	}

	if err := p.initialize(); err != nil {
		log.Fatal("failed to initialize %s policy: %v", PolicyName, err)
	}

	p.registerImplicitAffinities()

	config.GetModule(policyapi.ConfigPath).AddNotify(p.configNotify)

	return p
}

// Name returns the name of this policy.
func (p *policy) Name() string {
	return PolicyName
}

// Description returns the description for this policy.
func (p *policy) Description() string {
	return PolicyDescription
}

// Start prepares this policy for accepting allocation/release requests.
func (p *policy) Start(add []cache.Container, del []cache.Container) error {
	if err := p.restoreCache(); err != nil {
		return policyError("failed to start: %v", err)
	}

	// Turn coldstart forcibly off if we have movable non-DRAM memory.
	// Note that although this can change dynamically we only check it
	// during startup and trust users to either not fiddle with memory
	// or restart us if they do.
	p.checkColdstartOff()

	p.root.Dump("<post-start>")

	return p.Sync(add, del)
}

// Sync synchronizes the state of this policy.
func (p *policy) Sync(add []cache.Container, del []cache.Container) error {
	log.Debug("synchronizing state...")
	for _, c := range del {
		p.ReleaseResources(c)
	}
	for _, c := range add {
		p.AllocateResources(c)
	}

	return nil
}

// AllocateResources is a resource allocation request for this policy.
func (p *policy) AllocateResources(container cache.Container) error {
	log.Debug("allocating resources for %s...", container.PrettyName())

	grant, err := p.allocatePool(container, "")
	if err != nil {
		return policyError("failed to allocate resources for %s: %v",
			container.PrettyName(), err)
	}
	p.applyGrant(grant)
	p.updateSharedAllocations(&grant)

	p.root.Dump("<post-alloc>")

	return nil
}

// ReleaseResources is a resource release request for this policy.
func (p *policy) ReleaseResources(container cache.Container) error {
	log.Debug("releasing resources of %s...", container.PrettyName())

	if grant, found := p.releasePool(container); found {
		p.updateSharedAllocations(&grant)
	}

	p.root.Dump("<post-release>")

	return nil
}

// UpdateResources is a resource allocation update request for this policy.
func (p *policy) UpdateResources(c cache.Container) error {
	log.Debug("(not) updating container %s...", c.PrettyName())
	return nil
}

// Rebalance tries to find an optimal allocation of resources for the current containers.
func (p *policy) Rebalance() (bool, error) {
	var errors error

	containers := p.cache.GetContainers()
	movable := []cache.Container{}

	for _, c := range containers {
		if c.GetQOSClass() != v1.PodQOSGuaranteed {
			p.ReleaseResources(c)
			movable = append(movable, c)
		}
	}

	for _, c := range movable {
		if err := p.AllocateResources(c); err != nil {
			if errors == nil {
				errors = err
			} else {
				errors = policyError("%v, %v", errors, err)
			}
		}
	}

	return true, errors
}

// HandleEvent handles policy-specific events.
func (p *policy) HandleEvent(e *events.Policy) (bool, error) {
	log.Debug("received policy event %s.%s with data %v...", e.Source, e.Type, e.Data)

	switch e.Type {
	case events.ContainerStarted:
		c, ok := e.Data.(cache.Container)
		if !ok {
			return false, policyError("%s event: expecting cache.Container Data, got %T",
				e.Type, e.Data)
		}
		log.Info("triggering coldstart period (if necessary) for %s", c.PrettyName())
		return false, p.triggerColdStart(c)
	case ColdStartDone:
		id, ok := e.Data.(string)
		if !ok {
			return false, policyError("%s event: expecting container ID Data, got %T",
				e.Type, e.Data)
		}
		c, ok := p.cache.LookupContainer(id)
		if !ok {
			// TODO: This is probably a race condition. Should we return nil error here?
			return false, policyError("%s event: failed to lookup container %s", id)
		}
		log.Info("finishing coldstart period for %s", c.PrettyName())
		return p.finishColdStart(c)
	}
	return false, nil
}

// Introspect provides data for external introspection.
func (p *policy) Introspect(state *introspect.State) {
	pools := make(map[string]*introspect.Pool, len(p.pools))
	for _, node := range p.nodes {
		cpus := node.GetSupply()
		pool := &introspect.Pool{
			Name:   node.Name(),
			CPUs:   cpus.SharableCPUs().Union(cpus.IsolatedCPUs()).String(),
			Memory: node.GetMemset(memoryAll).String(),
		}
		if parent := node.Parent(); !parent.IsNil() {
			pool.Parent = parent.Name()
		}
		if children := node.Children(); len(children) > 0 {
			pool.Children = make([]string, 0, len(children))
			for _, c := range children {
				pool.Children = append(pool.Children, c.Name())
			}
		}
		pools[pool.Name] = pool
	}
	state.Pools = pools

	assignments := make(map[string]*introspect.Assignment, len(p.allocations.grants))
	for _, g := range p.allocations.grants {
		a := &introspect.Assignment{
			ContainerID:   g.GetContainer().GetID(),
			CPUShare:      g.SharedPortion(),
			ExclusiveCPUs: g.ExclusiveCPUs().Union(g.IsolatedCPUs()).String(),
			Pool:          g.GetCPUNode().Name(),
		}
		if g.SharedPortion() > 0 || a.ExclusiveCPUs == "" {
			a.SharedCPUs = g.SharedCPUs().String()
		}
		assignments[a.ContainerID] = a
	}
	state.Assignments = assignments
}

// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
func (p *policy) DescribeMetrics() []*prometheus.Desc {
	return nil
}

// PollMetrics provides policy metrics for monitoring.
func (p *policy) PollMetrics() policyapi.Metrics {
	return nil
}

// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
func (p *policy) CollectMetrics(policyapi.Metrics) ([]prometheus.Metric, error) {
	return nil, nil
}

// ExportResourceData provides resource data to export for the container.
func (p *policy) ExportResourceData(c cache.Container) map[string]string {
	grant, ok := p.allocations.grants[c.GetCacheID()]
	if !ok {
		return nil
	}

	data := map[string]string{}
	shared := grant.SharedCPUs().String()
	isolated := grant.ExclusiveCPUs().Intersection(grant.GetCPUNode().GetSupply().IsolatedCPUs())
	exclusive := grant.ExclusiveCPUs().Difference(isolated).String()

	if grant.SharedPortion() > 0 && shared != "" {
		data[policyapi.ExportSharedCPUs] = shared
	}
	if isolated.String() != "" {
		data[policyapi.ExportIsolatedCPUs] = isolated.String()
	}
	if exclusive != "" {
		data[policyapi.ExportExclusiveCPUs] = exclusive
	}

	mems := grant.Memset()
	dram := idset.NewIDSet()
	pmem := idset.NewIDSet()
	hbm := idset.NewIDSet()
	for _, id := range mems.SortedMembers() {
		node := p.sys.Node(id)
		switch node.GetMemoryType() {
		case system.MemoryTypeDRAM:
			dram.Add(id)
		case system.MemoryTypePMEM:
			pmem.Add(id)
			/*
				case system.MemoryTypeHBM:
					hbm.Add(id)
			*/
		}
	}
	data["ALL_MEMS"] = mems.String()
	if dram.Size() > 0 {
		data["DRAM_MEMS"] = dram.String()
	}
	if pmem.Size() > 0 {
		data["PMEM_MEMS"] = pmem.String()
	}
	if hbm.Size() > 0 {
		data["HBM_MEMS"] = hbm.String()
	}

	return data
}

// reallocateResources reallocates the given containers using the given pool hints
func (p *policy) reallocateResources(containers []cache.Container, pools map[string]string) error {
	errs := []error{}

	log.Info("reallocating resources...")

	cache.SortContainers(containers)

	for _, c := range containers {
		p.releasePool(c)
	}
	for _, c := range containers {
		log.Debug("reallocating resources for %s...", c.PrettyName())

		grant, err := p.allocatePool(c, pools[c.GetCacheID()])
		if err != nil {
			errs = append(errs, err)
		} else {
			p.applyGrant(grant)
		}
	}

	if len(errs) > 0 {
		return errors.Join(errs...)
	}

	p.updateSharedAllocations(nil)

	p.root.Dump("<post-realloc>")

	return nil
}

func (p *policy) configNotify(event config.Event, source config.Source) error {
	policyName := PolicyName
	if p.isAlias {
		policyName = AliasName
		*opt = *aliasOpt
	}
	log.Info("%s configuration %s:", policyName, event)
	log.Info("  - pin containers to CPUs: %v", opt.PinCPU)
	log.Info("  - pin containers to memory: %v", opt.PinMemory)
	log.Info("  - prefer isolated CPUs: %v", opt.PreferIsolated)
	log.Info("  - prefer shared CPUs: %v", opt.PreferShared)
	log.Info("  - reserved pool namespaces: %v", opt.ReservedPoolNamespaces)

	var allowed, reserved cpuset.CPUSet
	var reinit bool

	if cpus, ok := p.options.Available[policyapi.DomainCPU]; ok {
		if cset, ok := cpus.(cpuset.CPUSet); ok {
			allowed = cset
		}
	}
	if cpus, ok := p.options.Reserved[policyapi.DomainCPU]; ok {
		switch v := cpus.(type) {
		case cpuset.CPUSet:
			reserved = v
		case resapi.Quantity:
			reserveCnt := (int(v.MilliValue()) + 999) / 1000
			if reserveCnt != p.reserveCnt {
				log.Warn("CPU reservation has changed (%v, was %v)",
					reserveCnt, p.reserveCnt)
				reinit = true
			}
		}
	}

	if !allowed.Equals(p.allowed) {
		if !(allowed.Size() == 0 && p.allowed.Size() == 0) {
			log.Warn("allowed cpuset changed (%s, was %s)",
				allowed.String(), p.allowed.String())
			reinit = true
		}
	}
	if !reserved.Equals(p.reserved) {
		if !(reserved.Size() == 0 && p.reserved.Size() == 0) {
			log.Warn("reserved cpuset changed (%s, was %s)",
				reserved.String(), p.reserved.String())
			reinit = true
		}
	}

	//
	// Notes:
	//   If the allowed or reserved resources have changed, we need to
	//   rebuild our pool hierarchy using the updated constraints and
	//   also update the existing allocations accordingly. We do this
	//   first reinitializing the policy then reloading the allocations
	//   from the cache. If we fail, we restore the original state of
	//   the policy and reject the new configuration.
	//

	if reinit {
		log.Warn("reinitializing %s policy...", PolicyName)

		savedPolicy := *p
		allocations := savedPolicy.allocations.clone()

		if err := p.initialize(); err != nil {
			*p = savedPolicy
			return policyError("failed to reconfigure: %v", err)
		}

		for _, grant := range allocations.grants {
			if err := grant.RefetchNodes(); err != nil {
				*p = savedPolicy
				return policyError("failed to reconfigure: %v", err)
			}
		}

		log.Warn("updating existing allocations...")
		if err := p.restoreAllocations(&allocations); err != nil {
			*p = savedPolicy
			return policyError("failed to reconfigure: %v", err)
		}

		p.root.Dump("<post-config>")
	}

	return nil
}

// Initialize or reinitialize the policy.
func (p *policy) initialize() error {
	p.nodes = nil
	p.pools = nil
	p.root = nil
	p.nodeCnt = 0
	p.depth = 0
	p.allocations = p.newAllocations()

	if err := p.checkConstraints(); err != nil {
		return err
	}

	if err := p.buildPoolsByTopology(); err != nil {
		return err
	}

	return nil
}

// Check the constraints passed to us.
func (p *policy) checkConstraints() error {
	if c, ok := p.options.Available[policyapi.DomainCPU]; ok {
		p.allowed = c.(cpuset.CPUSet)
	} else {
		// default to all online cpus
		p.allowed = p.sys.CPUSet().Difference(p.sys.Offlined())
	}

	p.isolated = p.sys.Isolated().Intersection(p.allowed)

	c, ok := p.options.Reserved[policyapi.DomainCPU]
	if !ok {
		return policyError("cannot start without CPU reservation")
	}

	switch c.(type) {
	case cpuset.CPUSet:
		p.reserved = c.(cpuset.CPUSet)
		// check that all reserved CPUs are in the allowed set
		if !p.reserved.Difference(p.allowed).IsEmpty() {
			return policyError("invalid reserved cpuset %s, some CPUs (%s) are not "+
				"part of the online allowed cpuset (%s)", p.reserved,
				p.reserved.Difference(p.allowed), p.allowed)
		}
		// check that none of the reserved CPUs are isolated
		if !p.reserved.Intersection(p.isolated).IsEmpty() {
			return policyError("invalid reserved cpuset %s, some CPUs (%s) are also isolated",
				p.reserved.Intersection(p.isolated))
		}

	case resapi.Quantity:
		qty := c.(resapi.Quantity)
		p.reserveCnt = (int(qty.MilliValue()) + 999) / 1000
		// Use CpuAllocator to pick reserved CPUs among
		// allowed ones. Because using those CPUs is allowed,
		// they remain (they are put back) in the allowed set.
		cset, err := p.cpuAllocator.AllocateCpus(&p.allowed, p.reserveCnt, cpuallocator.PriorityNormal)
		p.allowed = p.allowed.Union(cset)
		if err != nil {
			log.Fatal("cannot reserve %dm CPUs for ReservedResources from AvailableResources: %s", qty.MilliValue(), err)
		}
		p.reserved = cset
	}

	if p.reserved.IsEmpty() {
		return policyError("cannot start without CPU reservation")
	}

	return nil
}

func (p *policy) restoreCache() error {
	allocations := p.newAllocations()
	if p.cache.GetPolicyEntry(keyAllocations, &allocations) {
		if err := p.restoreAllocations(&allocations); err != nil {
			return policyError("failed to restore allocations from cache: %v", err)
		}
		p.allocations.Dump(log.Info, "restored ")
	}
	p.saveAllocations()

	return nil
}

func (p *policy) checkColdstartOff() {
	for _, id := range p.sys.NodeIDs() {
		node := p.sys.Node(id)
		if node.GetMemoryType() == system.MemoryTypePMEM {
			if !node.HasNormalMemory() {
				coldStartOff = true
				log.Error("coldstart forced off: NUMA node #%d does not have normal memory", id)
				return
			}
		}
	}
}

// newAllocations returns a new initialized empty set of allocations.
func (p *policy) newAllocations() allocations {
	return allocations{policy: p, grants: make(map[string]Grant)}
}

// clone creates a copy of the allocation.
func (a *allocations) clone() allocations {
	o := allocations{policy: a.policy, grants: make(map[string]Grant)}
	for id, grant := range a.grants {
		o.grants[id] = grant.Clone()
	}
	return o
}

// getContainerPoolHints creates container pool hints for the current grants.
func (a *allocations) getContainerPoolHints() ([]cache.Container, map[string]string) {
	containers := make([]cache.Container, 0, len(a.grants))
	hints := make(map[string]string)
	for _, grant := range a.grants {
		c := grant.GetContainer()
		containers = append(containers, c)
		hints[c.GetCacheID()] = grant.GetCPUNode().Name()
	}
	return containers, hints
}

// Register us as a policy implementation.
func init() {
	policyapi.Register(PolicyName, PolicyDescription, CreateTopologyAwarePolicy)
	policyapi.Register(AliasName, PolicyDescription, CreateMemtierPolicy)
}


================================================
FILE: pkg/cri/resource-manager/policy/error.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package policy

import (
	"fmt"
)

func policyError(format string, args ...interface{}) error {
	return fmt.Errorf("policy: "+format, args...)
}


================================================
FILE: pkg/cri/resource-manager/policy/flags.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package policy

import (
	"encoding/json"
	"errors"
	"os"
	"path/filepath"
	"sort"
	"strconv"
	"strings"

	"k8s.io/apimachinery/pkg/api/resource"

	"github.com/intel/cri-resource-manager/pkg/cgroups"
	"github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
)

const (
	// NonePolicy is the name of our no-op policy.
	NonePolicy = "none"
	// DefaultPolicy is the name of our default policy.
	DefaultPolicy = NonePolicy
	// ConfigPath is the configuration module path for the generic policy layer.
	ConfigPath = "policy"
)

// Options captures our configurable parameters.
type options struct {
	// Policy is the name of the policy backend to activate.
	Policy string `json:"Active"`
	// Available hardware resources to use.
	Available ConstraintSet `json:"AvailableResources,omitempty"`
	// Reserved hardware resources, for system and kube tasks.
	Reserved ConstraintSet `json:"ReservedResources,omitempty"`
}

// Our runtime configuration.
var opt = defaultOptions().(*options)

// MarshalJSON implements JSON marshalling for ConstraintSets.
func (cs ConstraintSet) MarshalJSON() ([]byte, error) {
	obj := map[string]interface{}{}
	for domain, constraint := range cs {
		name := string(domain)
		switch constraint.(type) {
		case cpuset.CPUSet:
			obj[name] = "cpuset:" + constraint.(cpuset.CPUSet).String()
		case resource.Quantity:
			qty := constraint.(resource.Quantity)
			obj[name] = qty.String()
		case int:
			obj[name] = strconv.Itoa(constraint.(int))
		default:
			return nil, policyError("invalid %v constraint of type %T", domain, constraint)
		}
	}
	return json.Marshal(obj)
}

// UnmarshalJSON implements JSON unmarshalling for ConstraintSets.
func (cs *ConstraintSet) UnmarshalJSON(raw []byte) error {
	set := make(ConstraintSet)
	obj := map[string]interface{}{}
	if err := json.Unmarshal(raw, &obj); err != nil {
		return policyError("failed to unmarshal ConstraintSet: %v", err)
	}

	for name, value := range obj {
		switch strings.ToUpper(name) {
		case string(DomainCPU):
			switch v := value.(type) {
			case string:
				if err := set.parseCPU(v); err != nil {
					return err
				}
			case int:
				set.setCPUMilliQuantity(v)
			case float64:
				set.setCPUMilliQuantity(int(1000.0 * v))
			default:
				return policyError("invalid CPU constraint of type %T", value)
			}
		default:
			return policyError("internal error: unhandled ConstraintSet domain %s", name)
		}
	}

	*cs = set
	return nil
}

func (cs *ConstraintSet) String() string {
	ret := ""
	sep := ""
	for domain, value := range *cs {
		ret += sep + string(domain) + "=" + ConstraintToString(value)
		sep = ","
	}
	return ret
}

func (cs *ConstraintSet) parseCPU(value string) error {
	kind, spec := "", ""
	if sep := strings.IndexByte(value, ':'); sep != -1 {
		kind = value[:sep]
		spec = value[sep+1:]
	} else {
		spec = value
	}
	if len(spec) == 0 {
		return policyError("missing CPU constraint value")
	}

	switch {
	case kind == "cgroup" || spec[0] == '/':
		if err := cs.parseCPUFromCgroup(spec); err != nil {
			return err
		}
	case kind == "cpuset" || strings.IndexAny(spec, "-,") != -1:
		if err := cs.parseCPUSet(spec); err != nil {
			return err
		}
	case kind == "":
		if err := cs.parseCPUQuantity(spec); err != nil {
			return err
		}
	default:
		return policyError("invalid CPU constraint qualifier %q", kind)
	}

	return nil
}

func (cs *ConstraintSet) parseCPUSet(value string) error {
	cset, err := cpuset.Parse(value)
	if err != nil {
		return policyError("failed to parse CPU cpuset constraint %q: %v",
			value, err)
	}
	(*cs)[DomainCPU] = cset
	return nil
}

func (cs *ConstraintSet) parseCPUQuantity(value string) error {
	qty, err := resource.ParseQuantity(value)
	if err != nil {
		return policyError("failed to parse CPU Quantity constraint %q: %v",
			value, err)
	}
	(*cs)[DomainCPU] = qty
	return nil
}

func (cs *ConstraintSet) parseCPUFromCgroup(dir string) error {
	pathToCpuset := func(outPath *string, fragments ...string) bool {
		*outPath = filepath.Join(filepath.Join(fragments...), "cpuset.cpus")
		_, err := os.Stat(*outPath)
		return !errors.Is(err, os.ErrNotExist)
	}
	path := ""
	switch {
	case len(dir) == 0:
		return policyError("empty CPU cgroup constraint")
	case dir[0] == '/' && pathToCpuset(&path, dir):
		// dir is a direct, absolute path to an existing cgroup
	case pathToCpuset(&path, cgroups.GetMountDir(), dir):
		// dir is a relative path starting from the cgroup mount point
	case pathToCpuset(&path, cgroups.Cpuset.Path(), dir):
		// dir is a relative path starting from the cpuset controller (cgroup v1)
	default:
		// dir is none of the previous
		return policyError("failed to find cpuset.cpus for CPU cgroup constraint %q", dir)
	}
	bytes, err := os.ReadFile(path)
	if err != nil {
		return policyError("failed read CPU cpuset cgroup constraint %q: %v",
			path, err)
	}
	cpus := strings.TrimSuffix(string(bytes), "\n")
	cset, err := cpuset.Parse(cpus)
	if err != nil {
		return policyError("failed to parse cpuset cgroup constraint %q: %v",
			cpus, err)
	}
	(*cs)[DomainCPU] = cset
	return nil
}

func (cs *ConstraintSet) setCPUMilliQuantity(value int) {
	qty := resource.NewMilliQuantity(int64(value), resource.DecimalSI)
	(*cs)[DomainCPU] = *qty
}

// AvailablePolicy describes an available policy.
type AvailablePolicy struct {
	// Name is the name of the policy.
	Name string
	// Description is a short description of the policy.
	Description string
}

// AvailablePolicies returns the available policies and their descriptions.
func AvailablePolicies() []*AvailablePolicy {
	policies := make([]*AvailablePolicy, 0, len(backends)+1)
	for name, be := range backends {
		policies = append(policies, &AvailablePolicy{
			Name:        name,
			Description: be.description,
		})
	}
	sort.Slice(policies, func(i, j int) bool { return policies[i].Name < policies[j].Name })

	return policies
}

// defaultOptions returns a new options instance, all initialized to defaults.
func defaultOptions() interface{} {
	return &options{
		Policy:    DefaultPolicy,
		Available: ConstraintSet{},
		Reserved:  ConstraintSet{},
	}
}

// Register us for configuration handling.
func init() {
	config.Register(ConfigPath, "Generic policy layer.", opt, defaultOptions,
		config.WithNotify(configNotify))
}


================================================
FILE: pkg/cri/resource-manager/policy/policy.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package policy

import (
	"bytes"
	"fmt"
	"sort"
	"strconv"

	corev1 "k8s.io/api/core/v1"
	"k8s.io/apimachinery/pkg/api/resource"

	"github.com/intel/cri-resource-manager/pkg/blockio"
	"github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/agent"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/rdt"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	"github.com/prometheus/client_golang/prometheus"

	logger "github.com/intel/cri-resource-manager/pkg/log"
	system "github.com/intel/cri-resource-manager/pkg/sysfs"
)

// Domain represents a hardware resource domain that can be policied by a backend.
type Domain string

const (
	// DomainCPU is the CPU resource domain.
	DomainCPU Domain = "CPU"
	// DomainMemory is the memory resource domain.
	DomainMemory Domain = "Memory"
	// DomainHugePage is the hugepages resource domain.
	DomainHugePage Domain = "HugePages"
	// DomainCache is the CPU cache resource domain.
	DomainCache Domain = "Cache"
	// DomainMemoryBW is the memory resource bandwidth.
	DomainMemoryBW Domain = "MBW"
)

// Constraint describes constraint of one hardware domain
type Constraint interface{}

// ConstraintSet describes, per hardware domain, the resources available for a policy.
type ConstraintSet map[Domain]Constraint

// Options describes policy options
type Options struct {
	// Client interface to cri-resmgr agent
	AgentCli agent.Interface
	// SendEvent is the function for delivering events back to the resource manager.
	SendEvent SendEventFn
}

// BackendOptions describes the options for a policy backend instance
type BackendOptions struct {
	// System provides system/HW/topology information
	System system.System
	// System state/cache
	Cache cache.Cache
	// Resource availibility constraint
	Available ConstraintSet
	// Resource reservation constraint
	Reserved ConstraintSet
	// Client interface to cri-resmgr agent
	AgentCli agent.Interface
	// SendEvent is the function for delivering events up to the resource manager.
	SendEvent SendEventFn
}

// CreateFn is the type for functions used to create a policy instance.
type CreateFn func(*BackendOptions) Backend

// SendEventFn is the type for a function to send events back to the resource manager.
type SendEventFn func(interface{}) error

const (
	// ExportedResources is the basename of the file container resources are exported to.
	ExportedResources = "resources.sh"
	// ExportSharedCPUs is the shell variable used to export shared container CPUs.
	ExportSharedCPUs = "SHARED_CPUS"
	// ExportIsolatedCPUs is the shell variable used to export isolated container CPUs.
	ExportIsolatedCPUs = "ISOLATED_CPUS"
	// ExportExclusiveCPUs is the shell variable used to export exclusive container CPUs.
	ExportExclusiveCPUs = "EXCLUSIVE_CPUS"
)

// Backend is the policy (decision making logic) interface exposed by implementations.
//
// A backends operates in a set of policy domains. Currently each policy domain
// corresponds to some particular hardware resource (CPU, memory, cache, etc).
type Backend interface {
	// Name gets the well-known name of this policy.
	Name() string
	// Description gives a verbose description about the policy implementation.
	Description() string
	// Start up and sycnhronizes the policy, using the given cache and resource constraints.
	Start([]cache.Container, []cache.Container) error
	// Sync synchronizes the policy, allocating/releasing the given containers.
	Sync([]cache.Container, []cache.Container) error
	// AllocateResources allocates resources to/for a container.
	AllocateResources(cache.Container) error
	// ReleaseResources release resources of a container.
	ReleaseResources(cache.Container) error
	// UpdateResources updates resource allocations of a container.
	UpdateResources(cache.Container) error
	// Rebalance tries an optimal allocation of resources for the current container.
	Rebalance() (bool, error)
	// HandleEvent processes the given event. The returned boolean indicates whether
	// changes have been made to any of the containers while handling the event.
	HandleEvent(*events.Policy) (bool, error)
	// ExportResourceData provides resource data to export for the container.
	ExportResourceData(cache.Container) map[string]string
	// Introspect provides data for external introspection.
	Introspect(*introspect.State)
	// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
	DescribeMetrics() []*prometheus.Desc
	// PollMetrics provides policy metrics for monitoring.
	PollMetrics() Metrics
	// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
	CollectMetrics(Metrics) ([]prometheus.Metric, error)
}

// Policy is the exposed interface for container resource allocations decision making.
type Policy interface {
	// Start starts up policy, prepare for serving resource management requests.
	Start([]cache.Container, []cache.Container) error
	// Sync synchronizes the state of the active policy.
	Sync([]cache.Container, []cache.Container) error
	// AllocateResources allocates resources to a container.
	AllocateResources(cache.Container) error
	// ReleaseResources releases resources of a container.
	ReleaseResources(cache.Container) error
	// UpdateResources updates resource allocations of a container.
	UpdateResources(cache.Container) error
	// Rebalance tries to find an optimal allocation of resources for the current containers.
	Rebalance() (bool, error)
	// HandleEvent passes on the given event to the active policy. The returned boolean
	// indicates whether changes have been made to any of the containers while handling
	// the event.
	HandleEvent(*events.Policy) (bool, error)
	// ExportResourceData exports/updates resource data for the container.
	ExportResourceData(cache.Container)
	// Introspect provides data for external introspection.
	Introspect() *introspect.State
	// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
	DescribeMetrics() []*prometheus.Desc
	// PollMetrics provides policy metrics for monitoring.
	PollMetrics() Metrics
	// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
	CollectMetrics(Metrics) ([]prometheus.Metric, error)
}

type Metrics interface{}

// Policy instance/state.
type policy struct {
	options   Options            // policy options
	cache     cache.Cache        // system state cache
	active    Backend            // our active backend
	system    system.System      // system/HW/topology info
	inspsys   *introspect.System // ditto for introspection
	sendEvent SendEventFn        // function to send event up to the resource manager
}

// backend is a registered Backend.
type backend struct {
	name        string   // unqiue backend name
	description string   // verbose backend description
	create      CreateFn // backend creation function
}

// Out logger instance.
var log logger.Logger = logger.NewLogger("policy")

// Registered backends.
var backends = make(map[string]*backend)

// Options passed to created/activated backend.
var backendOpts = &BackendOptions{}

// ActivePolicy returns the name of the policy to be activated.
func ActivePolicy() string {
	return opt.Policy
}

// NewPolicy creates a policy instance using the selected backend.
func NewPolicy(cache cache.Cache, o *Options) (Policy, error) {
	sys, err := system.DiscoverSystem()
	if err != nil {
		return nil, policyError("failed to discover system topology: %v", err)
	}

	p := &policy{
		cache:   cache,
		system:  sys,
		options: *o,
	}

	active, ok := backends[opt.Policy]
	if !ok {
		return nil, policyError("unknown policy '%s' requested", opt.Policy)
	}

	log.Info("activating '%s' policy...", active.name)

	if len(opt.Available) != 0 {
		log.Info("  with available resources:")
		for n, r := range opt.Available {
			log.Info("    - %s=%s", n, ConstraintToString(r))
		}
	}
	if len(opt.Reserved) != 0 {
		log.Info("  with reserved resources:")
		for n, r := range opt.Reserved {
			log.Info("    - %s=%s", n, ConstraintToString(r))
		}
	}

	if log.DebugEnabled() {
		logger.Get(opt.Policy).EnableDebug()
	}

	backendOpts.Cache = p.cache
	backendOpts.System = p.system
	backendOpts.Available = opt.Available
	backendOpts.Reserved = opt.Reserved
	backendOpts.AgentCli = o.AgentCli
	backendOpts.SendEvent = o.SendEvent

	p.active = active.create(backendOpts)

	return p, nil
}

// Start starts up policy, preparing it for resving requests.
func (p *policy) Start(add []cache.Container, del []cache.Container) error {
	log.Info("starting policy '%s'...", p.active.Name())
	return p.active.Start(add, del)
}

// Sync synchronizes the active policy state.
func (p *policy) Sync(add []cache.Container, del []cache.Container) error {
	return p.active.Sync(add, del)
}

// AllocateResources allocates resources for a container.
func (p *policy) AllocateResources(c cache.Container) error {
	return p.active.AllocateResources(c)
}

// ReleaseResources release resources of a container.
func (p *policy) ReleaseResources(c cache.Container) error {
	return p.active.ReleaseResources(c)
}

// UpdateResources updates resource allocations of a container.
func (p *policy) UpdateResources(c cache.Container) error {
	return p.active.UpdateResources(c)
}

// Rebalance tries to find a more optimal allocation of resources for the current containers.
func (p *policy) Rebalance() (bool, error) {
	return p.active.Rebalance()
}

// HandleEvent passes on the given event to the active policy.
func (p *policy) HandleEvent(e *events.Policy) (bool, error) {
	return p.active.HandleEvent(e)
}

// ExportResourceData exports/updates resource data for the container.
func (p *policy) ExportResourceData(c cache.Container) {
	var buf bytes.Buffer

	data := p.active.ExportResourceData(c)
	keys := []string{}
	for key := range data {
		keys = append(keys, key)
	}
	sort.Strings(keys)

	for _, key := range keys {
		value := data[key]
		if _, err := buf.WriteString(fmt.Sprintf("%s=%q\n", key, value)); err != nil {
			log.Error("container %s: failed to export resource data (%s=%q)",
				c.PrettyName(), key, value)
			buf.Reset()
			break
		}
	}

	p.cache.WriteFile(c.GetCacheID(), ExportedResources, 0644, buf.Bytes())
}

// Introspect provides data for external introspection/visualization.
func (p *policy) Introspect() *introspect.State {
	pods := p.cache.GetPods()
	state := &introspect.State{Pods: make(map[string]*introspect.Pod, len(pods))}

	for _, p := range pods {
		containers := p.GetContainers()
		if len(containers) == 0 {
			continue
		}

		pod := &introspect.Pod{
			ID:         p.GetID(),
			UID:        p.GetUID(),
			Name:       p.GetName(),
			Containers: make(map[string]*introspect.Container, len(containers)),
		}

		for _, c := range containers {
			container := &introspect.Container{
				ID:      c.GetID(),
				Name:    c.GetName(),
				Command: c.GetCommand(),
				Args:    c.GetArgs(),
				Hints:   introspect.TopologyHints(c.GetTopologyHints()),
			}
			resources := c.GetResourceRequirements()
			if req, ok := resources.Requests[corev1.ResourceCPU]; ok {
				if value := req.MilliValue(); value > 0 {
					container.CPURequest = value
				}
			}
			if lim, ok := resources.Limits[corev1.ResourceCPU]; ok {
				if value := lim.MilliValue(); value > 0 {
					container.CPULimit = value
				}
			}
			if req, ok := resources.Requests[corev1.ResourceMemory]; ok {
				if value := req.Value(); value > 0 {
					container.MemoryRequest = value
				}
			}
			if lim, ok := resources.Limits[corev1.ResourceMemory]; ok {
				if value := lim.Value(); value > 0 {
					container.MemoryLimit = value
				}
			}
			pod.Containers[container.ID] = container
		}
		state.Pods[pod.ID] = pod
	}

	if p.inspsys == nil {
		sys := &introspect.System{
			Sockets: make(map[int]*introspect.Socket, p.system.PackageCount()),
			Nodes:   make(map[int]*introspect.Node, p.system.NUMANodeCount()),
		}
		for _, id := range p.system.PackageIDs() {
			pkg := p.system.Package(id)
			sys.Sockets[int(id)] = &introspect.Socket{ID: int(id), CPUs: pkg.CPUSet().String()}
		}
		for _, id := range p.system.NodeIDs() {
			node := p.system.Node(id)
			sys.Nodes[int(id)] = &introspect.Node{ID: int(id), CPUs: node.CPUSet().String()}
		}
		sys.Isolated = p.system.Isolated().String()
		sys.Offlined = p.system.Offlined().String()
		p.inspsys = sys
	}

	rdtClassNames := []string{}
	for _, rdtClass := range rdt.GetClasses() {
		rdtClassNames = append(rdtClassNames, rdtClass.Name())
	}
	blkioClassNames := []string{}
	for _, blkioClass := range blockio.GetClasses() {
		blkioClassNames = append(blkioClassNames, blkioClass.Name)
	}
	p.inspsys.RDTClasses = rdtClassNames
	p.inspsys.Policy = opt.Policy

	state.System = p.inspsys
	p.active.Introspect(state)

	return state
}

// PollMetrics provides policy metrics for monitoring.
func (p *policy) PollMetrics() Metrics {
	return p.active.PollMetrics()
}

// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
func (p *policy) DescribeMetrics() []*prometheus.Desc {
	return p.active.DescribeMetrics()
}

// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
func (p *policy) CollectMetrics(m Metrics) ([]prometheus.Metric, error) {
	return p.active.CollectMetrics(m)
}

// Register registers a policy backend.
func Register(name, description string, create CreateFn) error {
	log.Info("registering policy '%s'...", name)

	if o, ok := backends[name]; ok {
		return policyError("policy %s already registered (%s)", name, o.description)
	}

	backends[name] = &backend{
		name:        name,
		description: description,
		create:      create,
	}

	return nil
}

// ConstraintToString returns the given constraint as a string.
func ConstraintToString(value Constraint) string {
	switch value.(type) {
	case cpuset.CPUSet:
		return "#" + value.(cpuset.CPUSet).String()
	case int:
		return strconv.Itoa(value.(int))
	case string:
		return value.(string)
	case resource.Quantity:
		qty := value.(resource.Quantity)
		return qty.String()
	default:
		return fmt.Sprintf("<???(type:%T)>", value)
	}
}

// configNotify is the configuration change notification callback for the genric policy layer.
func configNotify(_ config.Event, _ config.Source) error {
	// let the active policy know of changes
	backendOpts.Available = opt.Available
	backendOpts.Reserved = opt.Reserved
	return nil
}


================================================
FILE: pkg/cri/resource-manager/requests.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package resmgr

import (
	"context"
	"fmt"
	"strings"

	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"

	pkgcfg "github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	config "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/config"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	"github.com/intel/cri-resource-manager/pkg/cri/server"
)

const (
	kubeAPIVersion = "0.1.0"
)

var knownRuntimes = []string{
	"containerd",
	"cri-o",
}

// setupRequestProcessing prepares the resource manager for CRI request processing.
func (m *resmgr) setupRequestProcessing() error {
	interceptors := map[string]server.Interceptor{
		"RunPodSandbox":    m.RunPod,
		"StopPodSandbox":   m.StopPod,
		"RemovePodSandbox": m.RemovePod,

		"CreateContainer": m.CreateContainer,
		"StartContainer":  m.StartContainer,
		"StopContainer":   m.StopContainer,
		"RemoveContainer": m.RemoveContainer,
		"ListContainers":  m.ListContainers,

		"UpdateContainerResources": m.UpdateContainer,
	}

	if err := m.relay.Server().RegisterInterceptors(interceptors); err != nil {
		return resmgrError("failed to register resource-manager CRI interceptors: %v", err)
	}

	return nil
}

// disambiguate produces disambiguation context for a request/reply dump.
func (m *resmgr) disambiguate(msg interface{}) string {
	var qualifier string

	m.RLock()
	defer m.RUnlock()

	switch req := msg.(type) {
	case *criv1.RunPodSandboxRequest:
		if req.Config != nil && req.Config.Metadata != nil {
			qualifier = req.Config.Metadata.Name
		}
	case *criv1.StopPodSandboxRequest:
		if pod, ok := m.cache.LookupPod(req.PodSandboxId); ok {
			qualifier = pod.GetName()
		} else {
			qualifier = "unknown pod " + req.PodSandboxId
		}
	case *criv1.RemovePodSandboxRequest:
		if pod, ok := m.cache.LookupPod(req.PodSandboxId); ok {
			qualifier = pod.GetName()
		} else {
			qualifier = "unknown pod " + req.PodSandboxId
		}

	case *criv1.CreateContainerRequest:
		switch {
		case req.SandboxConfig == nil || req.SandboxConfig.Metadata == nil:
			qualifier = "missing pod metadata in request"
		case req.Config == nil || req.Config.Metadata == nil:
			qualifier = "missing metadata in request"
		default:
			qualifier = req.SandboxConfig.Metadata.Name + ":" + req.Config.Metadata.Name
		}

	case *criv1.StartContainerRequest:
		if container, ok := m.cache.LookupContainer(req.ContainerId); ok {
			qualifier = container.PrettyName()
		} else {
			qualifier = "unknown container " + req.ContainerId
		}
	case *criv1.StopContainerRequest:
		if container, ok := m.cache.LookupContainer(req.ContainerId); ok {
			qualifier = container.PrettyName()
		} else {
			qualifier = "unknown container " + req.ContainerId
		}
	case *criv1.RemoveContainerRequest:
		if container, ok := m.cache.LookupContainer(req.ContainerId); ok {
			qualifier = container.PrettyName()
		} else {
			qualifier = "unknown container " + req.ContainerId
		}

	case *criv1.UpdateContainerResourcesRequest:
		if container, ok := m.cache.LookupContainer(req.ContainerId); ok {
			qualifier = container.PrettyName()
		} else {
			qualifier = "unknown container " + req.ContainerId
		}
	}

	if qualifier != "" {
		return "<" + qualifier + ">"
	}

	return ""
}

// startRequestProcessing starts request processing by starting the active policy.
func (m *resmgr) startRequestProcessing() error {
	ctx := context.Background()
	add, del, err := m.syncWithCRI(ctx)

	if err != nil {
		return err
	}

	//
	// Notes:
	//   While normally it is enough to release stale containers and allocate
	//   newly discovered ones, if we are switching policies we need to force
	//   reallocating everything. Otherwise containers already present in the
	//   cache would not get properly updated by the new policy.
	//
	if m.policySwitch {
		containers := m.cache.GetContainers()
		cache.SortContainers(containers)
		add, del = containers, containers
		m.policySwitch = false
	}

	if err := m.policy.Start(add, del); err != nil {
		return resmgrError("failed to start policy %s: %v", policy.ActivePolicy(), err)
	}

	if err := m.runPostReleaseHooks(ctx, "startup", del...); err != nil {
		m.Error("startup: failed to run post-release hooks: %v", err)
	}

	return m.cache.Save()
}

// syncWithCRI synchronizes cache pods and containers with the CRI runtime.
func (m *resmgr) syncWithCRI(ctx context.Context) ([]cache.Container, []cache.Container, error) {
	if !m.relay.Client().HasRuntimeService() {
		return nil, nil, nil
	}

	m.Info("synchronizing cache state with CRI runtime...")

	add, del := []cache.Container{}, []cache.Container{}
	pods, err := m.relay.Client().ListPodSandbox(ctx, &criv1.ListPodSandboxRequest{})
	if err != nil {
		return nil, nil, resmgrError("cache synchronization pod query failed: %v", err)
	}

	status := map[string]*cache.PodStatus{}
	for _, pod := range pods.Items {
		if s, err := m.queryPodStatus(ctx, pod.Id); err != nil {
			m.Error("%s: failed to query pod status: %v", pod.Id, err)
		} else {
			status[pod.Id] = s
		}
	}
	_, _, deleted := m.cache.RefreshPods(pods, status)
	for _, c := range deleted {
		m.Info("discovered stale container %s...", c.GetID())
		del = append(del, c)
	}

	containers, err := m.relay.Client().ListContainers(ctx, &criv1.ListContainersRequest{})
	if err != nil {
		return nil, nil, resmgrError("cache synchronization container query failed: %v", err)
	}
	added, deleted := m.cache.RefreshContainers(containers)
	for _, c := range added {
		if c.GetState() != cache.ContainerStateRunning {
			m.Info("ignoring discovered container %s (in state %v)...",
				c.GetID(), c.GetState())
			continue
		}
		m.Info("discovered out-of-sync running container %s...", c.GetID())
		add = append(add, c)
	}
	for _, c := range deleted {
		m.Info("discovered stale container %s...", c.GetID())
		del = append(del, c)
	}

	return add, del, nil
}

func (m *resmgr) queryPodStatus(ctx context.Context, podID string) (*cache.PodStatus, error) {
	response, err := m.relay.Client().PodSandboxStatus(ctx,
		&criv1.PodSandboxStatusRequest{
			PodSandboxId: podID,
			Verbose:      true,
		})
	if err != nil {
		return nil, err
	}

	return cache.ParsePodStatus(response)
}

// RunPod intercepts CRI requests for Pod creation.
func (m *resmgr) RunPod(ctx context.Context, method string, request interface{},
	handler server.Handler) (interface{}, error) {

	reply, rqerr := handler(ctx, request)
	if rqerr != nil {
		m.Error("%s: failed to create pod: %v", method, rqerr)
		return reply, rqerr
	}

	podID := reply.(*criv1.RunPodSandboxResponse).PodSandboxId

	m.Lock()
	defer m.Unlock()

	pod, err := m.cache.InsertPod(podID, request, nil)
	if err != nil {
		m.Error("%s: failed to insert new pod to cache: %v", method, err)
		return nil, resmgrError("%s: failed to insert new pod to cache: %v", method, err)
	}
	m.updateIntrospection()

	// search for any lingering old version and clean up if found
	released := false
	del := []cache.Container{}
	for _, p := range m.cache.GetPods() {
		if p.GetUID() != pod.GetUID() || p == pod {
			continue
		}
		m.Warn("re-creation of pod %s, releasing old one", p.GetName())
		for _, c := range pod.GetInitContainers() {
			m.Info("%s: removing stale init-container %s...", method, c.PrettyName())
			m.policy.ReleaseResources(c)
			c.UpdateState(cache.ContainerStateStale)
			released = true
			del = append(del, c)
		}
		for _, c := range pod.GetContainers() {
			m.Info("%s: removing stale container %s...", method, c.PrettyName())
			m.policy.ReleaseResources(c)
			c.UpdateState(cache.ContainerStateStale)
			released = true
			del = append(del, c)
		}
		m.cache.DeletePod(p.GetID())
	}
	if released {
		if err := m.runPostReleaseHooks(ctx, method, del...); err != nil {
			m.Error("%s: failed to run post-release hooks for lingering pod %s: %v",
				method, pod.GetName(), err)
		}
	}

	m.Info("created pod %s (%s)", pod.GetName(), podID)

	return reply, nil
}

// StopPod intercepts CRI requests for stopping Pods.
func (m *resmgr) StopPod(ctx context.Context, method string, request interface{},
	handler server.Handler) (interface{}, error) {

	reply, rqerr := handler(ctx, request)

	m.Lock()
	defer m.Unlock()

	podID := request.(*criv1.StopPodSandboxRequest).PodSandboxId
	pod, ok := m.cache.LookupPod(podID)

	if !ok {
		m.Warn("%s: failed to look up pod %s, just passing request through", method, podID)
		return reply, rqerr
	}

	if rqerr != nil {
		m.Error("%s: failed to stop pod %s: %v", method, podID, rqerr)
		return reply, rqerr
	}

	m.Info("%s: stopped pod %s (%s)...", method, pod.GetName(), podID)

	released := []cache.Container{}
	for _, c := range pod.GetInitContainers() {
		m.Info("%s: releasing resources for %s...", method, c.PrettyName())
		if err := m.policy.ReleaseResources(c); err != nil {
			m.Warn("%s: failed to release init-container %s: %v", method, c.PrettyName(), err)
		}
		c.UpdateState(cache.ContainerStateExited)
		released = append(released, c)
	}
	for _, c := range pod.GetContainers() {
		m.Info("%s: releasing resources for container %s...", method, c.PrettyName())
		if err := m.policy.ReleaseResources(c); err != nil {
			m.Warn("%s: failed to release container %s: %v", method, c.PrettyName(), err)
		}
		c.UpdateState(cache.ContainerStateExited)
		released = append(released, c)
	}

	if err := m.runPostReleaseHooks(ctx, method, released...); err != nil {
		m.Error("%s: failed to run post-release hooks for pod %s: %v",
			method, pod.GetName(), err)
	}

	m.updateIntrospection()

	return reply, rqerr
}

// RemovePod intercepts CRI requests for Pod removal.
func (m *resmgr) RemovePod(ctx context.Context, method string, request interface{},
	handler server.Handler) (interface{}, error) {

	reply, rqerr := handler(ctx, request)

	m.Lock()
	defer m.Unlock()

	podID := request.(*criv1.RemovePodSandboxRequest).PodSandboxId
	pod, ok := m.cache.LookupPod(podID)

	if !ok {
		m.Warn("%s: failed to look up pod %s, just passing request through", method, podID)
		return reply, rqerr
	}

	if rqerr != nil {
		m.Error("%s: failed to remove pod %s: %v", method, podID, rqerr)
	} else {
		m.Info("%s: removed pod %s (%s)...", method, pod.GetName(), podID)
	}

	released := []cache.Container{}
	for _, c := range pod.GetInitContainers() {
		m.Info("%s: removing stale init-container %s...", method, c.PrettyName())
		if err := m.policy.ReleaseResources(c); err != nil {
			m.Warn("%s: failed to release init-container %s: %v", method, c.PrettyName(), err)
		}
		c.UpdateState(cache.ContainerStateStale)
		released = append(released, c)
	}
	for _, c := range pod.GetContainers() {
		m.Info("%s: removing stale container %s...", method, c.PrettyName())
		if err := m.policy.ReleaseResources(c); err != nil {
			m.Warn("%s: failed to release container %s: %v", method, c.PrettyName(), err)
		}
		c.UpdateState(cache.ContainerStateStale)
		released = append(released, c)
	}

	if err := m.runPostReleaseHooks(ctx, method, released...); err != nil {
		m.Error("%s: failed to run post-release hooks for pod %s: %v",
			method, pod.GetName(), err)
	}

	m.cache.DeletePod(podID)
	m.updateIntrospection()

	return reply, rqerr
}

// CreateContainer intercepts CRI requests for Container creation.
func (m *resmgr) CreateContainer(ctx context.Context, method string, request interface{},
	handler server.Handler) (interface{}, error) {

	m.Lock()
	defer m.Unlock()

	// kubelet doesn't always clean up crashed containers so we try doing it here
	if msg, ok := request.(*criv1.CreateContainerRequest); ok {
		if pod, ok := m.cache.LookupPod(msg.PodSandboxId); ok {
			if msg.Config != nil && msg.Config.Metadata != nil {
				if c, ok := pod.GetContainer(msg.Config.Metadata.Name); ok {
					m.Warn("re-creation of container %s, releasing old one", c.PrettyName())
					m.policy.ReleaseResources(c)
				}
			}
		}
	}

	container, err := m.cache.InsertContainer(request)
	if err != nil {
		m.Error("%s: failed to insert new container to cache: %v", method, err)
		return nil, resmgrError("%s: failed to insert new container to cache: %v", method, err)
	}

	container.SetCRIRequest(request)

	m.Info("%s: creating container %s...", method, container.PrettyName())

	if err := m.policy.AllocateResources(container); err != nil {
		m.Error("%s: failed to allocate resources for container %s: %v",
			method, container.PrettyName(), err)
		m.cache.DeleteContainer(container.GetCacheID())
		return nil, resmgrError("failed to allocate container resources: %v", err)
	}

	container.InsertMount(&cache.Mount{
		Container:   "/.cri-resmgr",
		Host:        m.cache.ContainerDirectory(container.GetCacheID()),
		Readonly:    true,
		Propagation: cache.MountHostToContainer,
	})

	if err := m.runPostAllocateHooks(ctx, method); err != nil {
		m.Error("%s: failed to run post-allocate hooks for %s: %v",
			method, container.PrettyName(), err)
		m.policy.ReleaseResources(container)
		m.runPostReleaseHooks(ctx, method, container)
		m.cache.DeleteContainer(container.GetCacheID())
		return nil, resmgrError("failed to allocate container resources: %v", err)
	}

	container.ClearCRIRequest()
	reply, rqerr := handler(ctx, request)

	if rqerr != nil {
		m.Error("%s: failed to create container %s: %v", method, container.PrettyName(), rqerr)
		m.policy.ReleaseResources(container)
		m.runPostReleaseHooks(ctx, method, container)
		m.cache.DeleteContainer(container.GetCacheID())
		return nil, resmgrError("failed to create container: %v", rqerr)
	}

	m.cache.UpdateContainerID(container.GetCacheID(), reply)
	container.UpdateState(cache.ContainerStateCreated)
	m.updateIntrospection()

	return reply, nil
}

// StartContainer intercepts CRI requests for starting Containers.
func (m *resmgr) StartContainer(ctx context.Context, method string, request interface{},
	handler server.Handler) (interface{}, error) {

	m.Lock()
	defer m.Unlock()

	containerID := request.(*criv1.StartContainerRequest).ContainerId
	container, ok := m.cache.LookupContainer(containerID)

	if !ok {
		m.Warn("%s: failed to look up container %s, just passing request through",
			method, containerID)
		return handler(ctx, request)
	}

	m.Info("%s: starting container %s...", method, container.PrettyName())

	if container.GetState() != cache.ContainerStateCreated {
		m.Error("%s: refusing to start container %s in unexpected state %v",
			method, container.PrettyName(), container.GetState())
		return nil, resmgrError("refusing to start container %s in unexpexted state %v",
			container.PrettyName(), container.GetState())
	}

	reply, rqerr := handler(ctx, request)

	if rqerr != nil {
		m.Error("%s: failed to start container %s: %v", method, container.PrettyName(), rqerr)
		return nil, rqerr
	}

	container.UpdateState(cache.ContainerStateRunning)

	e := &events.Policy{
		Type:   events.ContainerStarted,
		Source: "resource-manager",
		Data:   container,
	}
	if _, err := m.policy.HandleEvent(e); err != nil {
		m.Error("%s: policy failed to handle event %s: %v", method, e.Type, err)
	}

	if err := m.runPostStartHooks(method, container); err != nil {
		m.Error("%s: failed to run post-start hooks for %s: %v",
			method, container.PrettyName(), err)
	}

	m.updateIntrospection()

	return reply, rqerr
}

// StopContainer intercepts CRI requests for stopping Containers.
func (m *resmgr) StopContainer(ctx context.Context, method string, request interface{},
	handler server.Handler) (interface{}, error) {

	reply, rqerr := handler(ctx, request)

	m.Lock()
	defer m.Unlock()

	containerID := request.(*criv1.StopContainerRequest).ContainerId
	container, ok := m.cache.LookupContainer(containerID)

	if !ok {
		m.Warn("%s: failed to look up container %s, just passing request through",
			method, containerID)
		return reply, rqerr
	}

	if rqerr != nil {
		m.Error("%s: failed to stop container %s: %v", method, container.PrettyName(), rqerr)
		return reply, rqerr
	}

	m.Info("%s: stopped container %s...", method, container.PrettyName())

	// Notes:
	//   For now, we assume any error replies from CRI are about the container not
	//   being found, in which case we still go ahead and finish locally stopping it...

	if err := m.policy.ReleaseResources(container); err != nil {
		m.Error("%s: failed to release resources for container %s: %v",
			method, container.PrettyName(), err)
	}

	container.UpdateState(cache.ContainerStateExited)

	if err := m.runPostReleaseHooks(ctx, method, container); err != nil {
		m.Error("%s: failed to run post-release hooks for %s: %v",
			method, container.PrettyName(), err)
	}

	m.updateIntrospection()

	return reply, rqerr
}

// RemoveContainer intercepts CRI requests for Container removal.
func (m *resmgr) RemoveContainer(ctx context.Context, method string, request interface{},
	handler server.Handler) (interface{}, error) {

	reply, rqerr := handler(ctx, request)

	m.Lock()
	defer m.Unlock()

	containerID := request.(*criv1.RemoveContainerRequest).ContainerId
	container, ok := m.cache.LookupContainer(containerID)

	if !ok {
		m.Warn("%s: failed to look up container %s, just passing request through",
			method, containerID)
		return reply, rqerr
	}

	if rqerr != nil {
		m.Error("%s: failed to remove container %s: %v", method, container.PrettyName(), rqerr)
	} else {
		m.Info("%s: removed container %s...", method, container.PrettyName())
	}

	if err := m.policy.ReleaseResources(container); err != nil {
		m.Error("%s: failed to release resources for container %s: %v",
			method, container.PrettyName(), err)
	}

	container.UpdateState(cache.ContainerStateStale)

	if err := m.runPostReleaseHooks(ctx, method, container); err != nil {
		m.Error("%s: failed to run post-release hooks for %s: %v",
			method, container.PrettyName(), err)
	}

	m.updateIntrospection()

	return reply, rqerr
}

// ListContainers intercepts CRI requests for listing Containers.
func (m *resmgr) ListContainers(ctx context.Context, method string, request interface{},
	handler server.Handler) (interface{}, error) {

	reply, rqerr := handler(ctx, request)

	if rqerr != nil {
		return reply, rqerr
	}

	if f := request.(*criv1.ListContainersRequest).Filter; f != nil {
		if f.Id != "" || f.State != nil || f.PodSandboxId != "" || len(f.LabelSelector) > 0 {
			return reply, nil
		}
	}

	m.Lock()
	defer m.Unlock()

	clistmap := map[string]*criv1.Container{}
	released := []cache.Container{}
	for _, listed := range reply.(*criv1.ListContainersResponse).Containers {
		clistmap[listed.Id] = listed
		if listed.State != criv1.ContainerState_CONTAINER_EXITED {
			continue
		}
		if c, ok := m.cache.LookupContainer(listed.Id); ok {
			state := c.GetState()
			if state == cache.ContainerStateRunning || state == cache.ContainerStateCreated {
				m.Info("%s: exited, releasing its resources...", c.PrettyName())
				if err := m.policy.ReleaseResources(c); err != nil {
					m.Error("%s: failed to release resources for container %s: %v",
						method, c.PrettyName(), err)
				}
				c.UpdateState(cache.ContainerStateExited)
				released = append(released, c)
			}
		}
	}

	for _, c := range m.cache.GetContainers() {
		if c.GetState() == cache.ContainerStateRunning {
			if _, ok := clistmap[c.GetID()]; !ok {
				m.Info("%s: absent from runtime, releasing its resources...", c.PrettyName())
				if err := m.policy.ReleaseResources(c); err != nil {
					m.Error("%s: failed to release resources for container %s: %v",
						method, c.PrettyName(), err)
				}
				c.UpdateState(cache.ContainerStateStale)
				released = append(released, c)
			}
		}
	}

	if len(released) > 0 {
		if err := m.runPostReleaseHooks(ctx, method, released...); err != nil {
			m.Error("%s: failed to run post-release hooks: %v",
				method, err)
		}
	}
	m.updateIntrospection()

	return reply, nil
}

// UpdateContainer intercepts CRI requests for updating Containers.
func (m *resmgr) UpdateContainer(_ context.Context, _ string, _ interface{},
	_ server.Handler) (interface{}, error) {

	m.Lock()
	defer m.Unlock()

	//
	// Notes:
	//   Once VPA is fully implemented, we need to start passing these
	//   requests on to the active policy.
	//
	// containerID := request.(*criv1.UpdateContainerResourcesRequest).ContainerId
	// container, ok := m.cache.LookupContainer(containerID)
	// if !ok {
	//     m.Warn("%s: failed to look up container %s, just passing request through",
	//         method, containerID)
	//     return handler(ctx, request)
	// }
	//
	// err := m.policy.UpdateResources(container)
	// if err != nil {
	//     m.Error("%s: failed to update resources of container %s: %v", method, containerID, err)
	//     return nil, err
	// }
	//
	// err := m.runPostUpdateHooks(ctx, method)
	// if err != nil {
	//     m.Warn("%s: failed to run post-update hooks for update of container %s: %v",
	//         method, containerID, err)
	// }
	//
	// return &criv1.UpdateContainerResourcesResponse{}, nil
	//

	if !m.warnedCRIUpdate {
		m.Warn("CRI UpdateContainerResources request received. Unless Vertical")
		m.Warn("Pod Autoscaling is fully implemented, this usually indicates that")
		m.Warn("kubelet is running with CPU Manager enabled and 'static' or some")
		m.Warn("other than 'none' policy active. This does not make much sense when")
		m.Warn("CRI Resource Manager is also active and on the kubelet-runtime")
		m.Warn("signalling path. Please consider disabling CPU Manager or setting")
		m.Warn("its active policy to 'none'.")
		m.warnedCRIUpdate = true
	}

	return &criv1.UpdateContainerResourcesResponse{}, nil
}

// RebalanceContainers tries to find a more optimal container resource allocation if necessary.
func (m *resmgr) RebalanceContainers() error {
	m.Lock()
	defer m.Unlock()

	m.Info("rebalancing (reallocating) containers...")

	return m.rebalance("Rebalance")
}

// rebalance triggers a policy-specific rebalancing cycle of containers.
func (m *resmgr) rebalance(method string) error {
	if m.policy == nil {
		return nil
	}

	changes, err := m.policy.Rebalance()

	if err != nil {
		m.Error("%s: rebalancing of containers failed: %v", method, err)
	}

	if changes {
		if err := m.runPostUpdateHooks(context.Background(), method); err != nil {
			m.Error("%s: failed to run post-update hooks: %v", method, err)
			return resmgrError("%s: failed to run post-update hooks: %v", method, err)
		}
	}

	return m.cache.Save()
}

// DeliverPolicyEvent delivers a policy-specific event to the active policy.
func (m *resmgr) DeliverPolicyEvent(e *events.Policy) error {
	m.Lock()
	defer m.Unlock()

	if m.policy == nil {
		return nil
	}

	if e.Source == "" {
		e.Source = "unspecified"
	}

	m.Info("delivering policy event %s.%s...", e.Source, e.Type)

	method := "DeliverPolicyEvent"
	changes, err := m.policy.HandleEvent(e)

	if err != nil {
		m.Error("%s: handling event %s.%s failed: %v", method, e.Source, e.Type, err)
		return err
	}

	if changes {
		if err = m.runPostUpdateHooks(context.Background(), method); err != nil {
			m.Error("%s: failed to run post-update hooks: %v", method, err)
			return resmgrError("%s: failed to run post-update hooks: %v", method, err)
		}
	}

	m.cache.Save()
	return nil
}

// setConfig activates a new configuration, either from the agent or from a file.
func (m *resmgr) setConfig(v interface{}) error {
	var err error

	m.Lock()
	defer m.Unlock()

	switch cfg := v.(type) {
	case *config.RawConfig:
		err = pkgcfg.SetConfig(cfg.Data)
	case string:
		err = pkgcfg.SetConfigFromFile(cfg)
	default:
		err = fmt.Errorf("invalid configuration source/type %T", v)
	}
	if err != nil {
		m.Error("configuration rejected: %v", err)
		return resmgrError("configuration rejected: %v", err)
	}

	if m.policy != nil {
		// synchronize state of controllers with new configuration
		if err = m.control.StartStopControllers(m.cache, m.relay.Client()); err != nil {
			m.Error("failed to synchronize controllers with new configuration: %v", err)
			return resmgrError("failed to synchronize controllers with new configuration: %v", err)
		}

		if err = m.runPostUpdateHooks(context.Background(), "setConfig"); err != nil {
			m.Error("failed to run post-update hooks after reconfiguration: %v", err)
			return resmgrError("failed to run post-update hooks after reconfiguration: %v", err)
		}
	}

	// if we managed to activate a configuration from the agent, store it in the cache
	if cfg, ok := v.(*config.RawConfig); ok {
		m.cache.SetConfig(cfg)
	}

	m.Info("successfully switched to new configuration")

	return nil
}

// runPostAllocateHooks runs the necessary hooks after allocating resources for some containers.
func (m *resmgr) runPostAllocateHooks(ctx context.Context, method string) error {
	for _, c := range m.cache.GetPendingContainers() {
		switch c.GetState() {
		case cache.ContainerStateRunning, cache.ContainerStateCreated:
			if err := m.control.RunPostUpdateHooks(c); err != nil {
				m.Warn("%s post-update hook failed for %s: %v",
					method, c.PrettyName(), err)
			}
			if req, ok := c.ClearCRIRequest(); ok {
				if _, err := m.sendCRIRequest(ctx, req); err != nil {
					m.Warn("%s update of container %s failed: %v",
						method, c.PrettyName(), err)
				}
			}
			m.policy.ExportResourceData(c)
		case cache.ContainerStateCreating:
			if err := m.control.RunPreCreateHooks(c); err != nil {
				m.Warn("%s pre-create hook failed for %s: %v",
					method, c.PrettyName(), err)
			}
			m.policy.ExportResourceData(c)
		default:
			m.Warn("%s: skipping container %s (in state %v)", method,
				c.PrettyName(), c.GetState())
		}
	}
	return nil
}

// runPostStartHooks runs the necessary hooks after having started a container.
func (m *resmgr) runPostStartHooks(method string, c cache.Container) error {
	if err := m.control.RunPostStartHooks(c); err != nil {
		m.Error("%s: post-start hook failed for %s: %v", method, c.PrettyName(), err)
	}
	return nil
}

// runPostReleaseHooks runs the necessary hooks after releaseing resources of some containers
func (m *resmgr) runPostReleaseHooks(ctx context.Context, method string, released ...cache.Container) error {
	for _, c := range released {
		if err := m.control.RunPostStopHooks(c); err != nil {
			m.Warn("post-stop hook failed for %s: %v", c.PrettyName(), err)
		}
		if c.GetState() == cache.ContainerStateStale {
			m.cache.DeleteContainer(c.GetCacheID())
		}
	}
	for _, c := range m.cache.GetPendingContainers() {
		switch state := c.GetState(); state {
		case cache.ContainerStateStale, cache.ContainerStateExited:
			if err := m.control.RunPostStopHooks(c); err != nil {
				m.Warn("post-stop hook failed for %s: %v", c.PrettyName(), err)
			}
			if state == cache.ContainerStateStale {
				m.cache.DeleteContainer(c.GetCacheID())
			}
		case cache.ContainerStateRunning, cache.ContainerStateCreated:
			if err := m.control.RunPostUpdateHooks(c); err != nil {
				m.Warn("post-update hook failed for %s: %v", c.PrettyName(), err)
			}
			if req, ok := c.ClearCRIRequest(); ok {
				if _, err := m.sendCRIRequest(ctx, req); err != nil {
					m.Warn("update of container %s failed: %v", c.PrettyName(), err)
				}
			}
			m.policy.ExportResourceData(c)
		default:
			m.Warn("%s: skipping pending container %s (in state %v)",
				method, c.PrettyName(), c.GetState())
		}
	}
	return nil
}

// runPostUpdateHooks runs the necessary hooks after reconcilation.
func (m *resmgr) runPostUpdateHooks(ctx context.Context, method string) error {
	for _, c := range m.cache.GetPendingContainers() {
		switch c.GetState() {
		case cache.ContainerStateRunning, cache.ContainerStateCreated:
			if err := m.control.RunPostUpdateHooks(c); err != nil {
				return err
			}
			if req, ok := c.GetCRIRequest(); ok {
				if _, err := m.sendCRIRequest(ctx, req); err != nil {
					m.Warn("%s update of container %s failed: %v",
						method, c.PrettyName(), err)
				} else {
					c.ClearCRIRequest()
				}
			}
			m.policy.ExportResourceData(c)
		default:
			m.Warn("%s: skipping container %s (in state %v)", method,
				c.PrettyName(), c.GetState())
		}
	}
	return nil
}

// sendCRIRequest sends the given CRI request, returning the received reply and error.
func (m *resmgr) sendCRIRequest(ctx context.Context, request interface{}) (interface{}, error) {
	client := m.relay.Client()
	switch request.(type) {
	case *criv1.UpdateContainerResourcesRequest:
		req := request.(*criv1.UpdateContainerResourcesRequest)
		m.Debug("sending update request for container %s...", req.ContainerId)
		return client.UpdateContainerResources(ctx, req)
	default:
		return nil, resmgrError("sendCRIRequest: unhandled request type %T", request)
	}
}

func (m *resmgr) checkRuntime(ctx context.Context) error {
	version, err := m.relay.Client().Version(ctx, &criv1.VersionRequest{
		Version: kubeAPIVersion,
	})
	if err != nil {
		return resmgrError("failed to query runtime version: %v", err)
	}

	for _, name := range knownRuntimes {
		if strings.HasPrefix(version.RuntimeName, name) {
			return nil
		}
	}

	if opt.AllowUntestedRuntimes {
		m.Warnf("running with untested/unknown runtime %q", version.RuntimeName)
		return nil
	}

	return rejectRuntimeError(version.RuntimeName)
}

func rejectRuntimeError(name string) error {
	return resmgrError("rejecting untested runtime %s, use --%s to allow it",
		name, allowUntestedRuntimesFlag)
}


================================================
FILE: pkg/cri/resource-manager/resource-manager.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package resmgr

import (
	"context"
	"os"
	"os/signal"
	"path/filepath"
	"strings"
	"sync"

	"golang.org/x/sys/unix"

	pkgcfg "github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/cri/relay"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/agent"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	config "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/config"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/metrics"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/visualizer"
	"github.com/intel/cri-resource-manager/pkg/instrumentation"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/pidfile"
	"github.com/intel/cri-resource-manager/pkg/sysfs"
	"github.com/intel/cri-resource-manager/pkg/topology"

	policyCollector "github.com/intel/cri-resource-manager/pkg/policycollector"
	"github.com/intel/cri-resource-manager/pkg/utils"
)

// ResourceManager is the interface we expose for controlling the CRI resource manager.
type ResourceManager interface {
	// Start starts the resource manager.
	Start() error
	// Stop stops the resource manager.
	Stop()
	// SetConfig dynamically updates the resource manager configuration.
	SetConfig(*config.RawConfig) error
	// SetAdjustment dynamically updates external adjustments.
	SetAdjustment(*config.Adjustment) map[string]error
	// SendEvent sends an event to be processed by the resource manager.
	SendEvent(event interface{}) error
	// Add-ons for testing.
	ResourceManagerTestAPI
}

// resmgr is the implementation of ResourceManager.
type resmgr struct {
	logger.Logger
	sync.RWMutex
	relay        relay.Relay        // our CRI relay
	cache        cache.Cache        // cached state
	policy       policy.Policy      // resource manager policy
	policySwitch bool               // active policy is being switched
	configServer config.Server      // configuration management server
	control      control.Control    // policy controllers/enforcement
	agent        agent.Interface    // connection to cri-resmgr agent
	conf         *config.RawConfig  // pending for saving in cache
	metrics      *metrics.Metrics   // metrics collector/pre-processor
	events       chan interface{}   // channel for delivering events
	stop         chan interface{}   // channel for signalling shutdown to goroutines
	signals      chan os.Signal     // signal channel
	introspect   *introspect.Server // server for external introspection

	warnedCRIUpdate bool // warned about CRI UpdateContainer calls
}

// NewResourceManager creates a new ResourceManager instance.
func NewResourceManager() (ResourceManager, error) {
	m := &resmgr{Logger: logger.NewLogger("resource-manager")}

	if err := m.setupCache(); err != nil {
		return nil, err
	}

	sysfs.SetSysRoot(opt.HostRoot)
	topology.SetSysRoot(opt.HostRoot)

	switch {
	case opt.ResetPolicy && opt.ResetConfig:
		os.Exit(m.resetCachedPolicy() + m.resetCachedConfig())
	case opt.ResetPolicy:
		os.Exit(m.resetCachedPolicy())
	case opt.ResetConfig:
		os.Exit(m.resetCachedConfig())
	}

	if err := m.checkOpts(); err != nil {
		return nil, err
	}

	if err := m.setupAgentInterface(); err != nil {
		return nil, err
	}

	if err := m.loadConfig(); err != nil {
		return nil, err
	}

	if err := m.setupConfigServer(); err != nil {
		return nil, err
	}

	if err := m.setupPolicy(); err != nil {
		return nil, err
	}

	if err := m.registerPolicyMetricsCollector(); err != nil {
		return nil, err
	}

	if err := m.setupRelay(); err != nil {
		pid, _ := pidfile.OwnerPid()
		if pid > 0 {
			m.Error("looks like we're already running as pid %d...", pid)
		}
		return nil, err
	}

	if err := m.setupRequestProcessing(); err != nil {
		return nil, err
	}

	if err := m.setupEventProcessing(); err != nil {
		return nil, err
	}

	if err := m.setupControllers(); err != nil {
		return nil, err
	}

	if err := m.setupIntrospection(); err != nil {
		return nil, err
	}

	return m, nil
}

// Start starts the resource manager.
func (m *resmgr) Start() error {
	m.Info("starting...")

	m.Lock()
	defer m.Unlock()

	if err := m.checkRuntime(context.Background()); err != nil {
		return err
	}

	if err := m.startControllers(); err != nil {
		return err
	}

	if err := m.startRequestProcessing(); err != nil {
		return err
	}

	if err := m.startEventProcessing(); err != nil {
		return err
	}

	m.startIntrospection()

	if err := m.relay.Start(); err != nil {
		return resmgrError("failed to start CRI relay: %v", err)
	}

	if err := pidfile.Remove(); err != nil {
		return resmgrError("failed to remove stale/old PID file: %v", err)
	}
	if err := pidfile.Write(); err != nil {
		return resmgrError("failed to write PID file: %v", err)
	}

	if opt.ForceConfig == "" {
		if err := m.configServer.Start(opt.ConfigSocket); err != nil {
			return resmgrError("failed to start configuration server: %v", err)
		}

		// We never store a forced configuration in the cache. However, if we're not
		// running with a forced configuration, and the configuration is pending to
		// get stored in the cache (IOW, it is a new one acquired from an agent), then
		// then store it in the cache now.
		if m.conf != nil {
			m.cache.SetConfig(m.conf)
			m.conf = nil
		}
	}

	m.Info("up and running")

	return nil
}

// Stop stops the resource manager.
func (m *resmgr) Stop() {
	m.Info("shutting down...")

	m.Lock()
	defer m.Unlock()

	if m.signals != nil {
		close(m.signals)
		m.signals = nil
	}

	m.configServer.Stop()
	m.relay.Stop()
	m.stopIntrospection()
	m.stopEventProcessing()
}

// SetConfig pushes new configuration to the resource manager.
func (m *resmgr) SetConfig(conf *config.RawConfig) error {
	m.Info("applying new configuration from agent...")
	return m.setConfig(conf)
}

// SetAdjustment pushes new external adjustments to the resource manager.
func (m *resmgr) SetAdjustment(adjustment *config.Adjustment) map[string]error {
	m.Info("applying new adjustments from agent...")

	m.Lock()
	defer m.Unlock()
	return m.setAdjustment(adjustment)
}

// setConfigFromFile pushes new configuration to the resource manager from a file.
func (m *resmgr) setConfigFromFile(path string) error {
	m.Info("applying new configuration from file %s...", path)
	return m.setConfig(path)
}

// setAdjustments pushes new external policies to the resource manager.
func (m *resmgr) setAdjustment(adjustments *config.Adjustment) map[string]error {
	m.Info("applying new external adjustments from agent...")

	rebalance, errors := m.cache.SetAdjustment(adjustments)
	if rebalance {
		m.rebalance("setAdjustment")
	}

	return errors
}

// resetCachedPolicy resets the cached active policy and all of its data.
func (m *resmgr) resetCachedPolicy() int {
	m.Info("resetting active policy stored in cache...")
	defer logger.Flush()

	if ls, err := utils.IsListeningSocket(opt.RelaySocket); ls || err != nil {
		m.Error("refusing to reset, looks like an instance of %q is active at socket %q...",
			filepath.Base(os.Args[0]), opt.RelaySocket)
		return 1
	}

	if err := m.cache.ResetActivePolicy(); err != nil {
		m.Error("failed to reset active policy: %v", err)
		return 1
	}
	return 0
}

// resetCachedConfig resets any cached configuration.
func (m *resmgr) resetCachedConfig() int {
	m.Info("resetting cached configuration...")
	defer logger.Flush()

	if ls, err := utils.IsListeningSocket(opt.RelaySocket); ls || err != nil {
		m.Error("refusing to reset, looks like an instance of %q is active at socket %q...",
			filepath.Base(os.Args[0]), opt.RelaySocket)
		return 1
	}

	if err := m.cache.ResetConfig(); err != nil {
		m.Error("failed to reset cached configuration: %v", err)
		return 1
	}
	return 0
}

// setupCache creates a cache and reloads its last saved state if found.
func (m *resmgr) setupCache() error {
	var err error

	options := cache.Options{CacheDir: opt.RelayDir}
	if m.cache, err = cache.NewCache(options); err != nil {
		return resmgrError("failed to create cache: %v", err)
	}

	return nil

}

// setupAgentInterface sets up the connection to the node agent.
func (m *resmgr) setupAgentInterface() error {
	var err error

	if m.agent, err = agent.NewAgentInterface(opt.AgentSocket); err != nil {
		return err
	}

	return nil
}

// setupConfigServer sets up our configuration server for agent notifications.
func (m *resmgr) setupConfigServer() error {
	var err error

	if m.configServer, err = config.NewConfigServer(m.SetConfig, m.SetAdjustment); err != nil {
		return resmgrError("failed to create configuration notification server: %v", err)
	}

	return nil
}

// checkOpts checks the command line options for obvious errors.
func (m *resmgr) checkOpts() error {
	if opt.ForceConfig != "" && opt.FallbackConfig != "" {
		return resmgrError("both fallback (%s) and forced (%s) configurations given",
			opt.FallbackConfig, opt.ForceConfig)
	}

	return nil
}

// loadConfig tries to pick and load (initial) configuration from a number of sources.
func (m *resmgr) loadConfig() error {
	//
	// We try to load initial configuration from a number of sources:
	//
	//    1. use forced configuration file if we were given one
	//    2. use last configuration stored in cache, if we have one and it applies
	//    3. use fallback configuration file if we were given one
	//    4. use empty/builtin default configuration, whatever that is...
	//

	if opt.ForceConfig != "" {
		m.Info("using forced configuration %s...", opt.ForceConfig)
		if err := pkgcfg.SetConfigFromFile(opt.ForceConfig); err != nil {
			return resmgrError("failed to load forced configuration %s: %v",
				opt.ForceConfig, err)
		}
		return m.setupConfigSignal(opt.ForceConfigSignal)
	}

	m.Info("trying last cached configuration...")
	if conf := m.cache.GetConfig(); conf != nil {
		err := pkgcfg.SetConfig(conf.Data)
		if err == nil {
			return nil
		}
		m.Error("failed to activate cached configuration: %v", err)
	}

	if opt.FallbackConfig != "" {
		m.Info("using fallback configuration %s...", opt.FallbackConfig)
		if err := pkgcfg.SetConfigFromFile(opt.FallbackConfig); err != nil {
			return resmgrError("failed to load fallback configuration %s: %v",
				opt.FallbackConfig, err)
		}
		return nil
	}

	m.Warn("no initial configuration found")
	return nil
}

// setupConfigSignal sets up a signal handler for reloading forced configuration.
func (m *resmgr) setupConfigSignal(signame string) error {
	if signame == "" || strings.HasPrefix(strings.ToLower(signame), "disable") {
		return nil
	}

	m.Info("setting up signal %s to reload forced configuration", signame)

	sig := unix.SignalNum(signame)
	if int(sig) == 0 {
		return resmgrError("invalid forced configuration reload signal '%s'", signame)
	}

	m.signals = make(chan os.Signal, 1)
	signal.Notify(m.signals, sig)

	go func(signals <-chan os.Signal) {
		for {
			select {
			case _, ok := <-signals:
				if !ok {
					return
				}
			}

			m.Info("reloading forced configuration %s...", opt.ForceConfig)

			if err := m.setConfigFromFile(opt.ForceConfig); err != nil {
				m.Error("failed to reload forced configuration %s: %v",
					opt.ForceConfig, err)
			}
		}
	}(m.signals)

	return nil
}

// setupPolicy sets up policy with the configured/active backend
func (m *resmgr) setupPolicy() error {
	var err error

	active := policy.ActivePolicy()
	cached := m.cache.GetActivePolicy()

	if active != cached {
		if cached != "" {
			if opt.DisablePolicySwitch {
				m.Error("can't switch policy from %q to %q: policy switching disabled",
					cached, active)
				return resmgrError("cannot load cache with policy %s for active policy %s",
					cached, active)
			}
			if err := m.cache.ResetActivePolicy(); err != nil {
				return resmgrError("failed to reset cached policy %q: %v", cached, err)
			}
		}
		m.cache.SetActivePolicy(active)
		m.policySwitch = true
	}

	options := &policy.Options{AgentCli: m.agent, SendEvent: m.SendEvent}
	if m.policy, err = policy.NewPolicy(m.cache, options); err != nil {
		return resmgrError("failed to create policy %s: %v", active, err)
	}

	return nil
}

// setupRelay sets up the CRI request relay.
func (m *resmgr) setupRelay() error {
	var err error

	options := relay.Options{
		RelaySocket:   opt.RelaySocket,
		ImageSocket:   opt.ImageSocket,
		RuntimeSocket: opt.RuntimeSocket,
		QualifyReqFn:  m.disambiguate,
	}

	options.ImageSocket = strings.TrimPrefix(options.ImageSocket, "unix://")
	options.RuntimeSocket = strings.TrimPrefix(options.RuntimeSocket, "unix://")
	options.RelaySocket = strings.TrimPrefix(options.RelaySocket, "unix://")

	if m.relay, err = relay.NewRelay(options); err != nil {
		return resmgrError("failed to create CRI relay: %v", err)
	}

	if err = m.relay.Setup(); err != nil {
		return resmgrError("failed to create CRI relay: %v", err)
	}

	return nil
}

// setupControllers sets up the resource controllers.
func (m *resmgr) setupControllers() error {
	var err error

	if m.control, err = control.NewControl(); err != nil {
		return resmgrError("failed to create resource controller: %v", err)
	}

	return nil
}

// startControllers start the resource controllers.
func (m *resmgr) startControllers() error {
	if err := m.control.StartStopControllers(m.cache, m.relay.Client()); err != nil {
		return resmgrError("failed to start resource controllers: %v", err)
	}

	return nil
}

// setupIntrospection prepares the resource manager for serving external introspection requests.
func (m *resmgr) setupIntrospection() error {
	mux := instrumentation.GetHTTPMux()

	i, err := introspect.Setup(mux, m.policy.Introspect())
	if err != nil {
		return resmgrError("failed to set up introspection service: %v", err)
	}
	m.introspect = i

	if !opt.DisableUI {
		if err := visualizer.Setup(mux); err != nil {
			m.Error("failed to set up UI for visualization: %v", err)
		}
	} else {
		m.Warn("built-in visualization UIs are disabled")
	}

	return nil
}

// startIntrospection starts serving the external introspection requests.
func (m *resmgr) startIntrospection() {
	m.introspect.Start()
	m.updateIntrospection()
}

// stopInstrospection stops serving external introspection requests.
func (m *resmgr) stopIntrospection() {
	m.introspect.Stop()
}

// updateIntrospection pushes updated data for external introspection·
func (m *resmgr) updateIntrospection() {
	m.introspect.Set(m.policy.Introspect())
}

// registerPolicyMetricsCollector registers policy metrics collector·
func (m *resmgr) registerPolicyMetricsCollector() error {
	pc := &policyCollector.PolicyCollector{}
	pc.SetPolicy(m.policy)
	if pc.HasPolicySpecificMetrics() {
		return pc.RegisterPolicyMetricsCollector()
	}
	m.Info("%s policy has no policy-specific metrics.", policy.ActivePolicy())
	return nil
}


================================================
FILE: pkg/cri/resource-manager/sockets/sockets.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package sockets

const (
	// Containerd is the CRI socket containerd listens on.
	Containerd = "/var/run/containerd/containerd.sock"
	// ResourceManagerRelay is the CRI socket the resource manager listens on.
	ResourceManagerRelay = "/var/run/cri-resmgr/cri-resmgr.sock"
	// ResourceManagerAgent is the socket the resource manager node agent listens on.
	ResourceManagerAgent = "/var/run/cri-resmgr/cri-resmgr-agent.sock"
	// ResourceManagerConfig for resource manager configuration notifications.
	ResourceManagerConfig = "/var/run/cri-resmgr/cri-resmgr-config.sock"
	// DirPermissions is the permissions to create the directory for sockets with.
	DirPermissions = 0711
)


================================================
FILE: pkg/cri/resource-manager/test-api.go
================================================
// Copyright 2029 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build test
// +build test

package resmgr

import (
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
)

// ResourceManagerTestAPI is a post-test verification helper interface.
type ResourceManagerTestAPI interface {
	// GetCache returns the Cache resource manager is running with.
	GetCache() cache.Cache
}

func (m *resmgr) GetCache() cache.Cache {
	return m.cache
}


================================================
FILE: pkg/cri/resource-manager/visualizer/bubbles/assets/css/style.css
================================================
body {
    font-family: Arial;
    color: white;
}

a {
    color: white;
    text-decoration: underline;
}

.node {
    cursor: pointer;
}

.node:hover {
    stroke: #000;
    stroke-width: 1.5px;
}

.node--leaf {
    fill: white;
}

.label {
    font: 11px "Helvetica Neue", Helvetica, Arial, sans-serif;
    text-anchor: middle;
    text-shadow: 0 1px 0 #fff, 1px 0 0 #fff, -1px 0 0 #fff, 0 -1px 0 #fff;
}

.label,
.inner--leaf,
.node--root {
    pointer-events: none;
}

.node--leaf:hover {
    fill: gainsboro;
}

.node--leaf:active {
    pointer-events: none
}


================================================
FILE: pkg/cri/resource-manager/visualizer/bubbles/assets/index.html
================================================
<!DOCTYPE html>
<html>
<head>
    <link rel="stylesheet" type="text/css" href="css/style.css">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <script src="js/d3.v4.min.js"></script>
</head>
<body>
    <div id="graph">
          <svg width="800" height="800"></svg>
    </div>
    <script src="js/ui-json-adapter.js"></script>
    <script src="js/ui.js"></script>
</body>
</html> 


================================================
FILE: pkg/cri/resource-manager/visualizer/bubbles/assets/js/ui-json-adapter.js
================================================
// CRI-RM introspection data to UI JSON data format adaptation.
"use strict";

function AdaptJSON(data) {
    "use strict";
    var root, nodes, containers

    console.log("should translate introspection to d3obj: %o", data)

    root = null
    nodes = new Object()
    containers = new Object()

    // create tree of pools
    for (var name in data.Pools) {
        var p = data.Pools[name]
        var node = new Object()

        console.log("got pool %o: %o", name, p)

        node.name     = p.Name
        node.CPUs     = p.CPUs
        node.Memory   = p.Memory
        node.children = new Array()
        if (p.Parent == "") {
            root = node
            console.log("root set to %o: %o", p.parent, node)
        }
        nodes[name] = node
    }
    for (var name in data.Pools) {
        var p = data.Pools[name]
        var n = nodes[name]

        if (n == null) {
            console.log("failed to look up node %o", name)
        }
        if (p.Children != null) {
            for (var i = 0; i < p.Children.length; i++) {
                var cname = p.Children[i]
                n.children.push(nodes[cname])
            }
        }
    }

    // create lookup table of containers
    for (var pid in data.Pods) {
        var p = data.Pods[pid]

        console.log("got pod %o", pid)

        for (var cid in p.Containers) {
            var c = p.Containers[cid]

            console.log("got container %o", cid)

            node = new Object()
            node.name = p.Name + ":" + c.Name
            node.CPURequest = c.CPURequest
            node.CPULimit = c.CPULimit
            node.MemoryRequest = c.MemoryRequest
            node.MemoryLimit = c.MemoryLimit
            node.Hints = c.Hints
            node.container = c
            containers[cid] = node
        }
    }

    // attach containers to pools
    for (var cid in data.Assignments) {
        var a = data.Assignments[cid]
        var n = containers[cid]
        var shared = ""
        var exclusive = ""
        var cpu = ""
        var sep = ""

        console.log("got assignment for container %o", cid)

        if (a.SharedCPUs != "") {
            shared = "shared:"+a.SharedCPUs+"(share:"+a.CPUShare+")"
        }
        if (a.ExclusiveCPUs != "") {
            exclusive = "exclusive:"+a.ExclusiveCPUs
        }
        if (exclusive != "") {
            cpus = exclusive
            sep = " + "
        }
        if (shared != "") {
            cpu += sep + shared
        }
        n.CPUs         = cpu
        n.Memory       = a.Memory
        n.RDTClass     = a.RDTClass
        n.BlockIOClass = a.BlockIOClass

        p = nodes[a.Pool]
        p.children.push(n)
    }

    console.log("translated object: %o", root)

    return root
}


================================================
FILE: pkg/cri/resource-manager/visualizer/bubbles/assets/js/ui.js
================================================
var svg = d3.select("svg")
   .attr("preserveAspectRatio", "xMinYMin meet")
   .attr("viewBox", "0 0 800 800"),
    margin = 20,
    diameter = +svg.attr("width"),
    g = svg.append("g").attr("transform", "translate(" + diameter / 2 + "," + diameter / 2 + ")");

var green = d3.color("green");

var color = d3.scaleLinear()
    .domain([-1, 5])
    .range(["hsl(152,80%,80%)", "hsl(228,30%,40%)"])
    .interpolate(d3.interpolateHcl);

var pack = d3.pack()
    .size([diameter - margin, diameter - margin])
    .padding(100);

drawBubbleGraph("/introspect")

function drawBubbleGraph(filename) {
    console.log("redraw")

    g.selectAll("*").remove()

    d3.json(filename, function(error, introspectJSON) {
        if (error) throw error;

        var root = AdaptJSON(introspectJSON)
        root = d3.hierarchy(root)
            .sum(function(d) { return d.CPURequest; })
            .sort(function(a, b) { console.log (b.value + " - " + a.value);return b.value - a.value; });


        var focus = root,
        nodes = pack(root).descendants(),
        view;
        console.log(nodes);
        var circle = g.selectAll("circle")
            .data(nodes)
            .enter().append("circle")
                .attr("class", function(d) { console.log("dx: " + d.x + " dy: " + d.y + " dr: " + d.r); console.log(d.data.name); d.parent ? d.children ? console.log("node") : console.log("node leaf") : console.log ("node root");  return d.parent ? d.children ? "node" : "node node--leaf" : "node node--root"; })
                .on("click", function(d) { if (focus !== d) zoom(d), d3.event.stopPropagation(); })
                .on("mouseover", function(d) {return d.children ? null : showData(d);})
                .on("mouseout", function(d) {return d.children ? null : clearData(d);})
                .style("fill", function(d) { return d.children ? color(d.depth) : null; })

        let innercircle = g.selectAll("innercircle")
          .data(nodes)
          .enter().append("circle")
          .attr("class", function(d) { return d.parent ? d.children ? "inner--node" : "inner--leaf" : "inner--root"; })

          let innerleaf = g.selectAll(".inner--leaf")
              .attr("r", function(d) {if (d.data.CPULimit || d.data.CPURequest) return (d.r * d.data.CPULimit / d.data.CPURequest);})
              .style("fill-opacity", 0.2)
              .on("click", function(d) { if (focus !== d) zoom(d), d3.event.stopPropagation(); })
              .style("fill", "red");

        var text = g.selectAll("text")
            .data(nodes)
            .enter().append("text")
                .attr("class", "label")
                .style("fill-opacity", function(d) { return d.parent === root ? 1 : 0; })
                .style("display", function(d) { return d.parent === root ? "inline" : "none"; })
                .text(function(d) { return d.data.name;});

        var node = g.selectAll("circle,innerleaf,text");

    svg
      .style("background", color(-1))
      .on("click", function() { zoom(root); });

    zoomTo([root.x, root.y, root.r * 2 + margin]);

    function zoom(d) {
        var focus0 = focus; focus = d;

        var transition = d3.transition()
            .duration(d3.event.altKey ? 7500 : 750)
            .tween("zoom", function(d) {
              var i = d3.interpolateZoom(view, [focus.x, focus.y, focus.r * 2 + margin]);
              return function(t) { zoomTo(i(t)); };
            });

        svg.transition().selectAll("text")
        .filter(function(d) { return d.parent === focus || this.style.display === "inline"; })
            .style("fill-opacity", function(d) { return d.parent === focus ? 1 : 0; })
            .on("start", function(d) { if (d.parent === focus) this.style.display = "inline"; })
            .on("end", function(d) { if (d.parent !== focus) this.style.display = "none"; });
    }
  function zoomTo(v) {
    var k = diameter / v[2]; view = v;
    node.attr("transform", function(d) { return "translate(" + (d.x - v[0]) * k + "," + (d.y - v[1]) * k + ")"; });
    circle.attr("r", function(d) { if (d.children) return d.r *k; if (d.data.CPULimit && d.data.CPURequest) return d.r * k; else return 20 * k ; })
    circle.style("fill", function(d) { if (d.children) return color(d.depth); if (!d.data.CPULimit || !d.data.CPURequest)return "grey"; else return color(d.depth);});
    innerleaf.attr("r", function(d) { if (d.data.CPULimit && d.data.CPURequest) {
                                        if (d.data.CPULimit == d.data.CPURequest) return d.r  * k;
                                        else return d.r * 2 *k; }});
  }
 
        let current_circle = undefined;
        function clearData(d) {
            console.log("CCLEAR DATA");
                svg.selectAll("#details-popup").remove();
        }

        function showData(d) {
            // cleanup previous selected circle
            if(current_circle !== undefined){
                svg.selectAll("#details-popup").remove();
            }
            console.log("here I am" + d.data.name);

        // select the circle
        current_circle = d3.select(this);
        console.log("here");
        console.log(current_circle);

        let textblock = svg.selectAll("#details-popup")
          .data([d])
          .enter()
          .append("g")
          .attr("id", "details-popup")
          .attr("font-size", 14)
          .attr("font-family", "sans-serif")
          .attr("text-anchor", "start")
          .attr("transform", d => `translate(0, 20)`);

        textblock.append("text")
          .text("Details:")
          .attr("font-weight", "bold");
        textblock.append("text")
          .text(d => "Name: " + d.data.name)
          .attr("y", "16");
        textblock.append("text")
          .text(d => "CPUs: " + d.data.CPUs)
          .attr("y", "32");
        textblock.append("text")
          .text(d => "CPU Request: " + d.data.CPURequest)
          .attr("y", "48");
        textblock.append("text")
          .text(d => "CPU Limit: " + d.data.CPULimit)
          .attr("y", "64");
        textblock.append("text")
          .text(d => "Memory Request: " + d.data.MemoryRequest)
          .attr("y", "80");
        textblock.append("text")
          .text(d => "Memory Limit: " + d.data.MemoryLimit)
          .attr("y", "96");
        }
    });
}


================================================
FILE: pkg/cri/resource-manager/visualizer/bubbles/assets.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build test
// +build test

package bubbles

import (
	"net/http"
)

// Assets is our UI assets for 'bubbles' visualizer, to serve over HTTP.
var Assets = http.Dir("assets")


================================================
FILE: pkg/cri/resource-manager/visualizer/bubbles/assets_generate.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build ignore
// +build ignore

package main

import (
	"fmt"
	visualizer "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/visualizer/bubbles"
	"github.com/shurcooL/vfsgen"
	"log"
)

const (
	name = "bubbles"
)

func main() {
	opts := vfsgen.Options{
		PackageName:  name,
		BuildTags:    "!test",
		VariableName: "Assets",
		Filename:     "assets_gendata.go",
	}
	if err := vfsgen.Generate(visualizer.Assets, opts); err != nil {
		log.Fatalln(fmt.Sprintf("failed to generate assets for %s UI:", name, err))
	}
}


================================================
FILE: pkg/cri/resource-manager/visualizer/bubbles/doc.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package bubbles

import (
	// The blank import is to make govendor happy.
	_ "github.com/shurcooL/vfsgen"
)

//go:generate go run -tags=test assets_generate.go


================================================
FILE: pkg/cri/resource-manager/visualizer/builtins.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !dev
// +build !dev

package visualizer

import (
	// Pull in builtin visualizer implementations.
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/visualizer/bubbles"
)

func init() {
	visualizers.register("bubbles", bubbles.Assets)
}


================================================
FILE: pkg/cri/resource-manager/visualizer/flags.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package visualizer

import (
	"flag"
)

// externalDirs is a comma-separated list of directories to search for visualizers.
var externalDirs string

// Register our command line options.
func init() {
	flag.StringVar(&externalDirs, "external-visualizers", "",
		"comma-separated list of directories to search for external visualizers.")
}


================================================
FILE: pkg/cri/resource-manager/visualizer/visualizer.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package visualizer

import (
	"fmt"
	"net/http"
	"os"
	"path/filepath"
	"sort"
	"strings"

	xhttp "github.com/intel/cri-resource-manager/pkg/instrumentation/http"
	logger "github.com/intel/cri-resource-manager/pkg/log"
)

const (
	// HTTP URI prefix to register all visualizer implementations under.
	visualizerPrefix = "/ui"
)

// Our logger instance.
var log = logger.NewLogger("visualizer")

// visualizer captures our runtime state.
type visualizer struct {
	builtin map[string]http.FileSystem
}

// Visualizer singleton instance.
var visualizers = &visualizer{
	builtin: map[string]http.FileSystem{},
}

// Register registers an builtin visualizer implementation.
func Register(name string, dir http.FileSystem) {
	visualizers.register(name, dir)
}

// Setup sets up the given multiplexer to serve visualization implementations.
func Setup(mux *xhttp.ServeMux) error {
	log.Info("activating visualization interface...")

	mux.Handle("/", http.RedirectHandler("/ui/index.html", http.StatusFound))
	mux.Handle("/ui", http.RedirectHandler("/ui/index.html", http.StatusFound))
	mux.Handle("/ui/builtin/", http.FileServer(visualizers))
	mux.Handle("/ui/external/", http.FileServer(visualizers))
	mux.HandleFunc("/ui/index.html", visualizers.generateIndexHTML)

	return nil
}

// Open is the http.Dir implementation for our visualizers.
func (v *visualizer) Open(path string) (http.File, error) {
	log.Debug("HTTP request %q", path)

	relative, err := filepath.Rel(visualizerPrefix+"/", path)
	if err != nil {
		return nil, visualizerError("failed to resolve path %q: %v", err)
	}

	log.Debug("%s => %s", path, relative)

	split := strings.Split(relative, "/")
	if len(split) < 2 {
		return nil, visualizerError("failed to resolve relative path %q", relative)
	}

	kind, name := split[0], split[1]
	fs, err := v.getVisualizerFileSystem(kind, name)
	if err != nil {
		return nil, err
	}

	return fs.Open(filepath.Join(split[2:]...))
}

// getVisualizerFileSystem returns the http.FileSystem for the given visualizer.
func (v *visualizer) getVisualizerFileSystem(kind, name string) (http.FileSystem, error) {
	switch kind {
	case "builtin":
		if dir, ok := v.builtin[name]; ok {
			return dir, nil
		}
		return nil, visualizerError("unknown builtin visualization UI %q", name)
	case "external":
		external := v.discoverExternalUIs()
		if path, ok := external[name]; ok {
			return http.FileSystem(http.Dir(path)), nil
		}
		return nil, visualizerError("unkown external visualization UI %q", name)
	}
	return nil, visualizerError("unknown visualization UI type %q", kind)
}

// Index page HTML header and footer.
const (
	uiPageHTMLHeader = `
<html>
  <head>
    <title>CRI Resource Manager - Workload Placement Visualization</title>
  </head>
  <body>
    <ul>
`
	uiPageHTMLFooter = `
    </ul>
  </body>
</html>
`
)

// generateIndexHTML generates a HTML page to access all known visualization UIs.
func (v *visualizer) generateIndexHTML(w http.ResponseWriter, _ *http.Request) {
	builtinUIs := []string{}
	for name := range v.builtin {
		builtinUIs = append(builtinUIs, name)
	}
	sort.Strings(builtinUIs)

	externalUIs := []string{}
	for name := range v.discoverExternalUIs() {
		externalUIs = append(externalUIs, name)
	}
	sort.Strings(externalUIs)

	fmt.Fprintf(w, "%s", uiPageHTMLHeader)
	if len(builtinUIs)+len(externalUIs) == 0 {
		fmt.Fprintf(w, "No builtin or external visualization UIs found.")
	} else {
		for _, name := range builtinUIs {
			fmt.Fprintf(w, "<li><a href=\"/ui/builtin/%s\">%s</a>\n", name, name)
		}
		for _, name := range externalUIs {
			fmt.Fprintf(w, "<li><a href=\"/ui/external/%s\">external %s</a>\n", name, name)
		}
	}
	fmt.Fprintf(w, "%s\r\n", uiPageHTMLFooter)
}

// register registers a builtin visualizer implementation.
func (v *visualizer) register(name string, dir http.FileSystem) {
	if _, ok := v.builtin[name]; ok {
		log.Error("builtin visualizer '%s' already registered", name)
		return
	}
	v.builtin[name] = dir
	log.Info("registered %s builtin visualizer...", name)
}

// discoverExternalUIs returns a map of external visualizer implementations.
func (v *visualizer) discoverExternalUIs() map[string]string {
	external := make(map[string]string)
	for _, root := range strings.Split(externalDirs, ",") {
		filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
			if err != nil || info.IsDir() || info.Name() != "index.html" {
				return nil
			}

			dir, err := filepath.Abs(filepath.Dir(path))
			if err != nil {
				log.Error("failed to determine absolute directory for '%s': %v", path, err)
				return nil
			}

			name := v.uniqueExternalUIName(dir, external)
			external[name] = dir

			log.Debug("found external visualizer '%s' (%s)", name, dir)

			return nil
		})
	}
	return external
}

// uniqueExternalUIName generates a unique name for the external visualizer.
func (v *visualizer) uniqueExternalUIName(dir string, others map[string]string) string {
	base := filepath.Base(dir)
	if base == "assets" {
		base = filepath.Base(filepath.Dir(dir))
	}
	cnt := 0
	name := base
	for {
		if cnt > 0 {
			name = base + fmt.Sprintf("-%d", cnt)
		}
		if _, ok := others[name]; !ok {
			return name
		}
		cnt++
	}
}

// visualizerError returns a formatted package-specific error.
func visualizerError(format string, args ...interface{}) error {
	return fmt.Errorf("visualizer: "+format, args...)
}


================================================
FILE: pkg/cri/server/server.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package server

import (
	"context"
	"fmt"
	"net"
	"os"
	"os/user"
	"path/filepath"
	"strconv"
	"strings"
	"time"

	"google.golang.org/grpc"

	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"

	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/sockets"
	"github.com/intel/cri-resource-manager/pkg/dump"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/utils"

	"github.com/intel/cri-resource-manager/pkg/instrumentation"
	"go.opencensus.io/trace"
)

// Options contains the configurable options of our CRI server.
type Options struct {
	// Socket is the path of our gRPC servers unix-domain socket.
	Socket string
	// User is the user ID for our gRPC socket.
	User int
	// Group is the group ID for our gRPC socket.
	Group int
	// Mode is the permission mode bits for our gRPC socket.
	Mode os.FileMode
	// QualifyReqFn produces return context for disambiguating a CRI request/reply.
	QualifyReqFn func(interface{}) string
}

// Handler is a CRI server generic request handler.
type Handler grpc.UnaryHandler

// Interceptor is a hook that intercepts processing a request by a handler.
type Interceptor func(context.Context, string, interface{}, Handler) (interface{}, error)

// Server is the interface we expose for controlling our CRI server.
type Server interface {
	// RegisterImageService registers the provided image service with the server.
	RegisterImageService(criv1.ImageServiceServer) error
	// RegistersRuntimeService registers the provided runtime service with the server.
	RegisterRuntimeService(criv1.RuntimeServiceServer) error
	// RegisterInterceptors registers the given interceptors with the server.
	RegisterInterceptors(map[string]Interceptor) error
	// Start starts the request processing loop (goroutine) of the server.
	Start() error
	// Stop stops the request processing loop (goroutine) of the server.
	Stop()
	// Chmod changes the permissions of the server's socket.
	Chmod(mode os.FileMode) error
	// Chown changes ownership of the server's socket.
	Chown(uid, gid int) error
}

// server is the implementation of Server.
type server struct {
	logger.Logger
	listener     net.Listener                // socket our gRPC server listens on
	server       *grpc.Server                // our gRPC server
	options      Options                     // server options
	interceptors map[string]Interceptor      // request intercepting hooks
	runtime      *criv1.RuntimeServiceServer // CRI runtime service
	image        *criv1.ImageServiceServer   // CRI image service
}

// NewServer creates a new server instance.
func NewServer(options Options) (Server, error) {
	if !filepath.IsAbs(options.Socket) {
		return nil, serverError("invalid socket '%s', expecting absolute path",
			options.Socket)
	}

	s := &server{
		Logger:  logger.NewLogger("cri/server"),
		options: options,
	}

	return s, nil
}

// RegisterImageService registers an image service with the server.
func (s *server) RegisterImageService(service criv1.ImageServiceServer) error {
	if s.image != nil {
		return serverError("can't register image service, already registered")
	}

	if err := s.createGrpcServer(); err != nil {
		return err
	}

	is := service
	s.image = &is
	criv1.RegisterImageServiceServer(s.server, s)

	return nil
}

// RegisterRuntimeService registers a runtime service with the server.
func (s *server) RegisterRuntimeService(service criv1.RuntimeServiceServer) error {
	if s.runtime != nil {
		return serverError("can't register runtime server, already registered")
	}

	if err := s.createGrpcServer(); err != nil {
		return err
	}

	rs := service
	s.runtime = &rs
	criv1.RegisterRuntimeServiceServer(s.server, s)

	return nil
}

// RegisterInterceptors registers the given interveptors with the server.
func (s *server) RegisterInterceptors(intercept map[string]Interceptor) error {
	if s.interceptors == nil {
		s.interceptors = make(map[string]Interceptor)
	}

	for method, i := range intercept {
		if _, ok := s.interceptors[method]; ok {
			return serverError("server already has a registered interceptor for '%s'", method)
		}
		s.interceptors[method] = i
	}

	return nil
}

// Start starts the servers request processing goroutine.
func (s *server) Start() error {
	s.trainMessageDumper()

	s.Debug("starting server on socket %s...", s.options.Socket)
	go func() {
		s.server.Serve(s.listener)
	}()

	s.Debug("waiting for server to become ready...")
	if err := utils.WaitForServer(s.options.Socket, time.Second); err != nil {
		return serverError("starting CRI server failed: %v", err)
	}

	return nil
}

// Stop serving CRI requests.
func (s *server) Stop() {
	s.Debug("stopping server on socket %s...", s.options.Socket)
	s.server.Stop()
}

// createGrpcServer creates a gRPC server instance on our socket.
func (s *server) createGrpcServer() error {
	if s.server != nil {
		return nil
	}

	if err := os.MkdirAll(filepath.Dir(s.options.Socket), sockets.DirPermissions); err != nil {
		return serverError("failed to create directory for socket %s: %v",
			s.options.Socket, err)
	}

	l, err := net.Listen("unix", s.options.Socket)
	if err != nil {
		if ls, lsErr := utils.IsListeningSocket(s.options.Socket); ls || lsErr != nil {
			return serverError("failed to create server: socket %q already exists",
				s.options.Socket)
		}
		s.Warn("removing abandoned socket %q...", s.options.Socket)
		os.Remove(s.options.Socket)
		l, err = net.Listen("unix", s.options.Socket)
		if err != nil {
			return serverError("failed to create server on socket %s: %v",
				s.options.Socket, err)
		}
	}

	s.listener = l

	if s.options.User >= 0 {
		if err := s.Chown(s.options.User, s.options.Group); err != nil {
			l.Close()
			s.listener = nil
			return err
		}
	}

	if s.options.Mode != 0 {
		if err := s.Chmod(s.options.Mode); err != nil {
			l.Close()
			s.listener = nil
			return err
		}
	}

	s.server = grpc.NewServer(instrumentation.InjectGrpcServerTrace()...)

	return nil
}

// Chmod changes the permissions of the server's socket.
func (s *server) Chmod(mode os.FileMode) error {
	if s.listener != nil {
		if err := os.Chmod(s.options.Socket, mode); err != nil {
			return serverError("failed to change permissions of socket %q to %v: %v",
				s.options.Socket, mode, err)
		}
		s.Info("changed permissions of socket %q to %v", s.options.Socket, mode)
	}

	s.options.Mode = mode

	return nil
}

// Chown changes ownership of the server's socket.
func (s *server) Chown(uid, gid int) error {
	if s.listener != nil {
		userName := strconv.FormatInt(int64(uid), 10)
		if u, err := user.LookupId(userName); u != nil && err == nil {
			userName = u.Name
		}
		groupName := strconv.FormatInt(int64(gid), 10)
		if g, err := user.LookupGroupId(groupName); g != nil && err == nil {
			groupName = g.Name
		}
		if err := os.Chown(s.options.Socket, uid, gid); err != nil {
			return serverError("failed to change ownership of socket %q to %s/%s: %v",
				s.options.Socket, userName, groupName, err)
		}
		s.Info("changed ownership of socket %q to %s/%s", s.options.Socket, userName, groupName)
	}

	s.options.User = uid
	s.options.Group = gid

	return nil
}

// getInterceptor finds an interceptor for the given method.
func (s *server) getInterceptor(method string) (Interceptor, string) {
	name := method[strings.LastIndex(method, "/")+1:]

	if fn, ok := s.interceptors[name]; ok {
		return fn, name
	}

	if fn, ok := s.interceptors["*"]; ok {
		return fn, name
	}

	return nil, name
}

// intercept processes requests with a registered interceptor or the default handler.
func (s *server) intercept(ctx context.Context, req interface{},
	info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) {
	// Notes:
	//   We record timestamps at various phases of processing a request to later
	//   calculate local, CRI-server and total request processing latencies. We
	//   wrap the original handler to get the pre- and post-communication stamps
	//   with reasonable accuracy without having to get the stamps at the client.
	//
	//   One thing that we currently fail to measure separately is the latency of
	//   internally generated CRI requests (UpdateContainerResources). These are
	//   now accounted to the local processing latency of the triggering request.

	var kind string
	var start, send, recv, end time.Time
	var sync bool

	wrapHandler := func(ctx context.Context, req interface{}) (interface{}, error) {
		send = time.Now()
		rpl, err := handler(ctx, req)
		recv = time.Now()
		return rpl, err
	}

	fn, name := s.getInterceptor(info.FullMethod)
	if fn != nil {
		kind = "intercepted"
		sync = true
	} else {
		kind = "passthrough"
		fn = func(c context.Context, n string, r interface{}, h Handler) (interface{}, error) {
			rpl, err := h(c, r)
			return rpl, err
		}
	}

	qualif := s.qualifier(req)
	dump.RequestMessage(kind, info.FullMethod, qualif, req, sync)

	if span := trace.FromContext(ctx); span != nil {
		span.AddAttributes(trace.StringAttribute("kind", kind))
	}

	start = time.Now()
	rpl, err := fn(ctx, name, req, wrapHandler)
	end = time.Now()
	elapsed := end.Sub(start)

	if err != nil {
		dump.ReplyMessage(kind, info.FullMethod, qualif, err, elapsed, false)
	} else {
		dump.ReplyMessage(kind, info.FullMethod, qualif, rpl, elapsed, false)
	}

	s.collectStatistics(kind, name, start, send, recv, end)
	logger.Flush()

	return rpl, err
}

// collectStatistics collects (should collect) request processing statistics.
func (s *server) collectStatistics(kind, name string, start, send, recv, end time.Time) {
	if kind == "passthrough" {
		return
	}

	pre := send.Sub(start)
	server := recv.Sub(send)
	post := end.Sub(recv)

	s.Debug(" * latency for %s: preprocess: %v, CRI server: %v, postprocess: %v, total: %v",
		name, pre, server, post, pre+server+post)
}

// trainMessageDumper pre-trains the message dumper with our full set of service methods.
func (s server) trainMessageDumper() {
	methods := []string{}
	svcinfo := s.server.GetServiceInfo()
	for _, info := range svcinfo {
		for _, m := range info.Methods {
			methods = append(methods, m.Name)
		}
	}
	dump.Train(methods)
}

// qualifier pulls a qualifier for disambiguation from a CRI request message.
func (s server) qualifier(msg interface{}) string {
	if fn := s.options.QualifyReqFn; fn != nil {
		return fn(msg)
	}
	return ""
}

// Return a formatter server error.
func serverError(format string, args ...interface{}) error {
	return fmt.Errorf("cri/server: "+format, args...)
}


================================================
FILE: pkg/cri/server/services.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package server

import (
	"context"

	"go.opencensus.io/trace"
	"google.golang.org/grpc"

	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"
)

const (
	apiVersion = "v1"

	imageService = "ImageService"
	listImages   = "ListImages"
	imageStatus  = "ImageStatus"
	pullImage    = "PullImage"
	removeImage  = "RemoveImage"
	imageFsInfo  = "ImageFsInfo"

	runtimeService           = "RuntimeService"
	version                  = "Version"
	runPodSandbox            = "RunPodSandbox"
	stopPodSandbox           = "StopPodSandbox"
	removePodSandbox         = "RemovePodSandbox"
	podSandboxStatus         = "PodSandboxStatus"
	listPodSandbox           = "ListPodSandbox"
	createContainer          = "CreateContainer"
	startContainer           = "StartContainer"
	stopContainer            = "StopContainer"
	removeContainer          = "RemoveContainer"
	listContainers           = "ListContainers"
	containerStatus          = "ContainerStatus"
	updateContainerResources = "UpdateContainerResources"
	reopenContainerLog       = "ReopenContainerLog"
	execSync                 = "ExecSync"
	exec                     = "Exec"
	attach                   = "Attach"
	portForward              = "PortForward"
	containerStats           = "ContainerStats"
	listContainerStats       = "ListContainerStats"
	podSandboxStats          = "PodSandboxStats"
	listPodSandboxStats      = "ListPodSandboxStats"
	updateRuntimeConfig      = "UpdateRuntimeConfig"
	status                   = "Status"
	checkpointContainer      = "CheckpointContainer"
	getContainerEvents       = "GetContainerEvents"
	listMetricDescriptors    = "ListMetricDescriptors"
	listPodSandboxMetrics    = "ListPodSandboxMetrics"
	runtimeConfig            = "RuntimeConfig"
)

func fqmn(service, method string) string {
	return "/runtime." + apiVersion + "." + service + "/" + method
}

func (s *server) interceptRequest(ctx context.Context, service, method string,
	req interface{}, handler grpc.UnaryHandler) (interface{}, error) {

	if span := trace.FromContext(ctx); span != nil {
		span.AddAttributes(
			trace.StringAttribute("service", service),
			trace.StringAttribute("method", method))
	}

	return s.intercept(ctx, req,
		&grpc.UnaryServerInfo{Server: s, FullMethod: fqmn(service, method)}, handler)
}

func (s *server) ListImages(ctx context.Context,
	req *criv1.ListImagesRequest) (*criv1.ListImagesResponse, error) {
	rsp, err := s.interceptRequest(ctx, imageService, listImages, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.image).ListImages(ctx, req.(*criv1.ListImagesRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.ListImagesResponse), err
}

func (s *server) ImageStatus(ctx context.Context,
	req *criv1.ImageStatusRequest) (*criv1.ImageStatusResponse, error) {
	rsp, err := s.interceptRequest(ctx, imageService, imageStatus, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.image).ImageStatus(ctx, req.(*criv1.ImageStatusRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.ImageStatusResponse), err
}

func (s *server) PullImage(ctx context.Context,
	req *criv1.PullImageRequest) (*criv1.PullImageResponse, error) {
	rsp, err := s.interceptRequest(ctx, imageService, pullImage, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.image).PullImage(ctx, req.(*criv1.PullImageRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.PullImageResponse), err
}

func (s *server) RemoveImage(ctx context.Context,
	req *criv1.RemoveImageRequest) (*criv1.RemoveImageResponse, error) {
	rsp, err := s.interceptRequest(ctx, imageService, removeImage, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.image).RemoveImage(ctx, req.(*criv1.RemoveImageRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.RemoveImageResponse), err
}

func (s *server) ImageFsInfo(ctx context.Context,
	req *criv1.ImageFsInfoRequest) (*criv1.ImageFsInfoResponse, error) {
	rsp, err := s.interceptRequest(ctx, imageService, imageFsInfo, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.image).ImageFsInfo(ctx, req.(*criv1.ImageFsInfoRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.ImageFsInfoResponse), err
}

func (s *server) Version(ctx context.Context,
	req *criv1.VersionRequest) (*criv1.VersionResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, version, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).Version(ctx, req.(*criv1.VersionRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.VersionResponse), err
}

func (s *server) RunPodSandbox(ctx context.Context,
	req *criv1.RunPodSandboxRequest) (*criv1.RunPodSandboxResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, runPodSandbox, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).RunPodSandbox(ctx, req.(*criv1.RunPodSandboxRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.RunPodSandboxResponse), err
}

func (s *server) StopPodSandbox(ctx context.Context,
	req *criv1.StopPodSandboxRequest) (*criv1.StopPodSandboxResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, stopPodSandbox, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).StopPodSandbox(ctx, req.(*criv1.StopPodSandboxRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.StopPodSandboxResponse), err
}

func (s *server) RemovePodSandbox(ctx context.Context,
	req *criv1.RemovePodSandboxRequest) (*criv1.RemovePodSandboxResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, removePodSandbox, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).RemovePodSandbox(ctx, req.(*criv1.RemovePodSandboxRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.RemovePodSandboxResponse), err
}

func (s *server) PodSandboxStatus(ctx context.Context,
	req *criv1.PodSandboxStatusRequest) (*criv1.PodSandboxStatusResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, podSandboxStatus, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).PodSandboxStatus(ctx, req.(*criv1.PodSandboxStatusRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.PodSandboxStatusResponse), err
}

func (s *server) ListPodSandbox(ctx context.Context,
	req *criv1.ListPodSandboxRequest) (*criv1.ListPodSandboxResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, listPodSandbox, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).ListPodSandbox(ctx, req.(*criv1.ListPodSandboxRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.ListPodSandboxResponse), err
}

func (s *server) CreateContainer(ctx context.Context,
	req *criv1.CreateContainerRequest) (*criv1.CreateContainerResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, createContainer, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).CreateContainer(ctx, req.(*criv1.CreateContainerRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.CreateContainerResponse), err
}

func (s *server) StartContainer(ctx context.Context,
	req *criv1.StartContainerRequest) (*criv1.StartContainerResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, startContainer, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).StartContainer(ctx, req.(*criv1.StartContainerRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.StartContainerResponse), err
}

func (s *server) StopContainer(ctx context.Context,
	req *criv1.StopContainerRequest) (*criv1.StopContainerResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, stopContainer, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).StopContainer(ctx, req.(*criv1.StopContainerRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.StopContainerResponse), err
}

func (s *server) RemoveContainer(ctx context.Context,
	req *criv1.RemoveContainerRequest) (*criv1.RemoveContainerResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, removeContainer, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).RemoveContainer(ctx, req.(*criv1.RemoveContainerRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.RemoveContainerResponse), err
}

func (s *server) ListContainers(ctx context.Context,
	req *criv1.ListContainersRequest) (*criv1.ListContainersResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, listContainers, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).ListContainers(ctx, req.(*criv1.ListContainersRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.ListContainersResponse), err
}

func (s *server) ContainerStatus(ctx context.Context,
	req *criv1.ContainerStatusRequest) (*criv1.ContainerStatusResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, containerStatus, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).ContainerStatus(ctx, req.(*criv1.ContainerStatusRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.ContainerStatusResponse), err
}

func (s *server) UpdateContainerResources(ctx context.Context,
	req *criv1.UpdateContainerResourcesRequest) (*criv1.UpdateContainerResourcesResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, updateContainerResources, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).UpdateContainerResources(ctx,
				req.(*criv1.UpdateContainerResourcesRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.UpdateContainerResourcesResponse), err
}

func (s *server) ReopenContainerLog(ctx context.Context,
	req *criv1.ReopenContainerLogRequest) (*criv1.ReopenContainerLogResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, reopenContainerLog, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).ReopenContainerLog(ctx, req.(*criv1.ReopenContainerLogRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.ReopenContainerLogResponse), err
}

func (s *server) ExecSync(ctx context.Context,
	req *criv1.ExecSyncRequest) (*criv1.ExecSyncResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, execSync, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).ExecSync(ctx, req.(*criv1.ExecSyncRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.ExecSyncResponse), err
}

func (s *server) Exec(ctx context.Context,
	req *criv1.ExecRequest) (*criv1.ExecResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, exec, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).Exec(ctx, req.(*criv1.ExecRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.ExecResponse), err
}

func (s *server) Attach(ctx context.Context,
	req *criv1.AttachRequest) (*criv1.AttachResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, attach, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).Attach(ctx, req.(*criv1.AttachRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.AttachResponse), err
}

func (s *server) PortForward(ctx context.Context,
	req *criv1.PortForwardRequest) (*criv1.PortForwardResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, portForward, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).PortForward(ctx, req.(*criv1.PortForwardRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.PortForwardResponse), err
}

func (s *server) ContainerStats(ctx context.Context,
	req *criv1.ContainerStatsRequest) (*criv1.ContainerStatsResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, containerStats, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).ContainerStats(ctx, req.(*criv1.ContainerStatsRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.ContainerStatsResponse), err
}

func (s *server) ListContainerStats(ctx context.Context,
	req *criv1.ListContainerStatsRequest) (*criv1.ListContainerStatsResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, listContainerStats, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).ListContainerStats(ctx, req.(*criv1.ListContainerStatsRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.ListContainerStatsResponse), err
}

func (s *server) PodSandboxStats(ctx context.Context, req *criv1.PodSandboxStatsRequest) (*criv1.PodSandboxStatsResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, podSandboxStats, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).PodSandboxStats(ctx, req.(*criv1.PodSandboxStatsRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.PodSandboxStatsResponse), err
}

func (s *server) ListPodSandboxStats(ctx context.Context, req *criv1.ListPodSandboxStatsRequest) (*criv1.ListPodSandboxStatsResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, listPodSandboxStats, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).ListPodSandboxStats(ctx, req.(*criv1.ListPodSandboxStatsRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.ListPodSandboxStatsResponse), err
}

func (s *server) UpdateRuntimeConfig(ctx context.Context,
	req *criv1.UpdateRuntimeConfigRequest) (*criv1.UpdateRuntimeConfigResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, updateRuntimeConfig, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).UpdateRuntimeConfig(ctx, req.(*criv1.UpdateRuntimeConfigRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.UpdateRuntimeConfigResponse), err
}

func (s *server) Status(ctx context.Context,
	req *criv1.StatusRequest) (*criv1.StatusResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, status, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).Status(ctx, req.(*criv1.StatusRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.StatusResponse), err
}

func (s *server) CheckpointContainer(ctx context.Context, req *criv1.CheckpointContainerRequest) (*criv1.CheckpointContainerResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, checkpointContainer, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).CheckpointContainer(ctx, req.(*criv1.CheckpointContainerRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.CheckpointContainerResponse), err
}

func (s *server) GetContainerEvents(req *criv1.GetEventsRequest, srv criv1.RuntimeService_GetContainerEventsServer) error {
	// TODO(klihub): interceptRequest is a unary interceptor. It can't handle streaming
	// requests so for now we short-circuit the call to the server here.
	return (*s.runtime).GetContainerEvents(req, srv)
}

func (s *server) ListMetricDescriptors(ctx context.Context, req *criv1.ListMetricDescriptorsRequest) (*criv1.ListMetricDescriptorsResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, listMetricDescriptors, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).ListMetricDescriptors(ctx, req.(*criv1.ListMetricDescriptorsRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.ListMetricDescriptorsResponse), err
}

func (s *server) ListPodSandboxMetrics(ctx context.Context, req *criv1.ListPodSandboxMetricsRequest) (*criv1.ListPodSandboxMetricsResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, listPodSandboxMetrics, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).ListPodSandboxMetrics(ctx, req.(*criv1.ListPodSandboxMetricsRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.ListPodSandboxMetricsResponse), err
}

func (s *server) RuntimeConfig(ctx context.Context, req *criv1.RuntimeConfigRequest) (*criv1.RuntimeConfigResponse, error) {
	rsp, err := s.interceptRequest(ctx, runtimeService, runtimeConfig, req,
		func(ctx context.Context, req interface{}) (interface{}, error) {
			return (*s.runtime).RuntimeConfig(ctx, req.(*criv1.RuntimeConfigRequest))
		})

	if err != nil {
		return nil, err
	}

	return rsp.(*criv1.RuntimeConfigResponse), err
}


================================================
FILE: pkg/dump/doc.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package dump

//
// This package implements the dumping of (gRPC) methods calls where
// each method is called with a single request struct and returns a
// single reply struct or an error. Configuring what to dump happens
// by specifying a comma-separated dump request on the command line.
//
// A dump request is a comma-separated list of dump specs:
//     <spec>[,<spec>,...,<spec>], where each spec is of the form
//     <[target:]request>
// A request is either a requests name (gRPC method name without
// the leading path), or a regexp for matching requests.
// The dump targets are: 'off', 'name', 'full', 'count' by default.
//

var configHelp = `
Dump CRI gRPC method calls as YAML.

This package implements configurable message dumping of CRI gRPC
method calls. Both requests and the resulting replies or errors
can be dumped. Messages can be both logged and dumped to a given
file.

Configuring what to dumps happens using a dump configuration string
of the following format:

  level1:pattern1[,level2:pattern2,...][,debug]

Each level specifies a level of detail for method calls with names
matching the corresponding pattern. A pattern can be a method call
name to match just a single method, or it can be regexp to match
several methods. For regexps all the patterns are evaluated in order
of appearance with the last one staying in effect. Exact method name
patterns terminate the evaluation without any regexp processing.

The possible levels of duping detail are:

  off: suppress dumping of matching requests and replies
  short: short dump of requests and potential error replies
  full: full dump of both request and reply content as YAML

Additionally including 'debug' in the configuration string will
cause messages to be logged as debug messages with the 'message'
log source. Note that debugging for this source needs to be
explicitly enabled, otherwise messages are suppressed.

If a dump file is specified messages will be dumped additionally
to the dump file as well.

Here is a sample configuration fragment to suppress all .*List.*
calls, produce short dumps of all .*Stop.* calls, and full dumps
of everything else, dumps also going to the file '/tmp/cri-dump.log'

  dump:
    config: full:.*,short:.*Stop.*,off:.*List.*
    file: /tmp/cri-dump.log
`


================================================
FILE: pkg/dump/dump.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package dump

//
// This package implements the dumping of (gRPC) methods calls where
// each method is called with a single request struct and returns a
// single reply struct or an error. Configuring what to dump happens
// by specifying a comma-separated dump request on the command line.
//
// A dump request is a comma-separated list of dump specs:
//     <spec>[,<spec>,...,<spec>], where each spec is of the form
//     <[target:]request>
// A request is either a requests name (gRPC method name without
// the leading path), or a regexp for matching requests.
// The dump targets are: 'off', 'name', 'full', 'count' by default.
//

import (
	"fmt"
	"os"
	"sigs.k8s.io/yaml"
	"strings"
	"sync"
	"time"

	logger "github.com/intel/cri-resource-manager/pkg/log"
)

const (
	// stampLayout is the timestamp format used in dump files.
	stampLayout = "2006-Jan-02 15:04:05.000"
	// stampLen is the length we adjust our printed latencies to.
	stampLen = len(stampLayout)
)

// dumper encapsulates the runtime state of our message dumper.
type dumper struct {
	sync.RWMutex                  // protect concurrent dumping/reconfiguration
	rules        ruleset          // dumping rules
	details      map[string]level // corresponding dump details per method
	disabled     bool             // dumping globally disabled
	debug        bool             // dump as debug messages
	path         string           // extra dump file path
	file         *os.File         // extra dump file
	methods      []string         // training set for config
	q            chan *dumpreq
}

// dumpreq is a request to dump a (CRI) request or a reply
type dumpreq struct {
	dir       direction
	kind      string
	method    string
	qualifier string
	msg       interface{}
	latency   time.Duration
	sync      chan struct{}
}

// direction is a message direction, a request or a reply
type direction int

const (
	request = iota
	reply
	nop
)

// Our global dumper instance.
var dump = newDumper()

// Our logger instances, one for generic logging and another for message dumps.
var log = logger.NewLogger("dump")
var message = logger.NewLogger("message")

// Train trains the message dumper for the given set of methods.
func Train(methods []string) {
	dump.Lock()
	defer dump.Unlock()
	dump.train(methods)
}

// RequestMessage dumps a CRI request.
func RequestMessage(kind, name, qualifier string, req interface{}, sync bool) {
	if !dump.disabled {
		var ch chan struct{}
		if sync {
			ch = make(chan struct{})
		}
		dump.q <- &dumpreq{
			dir:       request,
			kind:      kind,
			method:    name,
			qualifier: qualifier,
			msg:       req,
			sync:      ch,
		}
		if ch != nil {
			_ = <-ch
		}
	}
}

// ReplyMessage dumps a CRI reply.
func ReplyMessage(kind, name, qualifier string, rpl interface{}, latency time.Duration, sync bool) {
	if !dump.disabled {
		var ch chan struct{}
		if sync {
			ch = make(chan struct{})
		}
		dump.q <- &dumpreq{
			dir:       reply,
			kind:      kind,
			method:    name,
			qualifier: qualifier,
			msg:       rpl,
			latency:   latency,
			sync:      ch,
		}
		if ch != nil {
			_ = <-ch
		}
	}
}

// Sync returns once the last message currently being dumped is finished.
func Sync() {
	if !dump.disabled {
		dump.sync()
	}
}

// newDumper creates a dumper instance.
func newDumper() *dumper {
	d := &dumper{q: make(chan *dumpreq, 16)}
	d.run()
	return d
}

// run runs the dumping goroutine of the dumper.
func (d *dumper) run() {
	go func() {
		for req := range d.q {
			if req.dir != nop {
				method := methodName(req.method)
				d.RLock()
				detail, ok := d.details[method]
				if !ok {
					detail = d.rules.detailOf(method)
				}
				d.RUnlock()
				switch detail {
				case Name:
					d.name(req.dir, req.kind, method, req.qualifier, req.msg, req.latency)
				case Full:
					d.full(req.dir, req.kind, method, req.qualifier, req.msg, req.latency)
				}
			}
			if req.sync != nil {
				close(req.sync)
			}
		}
	}()
}

// sync waits until all the persent messages in the queue are dumped.
func (d *dumper) sync() {
	ch := make(chan struct{})
	dump.q <- &dumpreq{dir: nop, sync: ch}
	_ = <-ch
}

// configure (re)configures dumper
func (d *dumper) configure(o *options) {
	d.Lock()
	defer d.Unlock()

	d.debug = o.Debug
	d.rules = o.rules.duplicate()

	if d.path != o.File || d.disabled != o.Disabled {
		if d.file != nil {
			log.Info("closing old message dump file %q...", d.path)
			d.file.Close()
			d.file = nil
		}
		d.disabled = o.Disabled

		if d.disabled {
			return
		}

		d.path = o.File
		if d.path != "" {
			var err error
			log.Info("opening new message dump file %q...", d.path)
			d.file, err = os.Create(d.path)
			if err != nil {
				log.Error("failed to open file %q: %v", d.path, err)
			}
		}
	}

	d.train(nil)
}

// train trains the dumper with the given set of messages.
func (d *dumper) train(names []string) {
	if names != nil {
		d.methods = make([]string, len(names), len(names))
	} else {
		names = d.methods
	}
	d.details = make(map[string]level)
	for idx, name := range names {
		method := methodName(name)
		detail := d.rules.detailOf(method)
		log.Info("%s: %v", method, detail)
		d.methods[idx] = method
		d.details[method] = detail
	}
}

// name does a name-only dump of the given message.
func (d *dumper) name(dir direction, kind, method, qualifier string, msg interface{}, latency time.Duration) {
	var hdr string

	switch dir {
	case request:
		return
	case reply:
		if qualifier != "" {
			hdr = qualifier + " " + method + " " + dir.arrow() + " "
		} else {
			hdr = method + " " + dir.arrow() + " "
		}
		if err, ok := msg.(error); ok {
			d.warn(dir, latency, hdr+"(%s) FAILED: %v", kind, err)
		} else {
			d.line(dir, latency, hdr+"(%s) REQUEST", kind)
		}
	}
}

// full does a full dump of the given message.
func (d *dumper) full(dir direction, kind, method, qualifier string, msg interface{}, latency time.Duration) {
	var hdr string

	if qualifier != "" {
		hdr = qualifier + " " + method + " " + dir.arrow() + " "
	} else {
		hdr = method + " " + dir.arrow() + " "
	}

	switch dir {
	case request:
		raw, _ := yaml.Marshal(msg)
		str := strings.TrimRight(string(raw), "\n")
		if strings.LastIndexByte(str, '\n') > 0 {
			d.line(dir, latency, hdr+"(%s) REQUEST", kind)
			d.block(dir, latency, hdr+"    ", str)
		} else {
			d.line(dir, latency, hdr+"(%s) REQUEST %s", kind, str)
		}

	case reply:
		if err, ok := msg.(error); ok {
			d.warn(dir, latency, hdr+"(%s) FAILED", kind)
			d.warn(dir, latency, hdr+"    %v", err)
		} else {
			raw, _ := yaml.Marshal(msg)
			str := strings.TrimRight(string(raw), "\n")
			if strings.LastIndexByte(str, '\n') > 0 {
				d.line(dir, latency, hdr+"(%s) REPLY", kind)
				d.block(dir, latency, hdr+"    ", str)
			} else {
				d.line(dir, latency, hdr+"(%s) REPLY %s", kind, str)
			}
		}
	}
}

// line dumps a single line.
func (d *dumper) line(dir direction, latency time.Duration, format string, args ...interface{}) {
	msg := fmt.Sprintf(format, args...)
	if !d.debug {
		message.Info("%s", msg)
	} else {
		message.Debug("%s", msg)
	}
	if d.file != nil {
		d.tofile(dir, latency, "%s", msg)
	}
}

// block dumps a block of lines.
func (d *dumper) block(dir direction, latency time.Duration, prefix, msg string) {
	if !d.debug {
		message.InfoBlock(prefix, msg)
	} else {
		message.DebugBlock(prefix, msg)
	}
	if d.file != nil {
		for _, line := range strings.Split(msg, "\n") {
			d.tofile(dir, latency, "%s%s", prefix, line)
		}
	}
}

// warn dumps a single line as a warning.
func (d *dumper) warn(dir direction, latency time.Duration, format string, args ...interface{}) {
	msg := fmt.Sprintf(format, args...)
	message.Warn("%s", msg)
	if d.file != nil {
		d.tofile(dir, latency, "%s", msg)
	}
}

// tofile dumps a single line to a file.
func (d *dumper) tofile(dir direction, latency time.Duration, format string, args ...interface{}) {
	fmt.Fprintf(d.file, "["+stamp(dir, latency)+"] "+format+"\n", args...)
}

// stamp produces a stamp from a direction and a latency.
func stamp(dir direction, latency time.Duration) string {
	switch dir {
	case request:
		return time.Now().Format(stampLayout)
	case reply:
		return fmt.Sprintf("%*s", stampLen, fmt.Sprintf("+%f", latency.Seconds()))
	}
	return ""
}

// String returns a string representing the direction.
func (d direction) String() string {
	switch d {
	case request:
		return "request"
	case reply:
		return "reply"
	}
	return "unknown"
}

// arrow returns an 'ASCII arrow' for the direction.
func (d direction) arrow() string {
	switch d {
	case request:
		return "=>"
	case reply:
		return "<="
	}
	return "<=???=>"
}

// methodName returns the basename of a method.
func methodName(method string) string {
	return method[strings.LastIndex(method, "/")+1:]
}

// dumpError produces a formatted package-specific error.
func dumpError(format string, args ...interface{}) error {
	return fmt.Errorf("dump: "+format, args...)
}


================================================
FILE: pkg/dump/dump_test.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package dump

import (
	"fmt"
	"os"
	"strings"
	"sync"
	"testing"
	"time"

	"github.com/intel/cri-resource-manager/pkg/config"
)

// TestConfigParsing test parsing of dump configuration strings.
func TestConfigParsing(t *testing.T) {
	tcases := []string{
		DefaultConfig,
		"off:.*",
		"full:.*",
		"name:.*",
		"off:.*,full:CreateContainer,StartContainer,StopContainer,RemoveContainer",
		"off:.*,full:.*((PodSandbox)|(Container)),off:.*((Status)|(List)).*",
	}

	for _, cfg := range tcases {
		t.Run("parse config "+cfg, func(t *testing.T) {
			r := ruleset{}
			if err := r.parse(cfg); err != nil {
				t.Errorf("failed to parse dump config string '%s': %v", cfg, err)
			}
			if chk := r.String(); chk != cfg {
				switch {
				case strings.Replace(cfg, "short:", "name:", 1) == chk:
				case strings.Replace(cfg, "suppress:", "off:", 1) == chk:
				case strings.Replace(cfg, "verbose:", "full:", 1) == chk:
				default:
					t.Errorf("expected %s, got %s", cfg, chk)
				}
			}
		})
	}
}

// TestFiltering test message filtering, and a bit of formatting.
func fooTestFiltering(t *testing.T) {
	messages := []interface{}{
		mkmsg(&Type1Message1{}),
		mkmsg(&Type1Message2{}),
		mkmsg(&Type1Message3{}),
		mkmsg(&Type1Whatever{}),
		mkmsg(&Type2Message1{}),
		mkmsg(&Type2Message2{}),
		mkmsg(&Type2Message3{}),
		mkmsg(&Type2Whatever{}),
		mkmsg(&Type3Message1{}),
		mkmsg(&Type3Message2{}),
		mkmsg(&Type3Message3{}),
		mkmsg(&Type3Whatever{}),
	}

	tcases := []filterTest{
		{
			messages: messages,
			config:   "off:.*",
		},
		{
			messages: messages,
			config:   "name:Type1.*",
			details: map[string]level{
				msgmethod(&Type1Message1{}): Name,
				msgmethod(&Type1Message2{}): Name,
				msgmethod(&Type1Message3{}): Name,
				msgmethod(&Type1Whatever{}): Name,
			},
		},
		{
			messages: messages,
			config:   "full:.*Whatever.*",
			details: map[string]level{
				msgmethod(&Type1Whatever{}): Full,
				msgmethod(&Type2Whatever{}): Full,
				msgmethod(&Type3Whatever{}): Full,
			},
		},
		{
			messages: messages,
			config:   "full:.*Whatever.*,off:Type1.*",
			details: map[string]level{
				msgmethod(&Type2Whatever{}): Full,
				msgmethod(&Type3Whatever{}): Full,
			},
		},
		{
			messages: messages,
			config:   "full:.*Message.*,off:Type2.*,name:Type2Whatever",
			details: map[string]level{
				msgmethod(&Type1Message1{}): Full,
				msgmethod(&Type1Message2{}): Full,
				msgmethod(&Type1Message3{}): Full,
				msgmethod(&Type2Whatever{}): Name,
				msgmethod(&Type3Message1{}): Full,
				msgmethod(&Type3Message2{}): Full,
				msgmethod(&Type3Message3{}): Full,
			},
		},
	}

	for _, tc := range tcases {
		t.Run("filter with config "+tc.config, func(t *testing.T) {
			tc.run(t)
		})
	}
}

type filterTest struct {
	messages []interface{}
	config   string
	details  map[string]level
}

const (
	// test log marker to identify logged messages
	marker = "<testmsg>"
)

func (ft *filterTest) setup(train bool) *testlog {
	// override message logger
	logger := &testlog{}
	message = logger

	// create training set/reset messages
	methods := []string{}
	if train {
		for _, msg := range ft.messages {
			methods = append(methods, msgname(msg))
		}
	}
	Train(methods)

	// trigger reconfiguration
	opt.Config = ft.config
	opt.configNotify(config.UpdateEvent, config.ConfigFile)

	return logger
}

func (ft *filterTest) dumpMessages(logger *testlog) []string {
	// dump all test messages and a fake reply for each
	for _, msg := range ft.messages {
		RequestMessage(marker, msgname(msg), "", msg, false)
		ReplyMessage(marker, msgname(msg), "", Reply, time.Duration(0), false)
	}
	dump.sync()

	return logger.info
}

func (ft *filterTest) parseLogs(t *testing.T, logged []string) (map[string]int, map[string]int) {
	// count logged entries and lines per message
	lines := map[string]int{}
	entries := map[string]int{}
	for _, entry := range logged {
		entry = strings.Trim(entry, " ")
		split := strings.Split(entry, " ")
		method := ""
		switch {
		// log line: (marker) {REQUEST|REPLY} method
		case len(split) > 1 && split[0] == "("+marker+")":
			method = split[2]
			entries[method] = entries[method] + 1
		case len(split) > 1:
			// log line continuation: method {=>|<=} content...
			method = split[0]
		}
		if method == "" {
			t.Errorf("failed to parse log entry '%s' for config '%s'", entry, ft.config)
		}

		detail, ok := ft.details[method]
		if !ok || detail == Off {
			t.Errorf("message '%s' should have been filtered for config '%s'",
				method, ft.config)
		}
	}

	return lines, entries
}

func (ft *filterTest) checkResult(t *testing.T, entries map[string]int, lines map[string]int) {
	// check correctness of logged entries and lines per method
	for method, lineCnt := range lines {
		logcnt := entries[method]
		expected := 0
		switch ft.details[method] {
		case Full:
			expected = logcnt/2*(1+LinesPerRequest) + logcnt/2*(1+LinesPerReply)
		case Name:
			expected = logcnt
		}
		if lineCnt != expected {
			t.Errorf("message '%s' expected %d logged lines, got %d for config '%s'",
				method, expected, lineCnt, ft.config)
		}
	}
}

func (ft *filterTest) run(t *testing.T) {
	for _, train := range []bool{false, true} {
		logger := ft.setup(train)
		logged := ft.dumpMessages(logger)
		lines, entries := ft.parseLogs(t, logged)

		ft.checkResult(t, entries, lines)
	}
}

//
// a few message types for testing
//

type Message struct {
	Body []string
}

type Type1Message1 Message
type Type1Message2 Message
type Type1Message3 Message
type Type1Whatever Message
type Type2Message1 Message
type Type2Message2 Message
type Type2Message3 Message
type Type2Whatever Message
type Type3Message1 Message
type Type3Message2 Message
type Type3Message3 Message
type Type3Whatever Message

const (
	LinesPerRequest = 6
	LinesPerReply   = 2
)

var (
	Reply  = []string{"reply", "OK"}
	msgCnt int
)

func mkmsg(o interface{}) interface{} {
	msgCnt++
	body := []string{
		"this",
		"is",
		"message",
		fmt.Sprintf("#%d", msgCnt),
		fmt.Sprintf("of type (%T)", o),
	}

	switch o.(type) {
	case *Type1Message1:
		m := o.(*Type1Message1)
		m.Body = body
	case *Type1Message2:
		m := o.(*Type1Message2)
		m.Body = body
	case *Type1Message3:
		m := o.(*Type1Message3)
		m.Body = body
	case *Type1Whatever:
		m := o.(*Type1Whatever)
		m.Body = body

	case *Type2Message1:
		m := o.(*Type2Message1)
		m.Body = body
	case *Type2Message2:
		m := o.(*Type2Message2)
		m.Body = body
	case *Type2Message3:
		m := o.(*Type2Message3)
		m.Body = body
	case *Type2Whatever:
		m := o.(*Type2Whatever)
		m.Body = body

	case *Type3Message1:
		m := o.(*Type3Message1)
		m.Body = body
	case *Type3Message2:
		m := o.(*Type3Message2)
		m.Body = body
	case *Type3Message3:
		m := o.(*Type3Message3)
		m.Body = body
	case *Type3Whatever:
		m := o.(*Type3Whatever)
		m.Body = body
	}

	return o
}

func msgname(o interface{}) string {
	return strings.ReplaceAll(fmt.Sprintf("%T", o), ".", "/")
}

func msgmethod(o interface{}) string {
	return methodName(msgname(o))
}

//
// test logger to override and check dumping/logging for test.
//

type testlog struct {
	sync.Mutex
	info  []string
	warn  []string
	err   []string
	debug []string
}

func (t *testlog) reset() {
	t.Lock()
	defer t.Unlock()
	t.info = nil
	t.warn = nil
	t.err = nil
	t.debug = nil
}

func (t *testlog) log(save *[]string, prefix, format string, args ...interface{}) {
	msg := fmt.Sprintf(format, args...)
	*save = append(*save, msg)
	fmt.Println("<dump-test> " + prefix + " " + msg)
}

func (t *testlog) Info(format string, args ...interface{}) {
	t.Lock()
	defer t.Unlock()
	t.log(&t.info, "I:", format, args...)
}

func (t *testlog) Warn(format string, args ...interface{}) {
	t.Lock()
	defer t.Unlock()
	t.log(&t.warn, "W:", format, args...)
}

func (t *testlog) Error(format string, args ...interface{}) {
	t.Lock()
	defer t.Unlock()
	t.log(&t.err, "E:", format, args...)
}

func (t *testlog) Debug(format string, args ...interface{}) {
	t.Lock()
	defer t.Unlock()
	t.log(&t.debug, "D:", format, args...)
}

func (t *testlog) Fatal(format string, args ...interface{}) {
	msg := fmt.Sprintf(format, args...)
	fmt.Printf("<dump-test> Fatal error: %s\n", msg)
	os.Exit(1)
}

func (*testlog) Panic(format string, args ...interface{}) {
	msg := fmt.Sprintf(format, args...)
	fmt.Printf("<dump-test> Panic: %s\n", msg)
	panic(msg)
}

func (t *testlog) Infof(format string, args ...interface{}) {
	t.Info(format, args...)
}

func (t *testlog) Warnf(format string, args ...interface{}) {
	t.Warn(format, args...)
}

func (t *testlog) Errorf(format string, args ...interface{}) {
	t.Error(format, args...)
}

func (t *testlog) Debugf(format string, args ...interface{}) {
	t.Debug(format, args...)
}

func (t *testlog) Fatalf(format string, args ...interface{}) {
	t.Fatal(format, args...)
}

func (t *testlog) Panicf(format string, args ...interface{}) {
	t.Panic(format, args...)
}

func (*testlog) Block(fn func(string, ...interface{}), prfx string, frmt string, a ...interface{}) {
	for _, line := range strings.Split(fmt.Sprintf(frmt, a...), "\n") {
		fn("%s%s", prfx, line)
	}
}

func (t *testlog) InfoBlock(prefix string, format string, args ...interface{}) {
	t.Lock()
	defer t.Unlock()
	for _, line := range strings.Split(fmt.Sprintf(format, args...), "\n") {
		t.log(&t.info, "I:", "%s%s", prefix, line)
	}
}

func (t *testlog) WarnBlock(prefix string, format string, args ...interface{}) {
	t.Lock()
	defer t.Unlock()
	for _, line := range strings.Split(fmt.Sprintf(format, args...), "\n") {
		t.log(&t.info, "W:", "%s%s", prefix, line)
	}
}

func (t *testlog) ErrorBlock(prefix string, format string, args ...interface{}) {
	t.Lock()
	defer t.Unlock()
	for _, line := range strings.Split(fmt.Sprintf(format, args...), "\n") {
		t.log(&t.err, "E:", "%s%s", prefix, line)
	}
}

func (t *testlog) DebugBlock(prefix string, format string, args ...interface{}) {
	t.Lock()
	defer t.Unlock()
	for _, line := range strings.Split(fmt.Sprintf(format, args...), "\n") {
		t.log(&t.debug, "I:", "%s%s", prefix, line)
	}
}

func (*testlog) EnableDebug() bool  { return true }
func (*testlog) DebugEnabled() bool { return true }
func (*testlog) Stop()              {}
func (*testlog) Source() string     { return "" }


================================================
FILE: pkg/dump/flags.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package dump

//
// This package implements the dumping of (gRPC) methods calls where
// each method is called with a single request struct and returns a
// single reply struct or an error. Configuring what to dump happens
// by specifying a comma-separated dump request on the command line.
//
// A dump request is a comma-separated list of dump specs:
//     <spec>[,<spec>,...,<spec>], where each spec is of the form
//     <[target:]request>
// A request is either a requests name (gRPC method name without
// the leading path), or a regexp for matching requests.
// The dump targets are: 'off', 'name', 'full', 'count' by default.
//

import (
	"fmt"
	re "regexp"
	"strings"

	"github.com/intel/cri-resource-manager/pkg/config"
)

const (
	// DefaultConfig is the default dump configuration.
	DefaultConfig = "off:.*,short:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*,off:.*Image.*"
)

// Dumping options configurable via the command line or pkg/config.
type options struct {
	Debug    bool    // log messages as debug messages
	Disabled bool    // whether dumping is globally disabled
	File     string  // file to also dump to, if set
	Config   string  // dumping configuration
	rules    ruleset // corresponding dumping rules
}

// ruleset is an ordered set of dumping rules.
type ruleset []*rule

// rule is a single dumping rule, declaring verbosity of a single or a set of methods.
type rule struct {
	method string     // method, '*' wildcard, or regexp matching a set of methods
	regexp *re.Regexp // compiled regexp, if applicable
	detail level      // dumping verbosity
}

// level describes the level of detail to dump.
type level int

const (
	// Off suppresses dumping of matching methods
	Off level = iota
	// Name dumps only success/failure status of matching methods.
	Name
	// Full dumps matching methods with full level of detail.
	Full
)

// Our runtime configuration.
var opt = defaultOptions().(*options)

// parse parses the given string into a ruleset.
func (set *ruleset) parse(value string) error {
	prev := Full
	for _, spec := range strings.Split(value, ",") {
		r := &rule{}
		split := strings.SplitN(spec, ":", 2)
		switch len(split) {
		case 1:
			r.detail = prev
			r.method = split[0]
		case 2:
			switch strings.ToLower(split[0]) {
			case "off", "suppress":
				r.detail = Off
			case "name", "short":
				r.detail = Name
			case "full", "verbose":
				r.detail = Full
			default:
				return dumpError("invalid dump level '%s'", split[0])
			}
			r.method = split[1]
			prev = r.detail
		}

		if strings.ContainsAny(r.method, ".*?+()[]|") && r.method != "*" {
			regexp, err := re.Compile(r.method)
			if err != nil {
				return dumpError("invalid dump method regexp '%s': %v", r.method, err)
			}
			r.regexp = regexp
		}
		*set = append(*set, r)
	}

	return nil
}

// String returns the ruleset as a string.
func (set *ruleset) String() string {
	if set == nil || *set == nil {
		return ""
	}
	prev := Off
	value, sep := "", ""
	for idx, r := range *set {
		detail := ""
		if idx == 0 || r.detail != prev {
			detail = r.detail.String() + ":"
		}
		value += sep + detail + r.method
		sep = ","
		prev = r.detail
	}
	return value
}

// detailOf returns the level of detail for dumping the given method.
func (set *ruleset) detailOf(method string) level {
	log.Debug("%s: checking level of detail...", method)
	if set == nil {
		return Off
	}
	detail := Off
	for _, r := range *set {
		log.Debug("  - checking rule '%s'...", r.method)
		switch {
		case r.method == method:
			log.Debug("    => exact match: %v", r.detail)
			return r.detail
		case r.method == "*":
			log.Debug("    => wildcard match: %v", r.detail)
			detail = r.detail
		case r.regexp != nil && r.regexp.MatchString(method):
			log.Debug("    => regexp match (%s): %v", r.method, r.detail)
			detail = r.detail
		}
	}
	return detail
}

// copy creates a (shallow) copy of the ruleset.
func (set *ruleset) duplicate() ruleset {
	if set == nil || *set == nil {
		return nil
	}
	cp := make([]*rule, len(*set))
	copy(cp, *set)
	return cp
}

// String returns the level of detail as a string.
func (detail level) String() string {
	switch detail {
	case Off:
		return "off"
	case Name:
		return "name"
	case Full:
		return "full"
	}
	return fmt.Sprintf("<invalid dump level of detail %d>", detail)
}

// defaultOptions returns a new options instance, initialized to defaults.
func defaultOptions() interface{} {
	o := &options{Config: DefaultConfig}
	o.rules.parse(DefaultConfig)
	return o
}

// configNotify updates our runtime configuration.
func (o *options) configNotify(event config.Event, _ config.Source) error {
	log.Info("message dumper configuration %v", event)
	log.Info(" * config: %s", o.Config)

	rules := ruleset{}
	if err := rules.parse(o.Config); err != nil {
		return err
	}

	o.rules = rules

	log.Info(" * parsed: %s", o.rules.String())
	log.Info(" * dump file: %v", opt.File)
	log.Info(" * log with debug: %v", opt.Debug)

	dump.configure(o)

	return nil
}

// Register us for command line parsing and configuration handling.
func init() {
	opt.rules.parse(opt.Config)
	config.Register("dump", configHelp, opt, defaultOptions,
		config.WithNotify(opt.configNotify))
}


================================================
FILE: pkg/instrumentation/flags.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package instrumentation

import (
	"encoding/json"
	"os"
	"strconv"
	"strings"
	"time"

	"go.opencensus.io/trace"

	"github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/utils"
)

// Sampling defines how often trace samples are taken.
type Sampling float64

const (
	// Disabled is the trace configuration for disabling tracing.
	Disabled Sampling = 0.0
	// Production is a trace configuration for production use.
	Production Sampling = 0.1
	// Testing is a trace configuration for testing.
	Testing Sampling = 1.0

	// defaultSampling is the default sampling frequency.
	defaultSampling = "0"
	// defaultReportPeriod is the default report period
	defaultReportPeriod = "15s"
	// defaultJaegerCollector is the default Jaeger collector endpoint.
	defaultJaegerCollector = ""
	// defaultJaegerAgent is the default Jaeger agent endpoint.
	defaultJaegerAgent = ""
	// defaultHTTPEndpoint is the default HTTP endpoint serving Prometheus /metrics.
	defaultHTTPEndpoint = ""
	// defaultPrometheusExport is the default state for Prometheus exporting.
	defaultPrometheusExport = "false"
)

// options encapsulates our configurable instrumentation parameters.
type options optstruct

type optstruct struct {
	// Sampling is the sampling frequency for traces.
	Sampling Sampling
	// ReportPeriod is the OpenCensus view reporting period.
	ReportPeriod time.Duration
	// jaegerCollector is the URL to the Jaeger HTTP Thrift collector.
	JaegerCollector string
	// jaegerAgent, if set, defines the address of a Jaeger agent to send spans to.
	JaegerAgent string
	// HTTPEndpoint is our HTTP endpoint, used among others to export Prometheus /metrics.
	HTTPEndpoint string
	// PrometheusExport defines whether we export /metrics to/for Prometheus.
	PrometheusExport bool `json:"PrometheusExport"`
}

// UnmarshalJSON is a resetting JSON unmarshaller for options.
func (o *options) UnmarshalJSON(raw []byte) error {
	ostruct := optstruct{}
	if err := json.Unmarshal(raw, &ostruct); err != nil {
		return instrumentationError("failed to unmashal options: %v", err)
	}
	*o = options(ostruct)
	return nil
}

// Our instrumentation options.
var opt = defaultOptions().(*options)

// MarshalJSON is the JSON marshaller for Sampling values.
func (s Sampling) MarshalJSON() ([]byte, error) {
	return json.Marshal(s.String())
}

// UnmarshalJSON is the JSON unmarshaller for Sampling values.
func (s *Sampling) UnmarshalJSON(raw []byte) error {
	var obj interface{}
	if err := json.Unmarshal(raw, &obj); err != nil {
		return instrumentationError("failed to unmarshal Sampling value: %v", err)
	}
	switch v := obj.(type) {
	case string:
		if err := s.Parse(v); err != nil {
			return err
		}
	case float64:
		*s = Sampling(v)
	default:
		return instrumentationError("invalid Sampling value of type %T: %v", obj, obj)
	}
	return nil
}

// Parse parses the given string to a Sampling value.
func (s *Sampling) Parse(value string) error {
	switch strings.ToLower(value) {
	case "disabled":
		*s = Disabled
	case "testing":
		*s = Testing
	case "production":
		*s = Production
	default:
		f, err := strconv.ParseFloat(value, 64)
		if err != nil {
			return instrumentationError("invalid Sampling value '%s': %v", value, err)
		}
		*s = Sampling(f)
	}
	return nil
}

// String returns the Sampling value as a string.
func (s Sampling) String() string {
	switch s {
	case Disabled:
		return "disabled"
	case Production:
		return "production"
	case Testing:
		return "testing"
	}
	return strconv.FormatFloat(float64(s), 'f', -1, 64)
}

// Sampler returns a trace.Sampler corresponding to the Sampling value.
func (s Sampling) Sampler() trace.Sampler {
	if s == Disabled {
		return trace.NeverSample()
	}
	return trace.ProbabilitySampler(float64(s))
}

// parseEnv parses the environment for default values.
func parseEnv(name, defval string, parsefn func(string) error) {
	if envval := os.Getenv(name); envval != "" {
		err := parsefn(envval)
		if err == nil {
			return
		}
		log.Error("invalid environment %s=%q: %v, using default %q", name, envval, err, defval)
	}
	if err := parsefn(defval); err != nil {
		log.Error("invalid default %s=%q: %v", name, defval, err)
	}
}

// defaultOptions returns a new options instance, all initialized to defaults.
func defaultOptions() interface{} {
	o := &options{}

	type param struct {
		defval  string
		parsefn func(string) error
	}

	params := map[string]param{
		"JAEGER_COLLECTOR": {
			defaultJaegerCollector,
			func(v string) error { o.JaegerCollector = v; return nil },
		},
		"JAEGER_AGENT": {
			defaultJaegerAgent,
			func(v string) error { o.JaegerAgent = v; return nil },
		},
		"HTTP_ENDPOINT": {
			defaultHTTPEndpoint,
			func(v string) error { o.HTTPEndpoint = v; return nil },
		},
		"PROMETHEUS_EXPORT": {
			defaultPrometheusExport,
			func(v string) error {
				enabled, err := utils.ParseEnabled(v)
				if err != nil {
					return err
				}
				o.PrometheusExport = enabled
				return nil
			},
		},
		"SAMPLING_FREQUENCY": {
			defaultSampling,
			func(v string) error { return o.Sampling.Parse(v) },
		},
		"REPORT_PERIOD": {
			defaultReportPeriod,
			func(v string) error {
				d, err := time.ParseDuration(v)
				if err != nil {
					return err
				}
				o.ReportPeriod = d
				return nil
			},
		},
	}

	for envvar, p := range params {
		parseEnv(envvar, p.defval, p.parsefn)
	}

	return o
}

// configNotify is our configuration udpate notification handler.
func configNotify(_ config.Event, _ config.Source) error {
	log.Info("instrumentation configuration is now %v", opt)

	log.Info("reconfiguring...")
	if err := svc.reconfigure(); err != nil {
		log.Error("failed to restart instrumentation: %v", err)
	}

	return nil
}

// Register us for for configuration handling.
func init() {
	config.Register("instrumentation", "Instrumentation for traces and metrics.",
		opt, defaultOptions, config.WithNotify(configNotify))
}


================================================
FILE: pkg/instrumentation/grpc.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package instrumentation

import (
	"google.golang.org/grpc"

	"go.opencensus.io/plugin/ocgrpc"
	"go.opencensus.io/stats/view"
)

// InjectGrpcClientTrace injects gRPC dial options for instrumentation if necessary.
func InjectGrpcClientTrace(opts ...grpc.DialOption) []grpc.DialOption {
	extra := grpc.WithStatsHandler(&ocgrpc.ClientHandler{})

	if len(opts) > 0 {
		opts = append(opts, extra)
	} else {
		opts = []grpc.DialOption{extra}
	}

	return opts
}

// InjectGrpcServerTrace injects gRPC server options for instrumentation if necessary.
func InjectGrpcServerTrace(opts ...grpc.ServerOption) []grpc.ServerOption {
	extra := grpc.StatsHandler(&ocgrpc.ServerHandler{})

	if len(opts) > 0 {
		opts = append(opts, extra)
	} else {
		opts = []grpc.ServerOption{extra}
	}

	return opts
}

// registerGrpcViews registers default client and server trace views for gRPC.
func registerGrpcViews() error {
	log.Debug("registering gRPC trace views...")

	if err := view.Register(ocgrpc.DefaultClientViews...); err != nil {
		return instrumentationError("failed to register default gRPC client views: %v", err)
	}
	if err := view.Register(ocgrpc.DefaultServerViews...); err != nil {
		return instrumentationError("failed to register default gRPC server views: %v", err)
	}

	return nil
}

// unregisterGrpcViews unregisters default client and server trace views for gRPC.
func unregisterGrpcViews() {
	view.Unregister(ocgrpc.DefaultClientViews...)
	view.Unregister(ocgrpc.DefaultServerViews...)
}


================================================
FILE: pkg/instrumentation/http/http.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package http

import (
	"context"
	"fmt"
	"net"
	"net/http"
	"sync"

	logger "github.com/intel/cri-resource-manager/pkg/log"
)

const (
	// httpServer is used in log messages.
	httpServer = "HTTP server"
)

// Our logger instance.
var log = logger.NewLogger("http")

// ServeMux is our HTTP request multiplexer with removable handlers.
type ServeMux struct {
	sync.RWMutex
	handlers map[string]http.Handler
	mux      *http.ServeMux
}

// NewServeMux create a new HTTP request multiplexer.
func NewServeMux() *ServeMux {
	return &ServeMux{
		handlers: make(map[string]http.Handler),
		mux:      http.NewServeMux(),
	}
}

// Handle registers a handler for the given pattern.
func (mux *ServeMux) Handle(pattern string, handler http.Handler) {
	mux.Lock()
	defer mux.Unlock()

	log.Debug("registering handler for %q...", pattern)

	if _, ok := mux.handlers[pattern]; ok {
		log.Error("can't register duplicate HTTP handler for %q", pattern)
		return
	}

	mux.handlers[pattern] = handler
	mux.mux.Handle(pattern, handler)
}

// HandleFunc registers a handler function for the given pattern.
func (mux *ServeMux) HandleFunc(pattern string, fn func(http.ResponseWriter, *http.Request)) {
	mux.Lock()
	defer mux.Unlock()

	log.Debug("registering handler function for %q...", pattern)

	if _, ok := mux.handlers[pattern]; ok {
		log.Error("can't register duplicate HTTP handler function for '%s'", pattern)
		return
	}

	handler := http.HandlerFunc(fn)

	mux.handlers[pattern] = handler
	mux.mux.Handle(pattern, handler)
}

// Unregister unregister any handlers for the given pattern.
func (mux *ServeMux) Unregister(pattern string) (http.Handler, bool) {
	mux.Lock()
	defer mux.Unlock()

	h, ok := mux.handlers[pattern]
	if !ok {
		return nil, false
	}

	log.Debug("unregistering handler for %q...", pattern)

	delete(mux.handlers, pattern)
	mux.mux = http.NewServeMux()
	for pattern, handler := range mux.handlers {
		mux.mux.Handle(pattern, handler)
	}

	return h, true
}

// ServeHTTP serves a HTTP request.
func (mux *ServeMux) ServeHTTP(w http.ResponseWriter, r *http.Request) {
	mux.RLock()
	defer mux.RUnlock()
	log.Debug("serving %s...", r.URL)
	mux.mux.ServeHTTP(w, r)
}

// Server is our HTTP server, with support for unregistering handlers.
type Server struct {
	sync.RWMutex
	server *http.Server
	mux    *ServeMux
}

// NewServer creates a new server instance.
func NewServer() *Server {
	return &Server{
		mux: NewServeMux(),
	}
}

// GetMux returns the mux for this server.
func (s *Server) GetMux() *ServeMux {
	return s.mux
}

// GetAddress returns the current server HTTP endpoint/address.
func (s *Server) GetAddress() string {
	if s.server == nil {
		return ""
	}
	return s.server.Addr
}

// Start sets up the server to listen and serve on the given address.
func (s *Server) Start(addr string) error {
	if addr == "" {
		log.Info("%s is disabled", httpServer)
		return nil
	}

	log.Info("starting %s...", httpServer)

	s.Lock()
	defer s.Unlock()

	s.server = &http.Server{Addr: addr, Handler: s}
	ln, err := net.Listen("tcp", s.server.Addr)
	if err != nil {
		return httpError("can't listen on HTTP TCP address '%s': %v",
			s.server.Addr, err)
	}

	// update address if port was autobound
	if ln.Addr().String() != s.server.Addr {
		s.server.Addr = ln.Addr().String()
	}

	go s.server.Serve(ln)

	return nil
}

// Stop Close()'s the server immediately.
func (s *Server) Stop() {
	log.Info("stopping %s...", httpServer)

	s.Lock()
	defer s.Unlock()

	if s.server == nil {
		return
	}

	s.server.Close()
	s.server = nil
}

// Shutdown shuts down the server gracefully.
func (s *Server) Shutdown(wait bool) {
	var sync chan struct{}

	log.Info("shutting down %s...", httpServer)

	s.Lock()
	defer s.Unlock()

	if s.server == nil {
		return
	}

	if wait {
		sync = make(chan struct{})
		s.server.RegisterOnShutdown(func() {
			close(sync)
		})
	}
	s.server.Shutdown(context.Background())
	_ = <-sync

	s.server = nil
}

// Reconfigure reconfigures the server.
func (s *Server) Reconfigure(addr string) error {
	log.Info("reconfiguring %s...", httpServer)

	if s.GetAddress() != addr {
		return s.Restart(addr)
	}
	return nil
}

// Restart restarts it on the given address.
func (s *Server) Restart(addr string) error {
	log.Info("restarting %s...", httpServer)

	s.Stop()
	return s.Start(addr)
}

// ServeHTTP servers the given HTTP request.
func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {
	s.RLock()
	defer s.RUnlock()
	s.mux.ServeHTTP(w, r)
}

// httpError returns a formatted instrumentation/http-specific error.
func httpError(format string, args ...interface{}) error {
	return fmt.Errorf("instrumentation/http: "+format, args...)
}


================================================
FILE: pkg/instrumentation/http/http_test.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package http

import (
	"io"
	"net/http"
	"testing"
)

func TestStartStop(t *testing.T) {
	srv := NewServer()

	if err := srv.Start(":0"); err != nil {
		t.Errorf("failed to start HTTP server: %v", err)
	}

	srv.Stop()

	if err := srv.Start(":0"); err != nil {
		t.Errorf("failed to start HTTP server: %v", err)
	}

	if err := srv.Restart(":0"); err != nil {
		t.Errorf("failed to restart HTTP server on different port: %v", err)
	}

	if err := srv.Reconfigure(srv.GetAddress()); err != nil {
		t.Errorf("failed to reconfigure HTTP server on same port: %v", err)
	}
	if err := srv.Reconfigure(":0"); err != nil {
		t.Errorf("failed to reconfigure HTTP server on different port: %v", err)
	}

	srv.Stop()
}

type urlTest struct {
	pattern  string
	response string
	fallback string
}

func checkURL(t *testing.T, srv *Server, path, response string, status int) {
	url := "http://" + srv.GetAddress() + path

	res, err := http.Get(url)
	if err != nil {
		t.Errorf("http.Get(%s) failed: %v", url, err)
	}

	if res.StatusCode != status {
		t.Errorf("http.Get(%s) status %d, expected %d", url, res.StatusCode, status)
	}

	txt, err := io.ReadAll(res.Body)
	if err != nil {
		t.Errorf("http.Get(%s) failed to read response: %v", url, err)
	}

	if string(txt) != response {
		t.Errorf("http.Get(%s) unexpected response: %v, expected: %v", url, txt, response)
	}
}

type testHandler struct {
	response string
}

func (h *testHandler) ServeHTTP(w http.ResponseWriter, _ *http.Request) {
	_, _ = w.Write([]byte(h.response))
}

func TestPatternsp(t *testing.T) {
	srv := NewServer()
	mux := srv.GetMux()

	if err := srv.Start(":0"); err != nil {
		t.Errorf("failed to start HTTP server: %v", err)
	}

	rh := &testHandler{"/"}
	ah := &testHandler{"a"}
	bh := &testHandler{"b"}
	ch := &testHandler{"c"}

	mux.Handle("/a", ah)
	checkURL(t, srv, "/a", "a", 200)

	mux.Handle("/b", bh)
	checkURL(t, srv, "/b", "b", 200)

	mux.Handle("/", rh)
	checkURL(t, srv, "/b", "b", 200)

	mux.Unregister("/b")
	checkURL(t, srv, "/b", "/", 200)

	mux.Handle("/b", ch)
	checkURL(t, srv, "/b", "c", 200)

	mux.Unregister("/a")
	checkURL(t, srv, "/a", "/", 200)

	srv.Stop()
}


================================================
FILE: pkg/instrumentation/instrumentation.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package instrumentation

import (
	"fmt"

	"github.com/intel/cri-resource-manager/pkg/instrumentation/http"
	logger "github.com/intel/cri-resource-manager/pkg/log"
)

const (
	// ServiceName is our service name in external tracing and metrics services.
	ServiceName = "CRI-RM"
)

// Our logger instance.
var log = logger.NewLogger("instrumentation")

// Our instrumentation service instance.
var svc = newService()

// GetHTTPMux returns our HTTP request mux for external services.
func GetHTTPMux() *http.ServeMux {
	if svc == nil {
		return nil
	}
	return svc.http.GetMux()
}

// TracingEnabled returns true if the Jaeger tracing sampler is not disabled.
func TracingEnabled() bool {
	if svc == nil {
		return false
	}
	return svc.TracingEnabled()
}

// Start our internal instrumentation services.
func Start() error {
	if svc == nil {
		return instrumentationError("cannot start, no instrumentation service instance")
	}
	return svc.Start()
}

// Stop stops our internal instrumentation services.
func Stop() {
	if svc != nil {
		svc.Stop()
	}
}

// Restart restarts our internal instrumentation services.
func Restart() error {
	if svc == nil {
		return instrumentationError("cannot restart, no instrumentation service instance")
	}
	return svc.Restart()
}

// instrumentationError produces a formatted instrumentation-specific error.
func instrumentationError(format string, args ...interface{}) error {
	return fmt.Errorf("instrumentation: "+format, args...)
}


================================================
FILE: pkg/instrumentation/instrumentation_test.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package instrumentation

import (
	"io"
	"net/http"
	"strings"
	"testing"
)

func TestSamplingIdempotency(t *testing.T) {
	tcases := []Sampling{
		Disabled,
		Testing,
		Production,
		0.2, 0.25, 0.5, 0.75, 0.8,
	}
	for _, tc := range tcases {
		var chk Sampling
		if err := chk.Parse(tc.String()); err != nil {
			t.Errorf("failed to parse Sampling.String() %q: %v", tc, err)
		}
		if chk != tc {
			t.Errorf("expected sampling value for %q: %v, got: %v", tc, tc, chk)
		}
	}
}

func TestPrometheusConfiguration(t *testing.T) {
	log.EnableDebug()

	if opt.HTTPEndpoint == "" {
		opt.HTTPEndpoint = ":0"
	}

	s := newService()
	s.Start()

	address := s.http.GetAddress()
	if strings.HasSuffix(opt.HTTPEndpoint, ":0") {
		opt.HTTPEndpoint = address
	}

	checkPrometheus(t, address, !opt.PrometheusExport)

	opt.PrometheusExport = !opt.PrometheusExport
	s.reconfigure()
	checkPrometheus(t, address, !opt.PrometheusExport)

	opt.PrometheusExport = !opt.PrometheusExport
	s.reconfigure()
	checkPrometheus(t, address, !opt.PrometheusExport)

	opt.PrometheusExport = !opt.PrometheusExport
	s.reconfigure()
	checkPrometheus(t, address, !opt.PrometheusExport)

	s.http.Shutdown(true)
	s.Stop()
}

func checkPrometheus(t *testing.T, server string, shouldFail bool) {
	rpl, err := http.Get("http://" + server + "/metrics")

	switch shouldFail {
	case false:
		if err != nil {
			t.Errorf("Prometheus HTTP GET failed: %v", err)
			return
		}

		if rpl.StatusCode != 200 {
			t.Errorf("Prometheus HTTP GET failed: %s", rpl.Status)
			return
		}

		_, err = io.ReadAll(rpl.Body)
		rpl.Body.Close()
		if err != nil {
			t.Errorf("failed to read Prometheus response: %v", err)
		}
		return

	case true:
		if err == nil && rpl.StatusCode == 200 {
			t.Errorf("Prometheus HTTP GET should have failed, but it didn't.")
			return
		}
	}
}


================================================
FILE: pkg/instrumentation/jaeger.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package instrumentation

import (
	"contrib.go.opencensus.io/exporter/jaeger"
	"go.opencensus.io/trace"
)

const (
	// jaegerExporter is used in log messages.
	jaegerExporter = "Jaeger trace exporter"
)

// tracing encapsulates the state of our Jaeger exporter.
type tracing struct {
	exporter  *jaeger.Exporter
	agent     string
	collector string
	sampling  Sampling
}

// start starts our Jaeger exporter.
func (t *tracing) start(agent, collector string, sampling Sampling) error {
	if agent == "" && collector == "" {
		log.Info("%s is disabled", jaegerExporter)
		return nil
	}

	log.Info("creating %s...", jaegerExporter)

	cfg := jaeger.Options{
		ServiceName:       ServiceName,
		CollectorEndpoint: collector,
		AgentEndpoint:     agent,

		Process: jaeger.Process{ServiceName: ServiceName},
		OnError: func(err error) { log.Error("jaeger error: %v", err) },
	}

	exp, err := jaeger.NewExporter(cfg)
	if err != nil {
		return instrumentationError("failed to create %s: %v", jaegerExporter, err)
	}

	t.exporter = exp
	t.agent = agent
	t.collector = collector
	t.sampling = sampling

	trace.RegisterExporter(t.exporter)
	trace.ApplyConfig(trace.Config{DefaultSampler: t.sampling.Sampler()})

	return nil
}

// stop stops our Jaeger exporter.
func (t *tracing) stop() {
	if t.exporter == nil {
		return
	}

	log.Info("stopping Jaeger trace exporter...")

	trace.UnregisterExporter(t.exporter)
	*t = tracing{}
}

// reconfigure reconfigures our Jaeger exporter.
func (t *tracing) reconfigure(agent, collector string, sampling Sampling) error {
	log.Info("reconfiguring %s...", jaegerExporter)

	if agent == "" && collector == "" {
		t.stop()
		return nil
	}

	if t.agent != agent || t.collector != collector {
		t.stop()
	}

	if t.exporter != nil {
		t.sampling = sampling
		trace.ApplyConfig(trace.Config{DefaultSampler: t.sampling.Sampler()})
		return nil
	}

	return t.start(agent, collector, sampling)
}


================================================
FILE: pkg/instrumentation/prometheus.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package instrumentation

import (
	"strings"
	"sync"
	"time"

	"contrib.go.opencensus.io/exporter/prometheus"
	pclient "github.com/prometheus/client_golang/prometheus"
	model "github.com/prometheus/client_model/go"
	"go.opencensus.io/stats/view"

	"github.com/intel/cri-resource-manager/pkg/instrumentation/http"
)

const (
	// PrometheusMetricsPath is the URL path for exposing metrics to Prometheus.
	PrometheusMetricsPath = "/metrics"
	// prometheusExporter is used in log messages.
	prometheusExporter = "Prometheus metrics exporter"
)

// metrics encapsulates the state of our Prometheus exporter.
type metrics struct {
	exporter *prometheus.Exporter
	mux      *http.ServeMux
	period   time.Duration
}

// start starts our Prometheus exporter.
func (m *metrics) start(mux *http.ServeMux, period time.Duration, enable bool) error {
	if !enable {
		log.Info("%s is disabled", prometheusExporter)
		return nil
	}

	log.Info("starting %s...", prometheusExporter)

	cfg := prometheus.Options{
		Namespace: prometheusNamespace(ServiceName),
		Gatherer:  pclient.Gatherers{dynamicGatherers},
		OnError:   func(err error) { log.Error("prometheus error: %v", err) },
	}

	exp, err := prometheus.NewExporter(cfg)
	if err != nil {
		return instrumentationError("failed to create %s: %v", prometheusExporter, err)
	}

	m.exporter = exp
	m.mux = mux
	m.period = period

	m.mux.Handle(PrometheusMetricsPath, m.exporter)
	view.RegisterExporter(m.exporter)
	view.SetReportingPeriod(m.period)

	return nil
}

// stop stops our Prometheus exporter.
func (m *metrics) stop() {
	if m.exporter == nil {
		return
	}

	log.Info("stopping %s...", prometheusExporter)

	view.UnregisterExporter(m.exporter)
	m.mux.Unregister(PrometheusMetricsPath)

	*m = metrics{}
}

// reconfigure reconfigures our Prometheus exporter.
func (m *metrics) reconfigure(mux *http.ServeMux, period time.Duration, enable bool) error {
	log.Info("reconfiguring %s...", prometheusExporter)

	if !enable {
		m.stop()
		return nil
	}

	if m.exporter != nil {
		m.period = period
		view.SetReportingPeriod(m.period)
		return nil
	}

	return m.start(mux, period, enable)
}

// mutate service name into a valid Prometheus namespace name.
func prometheusNamespace(service string) string {
	return strings.ReplaceAll(strings.ToLower(service), "-", "_")
}

// gatherers is a trivial wrapper around prometheus Gatherers.
type gatherers struct {
	sync.RWMutex
	gatherers pclient.Gatherers
}

// Our dynamically registered Prometheus gatherers.
var dynamicGatherers = &gatherers{gatherers: pclient.Gatherers{}}

// Register registers a new gatherer.
func (g *gatherers) Register(gatherer pclient.Gatherer) {
	g.Lock()
	defer g.Unlock()
	g.gatherers = append(g.gatherers, gatherer)
}

// Gather implements the pclient.Gatherer interface.
func (g *gatherers) Gather() ([]*model.MetricFamily, error) {
	g.RLock()
	defer g.RUnlock()
	return g.gatherers.Gather()
}

// RegisterGatherer registers a new prometheus Gatherer.
func RegisterGatherer(g pclient.Gatherer) {
	dynamicGatherers.Register(g)
}


================================================
FILE: pkg/instrumentation/service.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package instrumentation

import (
	"sync"

	"github.com/intel/cri-resource-manager/pkg/instrumentation/http"
)

// service is the state of our instrumentation services: HTTP endpoint, trace/metrics exporters.
type service struct {
	sync.RWMutex              // we're RW-lockable
	http         *http.Server // HTTP server
	tracing      *tracing     // tracing data exporter
	metrics      *metrics     // metrics data exporter
}

// newService creates an instance of our instrumentation services.
func newService() *service {
	return &service{
		http:    http.NewServer(),
		tracing: &tracing{},
		metrics: &metrics{},
	}
}

// Start starts instrumentation services.
func (s *service) Start() error {
	log.Info("starting instrumentation services...")

	s.Lock()
	defer s.Unlock()

	err := s.http.Start(opt.HTTPEndpoint)
	if err != nil {
		return instrumentationError("failed to start HTTP server: %v", err)
	}
	err = s.tracing.start(opt.JaegerAgent, opt.JaegerCollector, opt.Sampling)
	if err != nil {
		return instrumentationError("failed to start tracing: %v", err)
	}
	err = s.metrics.start(s.http.GetMux(), opt.ReportPeriod, opt.PrometheusExport)
	if err != nil {
		return instrumentationError("failed to start metrics: %v", err)
	}

	if err := registerGrpcViews(); err != nil {
		s.metrics.stop()
		s.tracing.stop()
		s.http.Stop()
		return err
	}

	return nil
}

// Stop stops instrumentation services.
func (s *service) Stop() {
	s.Lock()
	defer s.Unlock()

	unregisterGrpcViews()
	s.metrics.stop()
	s.tracing.stop()
	s.http.Stop()
}

// reconfigure reconfigures instrumentation services.
func (s *service) reconfigure() error {
	s.Lock()
	defer s.Unlock()

	err := s.http.Reconfigure(opt.HTTPEndpoint)
	if err != nil {
		return instrumentationError("failed to reconfigure HTTP server: %v", err)
	}
	err = s.tracing.reconfigure(opt.JaegerAgent, opt.JaegerCollector, opt.Sampling)
	if err != nil {
		return instrumentationError("failed to reconfigure tracing: %v", err)
	}
	err = s.metrics.reconfigure(s.http.GetMux(), opt.ReportPeriod, opt.PrometheusExport)
	if err != nil {
		return instrumentationError("failed to reconfigure metrics: %v", err)
	}
	return nil
}

// Restart restarts instrumentation services.
func (s *service) Restart() error {
	s.Stop()
	return s.Start()
}

// TracingEnabled returns true if the Jaeger tracing sampler is not disabled.
func (s *service) TracingEnabled() bool {
	s.RLock()
	defer s.RUnlock()

	return float64(opt.Sampling) > 0.0
}


================================================
FILE: pkg/log/default.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package log

import (
	"os"
	"path/filepath"
)

// our default logger
var deflog = log.get(filepath.Base(filepath.Clean(os.Args[0])))

// Default returns the default Logger.
func Default() Logger {
	return deflog
}

// Info formats and emits an informational message.
func Info(format string, args ...interface{}) {
	deflog.Info(format, args...)
}

// Warn formats and emits a warning message.
func Warn(format string, args ...interface{}) {
	deflog.Warn(format, args...)
}

// Error formats and emits an error message.
func Error(format string, args ...interface{}) {
	deflog.Error(format, args...)
}

// Fatal formats and emits an error message and os.Exit()'s with status 1.
func Fatal(format string, args ...interface{}) {
	deflog.Fatal(format, args...)
}

// Panic formats and emits an error messages, and panics with the same.
func Panic(format string, args ...interface{}) {
	deflog.Panic(format, args...)
}

// Debug formats and emits a debug message.
func Debug(format string, args ...interface{}) {
	deflog.Debug(format, args...)
}

// InfoBlock formats and emits a multiline information message.
func InfoBlock(prefix string, format string, args ...interface{}) {
	deflog.InfoBlock(prefix, format, args...)
}

// WarnBlock formats and emits a multiline warning message.
func WarnBlock(prefix string, format string, args ...interface{}) {
	deflog.WarnBlock(prefix, format, args...)
}

// ErrorBlock formats and emits a multiline error message.
func ErrorBlock(prefix string, format string, args ...interface{}) {
	deflog.ErrorBlock(prefix, format, args...)
}

// DebugBlock formats and emits a multiline debug message.
func DebugBlock(prefix string, format string, args ...interface{}) {
	deflog.DebugBlock(prefix, format, args...)
}

func init() {
	binary := filepath.Clean(os.Args[0])
	source := filepath.Base(binary)
	deflog = log.get(source)
}


================================================
FILE: pkg/log/flags.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package log

import (
	"encoding/json"
	"os"
	"strings"

	pkgcfg "github.com/intel/cri-resource-manager/pkg/config"
	"github.com/intel/cri-resource-manager/pkg/log/klogcontrol"
	"github.com/intel/cri-resource-manager/pkg/utils"
)

const (
	// DefaultLevel is the default logging severity level.
	DefaultLevel = LevelInfo
	// debugEnvVar is the environment variable used to seed debugging flags.
	debugEnvVar = "LOGGER_DEBUG"
	// configModule is our module name in the runtime configuration.
	configModule = "logger"
)

// options capture our runtime configuration.
type options struct {
	// Klog contains klog-specific options.
	Klog klogcontrol.Options
	// Debug defines which sources produce debug messages.
	Debug srcmap
	// LogSource determines if messages are prefixed with the logger source
	LogSource bool
}

// srcmap tracks debugging settings for sources.
type srcmap map[string]bool

var (
	// Runtime logging configuration.
	opt *options
	// Default debugging configuration.
	defaultDebugFlags srcmap
	// Default klog configuration.
	defaultKlogFlags klogcontrol.Options
	// klog control
	klogctl *klogcontrol.Control
)

// parse parses the given string and updates the srcmap accordingly.
func (m *srcmap) parse(value string) error {
	if *m == nil {
		*m = make(srcmap)
	}
	if value = strings.TrimSpace(value); value == "" {
		return nil
	}

	prev, state, src := "", "", ""
	for _, entry := range strings.Split(value, ",") {
		if entry = strings.TrimSpace(entry); entry == "" {
			continue
		}
		statesrc := strings.Split(entry, ":")
		switch len(statesrc) {
		case 2:
			state, src = statesrc[0], strings.TrimSpace(statesrc[1])
		case 1:
			state, src = "", strings.TrimSpace(statesrc[0])
		default:
			return loggerError("invalid state spec '%s' in source map", entry)
		}
		if state != "" {
			prev = state
		} else {
			state = prev
			if state == "" {
				state = "on"
			}
		}

		if src == "all" {
			src = "*"
		}

		enabled, err := utils.ParseEnabled(state)
		if err != nil {
			return loggerError("invalid state '%s' in source map", state)
		}
		(*m)[src] = enabled
	}

	return nil
}

// String returns a string representation of the srcmap.
func (m *srcmap) String() string {
	off := ""
	on := ""
	for src, state := range *m {
		if state {
			if on == "" {
				on = src
			} else {
				on += "," + src
			}
		} else {
			if off == "" {
				off = src
			} else {
				off += "," + src
			}
		}
	}

	switch {
	case on == "" && off == "":
		return ""
	case off == "":
		return "on:" + on
	case on == "":
		return "off:" + off
	}
	return "on:" + on + "," + "off:" + off
}

// MarshalJSON is the JSON marshaller for srcmap.
func (m srcmap) MarshalJSON() ([]byte, error) {
	return json.Marshal(m.String())
}

// UnmarshalJSON is the JSON unmarshaller for srcmap.
func (m *srcmap) UnmarshalJSON(raw []byte) error {
	cfgstr := ""
	if err := json.Unmarshal(raw, &cfgstr); err != nil {
		return loggerError("failed to unmarshal source map '%s': %v", string(raw), err)
	}
	if err := m.parse(cfgstr); err != nil {
		return loggerError("failed to unmarshal source map '%s': %v", string(raw), err)
	}
	return nil
}

// cloneFrom state from another srcmap.
func (m *srcmap) cloneFrom(o srcmap) {
	*m = make(srcmap)
	for src, state := range o {
		(*m)[src] = state
	}
}

// clone returns a copy of the srcmap.
func (m srcmap) clone() srcmap {
	if m == nil {
		return nil
	}
	o := make(srcmap)
	for src, state := range m {
		o[src] = state
	}
	return o
}

// configNotify is the configuration change notification callback for options.
func (o *options) configNotify(event pkgcfg.Event, _ pkgcfg.Source) error {
	deflog.Info("logger configuration %v", event)
	deflog.Info(" * debugging: %s", o.Debug.String())
	deflog.Info(" * log source: %v", o.LogSource)
	deflog.InfoBlock(" * klog: ", "%s", o.Klog.String())

	// On the first configuration update event, we record the current values
	// of klog flags as the runtime defaults. Effectively this allows one to
	// override the built-in defaults using klog command line options (or
	// environment variables as interpreted by klogcontrol). The recorded
	// defaults will also reflect any potential programmatic changes done by
	// (mis-)using flag.Set() but there's not much we can do about that.
	if defaultKlogFlags == nil {
		defaultKlogFlags = klogctl.CurrentOptions()
	}

	if o.Klog == nil {
		o.Klog = make(klogcontrol.Options)
	}

	// The behavior of the options.Klog map across updates is difficult
	// to understand. To make it more user friendly we fill in runtime
	// defaults for each unset entry (klog flags) here.
	for flag, value := range defaultKlogFlags {
		if _, ok := o.Klog[flag]; !ok {
			o.Klog[flag] = value
		}
	}

	return o.apply()
}

// apply applies the options to logging.
func (o *options) apply() error {
	log.Lock()
	defer log.Unlock()

	prefix := o.LogSource
	if logToStderr, ok := o.Klog["logtostderr"]; ok && logToStderr.(bool) {
		if skipHeaders, ok := o.Klog["skip_headers"]; ok && skipHeaders.(bool) {
			prefix = true
		}
	}

	log.setDbgMap(o.Debug.clone())
	log.setPrefix(prefix)

	return klogctl.Configure(o.Klog)
}

// defaultOptions returns our current default runtime options.
func defaultOptions() interface{} {
	o := &options{}

	o.Debug.cloneFrom(defaultDebugFlags)
	if defaultKlogFlags != nil {
		o.Klog.CloneFrom(defaultKlogFlags)
	} else {
		o.Klog = klogctl.CurrentOptions()
	}

	return o
}

// Set up klog control, set pkg/config logger, register us for configuration handling.
func init() {
	klogctl = klogcontrol.Get()
	opt = defaultOptions().(*options)
	opt.apply()

	cfglog := log.get("config")
	pkgcfg.SetLogger(pkgcfg.Logger{
		DebugEnabled: cfglog.DebugEnabled,
		Debug:        cfglog.Debug,
		Info:         cfglog.Info,
		Warning:      cfglog.Warn,
		Error:        cfglog.Error,
		Fatal:        cfglog.Fatal,
		Panic:        cfglog.Panic,
	})

	defaultDebugFlags = make(srcmap)
	if value, ok := os.LookupEnv(debugEnvVar); ok {
		if err := defaultDebugFlags.parse(value); err != nil {
			Default().Error("failed to parse %s %q: %v", debugEnvVar,
				value, err)
		} else {
			log.setDbgMap(defaultDebugFlags)
			Default().Info("seeded debug flags ($%s): %s", debugEnvVar,
				defaultDebugFlags.String())
		}
	}

	pkgcfg.Register(configModule, "logging control", opt, defaultOptions,
		pkgcfg.WithNotify(opt.configNotify))
}


================================================
FILE: pkg/log/grpc-logger.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package log

import (
	"fmt"

	"google.golang.org/grpc/grpclog"
)

// SetGrpcLogger sets up a logger for (google.golang.org/)grpc.
func SetGrpcLogger(source string, rate *Rate) {
	var l Logger

	if source == "" {
		l = Default()
	} else {
		l = log.get(source)
	}

	if rate != nil {
		l = RateLimit(l, *rate)
	}

	grpclog.SetLoggerV2(&grpclogger{Logger: l})
}

// grpclogger implements grpclog.LoggerV2 interface for our logger.
type grpclogger struct {
	Logger
}

func (g grpclogger) Info(args ...interface{}) {
	g.Logger.Debug("%s", fmt.Sprint(args...))
}

func (g grpclogger) Infoln(args ...interface{}) {
	g.Logger.Debug("%s", fmt.Sprint(args...))
}

func (g grpclogger) Infof(format string, args ...interface{}) {
	g.Logger.Debug(format, args...)
}

func (g grpclogger) Warning(args ...interface{}) {
	g.Logger.Warn("%s", fmt.Sprint(args...))
}

func (g grpclogger) Warningln(args ...interface{}) {
	g.Logger.Warn("%s", fmt.Sprint(args...))
}

func (g grpclogger) Warningf(format string, args ...interface{}) {
	g.Logger.Warn(format, args...)
}

func (g grpclogger) Error(args ...interface{}) {
	g.Logger.Error("%s", fmt.Sprint(args...))
}

func (g grpclogger) Errorln(args ...interface{}) {
	g.Logger.Error("%s", fmt.Sprint(args...))
}

func (g grpclogger) Errorf(format string, args ...interface{}) {
	g.Logger.Error(format, args...)
}

func (g grpclogger) Fatal(args ...interface{}) {
	g.Logger.Fatal("%s", fmt.Sprint(args...))
}

func (g grpclogger) Fatalln(args ...interface{}) {
	g.Logger.Fatal("%s", fmt.Sprint(args...))
}

func (g grpclogger) Fatalf(format string, args ...interface{}) {
	g.Logger.Fatal(format, args...)
}

func (g grpclogger) V(_ int) bool {
	return true
}


================================================
FILE: pkg/log/klogcontrol/klogcontrol.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package klogcontrol

import (
	"flag"
	"fmt"
	"io"
	"os"
	"strings"

	"k8s.io/klog/v2"
)

// Options captures runtime configuration for klog.
type Options map[string]interface{}

// Control implements runtime control for klog.
type Control struct {
	flags *flag.FlagSet
}

// Our singleton klog Control instance.
var ctl *Control

// Get returns our singleton klog Control instance.
func Get() *Control {
	return ctl
}

// CurrentOptions returns the current klog configuration as Options.
func (c *Control) CurrentOptions() Options {
	o := make(Options)
	c.flags.VisitAll(func(f *flag.Flag) {
		o[f.Name] = flag.Lookup(f.Name).Value.(flag.Getter).Get()
	})
	return o
}

// Configure reconfigures klog with the given Options.
func (c *Control) Configure(options Options) error {
	for name, value := range options {
		if err := flag.Set(name, fmt.Sprintf("%v", value)); err != nil {
			return klogError("failed to set klog flag %q to %v: %v", name, value, err)
		}
	}
	return nil
}

// Set sets the value of the given klog flag.
func (c *Control) Set(name, value string) error {
	return flag.Set(name, value)
}

// Get returns the current value of the given klog flag.
func (c *Control) Get(name string) (interface{}, error) {
	if c.flags.Lookup(name) == nil {
		return nil, klogError("unknown klog flag %q", name)
	}
	return flag.Lookup(name).Value.(flag.Getter).Get(), nil
}

// CloneFrom clones src to o.
func (o *Options) CloneFrom(src Options) {
	*o = make(Options)
	for name, value := range src {
		(*o)[name] = value
	}
}

// String returns a string representation of the Options.
func (o *Options) String() string {
	if o == nil {
		return "<nil>"
	}
	str := ""
	sep := ""
	for name, value := range *o {
		str += sep + name + "=" + fmt.Sprintf("%v", value)
		sep = "\n"
	}
	return str
}

// klogflag wraps a klog flag for configuration.
type klogflag struct {
	flag *flag.Flag
}

// Set implements flag.Value.Set() for wrapped klog flags.
func (klogf *klogflag) Set(value string) error {
	if klogf.flag.Name == "stderrthreshold" { // klog expects thresholds in ALL CAPS
		value = strings.ToUpper(value)
	}
	if err := klogf.flag.Value.Set(value); err != nil {
		return err
	}
	return nil
}

// String implements flag.Value.String() for wrapped klog flags.
func (klogf *klogflag) String() string {
	if klogf.flag == nil { // flag.isZeroValue() probing us...
		return ""
	}
	value := klogf.flag.Value.String()
	if klogf.flag.Name == "log_backtrace_at" && value == ":0" {
		value = ""
	}
	return value
}

// Get implements flag.Getter.Get() for wrapped klog flags.
func (klogf *klogflag) Get() interface{} {
	if getter, ok := klogf.flag.Value.(flag.Getter); ok {
		if value := getter.Get(); value != nil {
			return value
		}
	}
	return klogf.String()
}

// boolFlag is identical to the unexported flag.boolFlag interface.
type boolFlag interface {
	IsBoolFlag() bool
}

// IsBoolFlag implements flag.boolFlag.IsBoolFlag() for wrapped klog flags.
func (klogf *klogflag) IsBoolFlag() bool {
	if klogf.flag == nil {
		return false
	}
	if boolf, ok := klogf.flag.Value.(boolFlag); ok {
		return boolf.IsBoolFlag()
	}
	return false
}

// getEnv returns a default value for the flag from the environment.
func (klogf *klogflag) getEnv() (string, string, bool) {
	name := "LOGGER_" + strings.ToUpper(strings.ReplaceAll(klogf.flag.Name, "-", "_"))
	if value, ok := os.LookupEnv(name); ok {
		return name, value, true
	}
	return "", "", false
}

// klogError returns a package-specific formatted error.
func klogError(format string, args ...interface{}) error {
	return fmt.Errorf("klogcontrol: "+format, args...)
}

// wrapKlogFlag wraps and registers the given klog flag.
func wrapKlogFlag(f *flag.Flag) {
	klogf := &klogflag{flag: f}
	flag.Var(klogf, f.Name, f.Usage)

	if name, value, ok := klogf.getEnv(); ok {
		if err := klogf.Set(value); err != nil {
			klog.Errorf("klog flag %q: invalid environment default %s=%q: %v",
				f.Name, name, value, err)
		}
	} else {
		// Unless explicitly configured in the environment, by default
		// turn off headers (date, timestamp, etc.) when we're logging
		// to a journald stream.
		if f.Name == "skip_headers" {
			if value, _ := os.LookupEnv("JOURNAL_STREAM"); value != "" {
				klog.Infof("Logging to journald, forcing headers off...")
				klogf.Set("true")
			}
		}
	}
}

// init discovers klog flags and sets up dynamic control for them.
func init() {
	ctl = &Control{flags: flag.NewFlagSet("klog flags", flag.ContinueOnError)}
	ctl.flags.SetOutput(io.Discard)
	klog.InitFlags(ctl.flags)
	ctl.flags.VisitAll(func(f *flag.Flag) {
		wrapKlogFlag(f)
	})
}


================================================
FILE: pkg/log/log.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package log

import (
	"fmt"
	"strings"
	"sync"

	"k8s.io/klog/v2"
)

// Level describes the severity of a log message.
type Level int

const (
	// levelUnset denotes an unset level.
	levelUnset Level = iota
	// LevelDebug is the severity for debug messages.
	LevelDebug
	// LevelInfo is the severity for informational messages.
	LevelInfo
	// LevelWarn is the severity for warnings.
	LevelWarn
	// LevelError is the severity for errors.
	LevelError
	// LevelPanic is the severity for panic messages.
	LevelPanic
	// LevelFatal is the severity for fatal errors.
	LevelFatal
)

// Per-level prefix tags.
var levelTag = map[Level]string{
	levelUnset: "?: ",
	LevelDebug: "D: ",
	LevelInfo:  "I: ",
	LevelWarn:  "W: ",
	LevelError: "E: ",
	LevelFatal: "F: ",
	LevelPanic: "P: ",
}

// Logger is the interface for producing log messages for/from a particular source.
type Logger interface {
	// Standardized Logger interface functions so that this interface can be
	// used from goresctrl library.
	Debugf(format string, v ...interface{})
	Infof(format string, v ...interface{})
	Warnf(format string, v ...interface{})
	Errorf(format string, v ...interface{})
	Panicf(format string, v ...interface{})
	Fatalf(format string, v ...interface{})

	// Debug formats and emits a debug message.
	Debug(format string, args ...interface{})
	// Info formats and emits an informational message.
	Info(format string, args ...interface{})
	// Warn formats and emits a warning message.
	Warn(format string, args ...interface{})
	// Error formats and emits an error message.
	Error(format string, args ...interface{})
	// Panic formats and emits an error message then panics with the same.
	Panic(format string, args ...interface{})
	// Fatal formats and emits an error message and os.Exit()'s with status 1.
	Fatal(format string, args ...interface{})

	// DebugBlock formats and emits a multiline debug message.
	DebugBlock(prefix string, format string, args ...interface{})
	// InfoBlock formats and emits a multiline information message.
	InfoBlock(prefix string, format string, args ...interface{})
	// WarnBlock formats and emits a multiline warning message.
	WarnBlock(prefix string, format string, args ...interface{})
	// ErrorBlock formats and emits a multiline error message.
	ErrorBlock(prefix string, format string, args ...interface{})

	// EnableDebug enables debug messages for this Logger.
	EnableDebug() bool
	// DebugEnabled checks if debug messages are enabled for this Logger.
	DebugEnabled() bool

	// Source returns the source name of this Logger.
	Source() string
}

// logger implements Logger.
type logger uint

// logging encapsulates the full runtime state of logging.
type logging struct {
	sync.RWMutex
	level   Level               // logging threshold for stderr
	dbgmap  srcmap              // debug configuration
	loggers map[string]logger   // source to logger mapping
	sources map[logger]string   // logger to source mapping
	debug   map[logger]struct{} // loggers with debugging enabled
	maxlen  int                 // max source length.
	forced  bool                // forced global debugging
	prefix  bool                // prefix messages with logger source
	aligned map[logger]string   // logger sources aligned to maxlen
}

// log tracks our runtime state.
var log = &logging{
	level:   DefaultLevel,
	loggers: make(map[string]logger),
	sources: make(map[logger]string),
	aligned: make(map[logger]string),
	debug:   make(map[logger]struct{}),
}

// Get returns the named Logger.
func Get(source string) Logger {
	log.Lock()
	defer log.Unlock()
	return log.get(source)
}

// NewLogger creates the named logger.
func NewLogger(source string) Logger {
	return Get(source)
}

// EnableDebug enables debug logging for the source.
func EnableDebug(source string) bool {
	log.Lock()
	defer log.Unlock()
	return log.setDebug(source, true)
}

// DisableDebug disables debug logging for the source.
func DisableDebug(source string) bool {
	log.Lock()
	defer log.Unlock()
	return log.setDebug(source, false)
}

// DebugEnabled checks if debug logging is enabled for the source.
func DebugEnabled(source string) bool {
	log.Lock()
	defer log.Unlock()
	return log.getDebug(source)
}

// SetLevel sets the logging severity level.
func SetLevel(level Level) {
	log.Lock()
	defer log.Unlock()
	log.setLevel(level)
}

// Flush flushes any pending log messages.
func Flush() {
	log.RLock()
	defer log.RUnlock()
	klog.Flush()
}

//
// logging
//

func (l Level) String() string {
	switch l {
	case LevelDebug:
		return "debug"
	case LevelInfo:
		return "info"
	case LevelWarn:
		return "warning"
	case LevelError:
		return "error"
	case LevelPanic:
		return "panic"
	case LevelFatal:
		return "fatal"
	}
	return "unknown"
}

// setLevel sets the logging severity level.
func (log *logging) setLevel(level Level) error {
	log.level = level
	threshold := ""
	switch level {
	case LevelDebug, LevelInfo:
		threshold = "INFO"
	case LevelWarn:
		threshold = "WARNING"
	case LevelError, LevelPanic, LevelFatal:
		threshold = "ERROR"
	}
	if err := klogctl.Set("stderrthreshold", threshold); err != nil {
		return loggerError("failed to set log level/threshold to %s: %v", threshold, err)
	}
	return nil
}

// setDebug sets the debug state for the given source and returns the previous one.
func (log *logging) setDebug(source string, enabled bool) bool {
	l := log.get(source)
	_, old := log.debug[l]
	if enabled {
		log.debug[l] = struct{}{}
	} else {
		delete(log.debug, l)
	}
	return old
}

// getDebug sets the debug state for the given source and returns the previous one.
func (log *logging) getDebug(source string) bool {
	if log.forced {
		return true
	}
	l := log.get(source)
	_, enabled := log.debug[l]
	return enabled
}

// setDbgMap updates the debug configuration of logging.
func (log *logging) setDbgMap(dbgmap srcmap) {
	log.dbgmap = dbgmap
	log.debug = make(map[logger]struct{})
	for source := range log.loggers {
		state, ok := log.dbgmap[source]
		if !ok {
			state = log.dbgmap["*"]
		}
		log.setDebug(source, state)
	}
}

// setPrefix sets the prefix (source) logging preference.
func (log *logging) setPrefix(prefix bool) {
	log.prefix = prefix
}

// align calculates and stores an aligned prefix for the given logger.
func (log *logging) align(l logger) {
	source := log.sources[l]
	srclen := len(source)

	if srclen > log.maxlen {
		log.realign(srclen)
		return
	}

	pad := log.maxlen - srclen
	pre := (pad + 1) / 2
	suf := pad - pre
	log.aligned[l] = "[" + fmt.Sprintf("%*s", pre, "") + source + fmt.Sprintf("%*s", suf, "") + "] "
}

// realign recalculates aligned prefixes for all loggers.
func (log *logging) realign(maxlen int) {
	if maxlen <= 0 {
		for _, source := range log.sources {
			if srclen := len(source); srclen > maxlen {
				maxlen = srclen
			}
		}
	}
	log.maxlen = maxlen
	log.aligned = make(map[logger]string)
	for l := range log.sources {
		log.align(l)
	}
}

//
// Logger
//

// get returns the logger for source, creating one if necessary.
func (log *logging) get(source string) logger {
	if l, ok := log.loggers[source]; ok {
		return l
	}

	l := logger(len(log.loggers))
	log.loggers[source] = l
	log.sources[l] = source
	log.align(l)

	state, ok := log.dbgmap[source]
	if !ok {
		state = log.dbgmap["*"]
	}
	log.setDebug(source, state)

	return l
}

func (l logger) EnableDebug() bool {
	log.Lock()
	defer log.Unlock()
	if _, ok := log.sources[l]; !ok {
		return false
	}
	_, old := log.debug[l]
	log.debug[l] = struct{}{}
	return old
}

func (l logger) DebugEnabled() bool {
	log.RLock()
	defer log.RUnlock()
	_, enabled := log.debug[l]
	return enabled || log.forced
}

func (l logger) Source() string {
	log.RLock()
	defer log.RUnlock()
	return log.sources[l]
}

func (l logger) Debug(format string, args ...interface{}) {
	log.RLock()
	defer log.RUnlock()

	if !log.forced {
		if _, ok := log.debug[l]; !ok {
			return
		}
	}

	msg := fmt.Sprintf(format, args...)

	if log.prefix {
		klog.InfoDepth(1, levelTag[LevelDebug], log.aligned[l], msg)
	} else {
		klog.InfoDepth(1, msg)
	}
}

func (l logger) Info(format string, args ...interface{}) {
	log.RLock()
	defer log.RUnlock()

	msg := fmt.Sprintf(format, args...)

	if log.prefix {
		klog.InfoDepth(1, levelTag[LevelInfo], log.aligned[l], msg)
	} else {
		klog.InfoDepth(1, msg)
	}
}

func (l logger) Warn(format string, args ...interface{}) {
	log.RLock()
	defer log.RUnlock()

	msg := fmt.Sprintf(format, args...)

	if log.prefix {
		klog.WarningDepth(1, levelTag[LevelWarn], log.aligned[l], msg)
	} else {
		klog.WarningDepth(1, msg)
	}
}

func (l logger) Error(format string, args ...interface{}) {
	log.RLock()
	defer log.RUnlock()

	msg := fmt.Sprintf(format, args...)
	if log.prefix {
		klog.ErrorDepth(1, levelTag[LevelError], log.aligned[l], msg)
	} else {
		klog.ErrorDepth(1, msg)
	}
}

func (l logger) Fatal(format string, args ...interface{}) {
	log.RLock()
	defer log.RUnlock()

	msg := fmt.Sprintf(format, args...)
	if log.prefix {
		klog.ExitDepth(1, levelTag[LevelFatal], log.aligned[l], msg)
	} else {
		klog.ExitDepth(1, msg)
	}
}

func (l logger) Panic(format string, args ...interface{}) {
	log.RLock()
	defer log.RUnlock()

	msg := fmt.Sprintf(format, args...)
	if log.prefix {
		klog.ErrorDepth(1, levelTag[LevelPanic], log.aligned[l], msg)
	} else {
		klog.ErrorDepth(1, msg)
	}
	panic(msg)
}

func (l logger) DebugBlock(prefix string, format string, args ...interface{}) {
	if l.DebugEnabled() {
		l.block(LevelDebug, prefix, format, args...)
	}
}

func (l logger) InfoBlock(prefix string, format string, args ...interface{}) {
	l.block(LevelInfo, prefix, format, args...)
}

func (l logger) WarnBlock(prefix string, format string, args ...interface{}) {
	l.block(LevelWarn, prefix, format, args...)
}

func (l logger) ErrorBlock(prefix string, format string, args ...interface{}) {
	l.block(LevelError, prefix, format, args...)
}

func (l logger) block(level Level, prefix, format string, args ...interface{}) {
	log.Lock()
	defer log.Unlock()

	var logFn func(int, ...interface{})

	switch level {
	case LevelDebug, LevelInfo:
		logFn = klog.InfoDepth
	case LevelWarn:
		logFn = klog.WarningDepth
	case LevelError:
		logFn = klog.ErrorDepth
	default:
		return
	}

	if log.prefix {
		src := log.aligned[l]
		for _, msg := range strings.Split(fmt.Sprintf(format, args...), "\n") {
			logFn(2, levelTag[level], src, prefix, msg)
		}
	} else {
		for _, msg := range strings.Split(fmt.Sprintf(format, args...), "\n") {
			logFn(2, prefix, msg)
		}
	}
}

// loggerError produces a formatted logger-specific error.
func loggerError(format string, args ...interface{}) error {
	return fmt.Errorf("logger: "+format, args...)
}

func (l logger) Debugf(format string, args ...interface{}) {
	l.Debug(format, args...)
}

func (l logger) Infof(format string, args ...interface{}) {
	l.Info(format, args...)
}

func (l logger) Warnf(format string, args ...interface{}) {
	l.Warn(format, args...)
}

func (l logger) Errorf(format string, args ...interface{}) {
	l.Error(format, args...)
}

func (l logger) Panicf(format string, args ...interface{}) {
	l.Panic(format, args...)
}

func (l logger) Fatalf(format string, args ...interface{}) {
	l.Fatal(format, args...)
}


================================================
FILE: pkg/log/ratelimit.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package log

import (
	"fmt"
	"sync"
	"time"

	goxrate "golang.org/x/time/rate"
)

// Rate specifies maximum per-message logging rate.
type Rate struct {
	// rate limit
	Limit goxrate.Limit
	// allowed bursts
	Burst int
	// optional message window size
	Window int
}

// ratelimited implements rate-limited logging with a sliding window of unique messages.
type ratelimited struct {
	Logger
	sync.Mutex
	rate   Rate
	window []string
	limits map[string]*goxrate.Limiter
}

const (
	// DefaultWindow is the default message window size for rate limiting.
	DefaultWindow = 256
	// MinimumWindow is the smallest message window size for rate limiting.
	MinimumWindow = 32
)

// Every defines a rate limit for the given interval.
func Every(interval time.Duration) goxrate.Limit {
	return goxrate.Every(interval)
}

// Interval returns a Rate for the given interval.
func Interval(interval time.Duration) Rate {
	return Rate{Limit: Every(interval), Burst: 1}
}

// RateLimit returns a ratelimited version of the given logger.
func RateLimit(log Logger, rate Rate) Logger {
	switch {
	case rate.Window == 0:
		rate.Window = DefaultWindow
	case rate.Window < MinimumWindow:
		rate.Window = MinimumWindow
	}
	if rate.Burst < 1 {
		rate.Burst = 1
	}
	return &ratelimited{
		Logger: log,
		rate:   rate,
		window: make([]string, 0, rate.Window),
		limits: make(map[string]*goxrate.Limiter),
	}
}

func (rl *ratelimited) Debug(format string, args ...interface{}) {
	msg := fmt.Sprintf(format, args...)
	if limit := rl.getMessageLimit(msg); limit.Allow() {
		rl.Logger.Debug("<rate-limited> %s", msg)
	}
}

func (rl *ratelimited) Info(format string, args ...interface{}) {
	msg := fmt.Sprintf(format, args...)
	if limit := rl.getMessageLimit(msg); limit.Allow() {
		rl.Logger.Info("<rate-limited> %s", msg)
	}
}

func (rl *ratelimited) Warn(format string, args ...interface{}) {
	msg := fmt.Sprintf(format, args...)
	if limit := rl.getMessageLimit(msg); limit.Allow() {
		rl.Logger.Warn("<rate-limited> %s", msg)
	}
}

func (rl *ratelimited) Error(format string, args ...interface{}) {
	msg := fmt.Sprintf(format, args...)
	if limit := rl.getMessageLimit(msg); limit.Allow() {
		rl.Logger.Error("<rate-limited> %s", msg)
	}
}

// Get existing message limit or create a new one, shifting out the oldest if window is full.
func (rl *ratelimited) getMessageLimit(msg string) *goxrate.Limiter {
	rl.Lock()
	defer rl.Unlock()

	limit, ok := rl.limits[msg]
	if ok {
		return limit
	}

	limit = goxrate.NewLimiter(rl.rate.Limit, rl.rate.Burst)
	if len(rl.limits) == rl.rate.Window {
		delete(rl.limits, rl.window[0])
		rl.window = rl.window[1:]
	}
	rl.window = append(rl.window, msg)
	rl.limits[msg] = limit

	return limit
}


================================================
FILE: pkg/log/ratelimit_test.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package log

import (
	"fmt"
	"testing"
	"time"

	goxrate "golang.org/x/time/rate"
)

func TestRateLimit(t *testing.T) {
	ratelimit := RateLimit(Default(), Rate{Window: MinimumWindow, Limit: Every(time.Second)})
	rl := ratelimit.(*ratelimited)

	limiters := make(map[string]*goxrate.Limiter)

	// fill message window, store limiters for checking
	messages := make([]string, 0, MinimumWindow)
	for idx := 0; idx < cap(messages); idx++ {
		msg := fmt.Sprintf("message #%d", idx)
		messages = append(messages, msg)
		limiters[msg] = rl.getMessageLimit(msg)
	}

	// check looked up vs. stored limters
	for msg, limiter := range limiters {
		if rl.getMessageLimit(msg) != limiter {
			t.Errorf("unexpected new limiter for message %s", msg)
		}
	}

	// create more messages, store limiters for checking
	recent := make([]string, 0, MinimumWindow/5)
	for i := 0; i < cap(recent); i++ {
		msg := fmt.Sprintf("message #%d", len(messages)+i)
		recent = append(recent, msg)
		limiters[msg] = rl.getMessageLimit(msg)
	}

	// check looked up vs. stored limiters
	for _, msg := range recent {
		if rl.getMessageLimit(msg) != limiters[msg] {
			t.Errorf("unexpected new limiter for recent message %s", msg)
		}
	}

	// check in-window part of old messages
	for idx := len(recent); idx < len(messages); idx++ {
		msg := messages[idx]
		l := rl.getMessageLimit(msg)
		if l != limiters[msg] {
			t.Errorf("unexpected new limiter for old message %s", msg)
		}
	}

	// check shifted out part of old messages
	for idx := 0; idx < len(recent); idx++ {
		msg := messages[idx]
		l := rl.getMessageLimit(msg)
		if l == limiters[msg] {
			t.Errorf("unexpected old limiter for old message %s", msg)
		}
	}
}


================================================
FILE: pkg/log/signal.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package log

import (
	"os"
	"os/signal"
)

// signal notification channel
var signals chan os.Signal

// SetupDebugToggleSignal sets up a signal handler to toggle full debugging on/off.
func SetupDebugToggleSignal(sig os.Signal) {
	log.Lock()
	defer log.Unlock()

	clearDebugToggleSignal()

	signals = make(chan os.Signal, 1)
	signal.Notify(signals, sig)

	go func(sig <-chan os.Signal) {
		state := map[bool]string{false: "off", true: "on"}
		for {
			select {
			case _, ok := <-sig:
				if !ok {
					return
				}
			}
			log.forced = !log.forced
			deflog.Warn("forced full debugging is now %s...", state[log.forced])
		}
	}(signals)
}

// ClearDebugToggleSignal removes any signal handlers for toggling debug on/off.
func ClearDebugToggleSignal() {
	log.Lock()
	defer log.Unlock()
	clearDebugToggleSignal()
}

func clearDebugToggleSignal() {
	if signals != nil {
		signal.Stop(signals)
		close(signals)
		signals = nil
	}
}


================================================
FILE: pkg/log/stdlog-logger.go
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package log

import (
	stdlog "log"
)

// stdlogger implements an io.Writer to redirect logging by the stock log package.
type stdlogger struct {
	l Logger
}

// SetStdLogger sets up a logger for the standard log package.
func SetStdLogger(source string) {
	var l Logger

	if source == "" {
		l = Default()
	} else {
		l = log.get(source)
	}

	stdlog.SetPrefix("")
	stdlog.SetFlags(0)
	stdlog.SetOutput(&stdlogger{l: l})
}

// Write implements io.Writer for stdlogger.
func (s *stdlogger) Write(p []byte) (int, error) {
	s.l.Debug("%s", string(p))
	return len(p), nil
}


================================================
FILE: pkg/metrics/metrics.go
================================================
package metrics

import (
	"fmt"
	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/prometheus/client_golang/prometheus"
)

var (
	builtInCollectors     = make(map[string]InitCollector)
	registeredCollectors  = []prometheus.Collector{}
	initializedCollectors = make(map[string]struct{})
	log                   = logger.NewLogger("collectors")
)

// InitCollector is the type for functions that initialize collectors.
type InitCollector func() (prometheus.Collector, error)

// RegisterCollector registers the named prometheus.Collector for metrics collection.
func RegisterCollector(name string, init InitCollector) error {
	log.Info("registering collector %s...", name)

	if _, found := builtInCollectors[name]; found {
		return metricsError("Collector %s already registered", name)
	}

	builtInCollectors[name] = init

	return nil
}

// NewMetricGatherer creates a new prometheus.Gatherer with all registered collectors.
func NewMetricGatherer() (prometheus.Gatherer, error) {
	reg := prometheus.NewPedanticRegistry()

	for name, cb := range builtInCollectors {
		if _, ok := initializedCollectors[name]; ok {
			continue
		}

		c, err := cb()
		if err != nil {
			log.Error("Failed to initialize collector '%s': %v. Skipping it.", name, err)
			continue
		}
		registeredCollectors = append(registeredCollectors, c)
		initializedCollectors[name] = struct{}{}
	}

	reg.MustRegister(registeredCollectors[:]...)

	return reg, nil
}

func metricsError(format string, args ...interface{}) error {
	return fmt.Errorf("metrics: "+format, args...)
}


================================================
FILE: pkg/metrics/register/register_metrics.go
================================================
package register

import (
	// Pull in cgroup-based metric collector.
	_ "github.com/intel/cri-resource-manager/pkg/cgroupstats"
)


================================================
FILE: pkg/metrics/register/register_metrics_avx.go
================================================
//go:build !noavx
// +build !noavx

package register

import (
	// Pull in avx collector.
	_ "github.com/intel/cri-resource-manager/pkg/avx"
)


================================================
FILE: pkg/pidfile/pidfile.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package pidfile

import (
	"fmt"
	"os"
	"path/filepath"
	"strconv"
	"strings"
	"syscall"

	"github.com/pkg/errors"
)

var (
	pidFilePath = defaultPath()
	pidFile     *os.File
)

// GetPath returns the current pidfile path.
func GetPath() string {
	return pidFilePath
}

// SetPath sets the pidfile path to the given one.
func SetPath(path string) {
	closePIDFile()
	pidFilePath = path
}

// Write opens the PID file and writes os.Getpid() to it. If the PID file already
// exists Write() fails with an error. On successful completion, Write keeps the
// PID file open.
func Write() error {
	if pidFile != nil {
		return nil
	}

	err := os.MkdirAll(filepath.Dir(pidFilePath), 0755)
	if err != nil {
		return errors.Wrap(err, "failed to create PID file")
	}

	pidFile, err = os.OpenFile(pidFilePath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0644)
	if err != nil {
		return errors.Wrap(err, "failed to create PID file")
	}

	_, err = pidFile.Write([]byte(fmt.Sprintf("%d\n", os.Getpid())))
	if err != nil {
		closePIDFile()
		return errors.Wrap(err, "failed to write PID file")
	}

	return nil
}

// Read reads the content of the PID file. It returns the process ID found
// in the file. If opening the file or reading an integer process ID fails
// Read() returns -1 and an error.
func Read() (int, error) {
	var (
		pid int
		buf []byte
		err error
	)

	if buf, err = os.ReadFile(pidFilePath); err != nil {
		if os.IsNotExist(err) {
			return 0, nil
		}
		return -1, errors.Wrap(err, "failed to read PID file")
	}

	if pid, err = strconv.Atoi(strings.TrimRight(string(buf), "\n")); err != nil {
		return -1, errors.Wrapf(err, "invalid PID (%q) in PID file", string(buf))
	}

	return pid, nil
}

// closePIDFile closes the PID file and truncates it to zero length.
func closePIDFile() {
	if pidFile != nil {
		pidFile.Truncate(0)
		pidFile.Close()
		pidFile = nil
	}
}

// Remove removes the PID file for the process unconditionally, regardless if
// the current process had created the PID file or not.
func Remove() error {
	closePIDFile()
	err := os.Remove(pidFilePath)
	if err != nil {
		if os.IsNotExist(err) {
			return nil
		}
	}
	return err
}

// OwnerPid returns the ID of the process owning the PID file. 0 is returned
// if it is known that no process owns the file. -1 and an error is returned
// if the owner or its existence could not be determined.
func OwnerPid() (int, error) {
	var (
		pid int
		p   *os.Process
		err error
	)

	pid, err = Read()
	if err != nil {
		return -1, err
	}
	if pid == 0 {
		return 0, nil
	}

	p, err = os.FindProcess(pid)
	if err != nil {
		return -1, errors.Wrapf(err, "FindProcess() failed for PID %d", pid)
	}

	err = p.Signal(syscall.Signal(0))
	if err == os.ErrProcessDone {
		return 0, nil
	}
	if err == nil {
		return pid, nil
	}

	return -1, errors.Wrapf(err, "failed to check process %d", pid)
}

// defaultPath returns the default pidfile path.
func defaultPath() string {
	var path string

	if len(os.Args) > 0 {
		name := filepath.Base(os.Args[0])
		if euid := os.Geteuid(); euid > 0 {
			path = filepath.Join("/tmp", name+".pid")
		} else {
			path = filepath.Join("/", "var", "run", name+".pid")
		}
	}

	return path
}


================================================
FILE: pkg/pidfile/pidfile_test.go
================================================
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package pidfile

import (
	"os"
	"path/filepath"
	"testing"

	"github.com/pkg/errors"
	"github.com/stretchr/testify/require"
)

const (
	testPidFile = "pidfile-test.pid"
)

func prepare(t *testing.T) string {
	dir, err := mkTestDir(t)
	if err != nil {
		t.Errorf("failed to create test directory: %v", err)
		os.Exit(1)
	}

	SetPath(filepath.Join(dir, testPidFile))
	return dir
}

func TestDefaults(t *testing.T) {
	t.Run("TestDefaults", func(t *testing.T) {
		var (
			pid int
			err error
		)

		Remove()

		err = Write()
		require.Nil(t, err)

		pid, err = Read()
		require.Nil(t, err)
		require.Equal(t, pid, os.Getpid())

		err = Write()
		require.Nil(t, err)

		pid, err = Read()
		require.Nil(t, err)
		require.Equal(t, pid, os.Getpid())

		closePIDFile()
		err = Write()
		require.NotNil(t, err)

		Remove()
		err = Write()
		require.Nil(t, err)

		pid, err = Read()
		require.Nil(t, err)
		require.Equal(t, pid, os.Getpid())
	})
}

func TestGetSetPath(t *testing.T) {
	t.Run("TestTestGetSetPath", func(t *testing.T) {
		var (
			dir  string
			path string
		)

		dir = prepare(t)
		path = GetPath()
		require.Equal(t, path, filepath.Join(dir, testPidFile))
	})
}

func TestReadNonExisting(t *testing.T) {
	t.Run("TestReadNonExisting", func(t *testing.T) {
		var (
			pid int
			err error
		)

		prepare(t)

		pid, err = Read()
		require.Nil(t, err)
		require.Equal(t, pid, 0)
	})
}

func TestRemoveNonExisting(t *testing.T) {
	t.Run("TestRemoveNonExisting", func(t *testing.T) {
		prepare(t)
		err := Remove()
		require.Nil(t, err)
	})
}

func TestRemoveExisting(t *testing.T) {
	t.Run("TestRemoveExisting", func(t *testing.T) {
		var (
			err error
		)

		prepare(t)
		err = Write()
		require.Nil(t, err)

		err = Remove()
		require.Nil(t, err)
	})
}

func TestWrite(t *testing.T) {
	t.Run("TestWrite", func(t *testing.T) {
		var (
			pid int
			err error
		)

		prepare(t)

		err = Write()
		require.Nil(t, err)

		pid, err = Read()
		require.Nil(t, err)
		require.Equal(t, pid, os.Getpid())
	})
}

func TestReadClosed(t *testing.T) {
	t.Run("TestReadClosed", func(t *testing.T) {
		var (
			pid int
			err error
		)

		prepare(t)

		err = Write()
		require.Nil(t, err)

		pid, err = Read()
		require.Nil(t, err)
		require.Equal(t, pid, os.Getpid())

		closePIDFile()
		pid, err = Read()
		require.NotNil(t, err)
		require.Equal(t, pid, -1)
	})
}

func TestFailToOverwrite(t *testing.T) {
	t.Run("TestFailToOverwrite", func(t *testing.T) {
		var (
			pid int
			err error
		)

		prepare(t)

		err = Write()
		require.Nil(t, err)

		pid, err = Read()
		require.Nil(t, err)
		require.Equal(t, pid, os.Getpid())

		closePIDFile()
		err = Write()
		require.NotNil(t, err)
	})
}

func TestRemoveToOverwrite(t *testing.T) {
	t.Run("TestRemoveToOverwrite", func(t *testing.T) {
		var (
			pid int
			err error
		)

		prepare(t)

		err = Write()
		require.Nil(t, err)

		pid, err = Read()
		require.Nil(t, err)
		require.Equal(t, pid, os.Getpid())

		err = Remove()
		require.Nil(t, err)

		err = Write()
		require.Nil(t, err)

		pid, err = Read()
		require.Nil(t, err)
		require.Equal(t, pid, os.Getpid())
	})
}

func TestOwnerPid(t *testing.T) {
	t.Run("TestOwnerPid", func(t *testing.T) {
		var (
			pid int
			chk int
			err error
		)

		prepare(t)

		err = Write()
		require.Nil(t, err)

		pid, err = Read()
		require.Nil(t, err)
		require.Equal(t, pid, os.Getpid())

		chk, err = OwnerPid()
		require.Nil(t, err)
		require.Equal(t, pid, chk)
	})
}

func mkTestDir(t *testing.T) (string, error) {
	tmp, err := os.MkdirTemp("", ".pidfile-test*")
	if err != nil {
		return "", errors.Wrapf(err, "failed to create test directory")
	}

	t.Cleanup(func() {
		os.RemoveAll(tmp)
	})

	return tmp, nil
}


================================================
FILE: pkg/policycollector/collector.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package policycollector

import (
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy"
	"github.com/intel/cri-resource-manager/pkg/metrics"
	"github.com/prometheus/client_golang/prometheus"
)

type PolicyCollector struct {
	policy policy.Policy
}

func (c *PolicyCollector) SetPolicy(policy policy.Policy) {
	c.policy = policy
}

// HasPolicySpecificMetrics judges whether the policy defines the policy-specific metrics
func (c *PolicyCollector) HasPolicySpecificMetrics() bool {
	if c.policy.DescribeMetrics() == nil {
		return false
	}
	return true
}

// Describe implements prometheus.Collector interface
func (c *PolicyCollector) Describe(ch chan<- *prometheus.Desc) {
	for _, d := range c.policy.DescribeMetrics() {
		ch <- d
	}
}

// Collect implements prometheus.Collector interface
func (c *PolicyCollector) Collect(ch chan<- prometheus.Metric) {
	prometheusMetrics, err := c.policy.CollectMetrics(c.policy.PollMetrics())
	if err != nil {
		return
	}
	for _, m := range prometheusMetrics {
		ch <- m
	}
}

// RegisterPolicyMetricsCollector registers policy-specific collector
func (c *PolicyCollector) RegisterPolicyMetricsCollector() error {
	return metrics.RegisterCollector("policyMetrics", func() (prometheus.Collector, error) {
		return c, nil
	})
}


================================================
FILE: pkg/procstats/procstats.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package procstats

import (
	"os"
	"strconv"
	"strings"
	"sync"

	"github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/sysfs"
)

// CPUTimeStat is used to calculate the CPU usage.
type CPUTimeStat struct {
	sync.RWMutex
	PrevIdleTime       []uint64
	PrevTotalTime      []uint64
	CurIdleTime        []uint64
	CurTotalTime       []uint64
	DeltaIdleTime      []uint64
	DeltaTotalTime     []uint64
	CPUUsage           []float64
	IsGetCPUUsageBegin bool
}

var (
	// procRoot is the mount point for the proc filesystem
	procRoot = "/proc"
	procStat = procRoot + "/stat"
)

// GetCPUTimeStat calculates CPU usage by using the CPU time statistics from /proc/stat
func (t *CPUTimeStat) GetCPUTimeStat() error {
	// /proc/stat looks like this:
	// cpuid: user, nice, system, idle, iowait, irq, softirq
	// cpu  130216 19944 162525 1491240 3784 24749 17773 0 0 0
	// cpu0 40321 11452 49784 403099 2615 6076 6748 0 0 0
	// cpu1 26585 2425 36639 151166 404 2533 3541 0 0 0
	// ...
	stats, err := os.ReadFile(procStat)
	if err != nil {
		return err
	}
	t.Lock()
	defer t.Unlock()
	sys, err := sysfs.DiscoverSystem()
	if err != nil {
		return err
	}
	cpuCount := len(sys.CPUIDs())
	for index, line := range strings.Split(string(stats), "\n") {
		if index > cpuCount {
			break
		}
		split := strings.Split(line, " ")

		if strings.HasPrefix(split[0], "cpu") && split[0] != "cpu" {
			i, err := strconv.Atoi(split[0][3:])
			if err != nil {
				log.Error("Fail to get CPU index.")
				return err
			}
			t.CurIdleTime[i], err = strconv.ParseUint(split[4], 10, 64)
			if err != nil {
				log.Error("Fail to get idle time.")
				return err
			}
			totalTime := uint64(0)
			for _, s := range split[1:] {
				u, err := strconv.ParseUint(s, 10, 64)
				if err == nil {
					totalTime += u
				}
			}
			t.CurTotalTime[i] = totalTime
			t.CPUUsage[i] = 0.0
			if t.IsGetCPUUsageBegin {
				t.DeltaIdleTime[i] = t.CurIdleTime[i] - t.PrevIdleTime[i]
				t.DeltaTotalTime[i] = t.CurTotalTime[i] - t.PrevTotalTime[i]
				if t.DeltaTotalTime[i] != 0 {
					t.CPUUsage[i] = (1.0 - float64(t.DeltaIdleTime[i])/float64(t.DeltaTotalTime[i])) * 100.0
				}
			}
			t.PrevIdleTime[i] = t.CurIdleTime[i]
			t.PrevTotalTime[i] = t.CurTotalTime[i]
		}
	}
	for _, i := range sys.Offlined().List() {
		t.DeltaIdleTime[i] = 0.0
		t.DeltaTotalTime[i] = 0.0
		t.PrevIdleTime[i] = t.CurIdleTime[i]
		t.PrevTotalTime[i] = t.CurTotalTime[i]
		t.CPUUsage[i] = 0.0
	}
	t.IsGetCPUUsageBegin = true
	return nil
}


================================================
FILE: pkg/sysfs/error.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package sysfs

import (
	"fmt"
)

func sysfsError(path, format string, args ...interface{}) error {
	return fmt.Errorf(path+": "+format, args...)
}


================================================
FILE: pkg/sysfs/parsers.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package sysfs

import (
	"os"
	"strconv"
	"strings"
)

// unit multipliers
const (
	k = (int64(1) << 10)
	M = (int64(1) << 20)
	G = (int64(1) << 30)
	T = (int64(1) << 40)
)

// unit name to multiplier mapping
var units = map[string]int64{
	"k": k, "kB": k,
	"M": M, "MB": M,
	"G": G, "GB": G,
	"T": T, "TB": T,
}

// PickEntryFn picks a given input line apart into an entry of key and value.
type PickEntryFn func(string) (string, string, error)

// splitNumericAndUnit splits a string into a numeric and a unit part.
func splitNumericAndUnit(path string, value string) (string, int64, error) {
	fields := strings.Fields(value)

	switch len(fields) {
	case 1:
		return fields[0], 1, nil
	case 2:
		num := fields[0]
		unit, ok := units[fields[1]]
		if !ok {
			return "", -1, sysfsError(path, "failed to parse '%s', invalid unit '%s'",
				value, num, unit)
		}
		return num, unit, nil
	}

	return "", -1, sysfsError(path, "invalid numeric value %s", value)
}

// PparseNumberic parses a numeric string into integer of the right size.
func parseNumeric(path, value string, ptr interface{}) error {
	var numstr string
	var num, unit int64
	var f float64
	var err error

	if numstr, unit, err = splitNumericAndUnit(path, value); err != nil {
		return err
	}

	switch ptr.(type) {
	case *int:
		num, err = strconv.ParseInt(numstr, 0, strconv.IntSize)
		*ptr.(*int) = int(num * unit)
	case *int8:
		num, err = strconv.ParseInt(numstr, 0, 8)
		*ptr.(*int8) = int8(num * unit)
	case *int16:
		num, err = strconv.ParseInt(numstr, 0, 16)
		*ptr.(*int16) = int16(num * unit)
	case *int32:
		num, err = strconv.ParseInt(numstr, 0, 32)
		*ptr.(*int32) = int32(num * unit)
	case *int64:
		num, err = strconv.ParseInt(numstr, 0, 64)
		*ptr.(*int64) = int64(num * unit)
	case *uint:
		num, err = strconv.ParseInt(numstr, 0, strconv.IntSize)
		*ptr.(*uint) = uint(num * unit)
	case *uint8:
		num, err = strconv.ParseInt(numstr, 0, 8)
		*ptr.(*uint8) = uint8(num * unit)
	case *uint16:
		num, err = strconv.ParseInt(numstr, 0, 16)
		*ptr.(*uint16) = uint16(num * unit)
	case *uint32:
		num, err = strconv.ParseInt(numstr, 0, 32)
		*ptr.(*uint32) = uint32(num * unit)
	case *uint64:
		num, err = strconv.ParseInt(numstr, 0, 64)
		*ptr.(*uint64) = uint64(num * unit)
	case *float32:
		f, err = strconv.ParseFloat(numstr, 32)
		*ptr.(*float32) = float32(f) * float32(unit)
	case *float64:
		f, err = strconv.ParseFloat(numstr, 64)
		*ptr.(*float64) = f * float64(unit)

	default:
		err = sysfsError(path, "can't parse numeric value '%s' into type %T", value, ptr)
	}

	return err
}

// ParseFileEntries parses a sysfs files for the given entries.
func ParseFileEntries(path string, values map[string]interface{}, pickFn PickEntryFn) error {
	var err error

	data, err := os.ReadFile(path)
	if err != nil {
		sysfsError(path, "failed to read file: %v", err)
	}

	left := len(values)
	for _, line := range strings.Split(string(data), "\n") {
		key, value, err := pickFn(line)
		if err != nil {
			return err
		}

		ptr, ok := values[key]
		if !ok {
			continue
		}

		switch ptr.(type) {
		case *int, *int8, *int32, *int16, *int64, *uint, *uint8, *uint16, *uint32, *uint64:
			if err = parseNumeric(path, value, ptr); err != nil {
				return err
			}
		case *float32, *float64:
			if err = parseNumeric(path, value, ptr); err != nil {
				return err
			}
		case *string:
			*ptr.(*string) = value
		case *bool:
			*ptr.(*bool), err = strconv.ParseBool(value)
			if err != nil {
				return sysfsError(path, "failed to parse line %s, value '%s' for boolean key '%s'",
					line, value, key)
			}
		default:
			return sysfsError(path, "don't know how to parse key '%s' of type %T", key, ptr)

		}

		left--
		if left == 0 {
			break
		}
	}

	return nil
}


================================================
FILE: pkg/sysfs/system.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package sysfs

import (
	"fmt"
	"os"
	"path/filepath"
	"sort"
	"strconv"
	"strings"

	logger "github.com/intel/cri-resource-manager/pkg/log"
	"github.com/intel/cri-resource-manager/pkg/utils"
	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	"github.com/intel/goresctrl/pkg/sst"
	idset "github.com/intel/goresctrl/pkg/utils"
)

var (
	// Parent directory under which host sysfs, etc. is mounted (if non-standard location).
	sysRoot = ""
)

const (
	// sysfs devices/cpu subdirectory path
	sysfsCPUPath = "devices/system/cpu"
	// sysfs device/node subdirectory path
	sysfsNumaNodePath = "devices/system/node"
)

// MemoryType is an enum for the Node memory
type MemoryType int

const (
	// MemoryTypeDRAM means that the node has regular DRAM-type memory
	MemoryTypeDRAM MemoryType = iota
	// MemoryTypePMEM means that the node has persistent memory
	MemoryTypePMEM
	// MemoryTypeHBM means that the node has high bandwidth memory
	MemoryTypeHBM
)

// System devices
type System interface {
	Discover() error
	SetCpusOnline(online bool, cpus idset.IDSet) (idset.IDSet, error)
	SetCPUFrequencyLimits(min, max uint64, cpus idset.IDSet) error
	PackageIDs() []idset.ID
	NodeIDs() []idset.ID
	CPUIDs() []idset.ID
	PackageCount() int
	SocketCount() int
	CPUCount() int
	NUMANodeCount() int
	ThreadCount() int
	CPUSet() cpuset.CPUSet
	Package(id idset.ID) CPUPackage
	Node(id idset.ID) Node
	NodeDistance(from, to idset.ID) int
	CPU(id idset.ID) CPU
	Offlined() cpuset.CPUSet
	Isolated() cpuset.CPUSet
}

// System devices
type system struct {
	logger.Logger                          // our logger instance
	path          string                   // sysfs mount point
	packages      map[idset.ID]*cpuPackage // physical packages
	nodes         map[idset.ID]*node       // NUMA nodes
	cpus          map[idset.ID]*cpu        // CPUs
	cache         map[idset.ID]*Cache      // Cache
	offline       idset.IDSet              // offlined CPUs
	isolated      idset.IDSet              // isolated CPUs
	threads       int                      // hyperthreads per core
}

// CPUPackage is a physical package (a collection of CPUs).
type CPUPackage interface {
	ID() idset.ID
	CPUSet() cpuset.CPUSet
	DieIDs() []idset.ID
	NodeIDs() []idset.ID
	DieNodeIDs(idset.ID) []idset.ID
	DieCPUSet(idset.ID) cpuset.CPUSet
	SstInfo() *sst.SstPackageInfo
}

type cpuPackage struct {
	id       idset.ID                 // package id
	cpus     idset.IDSet              // CPUs in this package
	nodes    idset.IDSet              // nodes in this package
	dies     idset.IDSet              // dies in this package
	dieCPUs  map[idset.ID]idset.IDSet // CPUs per die
	dieNodes map[idset.ID]idset.IDSet // NUMA nodes per die
	sstInfo  *sst.SstPackageInfo      // Speed Select Technology info
}

// Node represents a NUMA node.
type Node interface {
	ID() idset.ID
	PackageID() idset.ID
	DieID() idset.ID
	CPUSet() cpuset.CPUSet
	Distance() []int
	DistanceFrom(id idset.ID) int
	MemoryInfo() (*MemInfo, error)
	GetMemoryType() MemoryType
	HasNormalMemory() bool
}

type node struct {
	path       string      // sysfs path
	id         idset.ID    // node id
	pkg        idset.ID    // package id
	die        idset.ID    // die id
	cpus       idset.IDSet // cpus in this node
	memoryType MemoryType  // node memory type
	normalMem  bool        // node has memory in a normal (kernel space allocatable) zone
	distance   []int       // distance/cost to other NUMA nodes
}

// CPU is a CPU core.
type CPU interface {
	ID() idset.ID
	PackageID() idset.ID
	DieID() idset.ID
	NodeID() idset.ID
	CoreID() idset.ID
	ThreadCPUSet() cpuset.CPUSet
	BaseFrequency() uint64
	FrequencyRange() CPUFreq
	EPP() EPP
	Online() bool
	Isolated() bool
	SetFrequencyLimits(min, max uint64) error
	SstClos() int
}

type cpu struct {
	path     string      // sysfs path
	id       idset.ID    // CPU id
	pkg      idset.ID    // package id
	die      idset.ID    // die id
	node     idset.ID    // node id
	core     idset.ID    // core id
	threads  idset.IDSet // sibling/hyper-threads
	baseFreq uint64      // CPU base frequency
	freq     CPUFreq     // CPU frequencies
	epp      EPP         // Energy Performance Preference from cpufreq governor
	online   bool        // whether this CPU is online
	isolated bool        // whether this CPU is isolated
	sstClos  int         // SST-CP CLOS the CPU is associated with
}

// CPUFreq is a CPU frequency scaling range
type CPUFreq struct {
	min uint64   // minimum frequency (kHz)
	max uint64   // maximum frequency (kHz)
	all []uint64 // discrete set of frequencies if applicable/known
}

// EPP represents the value of a CPU energy performance profile
type EPP int

const (
	EPPPerformance EPP = iota
	EPPBalancePerformance
	EPPBalancePower
	EPPPower
	EPPUnknown
)

// MemInfo contains data read from a NUMA node meminfo file.
type MemInfo struct {
	MemTotal uint64
	MemFree  uint64
	MemUsed  uint64
}

// CPU cache.
//   Notes: cache-discovery is forced off now (by forcibly clearing the related discovery bit)
//      Can't seem to make sense of the cache information exposed under sysfs. The cache ids
//      do not seem to be unique, which IIUC is contrary to the documentation.

// CacheType specifies a cache type.
type CacheType string

const (
	// DataCache marks data cache.
	DataCache CacheType = "Data"
	// InstructionCache marks instruction cache.
	InstructionCache CacheType = "Instruction"
	// UnifiedCache marks a unified data/instruction cache.
	UnifiedCache CacheType = "Unified"
)

// Cache has details about cache.
type Cache struct {
	id    idset.ID    // cache id
	kind  CacheType   // cache type
	size  uint64      // cache size
	level uint8       // cache level
	cpus  idset.IDSet // CPUs sharing this cache
}

// SetSysRoot sets the sys root directory.
func SetSysRoot(path string) {
	sysRoot = path
}

// SysRoot returns the sys root directory.
func SysRoot() string {
	return sysRoot
}

// DiscoverSystem performs discovery of the running systems details.
func DiscoverSystem() (System, error) {
	return DiscoverSystemAt(filepath.Join("/", sysRoot, "sys"))
}

// DiscoverSystemAt performs discovery of the running systems details from sysfs mounted at path.
func DiscoverSystemAt(path string) (System, error) {
	sys := &system{
		Logger:  logger.NewLogger("sysfs"),
		path:    path,
		offline: idset.NewIDSet(),
	}

	if err := sys.Discover(); err != nil {
		return nil, err
	}

	return sys, nil
}

// Discover performs system/hardware discovery.
func (sys *system) Discover() error {
	if err := sys.discoverCPUs(); err != nil {
		return err
	}
	if err := sys.discoverNodes(); err != nil {
		return err
	}
	if err := sys.discoverPackages(); err != nil {
		return err
	}

	if err := sys.discoverSst(); err != nil {
		// Just consider SST unsupported if our detection fails for some reason
		sys.Warn("%v", err)
	}

	if len(sys.nodes) > 0 {
		for _, pkg := range sys.packages {
			for _, nodeID := range pkg.nodes.SortedMembers() {
				if node, ok := sys.nodes[nodeID]; ok {
					node.pkg = pkg.id
				} else {
					return sysfsError("NUMA nodes", "can't find NUMA node for ID %d", nodeID)
				}
			}
			for _, dieID := range pkg.DieIDs() {
				for _, nodeID := range pkg.DieNodeIDs(dieID) {
					if node, ok := sys.nodes[nodeID]; ok {
						node.die = dieID
					} else {
						return sysfsError("NUMA nodes", "can't find NUMA node for ID %d", nodeID)
					}
				}
			}
		}
	}

	if sys.DebugEnabled() {
		for id, pkg := range sys.packages {
			sys.Info("package #%d:", id)
			sys.Debug("   cpus: %s", pkg.cpus)
			sys.Debug("  nodes: %s", pkg.nodes)
			sys.Debug("   dies: %s", pkg.dies)
			for die := range pkg.dies {
				sys.Debug("    die #%v nodes: %v", die, pkg.DieNodeIDs(die))
				sys.Debug("    die #%v cpus: %s", die, pkg.DieCPUSet(die).String())
			}
		}

		for id, node := range sys.nodes {
			sys.Debug("node #%d:", id)
			sys.Debug("      cpus: %s", node.cpus)
			sys.Debug("  distance: %v", node.distance)
			sys.Debug("   package: #%d", node.pkg)
			sys.Debug("       die: #%d", node.die)
		}

		for id, cpu := range sys.cpus {
			sys.Debug("CPU #%d:", id)
			sys.Debug("        pkg: %d", cpu.pkg)
			sys.Debug("        die: %d", cpu.die)
			sys.Debug("       node: %d", cpu.node)
			sys.Debug("       core: %d", cpu.core)
			sys.Debug("    threads: %s", cpu.threads)
			sys.Debug("  base freq: %d", cpu.baseFreq)
			sys.Debug("       freq: %d - %d", cpu.freq.min, cpu.freq.max)
			sys.Debug("        epp: %d", cpu.epp)
		}

		sys.Debug("offline CPUs: %s", sys.offline)
		sys.Debug("isolated CPUs: %s", sys.isolated)

		for id, cch := range sys.cache {
			sys.Debug("cache #%d:", id)
			sys.Debug("   type: %v", cch.kind)
			sys.Debug("   size: %d", cch.size)
			sys.Debug("  level: %d", cch.level)
			sys.Debug("   CPUs: %s", cch.cpus)
		}
	}

	return nil
}

// SetCpusOnline puts a set of CPUs online. Return the toggled set. Nil set implies all CPUs.
func (sys *system) SetCpusOnline(online bool, cpus idset.IDSet) (idset.IDSet, error) {
	var entries []string

	if cpus == nil {
		entries, _ = filepath.Glob(filepath.Join(sys.path, sysfsCPUPath, "cpu[0-9]*"))
	} else {
		entries = make([]string, cpus.Size())
		for idx, id := range cpus.Members() {
			entries[idx] = sys.path + "/" + sysfsCPUPath + "/cpu" + strconv.Itoa(int(id))
		}
	}

	desired := map[bool]int{false: 0, true: 1}[online]
	changed := idset.NewIDSet()

	for _, entry := range entries {
		var current int

		id := getEnumeratedID(entry)
		if id <= 0 {
			continue
		}

		if _, err := writeSysfsEntry(entry, "online", desired, &current); err != nil {
			return nil, sysfsError(entry, "failed to set online to %d: %v", desired, err)
		}

		if desired != current {
			changed.Add(id)
			if cpu, found := sys.cpus[id]; found {
				cpu.online = online

				if online {
					sys.offline.Del(id)
				} else {
					sys.offline.Add(id)
				}
			}
		}
	}

	return changed, nil
}

// SetCPUFrequencyLimits sets the CPU frequency scaling limits. Nil set implies all CPUs.
func (sys *system) SetCPUFrequencyLimits(min, max uint64, cpus idset.IDSet) error {
	if cpus == nil {
		cpus = idset.NewIDSet(sys.CPUIDs()...)
	}

	for _, id := range cpus.Members() {
		if cpu, ok := sys.cpus[id]; ok {
			if err := cpu.SetFrequencyLimits(min, max); err != nil {
				return err
			}
		}
	}

	return nil
}

// PackageIDs gets the ids of all packages present in the system.
func (sys *system) PackageIDs() []idset.ID {
	ids := make([]idset.ID, len(sys.packages))
	idx := 0
	for id := range sys.packages {
		ids[idx] = id
		idx++
	}

	sort.Slice(ids, func(i, j int) bool {
		return int(ids[i]) < int(ids[j])
	})

	return ids
}

// NodeIDs gets the ids of all NUMA nodes present in the system.
func (sys *system) NodeIDs() []idset.ID {
	ids := make([]idset.ID, len(sys.nodes))
	idx := 0
	for id := range sys.nodes {
		ids[idx] = id
		idx++
	}

	sort.Slice(ids, func(i, j int) bool {
		return int(ids[i]) < int(ids[j])
	})

	return ids
}

// CPUIDs gets the ids of all CPUs present in the system.
func (sys *system) CPUIDs() []idset.ID {
	ids := make([]idset.ID, len(sys.cpus))
	idx := 0
	for id := range sys.cpus {
		ids[idx] = id
		idx++
	}

	sort.Slice(ids, func(i, j int) bool {
		return int(ids[i]) < int(ids[j])
	})

	return ids
}

// PackageCount returns the number of discovered CPU packages (sockets).
func (sys *system) PackageCount() int {
	return len(sys.packages)
}

// SocketCount returns the number of discovered CPU packages (sockets).
func (sys *system) SocketCount() int {
	return len(sys.packages)
}

// CPUCount resturns the number of discovered CPUs/cores.
func (sys *system) CPUCount() int {
	return len(sys.cpus)
}

// NUMANodeCount returns the number of discovered NUMA nodes.
func (sys *system) NUMANodeCount() int {
	cnt := len(sys.nodes)
	if cnt < 1 {
		cnt = 1
	}
	return cnt
}

// ThreadCount returns the number of threads per core discovered.
func (sys *system) ThreadCount() int {
	return sys.threads
}

// CPUSet gets the ids of all CPUs present in the system as a CPUSet.
func (sys *system) CPUSet() cpuset.CPUSet {
	return CPUSetFromIDSet(idset.NewIDSet(sys.CPUIDs()...))
}

// Package gets the package with a given package id.
func (sys *system) Package(id idset.ID) CPUPackage {
	return sys.packages[id]
}

// Node gets the node with a given node id.
func (sys *system) Node(id idset.ID) Node {
	return sys.nodes[id]
}

// NodeDistance gets the distance between two NUMA nodes.
func (sys *system) NodeDistance(from, to idset.ID) int {
	return sys.nodes[from].DistanceFrom(to)
}

// CPU gets the CPU with a given CPU id.
func (sys *system) CPU(id idset.ID) CPU {
	return sys.cpus[id]
}

// Offlined gets the set of offlined CPUs.
func (sys *system) Offlined() cpuset.CPUSet {
	return CPUSetFromIDSet(sys.offline)
}

// Isolated gets the set of isolated CPUs."
func (sys *system) Isolated() cpuset.CPUSet {
	return CPUSetFromIDSet(sys.isolated)
}

// Discover Cpus present in the system.
func (sys *system) discoverCPUs() error {
	if sys.cpus != nil {
		return nil
	}

	sys.cpus = make(map[idset.ID]*cpu)

	_, err := readSysfsEntry(sys.path, filepath.Join(sysfsCPUPath, "isolated"), &sys.isolated, ",")
	if err != nil {
		sys.Error("failed to get set of isolated cpus: %v", err)
	}

	entries, _ := filepath.Glob(filepath.Join(sys.path, sysfsCPUPath, "cpu[0-9]*"))
	for _, entry := range entries {
		if err := sys.discoverCPU(entry); err != nil {
			return fmt.Errorf("failed to discover cpu for entry %s: %v", entry, err)
		}
	}

	return nil
}

// Discover details of the given CPU.
func (sys *system) discoverCPU(path string) error {
	cpu := &cpu{path: path, id: getEnumeratedID(path), online: true, sstClos: -1}

	cpu.isolated = sys.isolated.Has(cpu.id)

	if online, err := readSysfsEntry(path, "online", nil); err == nil {
		cpu.online = (online != "" && online[0] != '0')
	}

	if cpu.online {
		if _, err := readSysfsEntry(path, "topology/physical_package_id", &cpu.pkg); err != nil {
			return err
		}
		readSysfsEntry(path, "topology/die_id", &cpu.die)
		if _, err := readSysfsEntry(path, "topology/core_id", &cpu.core); err != nil {
			return err
		}
		if _, err := readSysfsEntry(path, "topology/thread_siblings_list", &cpu.threads, ","); err != nil {
			return err
		}
	} else {
		sys.offline.Add(cpu.id)
	}

	if _, err := readSysfsEntry(path, "cpufreq/base_frequency", &cpu.baseFreq); err != nil {
		cpu.baseFreq = 0
	}
	if _, err := readSysfsEntry(path, "cpufreq/cpuinfo_min_freq", &cpu.freq.min); err != nil {
		cpu.freq.min = 0
	}
	if _, err := readSysfsEntry(path, "cpufreq/cpuinfo_max_freq", &cpu.freq.max); err != nil {
		cpu.freq.max = 0
	}
	if _, err := readSysfsEntry(path, "cpufreq/energy_performance_preference", &cpu.epp); err != nil {
		cpu.epp = EPPUnknown
	}
	if node, _ := filepath.Glob(filepath.Join(path, "node[0-9]*")); len(node) == 1 {
		cpu.node = getEnumeratedID(node[0])
	} else {
		return fmt.Errorf("exactly one node per cpu allowed")
	}

	if sys.threads < 1 {
		sys.threads = 1
	}
	if cpu.threads.Size() > sys.threads {
		sys.threads = cpu.threads.Size()
	}

	sys.cpus[cpu.id] = cpu

	return nil
}

// ID returns the id of this CPU.
func (c *cpu) ID() idset.ID {
	return c.id
}

// PackageID returns package id of this CPU.
func (c *cpu) PackageID() idset.ID {
	return c.pkg
}

// DieID returns the die id of this CPU.
func (c *cpu) DieID() idset.ID {
	return c.die
}

// NodeID returns the node id of this CPU.
func (c *cpu) NodeID() idset.ID {
	return c.node
}

// CoreID returns the core id of this CPU (lowest CPU id of all thread siblings).
func (c *cpu) CoreID() idset.ID {
	return c.core
}

// ThreadCPUSet returns the CPUSet for all threads in this core.
func (c *cpu) ThreadCPUSet() cpuset.CPUSet {
	return CPUSetFromIDSet(c.threads)
}

// BaseFrequency returns the base frequency setting for this CPU.
func (c *cpu) BaseFrequency() uint64 {
	return c.baseFreq
}

// FrequencyRange returns the frequency range for this CPU.
func (c *cpu) FrequencyRange() CPUFreq {
	return c.freq
}

// EPP returns the energy performance profile of this CPU.
func (c *cpu) EPP() EPP {
	return c.epp
}

// Online returns if this CPU is online.
func (c *cpu) Online() bool {
	return c.online
}

// Isolated returns if this CPU is isolated.
func (c *cpu) Isolated() bool {
	return c.isolated
}

// SstClos returns the Speed Select Core Power CLOS number assigned to the CPU
// -1 implies that no SST prioritization is in effect
func (c *cpu) SstClos() int {
	return c.sstClos
}

// SetFrequencyLimits sets the frequency scaling limits for this CPU.
func (c *cpu) SetFrequencyLimits(min, max uint64) error {
	if c.freq.min == 0 {
		return nil
	}

	min /= 1000
	max /= 1000
	if min < c.freq.min && min != 0 {
		min = c.freq.min
	}
	if min > c.freq.max {
		min = c.freq.max
	}
	if max < c.freq.min && max != 0 {
		max = c.freq.min
	}
	if max > c.freq.max {
		max = c.freq.max
	}

	if _, err := writeSysfsEntry(c.path, "cpufreq/scaling_min_freq", min, nil); err != nil {
		return err
	}
	if _, err := writeSysfsEntry(c.path, "cpufreq/scaling_max_freq", max, nil); err != nil {
		return err
	}

	return nil
}

func readCPUsetFile(base, entry string) (cpuset.CPUSet, error) {
	path := filepath.Join(base, entry)

	blob, err := os.ReadFile(path)
	if err != nil {
		return cpuset.New(), sysfsError(path, "failed to read sysfs entry: %v", err)
	}

	return cpuset.Parse(strings.Trim(string(blob), "\n"))
}

// Discover NUMA nodes present in the system.
func (sys *system) discoverNodes() error {
	if sys.nodes != nil {
		return nil
	}

	sysNodesPath := filepath.Join(sys.path, sysfsNumaNodePath)
	sys.nodes = make(map[idset.ID]*node)
	entries, _ := filepath.Glob(filepath.Join(sysNodesPath, "node[0-9]*"))
	for _, entry := range entries {
		if err := sys.discoverNode(entry); err != nil {
			return fmt.Errorf("failed to discover node for entry %s: %v", entry, err)
		}
	}

	normalMemNodeIDs, err := readSysfsEntry(sysNodesPath, "has_normal_memory", nil)
	if err != nil {
		return fmt.Errorf("failed to discover nodes with normal memory: %v", err)
	}
	normalMemNodes, err := cpuset.Parse(normalMemNodeIDs)
	if err != nil {
		return fmt.Errorf("failed to parse nodes with normal memory (%q): %v",
			normalMemNodes, err)
	}
	memoryNodeIDs, err := readSysfsEntry(sysNodesPath, "has_memory", nil)
	if err != nil {
		return fmt.Errorf("failed to discover nodes with memory: %v", err)
	}
	memoryNodes, err := cpuset.Parse(memoryNodeIDs)
	if err != nil {
		return fmt.Errorf("failed to parse nodes with memory (%q): %v",
			memoryNodeIDs, err)
	}

	cpuNodesSlice := []int{}
	for id, node := range sys.nodes {
		if node.cpus.Size() > 0 {
			cpuNodesSlice = append(cpuNodesSlice, int(id))
		}
		if normalMemNodes.Contains(int(id)) {
			node.normalMem = true
		}
	}
	cpuNodes := cpuset.New(cpuNodesSlice...)

	sys.Logger.Info("NUMA nodes with CPUs: %s", cpuNodes.String())
	sys.Logger.Info("NUMA nodes with (any) memory: %s", memoryNodes.String())
	sys.Logger.Info("NUMA nodes with normal memory: %s", normalMemNodes.String())

	dramNodes := memoryNodes.Intersection(cpuNodes)
	pmemOrHbmNodes := memoryNodes.Difference(dramNodes)

	dramNodeIds := IDSetFromCPUSet(dramNodes)
	pmemOrHbmNodeIds := IDSetFromCPUSet(pmemOrHbmNodes)

	infos := make(map[idset.ID]*MemInfo)
	dramAvg := uint64(0)
	if len(pmemOrHbmNodeIds) > 0 && len(dramNodeIds) > 0 {
		// There is special memory present in the system.

		// FIXME assumption: if a node only has memory (and no CPUs), it's PMEM or HBM. Otherwise it's DRAM.
		// Also, we figure out if the memory is HBM or PMEM based on the amount. If the amount of memory is
		// smaller than the average amount of DRAM per node, it's HBM, otherwise PMEM.
		dramTotal := uint64(0)
		for _, node := range sys.nodes {
			info, err := node.MemoryInfo()
			if err != nil {
				return fmt.Errorf("failed to get memory info for node %v: %s", node, err)
			}
			infos[node.id] = info
			if _, ok := dramNodeIds[node.id]; ok {
				dramTotal += info.MemTotal
			}
		}
		dramAvg = dramTotal / uint64(len(dramNodeIds))
		if dramAvg == 0 {
			// FIXME: should be no reason to bail out when memory types are properly determined.
			return fmt.Errorf("no dram in the system, cannot determine special memory types")
		}
	}

	for _, node := range sys.nodes {
		if _, ok := pmemOrHbmNodeIds[node.id]; ok {
			mem, ok := infos[node.id]
			if !ok {
				return fmt.Errorf("not able to determine system special memory types")
			}
			if mem.MemTotal < dramAvg {
				sys.Logger.Info("node %d has HBM memory", node.id)
				node.memoryType = MemoryTypeHBM
			} else {
				sys.Logger.Info("node %d has PMEM memory", node.id)
				node.memoryType = MemoryTypePMEM
			}
		} else if _, ok := dramNodeIds[node.id]; ok {
			sys.Logger.Info("node %d has DRAM memory", node.id)
			node.memoryType = MemoryTypeDRAM
		} else {
			return fmt.Errorf("Unknown memory type for node %v (pmem nodes: %s, dram nodes: %s)", node, pmemOrHbmNodes, dramNodes)
		}
	}

	return nil
}

// Discover details of the given NUMA node.
func (sys *system) discoverNode(path string) error {
	node := &node{path: path, id: getEnumeratedID(path)}

	if _, err := readSysfsEntry(path, "cpulist", &node.cpus, ","); err != nil {
		return err
	}
	if _, err := readSysfsEntry(path, "distance", &node.distance); err != nil {
		return err
	}

	sys.nodes[node.id] = node

	return nil
}

// ID returns id of this node.
func (n *node) ID() idset.ID {
	return n.id
}

// PackageID returns the package id for this node.
func (n *node) PackageID() idset.ID {
	return n.pkg
}

// DieID returns the die id for this node.
func (n *node) DieID() idset.ID {
	return n.die
}

// CPUSet returns the CPUSet for all cores/threads in this node.
func (n *node) CPUSet() cpuset.CPUSet {
	return CPUSetFromIDSet(n.cpus)
}

// Distance returns the distance vector for this node.
func (n *node) Distance() []int {
	return n.distance
}

// DistanceFrom returns the distance of this and a given node.
func (n *node) DistanceFrom(id idset.ID) int {
	if int(id) < len(n.distance) {
		return n.distance[int(id)]
	}

	return -1
}

// MemoryInfo memory info for the node (partial content from the meminfo sysfs entry).
func (n *node) MemoryInfo() (*MemInfo, error) {
	meminfo := filepath.Join(n.path, "meminfo")
	buf := &MemInfo{}
	err := ParseFileEntries(meminfo,
		map[string]interface{}{
			"MemTotal:": &buf.MemTotal,
			"MemFree:":  &buf.MemFree,
		},
		func(line string) (string, string, error) {
			fields := strings.Fields(strings.TrimSpace(line))
			if len(fields) < 4 {
				return "", "", sysfsError(meminfo, "failed to parse entry: '%s'", line)
			}
			key := fields[2]
			val := fields[3]
			if len(fields) == 5 {
				val += " " + fields[4]
			}
			return key, val, nil
		},
	)

	if err != nil {
		return nil, err
	}

	//
	// On some HW and kernel combinations we've seen more free than total
	// memory being reported. This causes exorbitant usage of memory being
	// reported which later can cause failures in policies which trust and
	// rely on this information.
	//
	// Give here a clear(er) error about that. This should also prevent us
	// immediately from starting up.
	//
	if buf.MemFree > buf.MemTotal {
		return nil, sysfsError(meminfo, "System reports more free than total memory. "+
			"This can be caused by a kernel bug. Please update your kernel.")
	}

	buf.MemUsed = buf.MemTotal - buf.MemFree

	return buf, nil
}

// GetMemoryType returns the memory type for this node.
func (n *node) GetMemoryType() MemoryType {
	return n.memoryType
}

// HasNormalMemory returns true if the node has memory that belongs to a normal zone.
func (n *node) HasNormalMemory() bool {
	return n.normalMem
}

// Discover physical packages (CPU sockets) present in the system.
func (sys *system) discoverPackages() error {
	if sys.packages != nil {
		return nil
	}

	sys.packages = make(map[idset.ID]*cpuPackage)

	for _, cpu := range sys.cpus {
		pkg, found := sys.packages[cpu.pkg]
		if !found {
			pkg = &cpuPackage{
				id:       cpu.pkg,
				cpus:     idset.NewIDSet(),
				nodes:    idset.NewIDSet(),
				dies:     idset.NewIDSet(),
				dieCPUs:  make(map[idset.ID]idset.IDSet),
				dieNodes: make(map[idset.ID]idset.IDSet),
			}
			sys.packages[cpu.pkg] = pkg
		}
		pkg.cpus.Add(cpu.id)
		pkg.nodes.Add(cpu.node)
		pkg.dies.Add(cpu.die)

		if dieCPUs, ok := pkg.dieCPUs[cpu.die]; !ok {
			pkg.dieCPUs[cpu.die] = idset.NewIDSet(cpu.id)
		} else {
			dieCPUs.Add(cpu.id)
		}
		if dieNodes, ok := pkg.dieNodes[cpu.die]; !ok {
			pkg.dieNodes[cpu.die] = idset.NewIDSet(cpu.node)
		} else {
			dieNodes.Add(cpu.node)
		}
	}

	return nil
}

func (sys *system) discoverSst() error {
	if !sst.SstSupported() {
		sys.Info("Speed Select Technology (SST) support not detected")
		return nil
	}

	for _, pkg := range sys.packages {
		sstInfo, err := sst.GetPackageInfo(pkg.id)
		if err != nil {
			return fmt.Errorf("failed to get SST info for package %d: %v", pkg.id, err)
		}
		sys.DebugBlock("", "Speed Select Technology info detected for package %d:\n%s", pkg.id, utils.DumpJSON(sstInfo))

		if sstInfo[pkg.id].CPEnabled {
			ids := pkg.cpus.SortedMembers()

			for _, id := range ids {
				clos, err := sst.GetCPUClosID(id)
				if err != nil {
					return fmt.Errorf("failed to get SST-CP clos id for cpu %d: %v", id, err)
				}

				sys.cpus[id].sstClos = clos
			}
		}
		pkg.sstInfo = sstInfo[pkg.id]
	}

	return nil
}

// ID returns the id of this package.
func (p *cpuPackage) ID() idset.ID {
	return p.id
}

// CPUSet returns the CPUSet for all cores/threads in this package.
func (p *cpuPackage) CPUSet() cpuset.CPUSet {
	return CPUSetFromIDSet(p.cpus)
}

// DieIDs returns the die ids for this package.
func (p *cpuPackage) DieIDs() []idset.ID {
	return p.dies.SortedMembers()
}

// NodeIDs returns the NUMA node ids for this package.
func (p *cpuPackage) NodeIDs() []idset.ID {
	return p.nodes.SortedMembers()
}

// DieNodeIDs returns the set of NUMA nodes in the given die of this package.
func (p *cpuPackage) DieNodeIDs(id idset.ID) []idset.ID {
	if dieNodes, ok := p.dieNodes[id]; ok {
		return dieNodes.SortedMembers()
	}
	return []idset.ID{}
}

// DieCPUSet returns the set of CPUs in the given die of this package.
func (p *cpuPackage) DieCPUSet(id idset.ID) cpuset.CPUSet {
	if dieCPUs, ok := p.dieCPUs[id]; ok {
		return CPUSetFromIDSet(dieCPUs)
	}
	return cpuset.New()
}

func (p *cpuPackage) SstInfo() *sst.SstPackageInfo {
	return p.sstInfo
}

// eppStrings initialized this way to better catch changes in the enum
var eppStrings = func() [EPPUnknown]string {
	var e [EPPUnknown]string
	e[EPPPerformance] = "performance"
	e[EPPBalancePerformance] = "balance_performance"
	e[EPPBalancePower] = "balance_power"
	e[EPPPower] = "power"
	return e
}()

var eppValues = func() map[string]EPP {
	m := make(map[string]EPP, len(eppStrings))
	for i, v := range eppStrings {
		m[v] = EPP(i)
	}
	return m
}()

// String returns EPP value as string
func (e EPP) String() string {
	if int(e) < len(eppStrings) {
		return eppStrings[e]
	}
	return ""
}

// EPPFromString converts string to EPP value
func EPPFromString(s string) EPP {
	if v, ok := eppValues[s]; ok {
		return v
	}
	return EPPUnknown
}


================================================
FILE: pkg/sysfs/utils.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package sysfs

import (
	"fmt"
	"os"
	"path/filepath"
	"strconv"
	"strings"

	"github.com/intel/cri-resource-manager/pkg/utils/cpuset"
	idset "github.com/intel/goresctrl/pkg/utils"
)

// Get the trailing enumeration part of a name.
func getEnumeratedID(name string) idset.ID {
	id := 0
	base := 1
	for idx := len(name) - 1; idx > 0; idx-- {
		d := name[idx]

		if '0' <= d && d <= '9' {
			id += base * (int(d) - '0')
			base *= 10
		} else {
			if base > 1 {
				return idset.ID(id)
			}

			return idset.ID(-1)
		}
	}

	return idset.ID(-1)
}

// Read content of a sysfs entry and convert it according to the type of a given pointer.
func readSysfsEntry(base, entry string, ptr interface{}, args ...interface{}) (string, error) {
	var buf string

	path := filepath.Join(base, entry)

	blob, err := os.ReadFile(path)
	if err != nil {
		return "", sysfsError(path, "failed to read sysfs entry: %v", err)
	}
	buf = strings.Trim(string(blob), "\n")

	if ptr == interface{}(nil) {
		return buf, nil
	}

	switch ptr.(type) {
	case *string, *int, *uint, *int8, *uint8, *int16, *uint16, *int32, *uint32, *int64, *uint64:
		err := parseValue(buf, ptr)
		if err != nil {
			return "", sysfsError(path, "%v", err)
		}
		return buf, nil

	case *idset.IDSet, *[]int, *[]uint, *[]int8, *[]uint8, *[]int16, *[]uint16, *[]int32, *[]uint32, *[]int64, *[]uint64:
		sep, err := getSeparator(" ", args)
		if err != nil {
			return "", sysfsError(path, "%v", err)
		}
		err = parseValueList(buf, sep, ptr)
		if err != nil {
			return "", sysfsError(path, "%v", err)
		}
		return buf, nil
	case *EPP:
		*ptr.(*EPP) = EPPFromString(buf)
		return buf, nil
	}

	return "", sysfsError(path, "unsupported sysfs entry type %T", ptr)
}

// Write a value to a sysfs entry. An optional item separator can be specified for slice values.
func writeSysfsEntry(base, entry string, val, oldp interface{}, args ...interface{}) (string, error) {
	var buf, old string
	var err error

	if oldp != nil {
		if old, err = readSysfsEntry(base, entry, oldp, args...); err != nil {
			return "", err
		}
	}

	path := filepath.Join(base, entry)

	switch val.(type) {
	case string, int, uint, int8, uint8, int16, uint16, int32, uint32, int64, uint64:
		buf, err = formatValue(val)
		if err != nil {
			return "", sysfsError(path, "%v", err)
		}

	case idset.IDSet, []int, []uint, []int8, []uint8, []int16, []uint16, []int32, []uint32, []int64, []uint64:
		sep, err := getSeparator(" ", args)
		if err != nil {
			return "", sysfsError(path, "%v", err)
		}
		buf, err = formatValueList(sep, val)
		if err != nil {
			return "", sysfsError(path, "%v", err)
		}

	default:
		return "", sysfsError(path, "unsupported sysfs entry type %T", val)
	}

	f, err := os.OpenFile(path, os.O_WRONLY, 0)
	if err != nil {
		return "", sysfsError(path, "cannot open: %v", err)
	}
	defer f.Close()

	if _, err = f.Write([]byte(buf + "\n")); err != nil {
		return "", sysfsError(path, "cannot write: %v", err)
	}

	return old, nil
}

// Determine list separator string, given an optional separator variadic argument.
func getSeparator(defaultVal string, args []interface{}) (string, error) {
	switch len(args) {
	case 0:
		return defaultVal, nil
	case 1:
		return args[0].(string), nil
	}

	return "", fmt.Errorf("invalid separator (%v), 1 expected, %d given", args, len(args))
}

// Parse a value from a string.
func parseValue(str string, value interface{}) error {
	switch value.(type) {
	case *string:
		*value.(*string) = str

	case *int, *int8, *int16, *int32, *int64:
		v, err := strconv.ParseInt(str, 0, 0)
		if err != nil {
			return fmt.Errorf("invalid entry '%s': %v", str, err)
		}

		switch value.(type) {
		case *int:
			*value.(*int) = int(v)
		case *int8:
			*value.(*int8) = int8(v)
		case *int16:
			*value.(*int16) = int16(v)
		case *int32:
			*value.(*int32) = int32(v)
		case int64:
			*value.(*int64) = v
		}

	case *uint, *uint8, *uint16, *uint32, *uint64:
		v, err := strconv.ParseUint(str, 0, 0)
		if err != nil {
			return fmt.Errorf("invalid entry: '%s': %v", str, err)
		}

		switch value.(type) {
		case *uint:
			*value.(*uint) = uint(v)
		case *uint8:
			*value.(*uint8) = uint8(v)
		case *uint16:
			*value.(*uint16) = uint16(v)
		case *uint32:
			*value.(*uint32) = uint32(v)
		case *uint64:
			*value.(*uint64) = v
		}
	}

	return nil
}

// Parse a list of values from a string into a slice.
func parseValueList(str, sep string, valuep interface{}) error {
	var value interface{}

	switch valuep.(type) {
	case *idset.IDSet:
		value = idset.NewIDSet()
	case *[]int:
		value = []int{}
	case *[]uint:
		value = []uint{}
	case *[]int8:
		value = []int8{}
	case *[]uint8:
		value = []uint8{}
	case *[]int16:
		value = []int16{}
	case *[]uint16:
		value = []uint16{}
	case *[]int32:
		value = []int32{}
	case *[]uint32:
		value = []uint32{}
	case *[]int64:
		value = []int64{}
	case *[]uint64:
		value = []uint64{}
	default:
		return fmt.Errorf("invalid slice value type: %T", valuep)
	}

	for _, s := range strings.Split(str, sep) {
		if s == "" {
			break
		}
		switch value.(type) {
		case idset.IDSet:
			if rng := strings.Split(s, "-"); len(rng) == 1 {
				id, err := strconv.Atoi(s)
				if err != nil {
					return fmt.Errorf("invalid entry '%s': %v", s, err)
				}
				value.(idset.IDSet).Add(idset.ID(id))
			} else {
				beg, err := strconv.Atoi(rng[0])
				if err != nil {
					return fmt.Errorf("invalid entry '%s': %v", s, err)
				}
				end, err := strconv.Atoi(rng[1])
				if err != nil {
					return fmt.Errorf("invalid entry '%s': %v", s, err)
				}
				for id := beg; id <= end; id++ {
					value.(idset.IDSet).Add(idset.ID(id))
				}
			}

		case []int, []int8, []int16, []int32, []int64:
			v, err := strconv.ParseInt(s, 0, 0)
			if err != nil {
				return fmt.Errorf("invalid entry '%s': %v", s, err)
			}
			switch value.(type) {
			case []int:
				value = append(value.([]int), int(v))
			case []int8:
				value = append(value.([]int8), int8(v))
			case []int16:
				value = append(value.([]int16), int16(v))
			case []int32:
				value = append(value.([]int32), int32(v))
			case []int64:
				value = append(value.([]int64), v)
			}

		case []uint, []uint8, []uint16, []uint32, []uint64:
			v, err := strconv.ParseUint(s, 0, 0)
			if err != nil {
				return fmt.Errorf("invalid entry '%s': %v", s, err)
			}
			switch value.(type) {
			case []uint:
				value = append(value.([]uint), uint(v))
			case []uint8:
				value = append(value.([]uint8), uint8(v))
			case []uint16:
				value = append(value.([]uint16), uint16(v))
			case []uint32:
				value = append(value.([]uint32), uint32(v))
			case []uint64:
				value = append(value.([]uint64), v)
			}
		}
	}

	switch valuep.(type) {
	case *idset.IDSet:
		*valuep.(*idset.IDSet) = value.(idset.IDSet)
	case *[]int:
		*valuep.(*[]int) = value.([]int)
	case *[]uint:
		*valuep.(*[]uint) = value.([]uint)
	case *[]int8:
		*valuep.(*[]int8) = value.([]int8)
	case *[]uint8:
		*valuep.(*[]uint8) = value.([]uint8)
	case *[]int16:
		*valuep.(*[]int16) = value.([]int16)
	case *[]uint16:
		*valuep.(*[]uint16) = value.([]uint16)
	case *[]int32:
		*valuep.(*[]int32) = value.([]int32)
	case *[]uint32:
		*valuep.(*[]uint32) = value.([]uint32)
	case *[]int64:
		*valuep.(*[]int64) = value.([]int64)
	case *[]uint64:
		*valuep.(*[]uint64) = value.([]uint64)
	}

	return nil
}

// Format a value into a string.
func formatValue(value interface{}) (string, error) {
	switch value.(type) {
	case string:
		return value.(string), nil
	case int, uint, int8, uint8, int16, uint16, int32, uint32, int64, uint64:
		return fmt.Sprintf("%d", value), nil
	default:
		return "", fmt.Errorf("invalid value type %T", value)
	}
}

// Format a list of values from a slice into a string.
func formatValueList(sep string, value interface{}) (string, error) {
	var v []interface{}

	switch value.(type) {
	case idset.IDSet:
		return value.(idset.IDSet).StringWithSeparator(sep), nil
	case []int, []uint, []int8, []uint8, []int16, []uint16, []int32, []uint32, []int64, []uint64:
		v = value.([]interface{})
	default:
		return "", fmt.Errorf("invalid value type %T", value)
	}

	str := ""
	t := ""
	for idx := range v {
		str = str + t + fmt.Sprintf("%d", v[idx])
		t = sep
	}

	return "", nil
}

// IDSetFromCPUSet returns an id set corresponding to a cpuset.CPUSet.
func IDSetFromCPUSet(cset cpuset.CPUSet) idset.IDSet {
	return idset.NewIDSetFromIntSlice(cset.List()...)
}

// CPUSetFromIDSet returns a cpuset.CPUSet corresponding to an id set.
func CPUSetFromIDSet(s idset.IDSet) cpuset.CPUSet {
	cpus := []int{}
	for id := range s {
		cpus = append(cpus, int(id))
	}
	return cpuset.New(cpus...)
}


================================================
FILE: pkg/testutils/verify.go
================================================
package testutils

import (
	"reflect"
	"strings"
	"testing"
)

// VerifyDeepEqual checks that two values (including structures) are equal, or else it fails the test.
func VerifyDeepEqual(t *testing.T, valueName string, expectedValue interface{}, seenValue interface{}) bool {
	if reflect.DeepEqual(expectedValue, seenValue) {
		return true
	}
	t.Errorf("expected %s value %+v, got %+v", valueName, expectedValue, seenValue)
	return false
}

// VerifyError checks a (multi)error has expected properties, or else it fails the test.
func VerifyError(t *testing.T, err error, expectedCount int, expectedSubstrings []string) bool {
	if expectedCount > 0 {
		if err == nil {
			t.Errorf("error expected, got nil")
			return false
		}
		if merr, ok := err.(interface{ Unwrap() []error }); !ok {
			t.Errorf("expected %d errors, but got %#v instead of multierror", expectedCount, err)
			return false
		} else if errs := merr.Unwrap(); len(errs) != expectedCount {
			t.Errorf("expected %d errors, but got %d: %v", expectedCount, len(errs), merr)
			return false
		}
	} else if expectedCount == 0 {
		if err != nil {
			t.Errorf("expected 0 errors, but got %v", err)
			return false
		}
	}
	for _, substring := range expectedSubstrings {
		if !strings.Contains(err.Error(), substring) {
			t.Errorf("expected error with substring %#v, got \"%v\"", substring, err)
		}
	}
	return true
}


================================================
FILE: pkg/topology/go.mod
================================================
module github.com/intel/cri-resource-manager/pkg/topology

go 1.22.0

require (
	github.com/pkg/errors v0.9.1
	golang.org/x/sys v0.18.0
)


================================================
FILE: pkg/topology/test-cleanup.sh
================================================
rm -fr testdata


================================================
FILE: pkg/topology/test-setup.sh
================================================
tar -xvzf test-data.tar.gz


================================================
FILE: pkg/topology/topology.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topology

import (
	"fmt"
	"os"
	"path/filepath"
	"strings"
	"syscall"

	"github.com/pkg/errors"
	"golang.org/x/sys/unix"
)

// to mock in tests
var (
	sysRoot = ""
)

const (
	// ProviderKubelet is a constant to distinguish that topology hint comes
	// from parameters passed to CRI create/update requests from Kubelet
	ProviderKubelet = "kubelet"
)

// Hint represents various hints that can be detected from sysfs for the device
type Hint struct {
	Provider string
	CPUs     string
	NUMAs    string
	Sockets  string
}

// Hints represents set of hints collected from multiple providers
type Hints map[string]Hint

// SetSysRoot sets the sysfs root directory to use.
func SetSysRoot(root string) {
	sysRoot = root
}

func getDevicesFromVirtual(realDevPath string) (devs []string, err error) {
	if !filepath.HasPrefix(realDevPath, "/sys/devices/virtual") {
		return nil, fmt.Errorf("%s is not a virtual device", realDevPath)
	}

	relPath, _ := filepath.Rel("/sys/devices/virtual", realDevPath)

	dir, file := filepath.Split(relPath)
	switch dir {
	case "vfio/":
		iommuGroup := filepath.Join(sysRoot, "/sys/kernel/iommu_groups", file, "devices")
		files, err := os.ReadDir(iommuGroup)
		if err != nil {
			return nil, errors.Wrapf(err, "failed to read IOMMU group %s", iommuGroup)
		}
		for _, file := range files {
			realDev, err := filepath.EvalSymlinks(filepath.Join(iommuGroup, file.Name()))
			if err != nil {
				return nil, errors.Wrapf(err, "failed to get real path for %s", file.Name())
			}
			devs = append(devs, realDev)
		}
		return devs, nil
	default:
		return nil, nil
	}
}

func getTopologyHint(sysFSPath string) (*Hint, error) {
	hint := Hint{Provider: sysFSPath}
	fileMap := map[string]*string{
		"local_cpulist": &hint.CPUs,
		"numa_node":     &hint.NUMAs,
	}
	if err := readFilesInDirectory(fileMap, sysFSPath); err != nil {
		return nil, err
	}
	// Workarounds for broken information provided by kernel
	if hint.NUMAs == "-1" {
		// non-NUMA aware device or system, ignore it
		hint.NUMAs = ""
	}
	if hint.NUMAs != "" && hint.CPUs == "" {
		// broken topology hint. BIOS reports socket id as NUMA node
		// First, try to get hints from parent device or bus.
		parentHints, er := NewTopologyHints(filepath.Dir(sysFSPath))
		if er == nil {
			cpulist := map[string]bool{}
			numalist := map[string]bool{}
			for _, h := range parentHints {
				if h.CPUs != "" {
					cpulist[h.CPUs] = true
				}
				if h.NUMAs != "" {
					numalist[h.NUMAs] = true
				}
			}
			if cpus := strings.Join(mapKeys(cpulist), ","); cpus != "" {
				hint.CPUs = cpus
			}
			if numas := strings.Join(mapKeys(numalist), ","); numas != "" {
				hint.NUMAs = numas
			}
		}
		// if after parent hints we still don't have CPUs hints, use numa hint as sockets.
		if hint.CPUs == "" && hint.NUMAs != "" {
			hint.Sockets = hint.NUMAs
			hint.NUMAs = ""
		}
	}
	return &hint, nil
}

// NewTopologyHints return array of hints for the device and its slaves (e.g. RAID).
func NewTopologyHints(devPath string) (hints Hints, err error) {
	hints = make(Hints)
	realDevPath, err := filepath.EvalSymlinks(devPath)
	if err != nil {
		return nil, errors.Wrapf(err, "failed get realpath for %s", devPath)
	}
	for p := realDevPath; strings.HasPrefix(p, sysRoot+"/sys/devices/"); p = filepath.Dir(p) {
		hint, err := getTopologyHint(p)
		if err != nil {
			return nil, err
		}
		if hint.CPUs != "" || hint.NUMAs != "" || hint.Sockets != "" {
			hints[hint.Provider] = *hint
			break
		}
	}
	fromVirtual, _ := getDevicesFromVirtual(realDevPath)
	slaves, _ := filepath.Glob(filepath.Join(realDevPath, "slaves/*"))
	for _, device := range append(slaves, fromVirtual...) {
		deviceHints, er := NewTopologyHints(device)
		if er != nil {
			return nil, er
		}
		hints = MergeTopologyHints(hints, deviceHints)
	}
	return
}

// MergeTopologyHints combines org and hints.
func MergeTopologyHints(org, hints Hints) (res Hints) {
	if org != nil {
		res = org
	} else {
		res = make(Hints)
	}
	for k, v := range hints {
		if _, ok := res[k]; ok {
			continue
		}
		res[k] = v
	}
	return
}

// String returns the hints as a string.
func (h *Hint) String() string {
	cpus, nodes, sockets, sep := "", "", "", ""

	if h.CPUs != "" {
		cpus = "CPUs:" + h.CPUs
		sep = ", "
	}
	if h.NUMAs != "" {
		nodes = sep + "NUMAs:" + h.NUMAs
		sep = ", "
	}
	if h.Sockets != "" {
		sockets = sep + "sockets:" + h.Sockets
	}

	return "<hints " + cpus + nodes + sockets + " (from " + h.Provider + ")>"
}

// FindSysFsDevice for given argument returns physical device where it is linked to.
// For device nodes it will return path for device itself. For regular files or directories
// this function returns physical device where this inode resides (storage device).
// If result device is a virtual one (e.g. tmpfs), error will be returned.
// For non-existing path, no error returned and path is empty.
func FindSysFsDevice(dev string) (string, error) {
	fi, err := os.Stat(dev)
	if err != nil {
		if os.IsNotExist(err) {
			return "", nil
		}
		return "", errors.Wrapf(err, "unable to get stat for %s", dev)
	}

	devType := "block"
	rdev := fi.Sys().(*syscall.Stat_t).Dev
	if mode := fi.Mode(); mode&os.ModeDevice != 0 {
		rdev = fi.Sys().(*syscall.Stat_t).Rdev
		if mode&os.ModeCharDevice != 0 {
			devType = "char"
		}
	}

	major := unix.Major(rdev)
	minor := unix.Minor(rdev)
	if major == 0 {
		return "", errors.Errorf("%s is a virtual device node", dev)
	}
	devPath := fmt.Sprintf("/sys/dev/%s/%d:%d", devType, major, minor)
	realDevPath, err := filepath.EvalSymlinks(devPath)
	if err != nil {
		return "", errors.Wrapf(err, "failed get realpath for %s", devPath)
	}
	return realDevPath, nil
}

// readFilesInDirectory small helper to fill struct with content from sysfs entry
func readFilesInDirectory(fileMap map[string]*string, dir string) error {
	for k, v := range fileMap {
		b, err := os.ReadFile(filepath.Join(dir, k))
		if err != nil {
			if os.IsNotExist(err) {
				continue
			}
			return errors.Wrapf(err, "%s: unable to read file %q", dir, k)
		}
		*v = strings.TrimSpace(string(b))
	}
	return nil
}

// mapKeys is a small helper that returns slice of keys for a given map
func mapKeys(m map[string]bool) []string {
	ret := make([]string, len(m))
	i := 0
	for k := range m {
		ret[i] = k
		i++
	}
	return ret
}


================================================
FILE: pkg/topology/topology_test.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topology

import (
	"os"
	"path/filepath"
	"reflect"
	"sort"
	"testing"
)

func setupTestEnv(t *testing.T) func() {
	pwd, err := os.Getwd()
	if err != nil {
		t.Fatal("unable to get current directory")
	}
	if path, err := filepath.EvalSymlinks(pwd); err == nil {
		pwd = path
	}
	SetSysRoot(pwd + "/testdata")
	teardown := func() {
		SetSysRoot("")
	}
	return teardown
}

func TestMapKeys(t *testing.T) {
	cases := []struct {
		name   string
		input  map[string]bool
		output []string
	}{
		{
			name:   "empty",
			input:  map[string]bool{},
			output: []string{},
		},
		{
			name:   "one",
			input:  map[string]bool{"a": false},
			output: []string{"a"},
		},
		{
			name:   "multiple",
			input:  map[string]bool{"a": false, "b": true, "c": false},
			output: []string{"a", "b", "c"},
		},
	}
	for _, tc := range cases {
		test := tc
		t.Run(test.name, func(t *testing.T) {
			t.Parallel()
			output := mapKeys(test.input)
			sort.Strings(output)
			if !reflect.DeepEqual(output, test.output) {
				t.Fatalf("expected output: %+v got: %+v", test.output, output)
			}
		})
	}
}

func TestFindSysFsDevice(t *testing.T) {
	if testing.Short() {
		t.Skip("skipping test in short mode.")
	}
	teardown := setupTestEnv(t)
	defer teardown()
	cases := []struct {
		name        string
		input       string
		output      string
		expectedErr bool
	}{
		{
			name:        "empty",
			input:       "",
			output:      "",
			expectedErr: false,
		},
		{
			name:        "null",
			input:       "/dev/null",
			output:      "/sys/devices/virtual/mem/null",
			expectedErr: false,
		},
		{
			name:        "proc",
			input:       "/proc/self",
			output:      "",
			expectedErr: true,
		},
	}
	for _, tc := range cases {
		test := tc
		t.Run(test.name, func(t *testing.T) {
			t.Parallel()
			output, err := FindSysFsDevice(test.input)
			switch {
			case err != nil && !test.expectedErr:
				t.Fatalf("unexpected error returned: %+v", err)
			case err == nil && test.expectedErr:
				t.Fatalf("unexpected success: %+v", output)
			case output != test.output:
				t.Fatalf("expected: %q got: %q", test.output, output)
			}
		})
	}
}

func TestReadFilesInDirectory(t *testing.T) {
	var file, empty string
	fname := "test-a"
	content := []byte(" something\n")
	expectedContent := "something"

	fileMap := map[string]*string{
		fname:          &file,
		"non_existing": &empty,
	}

	dir, err := os.MkdirTemp("", "readFilesInDirectory")
	if err != nil {
		t.Fatalf("unable to create test directory: %+v", err)
	}
	defer os.RemoveAll(dir)
	os.WriteFile(filepath.Join(dir, fname), content, 0644)

	if err = readFilesInDirectory(fileMap, dir); err != nil {
		t.Fatalf("unexpected failure: %v", err)
	}
	if empty != "" {
		t.Fatalf("unexpected content: %q", empty)
	}
	if file != expectedContent {
		t.Fatalf("unexpected content: %q expected: %q", file, expectedContent)
	}
}

func TestGetDevicesFromVirtual(t *testing.T) {
	teardown := setupTestEnv(t)
	defer teardown()

	cases := []struct {
		name        string
		input       string
		output      []string
		expectedErr bool
	}{
		{
			name:        "vfio",
			input:       "/sys/devices/virtual/vfio/42",
			output:      []string{sysRoot + "/sys/devices/pci0000:00/0000:00:02.0"},
			expectedErr: false,
		},
		{
			name:        "misc",
			input:       "/sys/devices/virtual/misc/vfio",
			output:      nil,
			expectedErr: false,
		},
		{
			name:        "missing-iommu-group",
			input:       "/sys/devices/virtual/vfio/84",
			output:      nil,
			expectedErr: true,
		},
		{
			name:        "non-virtual",
			input:       "/sys/devices/pci0000:00/0000:00:02.0",
			output:      nil,
			expectedErr: true,
		},
	}

	for _, tc := range cases {
		test := tc
		t.Run(test.name, func(t *testing.T) {
			output, err := getDevicesFromVirtual(test.input)
			switch {
			case err != nil && !test.expectedErr:
				t.Fatalf("unexpected error returned: %+v", err)
			case err == nil && test.expectedErr:
				t.Fatalf("unexpected success: %+v", output)
			case len(output) != len(test.output):
				t.Fatalf("expected: %q got: %q", len(test.output), len(output))
			}
			for i, p := range test.output {
				if test.output[i] != p {
					t.Fatalf("expected: %q got: %q", test.output[i], p)
				}
			}
		})
	}
}

func TestMergeTopologyHints(t *testing.T) {
	cases := []struct {
		name           string
		inputA         Hints
		inputB         Hints
		expectedOutput Hints
		expectedErr    bool
	}{
		{
			name:           "empty",
			inputA:         nil,
			inputB:         nil,
			expectedOutput: Hints{},
		},
		{
			name:           "one,nil",
			inputA:         Hints{"test": Hint{Provider: "test", CPUs: "0"}},
			inputB:         nil,
			expectedOutput: Hints{"test": Hint{Provider: "test", CPUs: "0"}},
		},
		{
			name:           "nil, one",
			inputA:         nil,
			inputB:         Hints{"test": Hint{Provider: "test", CPUs: "0"}},
			expectedOutput: Hints{"test": Hint{Provider: "test", CPUs: "0"}},
		},
		{
			name:           "duplicate",
			inputA:         Hints{"test": Hint{Provider: "test", CPUs: "0"}},
			inputB:         Hints{"test": Hint{Provider: "test", CPUs: "0"}},
			expectedOutput: Hints{"test": Hint{Provider: "test", CPUs: "0"}},
		},
		{
			name:   "two",
			inputA: Hints{"test1": Hint{Provider: "test1", CPUs: "0"}},
			inputB: Hints{"test2": Hint{Provider: "test2", CPUs: "1"}},
			expectedOutput: Hints{
				"test1": Hint{Provider: "test1", CPUs: "0"},
				"test2": Hint{Provider: "test2", CPUs: "1"},
			},
		},
	}
	for _, tc := range cases {
		test := tc
		t.Run(test.name, func(t *testing.T) {
			t.Parallel()
			output := MergeTopologyHints(test.inputA, test.inputB)
			if !reflect.DeepEqual(output, test.expectedOutput) {
				t.Fatalf("expected output: %+v got: %+v", test.expectedOutput, output)
			}
		})
	}
}

func TestNewTopologyHints(t *testing.T) {
	if testing.Short() {
		t.Skip("skipping test in short mode.")
	}
	teardown := setupTestEnv(t)
	defer teardown()
	cases := []struct {
		name        string
		input       string
		output      Hints
		expectedErr bool
	}{
		{
			name:        "empty",
			input:       "non-existing",
			output:      nil,
			expectedErr: true,
		},
		{
			name:  "pci card1",
			input: sysRoot + "/sys/devices/pci0000:00/0000:00:02.0/drm/card1",
			output: Hints{
				sysRoot + "/sys/devices/pci0000:00/0000:00:02.0": Hint{
					Provider: sysRoot + "/sys/devices/pci0000:00/0000:00:02.0",
					CPUs:     "0-7",
					NUMAs:    "",
					Sockets:  ""},
			},
			expectedErr: false,
		},
	}
	for _, test := range cases {
		t.Run(test.name, func(t *testing.T) {
			output, err := NewTopologyHints(test.input)
			switch {
			case err != nil && !test.expectedErr:
				t.Fatalf("unexpected error returned: %+v", err)
			case err == nil && test.expectedErr:
				t.Fatalf("unexpected success: %+v", output)
			case !reflect.DeepEqual(output, test.output):
				t.Fatalf("expected: %q got: %q", test.output, output)
			}
		})
	}
}


================================================
FILE: pkg/utils/cpuset/cpuset.go
================================================
// Copyright The NRI Plugins Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cpuset

import (
	"fmt"
	"strconv"
	"strings"

	"k8s.io/utils/cpuset"
)

// CPUSet is an alias for k8s.io/utils/cpuset.CPUSet.
type CPUSet = cpuset.CPUSet

var (
	// New is an alias for cpuset.New.
	New = cpuset.New
	// Parse is an alias for cpuset.Parse.
	Parse = cpuset.Parse
)

// MustParse panics if parsing the given cpuset string fails.
func MustParse(s string) cpuset.CPUSet {
	cset, err := cpuset.Parse(s)
	if err != nil {
		panic(fmt.Errorf("failed to parse CPUSet %s: %w", s, err))
	}
	return cset
}

// ShortCPUSet prints the cpuset as a string, trying to further shorten compared to .String().
func ShortCPUSet(cset cpuset.CPUSet) string {
	str, sep := "", ""

	beg, end, step := -1, -1, -1
	for _, cpu := range strings.Split(cset.String(), ",") {
		if strings.Contains(cpu, "-") {
			str += sep + cpu
			sep = ","
			continue
		}
		i, err := strconv.ParseInt(cpu, 10, 0)
		if err != nil {
			return cset.String()
		}
		id := int(i)
		if beg < 0 {
			beg, end = id, id
			continue
		}
		if step < 0 {
			end = id
			step = end - beg
			continue
		}
		if id-end == step {
			end = id
			continue
		}
		str += sep + mkRange(beg, end, step)
		sep = ","
		beg, end = id, id
		step = -1
	}

	if beg >= 0 {
		str += sep + mkRange(beg, end, step)
	}

	return str
}

func mkRange(beg, end, step int) string {
	if beg < 0 {
		return ""
	}
	if beg == end {
		return strconv.FormatInt(int64(beg), 10)
	}

	b, e := strconv.FormatInt(int64(beg), 10), strconv.FormatInt(int64(end), 10)
	if step == 1 {
		return b + "-" + e
	}
	if beg+step == end {
		return b + "," + e
	}

	s := strconv.FormatInt(int64(step), 10)
	return b + "-" + e + ":" + s
}


================================================
FILE: pkg/utils/cpuset/cpuset_test.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cpuset

import (
	"testing"
)

func TestShortCPUSet(t *testing.T) {
	tcases := []struct {
		source string
		native string
		short  string
	}{
		{source: "", native: "", short: ""},
		{source: "1", native: "1", short: "1"},
		{source: "1,2", native: "1-2", short: "1,2"},
		{source: "1,2,3,4,5,6,7", native: "1-7", short: "1-7"},
		{source: "1,3,5,7,9,11", native: "1,3,5,7,9,11", short: "1-11:2"},
		{source: "1,3,5,7,8,10,12,14,16", native: "1,3,5,7-8,10,12,14,16", short: "1-7:2,10-16:2"},
		{
			source: "0,2,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110",
			native: "0,2,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110",
			short:  "0-110:2",
		},
	}
	for _, tc := range tcases {
		cset := MustParse(tc.source)
		native := cset.String()
		if native != tc.native {
			t.Errorf("incorrect native CPUSet for %q, expected %q, got %q",
				tc.source, tc.native, native)
		}
		short := ShortCPUSet(cset)
		if native != tc.native {
			t.Errorf("incorrect shortened CPUSet for %q, expected %q, got %q",
				tc.source, tc.short, short)
		}
	}
}


================================================
FILE: pkg/utils/json.go
================================================
/*
Copyright 2019 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package utils

import (
	"fmt"

	"sigs.k8s.io/yaml"
)

// DumpJSON dumps a json-compatible struct in human-readable form
func DumpJSON(r interface{}) string {
	out, err := yaml.Marshal(r)
	if err != nil {
		return fmt.Sprintf("!!!!!\nUnable to stringify %T: %v\n!!!!!", r, err)
	}
	return string(out)
}


================================================
FILE: pkg/utils/net.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package utils

import (
	"errors"
	"fmt"
	"net"
	"os"
	"syscall"
	"time"

	"google.golang.org/grpc"
)

// WaitForServer waits for a gRPC server to start accepting connections on a socket.
func WaitForServer(socket string, timeout time.Duration, opts ...interface{}) error {
	var errChecker []func(error) bool
	var dialOpts []grpc.DialOption
	var connp **grpc.ClientConn

	for _, o := range opts {
		switch o.(type) {
		case func(error) bool:
			errChecker = append(errChecker, o.(func(error) bool))
		case grpc.DialOption:
			dialOpts = append(dialOpts, o.(grpc.DialOption))
		case []grpc.DialOption:
			dialOpts = append(dialOpts, o.([]grpc.DialOption)...)
		case **grpc.ClientConn:
			if connp != nil {
				return fmt.Errorf("WaitForServer: multiple net.Conn pointer options given")
			}
			connp = o.(**grpc.ClientConn)
		default:
			return fmt.Errorf("WaitForServer: invalid option of type %T", o)
		}
	}

	if len(errChecker) < 1 {
		errChecker = []func(error) bool{isFatalDialError}
	}

	if len(dialOpts) == 0 {
		dialOpts = []grpc.DialOption{
			grpc.WithInsecure(),
			grpc.WithBlock(),
			grpc.FailOnNonTempDialError(true),
			grpc.WithTimeout(timeout),
			grpc.WithDialer(func(socket string, timeout time.Duration) (net.Conn, error) {
				conn, err := net.Dial("unix", socket)
				return conn, err
			}),
		}
	}

	start := time.Now()
	for {
		conn, err := grpc.Dial(socket, dialOpts...)
		if err == nil {
			if connp != nil {
				*connp = conn
			} else {
				conn.Close()
			}
			return nil
		}

		for _, f := range errChecker {
			if f(err) {
				return err
			}
		}

		switch {
		case timeout >= 0 && start.Add(timeout).Before(time.Now()):
			return err
		case timeout < 0 || timeout > time.Second:
			time.Sleep(time.Second)
		default:
			time.Sleep(timeout / 2)
		}
	}
}

// IsListeningSocket returns true if connections are accepted on the socket.
func IsListeningSocket(socket string) (bool, error) {
	conn, err := net.Dial("unix", socket)
	if err == nil {
		conn.Close()
		return true, nil
	}

	if errors.Is(err, syscall.ECONNREFUSED) || os.IsNotExist(err) {
		return false, nil
	}

	return false, err
}

// Check if a socket connection error looks fatal.
//
// Notes:
//   Hmm... I wonder if it is really so difficult or I am just doing
//   it wrong ? We would like to find out if a connection attempt to
//   a unix-domain socket fails with a fatal error, in which case we
//   don't want to stick around retrying it later.
//
//   We treat errors which the originating layer considers a timeout
//   or a temporary error as non-fatal one. Otherwise, we single out
//   a few special errors:
//     - EPERM: fatal error
//     - EACCES: fatal error
//     - ENOENT: non-fatal, server might still come around
//     - ECONNREFUSED: non-fatal, maybe a lingering socket
//

type temporary interface {
	Temporary() bool
}

type timeout interface {
	Timeout() bool
}

type origin interface {
	Origin() error
}

func isFatalDialError(err error) bool {
	for {
		if e, ok := err.(temporary); ok {
			if e.Temporary() {
				return false
			}
		}
		if e, ok := err.(timeout); ok {
			if e.Timeout() {
				return false
			}
		}

		switch err.(type) {
		case *net.OpError:
			err = err.(*net.OpError).Err
			continue
		case *os.SyscallError:
			ne := err.(*os.SyscallError)
			switch {
			case os.IsPermission(ne):
				return true
			case os.IsNotExist(ne):
				return false
			case ne.Err == syscall.ECONNREFUSED:
				return true
			default:
				err = ne
				continue
			}
		default:
			if oe, ok := err.(origin); ok {
				err = oe.Origin()
				continue
			}
		}

		return true
	}
}


================================================
FILE: pkg/utils/parse.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package utils

import (
	"fmt"
	"strings"
)

// ParseEnabled returns whether the given string represents an 'enabled' state.
func ParseEnabled(value string) (bool, error) {
	switch strings.ToLower(value) {
	case "true", "on", "enable", "enabled", "1":
		return true, nil
	case "false", "off", "disable", "disabled", "0":
		return false, nil
	default:
		return false, fmt.Errorf("ParseEnabled: invalid string %q", value)
	}
}


================================================
FILE: pkg/utils/sort.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package utils

import (
	"sort"
)

// SortUint64s sorts a slice of uint64 in increasing order.
func SortUint64s(a []uint64) {
	sort.Sort(Uint64Slice(a))
}

// Uint64Slice implmenents sort.Interface for a slice of uint64.
type Uint64Slice []uint64

// Len returns the length of an UintSlice
func (s Uint64Slice) Len() int { return len(s) }

// Less returns true if element at 'i' is less than the element at 'j'
func (s Uint64Slice) Less(i, j int) bool { return s[i] < s[j] }

// Swap swaps the values of two elements
func (s Uint64Slice) Swap(i, j int) { s[i], s[j] = s[j], s[i] }


================================================
FILE: pkg/utils/tar.go
================================================
// Copyright 2019-2021 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package utils

import (
	"archive/tar"
	"compress/bzip2"
	"io"
	"os"
	"path"
)

func UncompressTbz2(archive string, dir string) error {
	file, err := os.Open(archive)
	if err != nil {
		return err
	}
	defer file.Close()

	data := bzip2.NewReader(file)
	tr := tar.NewReader(data)
	for {
		header, err := tr.Next()
		if err != nil {
			if err == io.EOF {
				return nil
			}
			return err
		}
		if header.Typeflag == tar.TypeDir {
			// Create a directory.
			err = os.MkdirAll(path.Join(dir, header.Name), 0755)
			if err != nil {
				return err
			}
		} else if header.Typeflag == tar.TypeReg {
			// Create a regular file.
			targetFile, err := os.Create(path.Join(dir, header.Name))
			if err != nil {
				return err
			}
			_, err = io.Copy(targetFile, tr)
			targetFile.Close()
			if err != nil {
				return err
			}
		} else if header.Typeflag == tar.TypeSymlink {
			// Create a symlink and all the directories it needs.
			err = os.MkdirAll(path.Dir(path.Join(dir, header.Name)), 0755)
			if err != nil {
				return err
			}
			err := os.Symlink(header.Linkname, path.Join(dir, header.Name))
			if err != nil {
				return err
			}
		}
	}
}


================================================
FILE: pkg/version/version.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//
// This module lets one tag built binaries with version metadata.
//
// Currently two pieces of metadata tracked/provided:
//   - Version: version number, by convention one provided by 'git describe'
//   - Build:   build id, by convention the git SHA1 the binary has been built from.
//
// To enable automatic versioning metadata for your binary, you need to
//
//   1) import this package
//   2) add the linker flags to override the dummy package variables, for instance:
//        LDFLAGS=-ldflags \
//          "-X=github.com/intel/cri-resource-manager/pkg/version.Version=<version> \
//           -X=github.com/intel/cri-resource-manager/pkg/version.Build=<build-id>"
//
// Note that further metadata can be trivially added in a similar fashion:
//
//   1) add the corresponding variables to this modules
//   2) arrange the default values to be correctly overridden during linking
//   3) add printing of the new metadata to PrintVersionInfo()
//

package version

import (
	"flag"
	"fmt"
	"os"
	"path/filepath"
	"strconv"
)

// Default values of variables we'll override with the linker.
var (
	// Version is our version as given by 'git describe'.
	Version = "<If you see this, you ain't doin' it right, Jimbo...>"
	// Build is the SHA1 of the repository we've been built from.
	Build = "<If you see this, you ain't doin' it right, Jimbo...>"
)

// PrintVersionInfo prints version information about this binary.
func PrintVersionInfo() {
	fmt.Printf("%s version information:\n", filepath.Base(os.Args[0]))
	fmt.Printf("  - version: %s\n", Version)
	fmt.Printf("  - build:   %s\n", Build)
}

// Dummy struct used to hook into flag.Value.Set of -version during commandline parsing.
type version struct{}

// IsBoolFlag tell flag that we only have optional arguments.
func (version) IsBoolFlag() bool {
	return true
}

// Set is our dummy flag.Value setter.
func (version) Set(value string) error {
	printVersion, err := strconv.ParseBool(value)
	if err != nil {
		return err
	}
	if printVersion {
		PrintVersionInfo()
		os.Exit(0)
	}

	return nil
}

// String is our dummy flag.Value stringification function.
func (*version) String() string {
	return "false"
}

// Put in place a '--version' command line option for us.
func init() {
	flag.Var(&version{}, "version", "Print version information about "+filepath.Base(os.Args[0]))
}


================================================
FILE: runtime-deps.csv
================================================
Go,https://github.com/golang/go,9051
fsnotify,https://github.com/fsnotify/fsnotify,8402
yaml,https://github.com/ghodss/yaml,9746
grpc-go,https://github.com/grpc/grpc-go,7283
kubernetes,https://github.com/kubernetes/kubernetes,9641


================================================
FILE: sample-configs/balloons-policy.cfg
================================================
policy:
  Active: balloons
  # Use only 15 CPUs in total, leave cpu0 for other than Kubernetes
  # processes.
  AvailableResources:
    CPU: cpuset:1-15
  # Reserve one of our CPUs (cpu15) for kube-system tasks.
  ReservedResources:
    CPU: cpuset:15
  balloons:
    # PinCPU: allow containers to use only the CPUs in their balloons.
    PinCPU: true
    # PinMemory: allow containers to use only the closest memory to
    # the CPUs in their balloons.
    PinMemory: true
    # IdleCPUClass: how to configure CPUs that are not included in any
    # of the balloons.
    IdleCPUClass: idle
    BalloonTypes:
      - Name: "full-core-turbo"
        # MinCPUs: minimum number of logical cores in every balloon
        # instance of this type.
        # The default is 0.
        MinCPUs: 2
        # MaxCPUs: maximum number of logical cores in every balloon
        # instance of this type.
        # The default is 0 (unlimited).
        MaxCPUs: 2
        # CPUClass: how to configure CPUs of these balloons.
        # The default is "".
        CPUClass: "turbo"
        # Namespaces: assign pods in listed namespaces to these
        # balloons, even if there is no explicit annotation:
        # balloon.balloons.cri-resource-manager.intel.com: full-core-turbo
        # The default is to assign only annotated pods.
        Namespaces:
          - "highperf"
        # AllocatorPriotity: CPU allocator priority (0: High, 1:
        # Normal, 2: Low, 3: None). Affects the performance/type of
        # CPUs that are selected into the balloon. CPUs for static
        # balloon instances (MinBalloons > 0) with highest
        # AllocatorPriority are reserved first.
        # The default is 0.
        AllocatorPriority: 2
        # MinBalloons: how many balloon instances of this type are always
        # kept in the system, even if there would not be workloads to them.
        # The default is 0.
        MinBalloons: 2
        # PreferNewBalloons: prefer creating a new balloon for
        # separate pods, even if their CPU requirements would allow
        # putting them in the same balloon.
        # The default is: false.
        PreferNewBalloons: true
        # PreferPerNamespaceBalloon: if true, containers in the same
        # namespace are preferrably placed in the same balloon, and
        # containers in different namespaces to different
        # balloons. The default is false: namespaces have no effect on
        # placement.
        PreferPerNamespaceBalloon: false
        # PreferSpreadingPods: if true, containers of single pod can
        # be assigned in different balloons, based on which balloons
        # have most free CPU resources.
        # The default is: false: prefer running containers of a same
        # pod in the same balloon(s).
        PreferSpreadingPods: false

      - Name: "socket-size"
        MaxCPUs: 8
        AllocatorPriority: 2
        Namespaces:
          - "default"
        CPUClass: "normal"
# CPU controller configuration specifies CPU class properties. CPUs of
# each balloon are configured based on its CPUClass. If a balloon has
# no CPUClass, the properties of the default class are applied.
cpu:
  classes:
    default:
      minFreq: 800
      maxFreq: 1600
    turbo:
      minFreq: 3300
      maxFreq: 3600
    normal:
      minFreq: 800
      maxFreq: 2400
instrumentation:
  # The balloons policy exports containers running in each balloon,
  # and cpusets of balloons. Accessible in command line:
  # curl --silent http://localhost:8891/metrics
  HTTPEndpoint: :8891
  PrometheusExport: true
logger:
  Debug: policy


================================================
FILE: sample-configs/blockio.cfg
================================================
# This configuration demonstrates how to configure cgroups block io
# controller for pods.
#
# The configuration defines block device parameters for three blockio
# classes (LowPrioThrottled, HighPrioFullSpeed and Default, feel free
# to choose any names here). Finally resource-manager.blockio maps QOS
# classes BestEffort, Burstable (via wildcard), and Guaranteed to
# these classes.
#
# Try with: cri-resmgr -force-config blockio.cfg

policy:
  Active: none

logger:
  Debug: blockio,cgroupblkio

blockio:
  Classes:
    # LowPrioThrottled and HighPrioFullSpeed are user-defined blockio classes
    # in this example. Pods and containers can be assigned to these classes using Pod
    # metadata annotations. For example in Pod yaml:
    # ...
    # metadata:
    #   annotations:
    #     # Default blockio class for containers in the pod:
    #     blockioclass.cri-resource-manager.intel.com/pod: LowPrioThrottled
    #     # Special blockio class for a container in the pod:
    #     blockioclass.cri-resource-manager.intel.com/container.mycontainer: HighPrioFullSpeed
    LowPrioThrottled:
      # Default io-scheduler weight for all devices that are not
      # explicitly mentioned in following items.
      - Weight: 80 # will be written to cgroups(.bfq).weight

      # Configuration for all virtio and scsi block devices.
      - Devices:
          - /dev/vd*
          - /dev/sd*
        ThrottleReadBps: 50M   # max read bytes per second
        ThrottleWriteBps: 10M  # max write bytes per second
        ThrottleReadIOPS: 10k  # max read io operations per second
        ThrottleWriteIOPS: 5k  # max write io operations per second
        Weight: 50             # io-scheduler (cfq/bfq) weight for these devices,
                               # will be written to cgroups(.bfq).weight_device

      # Configuration for SSD devices.
      # This overrides above configuration for those /dev/sd* devices
      # whose disk id contains "SSD"
      - Devices:
          - /dev/disk/by-id/*SSD*
        ThrottleReadBps: 100M
        ThrottleWriteBps: 40M
        # Not mentioning Throttle*IOPS means no io operations throttling for matching devices.
        Weight: 50

    HighPrioFullSpeed:
      - Weight: 400

    # When Pod annotations do not define blockio class, QoS class
    # names (BestEffort, Burstable, Guaranteed) are used as blockio
    # class names for the pod. By default no blockio configuration
    # takes place for them, but here we define I/O scheduler weight
    # difference:
    BestEffort:
      - Weight: 90
    Guaranteed:
      - Weight: 200


================================================
FILE: sample-configs/cri-full-message-dump.cfg
================================================
# run with no-op policy
policy:
  Active: none
# enable full dumps of all messages
dump:
  Config: full:.*


================================================
FILE: sample-configs/cri-resmgr-configmap.example.yaml
================================================
#
# This example creates 3 ConfigMaps:
#  - cri-resmgr-config.default: the default configuration
#  - cri-resmgr-config.group.foo: the configuration for nodes in group foo
#  - cri-resmgr-config.node.cl0-slave1: the configuration for node cl0-slave1
#
# You can assign nodes to group foo using the command
#   kubectl label --overwrite node $NODE_NAME cri-resource-manager.intel.com/group=foo
#
# You can remove nodes from group foo using the command
#   kubectl label node $NODE_NAME cri-resource-manager.intel.com/group-
#

apiVersion: v1
kind: ConfigMap
metadata:
  name: cri-resmgr-config.default
  namespace: kube-system
data:
  policy: |+
    Active: topology-aware
    AvailableResources:
      cpu: cpuset:0-63
    ReservedResources:
      cpu: cpuset:0-1
    topology-aware:
      PinCPU: true
      PinMemory: true
      PreferIsolatedCPUs: true
      PreferSharedCPUs: false
    static:
      RelaxedIsolation: true
    static-pools:
      # Filesystem path to legacy configuration directory structure
      ConfDirPath: "/etc/cmk"
      # Filesystem path to legacy configuration file
      ConfFilePath: ""
      # Whether to create CMK node label
      LabelNode: false
      # Whether to create CMK node taint
      TaintNode: false
      # Pool configuration.
      # The imaginary example system below consists of 4 sockets, 4 cores, 2
      # threads each.
      pools:
        exclusive:
          # 6 exclusive cores, 3 on sockets 1, 2 and 3 each
          cpuLists:
          - Cpuset: 8,9
            Socket: 1
          - Cpuset: 10,11
            Socket: 1
          - Cpuset: 16,17
            Socket: 2
          - Cpuset: 18,19
            Socket: 2
          - Cpuset: 24,25
            Socket: 3
          - Cpuset: 26,27
            Socket: 3
          exclusive: true
        shared:
          # 2 cores in shared pool, all on socket 1
          cpuLists:
          - Cpuset: 12,13,14,15
            Socket: 1
          exclusive: false
        infra:
          # Rest of cores designated to infra pool
          cpuLists:
          - Cpuset: 0,1,2,3,4,5,6,7
            Socket: 0
          - Cpuset: 20,21,22,23
            Socket: 2
          - Cpuset: 28,29,30,31
            Socket: 3
          exclusive: false
  rdt: |+
    # Common options
    options:
      # One of Full, Discovery or Disabled
      mode: Full
      # Set to true to disable creation of monitoring groups
      monitoringDisabled: false
      l3:
        # Make this false if L3 CAT must be available
        optional: true
      mb:
        # Make this false if MBA must be available
        optional: true

    # Configuration of classes
    partitions:
      exclusive:
        # Allocate 60% of all L3 cache to the "exclusive" partition
        l3Allocation: "60%"
        mbAllocation: ["100%"]
        classes:
          Guaranteed:
            # Allocate all of the partitions cache lines to "Guaranteed"
            l3Allocation: "100%"
      shared:
        # Allocate 40% L3 cache IDs to the "shared" partition
        # These will NOT overlap with the cache lines allocated for "exclusive" partition
        l3Allocation: "40%"
        mbAllocation: ["50%"]
        classes:
          Burstable:
            # Allow "Burstable" to use all cache lines of the "shared" partition
            l3Allocation: "100%"
          BestEffort:
            # Allow "Besteffort" to use only half of the L3 cache # lines of the "shared" partition.
            # These will overlap with those used by "Burstable"
            l3Allocation: "50%"
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: cri-resmgr-config.group.foo
  namespace: kube-system
data:
  policy: |+
    Active: topology-aware
    AvailableResources:
      cpu: cpuset:0-63
    ReservedResources:
      cpu: cpuset:0-1
    topology-aware:
      PinCPU: true
      PinMemory: false
      PreferIsolatedCPUs: false
      PreferSharedCPUs: false
    static:
      RelaxedIsolation: true
    static-pools:
      # This is an example configuration for static-pools policy.
      # The imaginary example system here consists of 4 sockets, 4 cores, 2 threads each.
      pools:
        exclusive:
          # 6 exclusive cores, 3 on sockets 1, 2 and 3 each
          cpuLists:
          - Cpuset: 8,9
            Socket: 1
          - Cpuset: 10,11
            Socket: 1
          - Cpuset: 16,17
            Socket: 2
          - Cpuset: 18,19
            Socket: 2
          - Cpuset: 24,25
            Socket: 3
          - Cpuset: 26,27
            Socket: 3
          exclusive: true
        shared:
          # 2 cores in shared pool, all on socket 1
          cpuLists:
          - Cpuset: 12,13,14,15
            Socket: 1
          exclusive: false
        infra:
          # Rest of cores designated to infra pool
          cpuLists:
          - Cpuset: 0,1,2,3,4,5,6,7
            Socket: 0
          - Cpuset: 20,21,22,23
            Socket: 2
          - Cpuset: 28,29,30,31
            Socket: 3
          exclusive: false
  rdt: |+
    # Common options
    options:
      # One of Full, Discovery or Disabled
      mode: Full
      # Set to true to disable creation of monitoring groups
      monitoringDisabled: false
      l3:
        # Make this false if L3 CAT must be available
        optional: true
      mb:
        # Make this false if MBA must be available
        optional: true

    # Configuration of classes
    partitions:
      exclusive:
        # Allocate 60% of all L3 cache to the "exclusive" partition
        l3Allocation: "60%"
        mbAllocation: ["100%"]
        classes:
          Guaranteed:
            # Allocate all of the partitions cache lines to "Guaranteed"
            l3Allocation: "100%"
      shared:
        # Allocate 40% L3 cache IDs to the "shared" partition
        # These will NOT overlap with the cache lines allocated for "exclusive" partition
        l3Allocation: "40%"
        mbAllocation: ["50%"]
        classes:
          Burstable:
            # Allow "Burstable" to use all cache lines of the "shared" partition
            l3Allocation: "100%"
          BestEffort:
            # Allow "Besteffort" to use only half of the L3 cache # lines of the "shared" partition.
            # These will overlap with those used by "Burstable"
            l3Allocation: "50%"
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: cri-resmgr-config.node.cl0-slave1
  namespace: kube-system
data:
  policy: |+
    Active: topology-aware
    AvailableResources:
      cpu: cpuset:0-63
    ReservedResources:
      cpu: cpuset:0-1
    topology-aware:
      PinCPU: false
      PinMemory: true
      PreferIsolatedCPUs: false
      PreferSharedCPUs: false
    static:
      RelaxedIsolation: true
    static-pools:
      # This is an example configuration for static-pools policy.
      # The imaginary example system here consists of 4 sockets, 4 cores, 2 threads each.
      pools:
        exclusive:
          # 6 exclusive cores, 3 on sockets 1, 2 and 3 each
          cpuLists:
          - Cpuset: 8,9
            Socket: 1
          - Cpuset: 10,11
            Socket: 1
          - Cpuset: 16,17
            Socket: 2
          - Cpuset: 18,19
            Socket: 2
          - Cpuset: 24,25
            Socket: 3
          - Cpuset: 26,27
            Socket: 3
          exclusive: true
        shared:
          # 2 cores in shared pool, all on socket 1
          cpuLists:
          - Cpuset: 12,13,14,15
            Socket: 1
          exclusive: false
        infra:
          # Rest of cores designated to infra pool
          cpuLists:
          - Cpuset: 0,1,2,3,4,5,6,7
            Socket: 0
          - Cpuset: 20,21,22,23
            Socket: 2
          - Cpuset: 28,29,30,31
            Socket: 3
          exclusive: false
  rdt: |+
    # Common options
    options:
      # One of Full, Discovery or Disabled
      mode: Full
      # Set to true to disable creation of monitoring groups
      monitoringDisabled: false
      l3:
        # Make this false if L3 CAT must be available
        optional: true
      mb:
        # Make this false if MBA must be available
        optional: true

    # Configuration of classes
    partitions:
      exclusive:
        # Allocate 60% of all L3 cache to the "exclusive" partition
        l3Allocation: "60%"
        mbAllocation: ["100%"]
        classes:
          Guaranteed:
            # Allocate all of the partitions cache lines to "Guaranteed"
            l3Allocation: "100%"
      shared:
        # Allocate 40% L3 cache IDs to the "shared" partition
        # These will NOT overlap with the cache lines allocated for "exclusive" partition
        l3Allocation: "40%"
        mbAllocation: ["50%"]
        classes:
          Burstable:
            # Allow "Burstable" to use all cache lines of the "shared" partition
            l3Allocation: "100%"
          BestEffort:
            # Allow "Besteffort" to use only half of the L3 cache # lines of the "shared" partition.
            # These will overlap with those used by "Burstable"
            l3Allocation: "50%"
  dump: |+
    Config: full:.*,short:.*Stop.*,off:.*List.*
    File: /tmp/cri-selective-debug.dump
  logger: |+
    Debug: resource-manager,cache


================================================
FILE: sample-configs/external-adjustment.yaml
================================================
apiVersion: criresmgr.intel.com/v1alpha1
kind: Adjustment
metadata:
  name: external-adjustment
  namespace: kube-system
spec:
  scope:
    - nodes: [ node-1 ]
      containers:
        - key: ":,:pod/name,name"
          operator: Matches
          values: [ "*:container" ]
    - nodes: [ node-2 ]
      containers:
        - key: ":,:pod/name,name"
          operator: Matches
          values: [ "pod:*" ]
    - nodes: [ node-3, node-4 ]
      containers:
        - key: ":,:pod/name,name"
          operator: Equals
          values: [ "anotherpod:container" ]
  resources:
    requests:
      cpu: 750m
      memory: 500Mi
    limits:
      cpu: 1500m
      memory: 750Mi
  toptierLimit: 500Mi
  classes:
    rdt: rdt-class-1
    blockio: blockio-class-1


================================================
FILE: sample-configs/podpools-policy.cfg
================================================
# This example demonstrates pod-based CPU and memory pinning.
# All containers of a pod run in the same CPU/memory pool.
# The capacity of a pool is defined as a number of pods it can
# contain.
#
# The two steps for running a pod in a pod pool are:
#
# 1. Annotate the pod:
#
#    metadata:
#      annotations:
#        pool.podpools.cri-resource-manager.intel.com: POOLNAME
#
# 2. Make sure that total CPU resources required by the containers
#    in the pod match the CPUs per pod in the pod pool.

policy:
  # pod-based CPU and memory pinning is implemented in the podpools policy.
  Active: podpools

  # AvailableResources specifies CPUs that active policy is allowed to
  # use: containers will not run outside AvailableResources
  # CPUs. Other CPUs are considered reserved for system. Corresponding
  # kubelet parameter: --system-reserved. By default
  # AvailableResources contains all CPUs.
  AvailableResources:
    # "CPU" can be the number of CPUs or explicitly defined set of
    # CPUs. In this example we use 14 CPUs, excluding CPUs #0 and #1
    # (hyperthreads of core 0).
    CPU: cpuset:2-15

  # ReservedResources specifies CPU(s) that active policy dedicates
  # for running kube-system pods. Corresponding kubelet parameter:
  # --kube-reserved.
  ReservedResources:
    # Here we dedicate CPU #15 for these pods.
    # This leaves 13 out of 14 available CPUs unallocated.
    CPU: cpuset:15

  # podpools-specific configuration specifies the following.
  # 1. Pod pool definitions ("Pools").
  #    The policy creates one or more pool instances from a definition.
  # 2. Resources (CPUs) needed by each pod pool definition in total.
  #    This can be given as one of the following:
  #    1. a number of pool instances:      "Instances: <NUM>"
  #    2. a number of CPUs:                "Instances: <NUM> CPUs"
  #    3. percentage of non-reserved CPUs: "Instances: <NUM> %"
  #    In case 1, CPUs needed by the definition is <NUM> * CPUs per pool.
  # 3. How many CPUs each pool instance gets from the CPUs allocated
  #    to its definition in total.
  # 4. Capacity of each pool instance.
  #    This is the maximum number of pods in a single pool instance.
  podpools:
    # By default podpools pins both CPU and memory of all containers.
    # Pinning either of them can be disabled with:
    # pinCPU: false
    # pinMemory: false
    Pools:
      # Define the "singlecpu" pod pool type:
      - Name: singlecpu
        # Take 3 out of 13 AvailableResources CPUs to be used by
        # all "singlecpu" pod pool instances in total.
        # This leaves 10 CPUs unallocated for other pools.
        Instances: 3 CPUs
        # Every "singlecpu" pod pool instance has 1 CPU to run all
        # pods assigned to the instance.
        # As the definition can use 3 CPUs in total, there will be 3
        # "singlecpu" pool instances.
        CPU: 1
        # Every "singlecpu" pod pool instance holds at most 2 pods.
        MaxPods: 2

        # Note that every pod that is annotated to run on a singlecpu
        # pool is assumed to consume CPU/MaxPods = 500m CPU. Therefore
        # the sum of request.cpu's of all containers in this kind of
        # pod should be 500m. Otherwise kube-scheduler may overload or
        # underload the node.

      # Define the "dualcpu" pod pool type:
      - Name: dualcpu
        # FillOrder specifies the order in which the capacity of pod
        # pool instances of this pool type is filled with pods. The
        # default is Balanced: new pod is assigned to a pool instance
        # with most free capacity. The opposite is Packed: new pod is
        # assigned to a pool instance with least free capacity.
        FillOrder: Packed
        # Take at most 50 % of non-reserved CPUs (50 % * 13 = 6.5)
        # to be used by all "dualcpu" pool instances in total.
        Instances: 50 %
        # Every "dualcpu" pool instance has 2 CPUs.
        # That is, floor(6.5 / 2) = 3 pool instances of this type will
        # be created, and therefore 6 CPUs actually consumed to this
        # pool type.
        # This leaves 4 CPUs unallocated.
        CPU: 2
        # Every "dualcpu" pool instance holds at most 3 pods.
        MaxPods: 3

      # In addition to user-defined pools, there are two built-in
      # pools:
      #
      # - "reserved" contains the ReservedResources CPUs and runs all
      #   kube-system pods.
      #
      # - "default" contains CPUs that are neither reserved nor
      #   allocated to any user-defined pools. It runs all pods that
      #   are not kube-system and are not assigned to any user-defined
      #   pool. The number of CPUs in the default pool can be
      #   overridden by defining "default" pool like other pools. If
      #   CPUs were not left over for the default pool, it will use
      #   the same CPUs as the reserved pool.
logger:
  Debug: policy


================================================
FILE: sample-configs/static-policy.cfg
================================================
policy:
  Active: static
  ReservedResources:
    CPU: 1000m
logger:
  Debug: policy,static
dump:
  Config: off:.*,full:((Create)|(Remove)|(Run)|(Update)|(Start)|(Stop)).*


================================================
FILE: sample-configs/static-pools-policy.conf.example
================================================
# This is an example configuration file for the builtin cmk policy
# The imaginary example system here consists of 4 sockets, 4 cores (8
# multithreaded CPUs)
#
# NOTE: only pools configuration may be specified in this file. Other
# configuration options must be set through the dynamic configration system
pools:
  exclusive:
    # 6 exclusive cores, 3 on sockets 1, 2 and 3 each
    cpuLists:
    - Cpuset: 8,9
      Socket: 1
    - Cpuset: 10,11
      Socket: 1
    - Cpuset: 16,17
      Socket: 2
    - Cpuset: 18,19
      Socket: 2
    - Cpuset: 24,25
      Socket: 3
    - Cpuset: 26,27
      Socket: 3
    exclusive: true
  shared:
    # 2 cores in shared pool, all on socket 1
    cpuLists:
    - Cpuset: 12,13,14,15
      Socket: 1
    exclusive: false
  infra:
    # Rest of cores designated to infra pool
    cpuLists:
    - Cpuset: 0,1,2,3,4,5,6,7
      Socket: 0
    - Cpuset: 20,21,22,23
      Socket: 2
    - Cpuset: 28,29,30,31
      Socket: 3
    exclusive: false


================================================
FILE: sample-configs/topology-aware-policy.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
logger:
  Debug: cri-resmgr,resource-manager,cache
dump:
  Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*


================================================
FILE: scripts/build/docker-build-image
================================================
#!/bin/bash

IMAGE=$1
DOCKERFILE=dockerfiles/cross-build/Dockerfile.${IMAGE%-build}
shift 1

echo "* Building docker images with"
echo "  - Dockerfile: $DOCKERFILE"
echo "  - image name: $IMAGE"
echo "  - options   : $@"

docker build . \
       -f "$DOCKERFILE" -t "$IMAGE" \
       --build-arg "CREATE_USER=$USER" \
       --build-arg USER_UID="$(id -u)" \
       "$@" || exit 1


================================================
FILE: scripts/build/get-buildid
================================================
#!/bin/bash

#
# Script to determine a version string, a buildid as well as related RPM
# and debian package versions. These are determined using the following
# sources in decreasing order of preference:
#
#  1. git metadata:
#    - version: git describe --tags --long --dirty
#    - buildid: git rev-parse --short HEAD
#  2. stored git metadata:
#    - version: git-version
#    - buildid: git-buildid
#  3. directory name:
#    - version: cri-resource-manager-(.*):
#    - buildid: unknown
#  4. date:
#    - version: 0.0.0-$(date +%Y%m%d%H%M)
#    - buildid: unknown
#

PARENT_DIRNAME=cri-resource-manager
VERSION_FILE=version
BUILDID_FILE=buildid
VERSION=""
BUILDID=""
RPM=""
DEB=""

fail() {
    echo "$*" 2>&1
    exit 1
}

log() {
    echo "$*" 1>&2
}

print_usage() {
    local _status=0
    if [ -n "$*" ]; then
        echo "$*"
        _status=1
    fi
    echo "usage $0 [--store[=<dir>]] [--version] [--buildid] [--rpm] [--deb] [--tar] [--all]"
    exit $_status
}

dotgit_hasrepo() {
    git status >& /dev/null
}

dotgit_version() {
    local _v _id _dirty _count

    if  [ -z "$TEST_DESCRIBE" ]; then
        if ! dotgit_hasrepo; then
            return 1
        fi
        _id=$(git rev-parse --short HEAD)
        _dirty=$(git diff --quiet -- ':!go.mod' ':!go.sum' || echo '-dirty')
        _v=$(git describe --tags --long --dirty 2>/dev/null)
    else
        _v="$TEST_DESCRIBE"
        _id="$TEST_REV"
        _dirty=""
    fi

    case "$_v" in
        v*) _v="${_v#v}"
            ;;
        *)
            _count=$(git rev-list --count HEAD)
            _v="0.0.0-$_count-g$_id$_dirty"
            ;;
    esac

    VERSION="$_v"
    BUILDID="$_id$_dirty"
}

stored_hasdata() {
    if [ ! -f "$OUTDIR/$VERSION_FILE" ] || [ ! -f "$OUTDIR/$BUILDID_FILE" ]; then
        return 1
    fi
    STORED_VERSION=$(cat "$OUTDIR/$VERSION_FILE") && \
        STORED_BUILDID=$(cat "$OUTDIR/$BUILDID_FILE")
}

stored_version() {
    if ! stored_hasdata; then
        return 1
    fi
    VERSION="$STORED_VERSION"
    BUILDID="$STORED_BUILDID"
}

stored_update() {
    if stored_hasdata; then
        if [ "$STORED_VERSION" = "$VERSION" ] && [ "$STORED_BUILDID" = "$BUILDID" ]; then
            return 0
        fi
    fi
    mkdir -p "$OUTDIR" || fail "failed to create $OUTDIR"
    echo "$VERSION" > "$OUTDIR/$VERSION_FILE"
    echo "$BUILDID" > "$OUTDIR/$BUILDID_FILE"
}

parent_version() {
    local _dir

    _dir=$(basename "$(realpath .)")
    case "$_dir" in
        "${PARENT_DIRNAME}"-*)
            VERSION="${_dir##${PARENT_DIRNAME}-}"
            BUILDID=unknown
            return 0
            ;;
    esac
    return 1
}

unknown_version() {
    VERSION="0.0.0-$(date +%Y%m%d%H%M)"
    BUILDID=unknown
}

package_versions() {
    case "$VERSION" in
        [0-9.]**-g[0-9a-f]*)
            local _full="$VERSION"
            local _numeric=${_full%%-*}
            local _cntsha1=${_full#*-}
            local _clean=${_cntsha1%-dirty}
            local _dirty=${_cntsha1#$_clean}; _cntsha1="$_clean"
            local _sha1=${_cntsha1##*-g}
            local _cnt=${_cntsha1%-g*}
            VERSION=$_numeric
            if [ -n "$_cnt" ] && [ "$_cnt" != "0" ]; then
                VERSION="$VERSION-$_cnt-g$_sha1"
            fi
            VERSION=$VERSION$_dirty
            RPM=$(echo "$VERSION" | tr '+-' '_')
            DEB=$VERSION
            ;;
        [0-9.]*)
            RPM=$VERSION
            DEB=$VERSION
            ;;
        *)
            fail "can't parse version $VERSION"
            ;;
    esac
}

print_variables() {
    local _what _var _val

    for _what in $PRINT; do
        case $_what in
            version)
                [ -n "$SHVAR" ] && _var='gitversion='
                _val="$VERSION"
                ;;
            buildid)
                [ -n "$SHVAR" ] && _var='gitbuildid='
                _val="$BUILDID"
                ;;
            rpm)
                [ -n "$SHVAR" ] && _var='rpmversion='
                _val="$RPM"
                ;;
            deb)
                [ -n "$SHVAR" ] && _var='debversion='
                _val="$DEB"
                ;;
            tar)
                [ -n "$SHVAR" ] && _var='tarversion='
                _val="$VERSION"
                ;;
            *)
                print_usage "unknown version/buildid-related tag \"$_what\""
                ;;
        esac
        echo "$_var$_val"
    done
}

#########################
# main script
#

OUTDIR="."
STORE=""
PRINT=""
SHVAR=y
TEST_DESCRIBE=""
TEST_REV=""

while [ "$#" != "0" ]; do
    case $1 in
        --help|-h)
            print_usage
            ;;
        --debug)
            set -x
            ;;
        --store=*|-s*)
            STORE=y
            out="${1##*=}"
            if [ "$out" != "$1" ]; then
                OUTDIR="$out"
            fi
            ;;
        --version|-v)
            PRINT="$PRINT version"
            ;;
        --buildid|-b)
            PRINT="$PRINT buildid"
            ;;
        --rpm)
            PRINT="$PRINT rpm"
            ;;
        --deb)
            PRINT="$PRINT deb"
            ;;
        --tar)
            PRINT="$PRINT tar"
            ;;
        --all)
            PRINT="version buildid rpm deb tar"
            ;;
        --shell*|--sh-syntax*)
            val="${1##*=}"
            if [ "$val" != "$1" ]; then
                case $val in
                    y*|t*) SHVAR=y;;
                    n*|f*) SHVAR="";;
                esac
            else
                SHVAR=y
            fi
            ;;
        --no-shell|--no-sh-syntax)
            SHVAR=""
            ;;
        --test)
            TEST_DESCRIBE="$2"
            TEST_REV="$3"
            shift 2
            ;;
        *)
            print_usage "unknown option \"$1\""
            ;;
    esac
    shift
done

if ! dotgit_version; then
    if ! stored_version; then
        if ! parent_version; then
            unknown_version
        fi
    fi
fi

if [ -z "$STORE" ] && [ -z "$PRINT" ]; then
    PRINT="version buildid"
fi

package_versions
print_variables

if [ -n "$STORE" ]; then
    stored_update
fi


================================================
FILE: scripts/build/update-gh-pages.sh
================================================
#!/bin/bash -e
set -o pipefail

script=`basename $0`

usage () {
cat << EOF
Usage: $script [-h] [-a] [BUILD_SUBDIR]

Options:
  -h         show this help and exit
  -a         amend (with --reset-author) instead of creating a new commit
EOF
}

# Helper function for detecting available versions from the current directory
create_versions_js() {
    _baseurl="/cri-resource-manager"

    echo -e "function getVersionsMenuItems() {\n  return ["
    # 'stable' is a symlink pointing to the latest version
    [ -f stable ] && echo "    { name: 'stable', url: '$_baseurl/stable' },"
    for f in `ls -d */  | tr -d / | sed s'/releases//'`; do
        echo "    { name: '$f', url: '$_baseurl/$f' },"
    done
    echo -e "  ];\n}"
}

# Helper function for detecting archived releases from the current directory
create_releases_js() {
    echo -e "function getReleaseListItems() {\n  return ["
    for f in `ls -d v*/  | tr -d /`; do
        echo "    { name: '$f', url: '$f' },"
    done
    echo -e "  ];\n}"
}

#
# Argument parsing
#
while [ "${1#-}" != "$1" -a -n "$1" ]; do
    case "$1" in
        -a|--amend)
            amend="--amend --reset-author"
            ;;
        -h|--help)
            usage
            exit 0
            ;;
        *)
            usage
            exit 1
            ;;
    esac
    shift
done

build_subdir="$1"

# Check that no extra args were provided
if [ $# -gt 1 ]; then
    echo "ERROR: unknown arguments: $@"
    usage
    exit 1
fi

#
# Build the documentation
#
build_dir="_build"
echo "Creating new Git worktree at $build_dir"
git worktree add "$build_dir" gh-pages

# Drop worktree on exit
trap "echo 'Removing Git worktree $build_dir'; git worktree remove --force '$build_dir'" EXIT

# Parse subdir name from GITHUB_REF
release_tag=
if [ -z "$build_subdir" ]; then
    case "$GITHUB_REF" in
        refs/tags/*)
            _base_ref=${GITHUB_REF#refs/tags/}
            release_tag=$_base_ref
            ;;
        refs/heads/*)
            _base_ref=${GITHUB_REF#refs/heads/}
            ;;
        *) _base_ref=
    esac
    echo "Parsed baseref: '$_base_ref'"

    case "$GITHUB_REF" in
        refs/tags/v*)
            _version=${GITHUB_REF#refs/tags/v}
            ;;
        refs/heads/release-*)
            _version=${GITHUB_REF#refs/heads/release-}
            ;;
        *) _version=
    esac
    echo "Detected version: '$_version'"

    _version=`echo -n $_version | sed -nE s'!^([0-9]+\.[0-9]+).*$!\1!p'`

    # Use version as the subdir
    build_subdir=${_version:+v$_version}
    # Fallback to base-ref i.e. name of the branch or tag
    if [ -z "$build_subdir" ]; then
        # For master branch we use the name 'devel'
        [ "$_base_ref" = "master" ] && build_subdir=devel || build_subdir=$_base_ref
    fi
fi

# Default to 'devel' if no subdir was given and we couldn't parse
# it
build_subdir=${build_subdir:-devel}
echo "Updating site version subdir: '$build_subdir'"
export SITE_BUILDDIR="$build_dir/$build_subdir"
export VERSIONS_MENU=1
export VERSIONS_MENU_THIS_VERSION=$build_subdir

make html

# Update releases/ subdir
if [ "$release_tag" ]; then
    echo "Building archived docs for release $release_tag"

    export SITE_BUILDDIR="$build_dir/releases/$release_tag"
    make html

fi

# Only update the releases "site" from master
if [ "$GITHUB_REF" = "refs/heads/master" ]; then
    echo "Building releases/"
    sphinx-build docs/releases "$build_dir"/releases
fi

#
# Update gh-pages branch
#
commit_hash=`git describe --dirty --always`

# Switch to work in the gh-pages worktree
pushd "$build_dir"

# Add "const" files we need in root dir
touch .nojekyll

_stable=`(ls -d1 v*/ || :) | sort -n | tail -n1`
if [ -n "$_stable" ]; then
    ln -sfT "$_stable" stable
    redirect_to="stable"
else
    redirect_to=$build_subdir
fi

# Detect existing versions from the gh-pages branch
create_versions_js > versions.js

# Update releases directory
mkdir -p releases
cp versions.js releases/
pushd releases
create_releases_js > releases.js
popd

cat > index.html << EOF
<meta http-equiv="refresh" content="0; URL='$redirect_to'" />
EOF

if [ -z "`git status --short`" ]; then
    echo "No new content, gh-pages branch already up-to-date"
    exit 0
fi

# Create a new commit
commit_msg=`echo -e "Update documentation for $build_subdir\n\nAuto-generated from $commit_hash by '$script'"`

echo "Committing changes..."
# Exclude doctrees dir
git add -- ":!$build_subdir/.doctrees"
git commit $amend -m "$commit_msg"

popd

echo "gh-pages branch successfully updated"


================================================
FILE: scripts/code-generator/boilerplate.go.txt
================================================
// Copyright 2019-2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


================================================
FILE: scripts/code-generator/generate-groups.sh
================================================
#!/bin/bash

# This is a helper for running the identically named code-generator script from
# https://github.com/kubernetes/code-generator.

REPO=https://github.com/kubernetes/code-generator
SCRIPT="$(realpath "$0")"
HEADER="${SCRIPT%/*}"/boilerplate.go.txt
TOPDIR=${SCRIPT%/scripts/*}

MODDIR=$TOPDIR
MODURL=$(grep ^module "$TOPDIR"/go.mod | sed 's/^module *//g')
MODULES=${MODULES:-pkg/topology}

fail() {
    echo "error: $*"
    exit 1
}

# Parse $* for --output-base, set $gendir and $repo accordingly.
pick-gen-dir() {
    local _save="" _a

    gendir=$TOPDIR/generate
    for _a in "$@"; do
        case $_a in
            --output-base)
                _save=y;;
            *)
                if [ -n "$_save" ]; then
                    gendir=$_a
                    _save=""
                fi
                ;;
        esac
    done
    repo=$gendir/${REPO##*/}
}

# Set $tag to correspond to $KUBERNETE_VERSION.
pick-git-tag() {
    if [ -z "$KUBERNETES_VERSION" ]; then
        fail "KUBERNETES_VERSION not set, please set it to the desired version to match/use."
    fi
    case $KUBERNETES_VERSION in
        v1.[0-9.]*) tag=${KUBERNETES_VERSION/#v1/v0};;
        *)
            fail "Don't know how to convert KUBERNETES_VERSION $KUBERNETES_VERSION to tag."
            ;;
    esac
}

# Clone $REPO as $repo.
git-clone() {
    if [ ! -d "$repo"/.git ]; then
        mkdir -p "$gendir" || fail "failed to clone git repo"
        (cd "$gendir" && git clone $REPO) || fail "failed to clone git repo $REPO"
    else
        (cd "$repo" && git fetch -q origin) || fail "failed to update/fetch git repo $REPO"
    fi
}

# Check out the $tag corresponding to $KUBERNETES_VERSION.
git-switch() {
    (set -e
     cd "$repo"
         git reset -q --hard HEAD 2> /dev/null
         git checkout -q "$tag"
    ) || fail "failed to checkout git tag $tag"
}

# Patch $repo/go.mod with replacement rules from $TOPDIR and add replacement rules for $TOPDIR.
go-mod-patch() {
    (set -e
     cd "$repo"
         grep -A 640 '^replace ' "$TOPDIR"/go.mod | grep -v pkg/topology >> go.mod
         go mod edit -replace="$MODURL=$MODDIR"
         for mod in $MODULES; do
             go mod edit -replace="$MODURL/$mod=$MODDIR/$mod"
         done
    ) || fail "failed to patch go.mod"
}

# Check any previously generated files for $MODURL, bail out if they exist.
check-existing() {
    local _pkg=${3%:*} _ver=${3#*:} _dir
    for _dir in "$gendir/$1" "$gendir/$2/$_pkg/$_ver"; do
        if [ -d "$_dir" ]; then
            fail "$_dir already exists, refusing to overwrite it"
        fi
    done
}

# Run generate
run-generator() {
    (set -e
     cd "$repo"
         ./generate-groups.sh "$@" --go-header-file "$HEADER"
    ) || fail "code generation failed"
}

pick-gen-dir "$@"
pick-git-tag
git-clone
git-switch
go-mod-patch
check-existing "$2" "$3" "$4"
run-generator "$@"


================================================
FILE: scripts/hack/create-webhook-secrets.sh
================================================
#!/bin/sh -e

this=$(realpath "$0")
this_dir=$(dirname "$this")
template_dir=$(realpath "$this_dir/../../cmd/cri-resmgr-webhook/")
outdir="deploy/cri-resmgr-webhook"
outdir_abs="$(pwd)/$outdir"

cat << EOF
***                                 ***
*** WARNING: NOT FOR PRODUCTION USE ***
***                                 ***

EOF

info () {
    echo "[INFO] $1"
}

info "Generating x509 keys..."

mkdir -p "$outdir"

# Create temp workdir and remove it on exit
tmpdir=$(mktemp -d --suffix=.cri-resmgr)
trap 'rm -rf $tmpdir' EXIT

cd "$tmpdir"

# Create a self-signed CA certificate
openssl req -batch -new -newkey rsa:2048 -x509 -sha256 -nodes -days=30 -out ca.crt -keyout ca.key

export cn=cri-resmgr-webhook.cri-resmgr.svc
openssl req -batch -newkey rsa:2048 -nodes -keyout svc.key -out $cn.csr -subj "/CN=cri-resmgr-webhook.cri-resmgr.svc"
openssl x509 -req -in $cn.csr -CA ca.crt  -CAkey ca.key -CAcreateserial -sha256 -out svc.crt -days 3650

# Copy artifacts to outdir
cp ca.crt svc.crt svc.key "$outdir_abs"

info "Done"
info "Sample cert and key files successfully generated under '$outdir'"

info "Creating MutatingWebhookConfiguration template"
sed "s/CA_BUNDLE_PLACEHOLDER/$(base64 -w0 < ca.crt)/" "$template_dir/mutating-webhook-config.yaml" > "$outdir_abs/mutating-webhook-config.yaml"

# Print instructions
cat << EOF

Instructions for example deployment
===================================
0. Create cri-resmgr namespace, if it does not exist:
   kubectl create ns cri-resmgr

1. Create Kubernetes secrets with:
   kubectl -n cri-resmgr create secret generic cri-resmgr-webhook-secret \\
    --from-file=$outdir/svc.crt --from-file=$outdir/svc.key

2. Build and publish webhook container:
   make image-webhook IMAGE_REPO=my-image-repo IMAGE_TAG=my-version

   And deploy it:
   sed s'!IMAGE_PLACEHOLDER!my-image-repo/cri-resmgr-webhook:my-version!' cmd/webhook/webhook-deployment.yaml | kubectl apply -f -

3. Create MutatingWebhookConfiguration with:
   kubectl apply -f $outdir/mutating-webhook-config.yaml

EOF


================================================
FILE: scripts/hack/go-mod-replace-helper.sh
================================================
#!/bin/bash -e
set -o pipefail

this=`basename $0`

usage () {
cat << EOF
USAGE: $this REPO_CACHE_DIR VERSION MODULE...

OPTIONS
  -h         show this help and exit

EXAMPLES
  Print replace directives for all k8s.io/* updated to v0.19.4:

    $ sed -n '/replace/,$p' go.mod  | grep k8s.io | awk '{print $1}' | \\
      xargs ./scripts/hack/go-mod-replace-helper.sh ../k8s-cache/ v0.19.4

EOF
}

update_cache() {
    local module_base=`basename "$1"`
    local module_cache_dir="$cache_dir/$module_base"


    if [ ! -e "$module_cache_dir" ]; then
        module_repo="https://github.com/kubernetes/$module_base"
        echo "Cloning $module_repo to $module_cache_dir"
        git clone -q --depth=1 "$module_repo" "$module_cache_dir"
    fi

    echo "Updating $1 at $module_cache_dir"
    cd "$module_cache_dir"
    git fetch -q --tags --depth=1
    cd ->/dev/null
}

gomodrev() {
    local module_base=`basename "$1"`
    local module_cache_dir="$cache_dir/$module_base"
    cd "$module_cache_dir"

    # Resolve to a commit
    sha=`git rev-parse "$2"~0`

    short_sha=`git rev-parse --short=12 $sha`

    unix_ts=`git show $sha --format=%ct --date=unix | head -n1`

    gomod_ts=`date -u --date=@$unix_ts +'%Y%m%d%H%M%S'`

    echo "v0.0.0-$gomod_ts-$short_sha"

    cd - >/dev/null
}

while [ "${1#-}" != "$1" -a -n "$1" ]; do
    case "$1" in
        -h|--help)
            usage
            exit 0
            ;;
        *)
            usage
            exit 1
            ;;
    esac
    shift
done

if [ $# -lt 3 ]; then
    usage
    exit 1
fi

cache_dir="$1"
shift
module_version="$1"
shift
module_names="$@"

cat << EOF

UPDATING CACHE
==============
EOF
for m in $@; do
    update_cache $m
done

cat << EOF

GO.MOD REPLACE
==============
EOF

for m in $@; do
    r=`gomodrev $m $module_version`
    echo -e "\t$m v0.0.0 => $m $r"
done


================================================
FILE: scripts/hack/go-mod-tree
================================================
#!/usr/bin/env python3

"""go-mod-tree - inspect go module import hierarchy

Usage: go mod graph | go-mod-tree [options]

Options:
  -h, --help                    print help.

  Input:
  -g, --graph FILE              read graph from FILE instead of stdin.

  Dependency tree selection: (MODULEs are regular expressions)
  -r, --reverse                 print reverse tree: from importees to importers.
  -f, --from MODULE             print tree starting from matching MODULEs.
  -t, --to MODULE               print tree with only branches that end to
                                matching MODULEs.
  -x, --exclude MODULE          exclude matching MODULEs from the graph.
  -s, --shortest-path MODULE    print only a shortest path to matching MODULEs.
  -d, --depth DEPTH             limit printed tree to DEPTH.

  Output format:
  -H <L|D|I|R>                  hide from line format:
                                  L: line number
                                  D: depth
                                  I: indentation
                                  R: reference to already printed line
  -I STRING                     indentation by repeating STRING

Examples:
  - Print full import graph as a tree:
    go mod graph | go-mod-tree

  - Print which of the direct dependencies lead to importing x/net:
    go mod graph | go-mod-tree --to golang.org/x/net --depth 1

  - Print modules directly imported by different versions of x/net:
    go mod graph | go-mod-tree --from golang.org/x/net --depth 1

  - Print modules that directly depend on any version of x/net:
    go mod graph | go-mod-tree --reverse --from golang.org/x/net --depth 1

  - Print shortest import paths to 2010-2019 versions of x/net:
    go mod graph | go-mod-tree --shortest-path .*/x/net@.*201[0-9].*

  - Print full reverse import tree from a specific x/net version:
    go mod graph | go-mod-tree --reverse --from .*20190311183353-d8887717615a
"""

import getopt
import re
import sys

g_command = "go-mod-tree"

opt_fmt = "%(prefix)s%(indent)s%(node)s %(ref)s\n"
opt_indent = ":   "
opt_reverse = False
opt_graph = "-"
opt_shortest_path = None
opt_from = None
opt_to = None
opt_exclude = None
opt_depth = float("inf")
opt_hide = ""

def error(msg, exit_status=1):
    """print error message and exit"""
    if msg:
        sys.stderr.write("%s: %s\n" % (g_command, msg))
    if exit_status != None:
        sys.exit(1)

def output(msg):
    try:
        sys.stdout.write(msg)
    except:
        error("broken pipe")

def read_graph(s):
    """read go mod graph output from a string"""
    deps = {} # {importer: set(importee, ...)}
    for line in s.splitlines():
        if not line:
            continue
        if not " " in line:
            continue
        importer, importee = line.split(" ", 1)
        if not importer in deps:
            deps[importer] = set()
        deps[importer].add(importee)
    return deps

g_lineno = 0
def dump_tree(graph, module, depth=0, already_seen={}, max_depth=opt_depth):
    def dump_line(depth, node):
        global g_lineno
        g_lineno += 1
        if "D" not in opt_hide:
            pp_depth = "D%d" % (depth,)
        else:
            pp_depth = ""
        if "L" not in opt_hide:
            pp_lineno = "L%d" % (g_lineno,)
        else:
            pp_lineno = ""
        if "D" in opt_hide and "L" in opt_hide:
            pp_lineprefix = ""
        else:
            pp_lineprefix = "%-8s" % ((pp_lineno + pp_depth),)
        if "I" in opt_hide:
            pp_indent = ""
        else:
            pp_indent = opt_indent * depth
        pp_ref = ""
        if node in already_seen and "R" not in opt_hide:
            pp_ref = " (see L%(line)sD%(depth)s...)" % already_seen[node]
        output((opt_fmt % {
            'prefix': pp_lineprefix,
            'indent': pp_indent,
            'node': node,
            'ref': pp_ref}))
    if depth > max_depth:
        return
    dump_line(depth, module)
    if module in already_seen:
        return
    already_seen[module] = {"line": g_lineno, "depth": depth}
    for child in sorted(graph.get(module, set())):
        dump_tree(graph, child, depth+1, already_seen, max_depth=max_depth)

def graph_clear(graph):
    """return graph without node keys that have no outgoing edges"""
    new_graph = {}
    for node in graph:
        if graph[node]:
            new_graph[node] = set(graph[node])
    return new_graph

def graph_exclude(graph, exclude_nodes):
    """return graph without nodes in the exclude_nodes set"""
    new_graph = {}
    for node in graph:
        if node not in exclude_nodes:
            new_graph[node] = graph[node] - exclude_nodes
    return graph_clear(new_graph)

def graph_reverse(graph):
    """return reversed graph"""
    new_graph = {}
    for from_node, to_nodes in graph.items():
        for to_node in to_nodes:
            if not to_node in new_graph:
                new_graph[to_node] = set()
            new_graph[to_node].add(from_node)
    return new_graph

def graph_reachable_part(graph, from_nodes):
    """return the part of the graph that is reachable from a set of nodes"""
    new_graph = {}
    stack = list(set(graph.keys()).intersection(from_nodes))
    while stack:
        node = stack.pop()
        if node in new_graph:
            continue
        new_graph[node] = set()
        for child in graph.get(node, set()):
            new_graph[node].add(child)
            stack.append(child)
    return graph_clear(new_graph)

def graph_from_to(graph, from_nodes, to_nodes):
    """return graph between from_nodes and to_nodes"""
    new_graph = graph
    new_graph = graph_reverse(new_graph)
    new_graph = graph_reachable_part(new_graph, to_nodes)
    new_graph = graph_reverse(new_graph)
    new_graph = graph_reachable_part(new_graph, from_nodes)
    return new_graph

def shortest_path(graph, from_node, to_node):
    """return new graph that contains only a shorest path between nodes"""
    shortest_path = None
    bfs_queue = [(child, [from_node]) for child in sorted(graph.get(from_node, set()))]
    seen = set(from_node)
    while bfs_queue:
        node, history = bfs_queue.pop(0)
        seen.add(node)
        if node == to_node:
            shortest_path = history + [node]
            break
        for child in sorted(graph.get(node, set())):
            if child in seen:
                continue
            bfs_queue.append((child, history + [node]))
    return shortest_path

def graph_add_path(graph, path):
    """add a path to current graph"""
    for n, node in enumerate(path):
        if not node in graph:
            graph[node] = set()
        if n > 0:
            graph[path[n-1]].add(node)
    return graph

def matching_nodes(graph, node_regexp):
    matching = set()
    nodes = set.union(set(graph.keys()), set.union(*graph.values()))
    for node in nodes:
        if re.match(node_regexp, node):
            matching.add(node)
    return sorted(matching)

def root_nodes(graph):
    dest_nodes = set.union(*graph.values())
    src_nodes = set(graph.keys())
    roots = src_nodes - dest_nodes
    return sorted(roots)

if __name__ == "__main__":
    try:
        opts, remainder = getopt.gnu_getopt(
            sys.argv[1:],
            'd:f:g:hrs:t:x:H:I:',
            ['depth=', 'exclude=', 'from=', 'graph=', 'help', 'reverse',
             'shortest-path=', 'to='])
    except getopt.GetoptError as e:
        error(str(e))
    for opt, arg in opts:
        if opt in ["-h", "--help"]:
            print(__doc__)
            error(None, exit_status=0)
        elif opt in ["-d", "--depth"]:
            try:
                opt_depth = int(arg)
                if opt_depth <= 0:
                    raise Exception("depth <= 0")
            except:
                error('invalid --depth=%r, positive integer expected', (arg,))
        elif opt in ["-f", "--from"]:
            opt_from = arg
        elif opt in ["-g", "--graph"]:
            opt_graph = arg
        elif opt in ["-r", "--reverse"]:
            opt_reverse = True
        elif opt in ["-s", "--shortest-path"]:
            opt_shortest_path = arg
        elif opt in ["-t", "--to"]:
            opt_to = arg
        elif opt in ["-H"]:
            opt_hide = arg
        elif opt in ["-I"]:
            opt_indent = arg
        elif opt in ["-x", "--exclude"]:
            opt_exclude = arg
        else:
            error('internal error: option "%s" not handled' % (opt,))
    if len(remainder) > 0:
        error('too many parameters')

    if opt_graph == "-":
        graph_string = sys.stdin.read()
    else:
        try:
            graph_string = open(opt_graph).read()
        except Exception as err:
            error('failed to read graph from file "%s": %s' % (opt_graph, err))
    graph = read_graph(graph_string)

    if opt_exclude:
        exclude_modules = matching_nodes(graph, opt_exclude)
        if not exclude_modules:
            error('no modules matching regular expression --exclude %r' % (opt_exclude,))
        graph = graph_exclude(graph, set(exclude_modules))

    if opt_reverse:
        graph = graph_reverse(graph)

    if opt_from:
        from_modules = matching_nodes(graph, opt_from)
        if not from_modules:
            error('no modules matching regular expression --from %r' % (opt_from,))
    else:
        from_modules = root_nodes(graph)

    if opt_to:
        to_modules = matching_nodes(graph, opt_to)
        if not to_modules:
            error('no modules matching regular expression --to %r' % (opt_to,))
        graph = graph_from_to(graph, set(from_modules), set(to_modules))
        from_modules = set(from_modules).intersection(
            set.union(set(graph.keys()), set.union(*graph.values())))

    if opt_shortest_path:
        new_graph = {}
        to_modules = matching_nodes(graph, opt_shortest_path)
        if not to_modules:
            error('no modules matching regular expression --shortest-path %r' % (opt_shortest_path,))
        for from_node in from_modules:
            for to_node in to_modules:
                path = shortest_path(graph, from_node, to_node)
                if path:
                    graph_add_path(new_graph, path)
        graph = new_graph

    for from_node in from_modules:
        dump_tree(graph, from_node, max_depth=opt_depth)


================================================
FILE: scripts/hack/install-protobuf
================================================
#!/usr/bin/env bash

#   Copyright The containerd Authors.

#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at

#       http://www.apache.org/licenses/LICENSE-2.0

#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.


#
# Downloads and installs protobuf
#
set -eu -o pipefail

PROTOBUF_VERSION=3.20.1
GOARCH=$(go env GOARCH)
GOOS=$(go env GOOS)
PROTOBUF_DIR=$(mktemp -d)

case $GOARCH in

arm64)
	wget -O "$PROTOBUF_DIR/protobuf" "https://github.com/protocolbuffers/protobuf/releases/download/v$PROTOBUF_VERSION/protoc-$PROTOBUF_VERSION-linux-aarch64.zip"
	unzip "$PROTOBUF_DIR/protobuf" -d /usr/local
	;;

amd64|386)
	if [ "$GOOS" = windows ]; then
		wget -O "$PROTOBUF_DIR/protobuf" "https://github.com/protocolbuffers/protobuf/releases/download/v$PROTOBUF_VERSION/protoc-$PROTOBUF_VERSION-win32.zip"
	elif [ "$GOOS" = linux ]; then
		wget -O "$PROTOBUF_DIR/protobuf" "https://github.com/protocolbuffers/protobuf/releases/download/v$PROTOBUF_VERSION/protoc-$PROTOBUF_VERSION-linux-x86_64.zip"
	fi
	unzip "$PROTOBUF_DIR/protobuf" -d /usr/local
	;;

ppc64le)
	wget -O "$PROTOBUF_DIR/protobuf" "https://github.com/protocolbuffers/protobuf/releases/download/v$PROTOBUF_VERSION/protoc-$PROTOBUF_VERSION-linux-ppcle_64.zip"
	unzip "$PROTOBUF_DIR/protobuf" -d /usr/local
	;;
*)
	wget -O "$PROTOBUF_DIR/protobuf" "https://github.com/protocolbuffers/protobuf/releases/download/v$PROTOBUF_VERSION/protobuf-cpp-$PROTOBUF_VERSION.zip"
	unzip "$PROTOBUF_DIR/protobuf" -d /usr/src/protobuf
	cd "/usr/src/protobuf/protobuf-$PROTOBUF_VERSION"
	./autogen.sh
	./configure --disable-shared
	make
	make check
	make install
	ldconfig
	;;
esac
rm -rf "$PROTOBUF_DIR"

# Download status.proto. grpc repos' one seems copied from
# https://github.com/googleapis/googleapis/blob/master/google/rpc/status.proto,
# but we use grpc's since the repos has tags/releases.
mkdir -p /usr/local/include/google/rpc
curl \
	-L https://raw.githubusercontent.com/grpc/grpc/v1.45.2/src/proto/grpc/status/status.proto \
	-o /usr/local/include/google/rpc/status.proto


================================================
FILE: scripts/testing/crictl
================================================
#!/bin/sh


RELAY_SOCKET=unix:///var/run/cri-relay.sock
if [ -z "$CRICTL" ]; then
    CRICTL=crictl
fi

sudo $CRICTL -i $RELAY_SOCKET -r $RELAY_SOCKET "$@"


================================================
FILE: scripts/testing/jaeger
================================================
#!/bin/sh

ENVVARS="-e COLLECTOR_ZIPKIN_HTTP_PORT=9411"
PORTS="-p 5775:5775/udp \
  -p 6831:6831/udp \
  -p 6832:6832/udp \
  -p 5778:5778 \
  -p 16686:16686 \
  -p 14268:14268 \
  -p 9411:9411"

if [ "$1" = "--permanent" ]; then
    storage=/tmp/jaeger-trace
    data=$storage/data
    key=$storage/key
    echo "Using $data and $key to store (badger) traces..."
    mkdir -p $storage
    STORAGE="-e SPAN_STORAGE_TYPE=badger \
      -e BADGER_EPHEMERAL=false \
      -e BADGER_DIRECTORY_VALUE=$data \
      -e BADGER_DIRECTORY_KEY=$key \
      -v $storage:$storage"
fi

cmd="docker run $ENVVARS $PORTS $STORAGE jaegertracing/all-in-one:latest"
echo "Running command $cmd..."
$cmd


================================================
FILE: scripts/testing/kube-cgroups
================================================
#!/bin/bash

usage() {
    cat <<EOF
Usage: kube-cgroups [options]

Options:
  -g CGDIR            print cgroup data under CGDIR.
                      The default is /sys/fs/cgroup.
  -E                  print also empty files.
                      The default is to print non-empty files only.
  -F                  print full cgroup filename.
                      The default is basename only.
  Filtering options:
  -n NS_REGEXP        print only pods in namespaces matching NS_REGEXP
  -p POD_REGEXP       print only pods matching POD_REGEXP
  -c CNTR_REGEXP      print only containers matching CNTR_REGEXP
  -f CGFILE_REGEXP    print only cgroup files matching CGFILE_REGEXP

Examples:

  # print cgroup information of pods in any namespace
  kube-cgroups -n .

  # print read bps and iops throttling of containers in mypod
  kube-cgroups -g /sys/fs/cgroup/blkio -p mypod -f read
EOF
}

error() {
    echo "kube-cgroups: $*" >&2
    exit 1
}

full_filename=0
empty_files=0
ns_regexp="default" # regexp matching
pod_regexp="." # regexp matching any pod name
cntr_regexp="." # regexp matching any container line
cgfile_regexp="cpuset.cpus|cpuset.mems|blkio.throttle.*_device" # regexp matching any cgroup file

cg_controller_dir=/sys/fs/cgroup

while getopts "hg:EFn:p:c:f:" OPTION; do
    case $OPTION in
        h)
            usage
            exit 0
            ;;
        g)
            cg_controller_dir="$OPTARG"
            ;;
        E)
            empty_files=1
            ;;
        F)
            full_filename=1
            ;;
        n)
            ns_regexp="$OPTARG"
            ;;
        p)
            pod_regexp="$OPTARG"
            ;;
        c)
            cntr_regexp="$OPTARG"
            ;;
        f)
            cgfile_regexp="$OPTARG"
            ;;
        *)
            error "invalid option $OPTION"
            ;;
    esac
done

if [ ! -d "$cg_controller_dir" ]; then
    error "cgroup directory '$cg_controller_dir' does not exist"
fi

kubectl get pods -A | grep -E "$pod_regexp" | while read -r namespace podname rest; do

    [ "$namespace" == "NAMESPACE" ] && continue

    grep -q -E "$ns_regexp" <<< "$namespace" || continue

    kubectl describe pod -n "$namespace" "$podname" | grep -B1 'Container ID:' | while read -r container _ containerid; do

        if [[ "$container" != "Container" ]] && [[ "$container" != "--" ]]; then
            containername="${container%%:*}"
            continue
        fi

        containerID=${containerid#*://}

        if [[ -z "$containerID" ]]; then
            continue
        fi

        grep -q -E "$cntr_regexp" <<< "$containername" || continue

        while read -r cgroupdir; do
            if [[ "$cgroupdir" == *crio-conmon* ]]; then
                continue
            fi
            for filename in "$cgroupdir"/*; do
                if [[ ! -f "$filename" ]]; then
                    continue
                fi
                filename_nodir="${filename##*/}"
                grep -q -E "$cgfile_regexp" <<< "$filename_nodir" || continue
                if [[ -n "$podname" ]]; then
                    echo "$namespace/$podname:"
                    unset podname
                fi
                [[ -n "$containername" ]] && {
                    echo "  $containername:"
                    unset containername
                }
                linecount="$(wc -l < "$filename")"
                if [[ "$linecount" == "0" ]] && [[ "$empty_files" == "0" ]]; then
                    continue
                fi
                if [[ "$full_filename" == "1" ]]; then
                    print_filename="$filename"
                else
                    print_filename="$filename_nodir"
                fi
                if (( "$linecount" <= 1 )); then
                    # print contents of a single-line file after filename
                    echo "    $print_filename: $(< "$filename")"
                else
                    # print contents of a multiline file indented
                    echo "    $print_filename:"
                    sed "s/^/      /g" < "$filename"
                fi
            done
        done <<< "$(find "$cg_controller_dir" -name "*${containerID}*")"
    done
done


================================================
FILE: scripts/testing/pairwise
================================================
#!/usr/bin/env python3

"""pairwise - print var-value combinations that cover all value pairs

Usage: pairwise VAR=VALUE [VAR=VALUE...]

Example:
$ pairwise \\
    distro={debian-sid,opensuse,fedora} \\
    k8scni={cilium,weavenet,flannel} \\
    k8scri={crio,containerd} \\
    k8s={1.22.0,1.23.0}
"""

import sys

def error(msg, exit_status=1):
    sys.stderr.write('pairwise: %s\n' % (msg,))
    if exit_status is not None:
        sys.exit(exit_status)

def output(msg):
    sys.stdout.write(msg)

# This program prints an optimized set of value combinations
# that covers all value pairs.

def all_combinations(var_values):
    combinations = [{}]
    for var in var_values:
        new_combinations = []
        for d in combinations:
            for value in var_values[var]:
                new_comb = dict(d)
                new_comb[var] = value
                new_combinations.append(new_comb)
        combinations = new_combinations
    return combinations

def combination_to_triplets(d):
    triplets = set()
    keys = sorted(d.keys())
    for key1_index, key1 in enumerate(keys):
        val1 = d[key1]
        for key2_index, key2 in enumerate(keys[key1_index+1:]):
            val2 = d[key2]
            for key3 in keys[key1_index + key2_index + 2:]:
                val3 = d[key3]
                triplets.add(frozenset(((key1, val1), (key2, val2), (key3, val3))))
    return triplets

def combination_to_pairs(d):
    pairs = set()
    keys = sorted(d.keys())
    for key1_index, key1 in enumerate(keys):
        val1 = d[key1]
        for key2 in keys[key1_index+1:]:
            val2 = d[key2]
            pairs.add(frozenset(((key1, val1), (key2, val2))))
    return pairs

def combination_to_singles(d):
    singles = set()
    for key1 in d.keys():
        val1 = d[key1]
        singles.add(frozenset((key1, val1)))
    return singles

def cover_pairwise(var_values):
    chosen_combinations = []
    covered_pairs = set()
    combination_pairs = {}
    all_triplets = set()
    all_pairs = set()
    all_singles = set()
    combinations = all_combinations(var_values)
    for c in combinations:
        all_triplets = all_triplets.union(combination_to_triplets(c))
        all_pairs = all_pairs.union(combination_to_pairs(c))
        all_singles = all_singles.union(combination_to_singles(c))
    uncovered_triplets = set(all_triplets)
    number_of_triplets = len(uncovered_triplets)
    uncovered_pairs = set(all_pairs)
    uncovered_singles = set(all_singles)
    while uncovered_pairs:
        combination_score = []
        for c in combinations:
            covers_triplets = combination_to_triplets(c)
            covers_pairs = combination_to_pairs(c)
            covers_singles = combination_to_singles(c)
            combination_score.append(
                (len(uncovered_pairs.intersection(covers_pairs)) +
                 len(uncovered_singles.intersection(covers_singles)) +
                 len(uncovered_triplets.intersection(covers_triplets)) / number_of_triplets,
                 c, covers_pairs, covers_singles, covers_triplets))
        best_score, best_comb, best_pairs, best_singles, best_triplets = sorted(combination_score, key=lambda comb_score: comb_score[0])[-1]
        chosen_combinations.append(best_comb)
        uncovered_triplets = uncovered_triplets - best_triplets
        uncovered_pairs = uncovered_pairs - best_pairs
        uncovered_singles = uncovered_singles - best_singles
    return chosen_combinations

if __name__ == "__main__":
    if len(sys.argv) < 2 or "-h" in sys.argv or "--help" in sys.argv:
        output(__doc__)
        error('missing VAR=VALUE...', exit_status=0)
    # construct var_values from command line arguments
    var_values = {} # {var: list-of-values}
    for var_value in sys.argv[1:]:
        try:
            var, value = var_value.split("=", 1)
        except:
            error('bad argument %r, VAR=VALUE expected', var_value)
        if var not in var_values:
            var_values[var] = []
        var_values[var].append(value)

    for comb in cover_pairwise(var_values):
        var_value_row = []
        for var in sorted(comb.keys()):
            var_value_row.append('%s="%s"' % (var, comb[var]))
        output(" ".join(var_value_row) + "\n")


================================================
FILE: scripts/testing/prometheus
================================================
#!/bin/sh

dir=$(dirname "$0")
cfg=$dir/prometheus.yaml

cmd="docker run -p 9090:9090 \
    -v $cfg:/etc/prometheus/prometheus.yml \
    prom/prometheus --config.file=/etc/prometheus/prometheus.yml $*"

echo "Running command $cmd..."
$cmd


================================================
FILE: scripts/testing/prometheus.yaml
================================================
global:
  scrape_interval: 10s
  external_labels:
    monitor: 'CRI-RM'

scrape_configs:
  - job_name: 'CRI-RM'
    scrape_interval: 10s
    static_configs:
      - targets: ['10.0.0.2:8888']


================================================
FILE: scripts/testing/set-path
================================================
#!/bin/sh

# set -x

dirpart=packages/src/github.com/intel/cri-interceptor

case $(pwd) in
    */$dirpart*)
        ;;
    *)
        echo "Don't know how: I don't see $dirpart in $(pwd)..."
        return 1
        ;;
esac

dir=$(pwd)
kubedir=${dir%%/github.com*}/k8s.io/kubernetes
kubebin=$kubedir/_output/local/bin/linux/amd64

if [ ! -d "$kubebin" ]; then
    echo "*** You don't seem to have a $kubebin directory."
    return 1
fi

if [ ! -x "$kubebin"/kubelet ]; then
    echo "*** You don't seem to have kubelet in $kubebin (done a make WHAT=cmd/kubelet ?)"
    ls -ls "$kubebin"
    return 1
fi

export PATH="$kubebin:$PATH"


================================================
FILE: test/critest/run.sh
================================================
#!/bin/bash

TEST_TITLE="CRI validation tests with critest"

PV='pv -qL'

SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")"
DEMO_LIB_DIR=$(realpath "$SCRIPT_DIR/../../demo/lib")
BIN_DIR=$(realpath "$SCRIPT_DIR/../../bin")
OUTPUT_DIR=${outdir-$SCRIPT_DIR/output}
COMMAND_OUTPUT_DIR=$OUTPUT_DIR/commands

# shellcheck disable=SC1091
# shellcheck source=../../demo/lib/command.bash
source "$DEMO_LIB_DIR"/command.bash
# shellcheck disable=SC1091
# shellcheck source=../../demo/lib/host.bash
source "$DEMO_LIB_DIR"/host.bash
# shellcheck disable=SC1091
# shellcheck source=../../demo/lib/vm.bash
source "$DEMO_LIB_DIR"/vm.bash

usage() {
    echo "$TEST_TITLE"
    echo "Usage: [VAR=VALUE] ./run.sh MODE"
    echo "  MODE:     \"play\" plays the test as a demo."
    echo "            \"test\" runs fast, reports pass or fail."
    echo "  VARs:"
    echo "    tests:   space-separated list of cri-resmgr configurations."
    echo "             The default is all *.cfg files in $SCRIPT_DIR."
    echo "    vm:      govm virtual machine name."
    echo "             The default is \"crirm-test-critest\"."
    echo "    speed:   Demo play speed."
    echo "             The default is 10 (keypresses per second)."
    echo "    cleanup: Level of cleanup after a test run:"
    echo "             0: leave VM running (the default)"
    echo "             1: delete VM"
    echo "             2: stop VM, but do not delete it."
    echo "    outdir:  Save output under given directory."
    echo "             The default is \"${SCRIPT_DIR}/output\"."
}

error() {
    (echo ""; echo "error: $1" ) >&2
    exit 1
}

out() {
    if [ -n "$PV" ]; then
        speed=${speed-10}
        echo "$1" | $PV "$speed"
    else
        echo "$1"
    fi
    echo ""
}

screen-create-vm() {
    speed=60 out "### Running the test in VM \"$vm\"."
    host-create-vm "$vm" "$topology"
    if [ -z "$VM_IP" ]; then
        error "creating VM failed"
    fi
    vm-networking
}

screen-install-containerd() {
    speed=60 out "### Installing Containerd to the VM."
    vm-install-cri
    vm-install-containernetworking
}

screen-copy-cri-resmgr() {
    prefix=/usr/local
    host-command "scp \"$BIN_DIR/cri-resmgr\" \"$SCRIPT_DIR/tsl\" $VM_SSH_USER@$VM_IP:" || {
        command-error "copying cri-resmgr failed"
    }
    vm-command "mv cri-resmgr tsl $prefix/bin/" || {
        command-error "installing cri-resmgr to $prefix/bin failed"
    }
    PV="" vm-command "command -v cri-resmgr" >/dev/null
    ( echo "$COMMAND_OUTPUT" | grep -q $prefix/bin/cri-resmgr ) || {
        command-error "\"cri-resmgr\" does not execute $prefix/bin/cri-resmgr on VM"
    }

}

screen-install-critest() {
    speed=60 out "### Installing critest to VM."
    vm-command "apt update && apt install -y golang make socat"
    vm-command "go get -d github.com/kubernetes-sigs/cri-tools"
    CRI_TOOLS_SOURCE_DIR=$(awk '/package.*cri-tools/{print $NF}' <<< "$COMMAND_OUTPUT")
    [ -n "$CRI_TOOLS_SOURCE_DIR" ] || {
        command-error "downloading cri-tools failed"
    }
    vm-command "pushd \"$CRI_TOOLS_SOURCE_DIR\" && make && make install && popd" || {
        command-error "building and installing cri-tools failed"
    }
}

screen-critest-crirm-config() {
    config_file=$1
    cri_endpoint=/var/run/containerd/containerd.sock
    cri_resmgr_endpoint=/var/run/cri-resmgr/cri-resmgr.sock
    host-command "scp $config_file $VM_SSH_USER@$VM_IP:"
    vm-command "rm -rf *.tsl; killall cri-resmgr; systemctl stop containerd; sleep 1; systemctl start containerd; sleep 1; rm -rf /var/lib/cri-resmgr"
    vm-command "cri-resmgr -force-config $config_file -runtime-socket $cri_endpoint -relay-socket $cri_resmgr_endpoint 2>&1 | tsl -uU -F \"%(ts) s cri-resmgr: %(line)s\" -o cri-resmgr.output.tsl" bg
    sleep 5
    vm-command "critest -runtime-endpoint unix://$cri_resmgr_endpoint 2>&1 | tsl -uU -F \"%(ts) s critest: %(line)s\" -o critest.output.tsl"
    vm-command "killall cri-resmgr"
    vm-command-q "cat *.tsl | sort -n | awk '{if (t_start==0) t_start=\$1; \$1=sprintf(\"%.6fs\", \$1-t_start); print;}'" > "$OUTPUT_DIR/test-$config_file.log"
}

screen-critest-containerd() {
    cri_endpoint=/var/run/containerd/containerd.sock
    vm-command "rm -rf *.tsl; critest -runtime-endpoint unix://$cri_endpoint 2>&1 | tsl -uU -F \"%(ts) s critest: %(line)s\" -o critest.output.tsl"
    vm-command-q "cat *.tsl | sort -n | awk '{if (t_start==0) t_start=\$1; \$1=sprintf(\"%.6fs\", \$1-t_start); print;}'" > "$OUTPUT_DIR/test-containerd.log"
}

require_cmd() {
    cmd=$1
    if ! command -v "$cmd" >/dev/null ; then
        error "required command missing \"${cmd}\", make sure it is in PATH"
    fi
}

# Validate parameters
mode=$1
topology=${topology:='[{"cores": 2, "mem": "8G"}]'}
distro=${distro:="ubuntu-20.04"}
cri=${cri:="containerd"}
vm=${vm:="critest-$distro-$cri"}
cleanup=${cleanup-0}
host-set-vm-config "$vm" "$distro" "$cri"

cd "${SCRIPT_DIR}" || error "failed to cd to \"${SCRIPT_DIR}\""
tests=${tests-*.cfg}

if [ "$mode" == "test" ]; then
    PV=
elif [ "$mode" == "play" ] ; then
    speed=${speed-10}
else
    usage
    error "invalid MODE"
fi

# Prepare for test/demo
mkdir -p "$OUTPUT_DIR"
mkdir -p "$COMMAND_OUTPUT_DIR"
rm -f "$COMMAND_OUTPUT_DIR"/0*
( echo x > "$OUTPUT_DIR/x" && rm -f "$OUTPUT_DIR/x" ) || {
    error "output directory outdir=$OUTPUT_DIR is not writable"
}

if [ -z "$VM_IP" ] || [ -z "$VM_SSH_USER" ] || [ -z "$VM_NAME" ]; then
    screen-create-vm
fi

# always copy new version of the binary to VM
screen-copy-cri-resmgr

if ! vm-command-q "dpkg -l | grep -q containerd"; then
    screen-install-containerd
fi

if ! vm-command-q "command -v critest | grep -q critest"; then
    screen-install-critest
fi

# Run test/demo
# 1. Run critest on cri-resmgr with each config file.
for config_file in $tests; do
    screen-critest-crirm-config "$config_file"
done
# 2. Run critest without cri-resmgr for reference.
screen-critest-containerd

# Cleanup
if [ "$cleanup" == "0" ]; then
    echo "The VM with critest, cri-resmgr and containerd is left running. Next steps:"
    vm-print-usage
elif [ "$cleanup" == "1" ]; then
    host-stop-vm $vm
    host-delete-vm $vm
elif [ "$cleanup" == "2" ]; then
    host-stop-vm $vm
fi

# Summarize results
SUMMARY_FILE="$OUTPUT_DIR/summary.txt"
echo -n "" > "$SUMMARY_FILE" || error "cannot write summary to \"$SUMMARY_FILE\""
for testlog in "$OUTPUT_DIR"/test-*.log; do
    {
        echo -n "$(basename "$testlog") "
        awk 'BEGIN{s=0;e=0}/critest: /{if(s==0)s=$1;e=$1}END{printf "(runtime %.2f s): ",e-s}' < "$testlog"
        # remove ansi colors from critest output in the summary
        grep Pending "$testlog" | grep critest: | tail -n 1 | sed -r -e "s/[[:cntrl:]]\[[0-9]+m//g" -e "s/^.* -- //g"
    } >> "$SUMMARY_FILE"
done
exit_status=0
# Declare verdict in test mode
if [ "$mode" == "test" ]; then
    echo "" >> "$SUMMARY_FILE"
    # Test is passed if all critest executions had the same passrate,
    # no matter which cri-resmgr configuration was used.
    if [ "$(awk -F: '/Passed/{print $2}' < "$SUMMARY_FILE" | sort -u | wc -l)" == "1" ]; then
        echo "All critest results are the same." >> "$SUMMARY_FILE"
        echo "Test verdict: PASS" >> "$SUMMARY_FILE"
    else
        echo "Error: critest results are not the same in all configurations." >> "$SUMMARY_FILE"
        echo "Test verdict: FAIL" >> "$SUMMARY_FILE"
        exit_status=1
    fi
fi
echo ""
echo "Summary:"
cat "$SUMMARY_FILE"
exit "$exit_status"


================================================
FILE: test/critest/topology-aware-policy.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
logger:
  Debug: cri-resmgr,resource-manager,cache,dump,instrumentation,policy
dump:
  Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*
instrumentation:
  Sampling: disabled


================================================
FILE: test/critest/tsl
================================================
#!/usr/bin/python3
#
# Copyright 2020 Intel Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""tsl - timestamp lines

Usage: tsl [options]

Options:
  -h, --help      print help.
  -f TIMEFORMAT   use TIMEFORMAT as output timeformat (man strftime).
                  The default format is "%s.%f".
  -F LINEFORMAT   use LINEFORMAT as line output format:
                  - %(ts)s: timestamp
                  - %(line)s: original line
                  The default is "%(ts)s %(line)s".
  -o OUTFILE      write output lines to OUTFILE. Supports many -o's.
                  Special outfiles:
                  - stdout: standard output
                  - stderr: standard error
  -u              unbuffered input: more accurate timestamps, slower throughput.
  -U              unbuffered output: flush after every line, slower throughput.

Examples:
  cmd1 | tsl -u -F "%(ts)s cmd1: %(line)s" > cmd1.tsl &
  cmd2 | tsl -u -F "%(ts)s cmd2: %(line)s" > cmd2.tsl &
  wait
  cat cmd1.tsl cmd2.tsl | sort -n > cmd1_cmd2.output
"""

import getopt
import sys
import datetime

def unbuffered_xreadlines(fileobj):
    """like fileobj.xreadlines() but unbuffered"""
    ln = []
    while True:
        c = fileobj.read(1)
        if not c:
            if ln:
                yield "".join(ln)
            break
        ln.append(c)
        if c == "\n":
            yield "".join(ln)
            ln = []

if __name__ == "__main__":
    opt_timeformat = "%s.%f" #"%Y-%m-%d %H:%M:%S"
    opt_lineformat = "%(ts)s %(line)s"
    opt_unbuffered_in = False
    opt_unbuffered_out = False
    opt_outfiles = []
    opts, remainder = getopt.gnu_getopt(
        sys.argv[1:], 'hf:F:o:uU',
        ['help', 'format='])
    for opt, arg in opts:
        if opt in ["-h", "--help"]:
            print(__doc__)
            sys.exit(0)
        elif opt in ["-f", "--format"]:
            opt_timeformat = arg
        elif opt in ["-F"]:
            opt_lineformat = arg
        elif opt in ["-o"]:
            if arg == "stdout":
                opt_outfiles.append(sys.stdout)
            elif arg == "stderr":
                opt_outfiles.append(sys.stderr)
            else:
                opt_outfiles.append(open(arg, "w"))
        elif opt in ["-u"]:
            opt_unbuffered_in = True
        elif opt in ["-U"]:
            opt_unbuffered_out = True
    if not opt_outfiles:
        opt_outfiles.append(sys.stdout)
    if opt_unbuffered_in:
        line_iter = unbuffered_xreadlines(sys.stdin)
    else:
        line_iter = sys.stdin
    for line in line_iter:
        ts = datetime.datetime.now().strftime(opt_timeformat)
        out_line = opt_lineformat % {'ts': ts, 'line': line}
        for outfile in opt_outfiles:
            outfile.write(out_line)
            if opt_unbuffered_out:
                outfile.flush()


================================================
FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/cri-resmgr.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
logger:
  Debug: cri-resmgr,resource-manager,cache,policy


================================================
FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/memtier-benchmark-02.yaml.in
================================================
apiVersion: batch/v1
kind: Job
metadata:
  name: memtier-benchmark
spec:
  template:
    metadata:
      annotations:
        cri-resource-manager.intel.com/${AFFINITY}: |+
          memtier-benchmark:
            - scope:
                key: pod/name
                operator: Matches
                values:
                  - redis-*
              match:
                key: name
                operator: Equals
                values:
                  - redis
              weight: 10
    spec:
      containers:
      $(for contnum in $(seq 1 ${CONTCOUNT}); do echo "
      - name: ${NAME}c$(( contnum - 1 ))
        image: redislabs/memtier_benchmark:edge
        imagePullPolicy: IfNotPresent
        args: ['${ARGS// /\', \'}']
        resources:
          requests:
            cpu: ${CPU}
            memory: '${MEM}'
          limits:
            cpu: ${CPULIM}
            memory: '${MEMLIM}'
      "; done )
      restartPolicy: Never


================================================
FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/memtier-benchmark.yaml.in
================================================
apiVersion: batch/v1
kind: Job
metadata:
  name: memtier-benchmark
spec:
  template:
    metadata:
      annotations:
        cri-resource-manager.intel.com/${AFFINITY}: |+
          memtier-benchmark:
            - scope:
                key: pod/name
                operator: Matches
                values:
                  - redis-*
              match:
                key: name
                operator: Equals
                values:
                  - redis
              weight: 10
    spec:
      containers:
        - name: memtier-benchmark
          image: redislabs/memtier_benchmark:edge
          imagePullPolicy: IfNotPresent
          args: ['${ARGS// /\', \'}']
          $(if [ "$CPU" != "0" ]; then echo "
          resources:
            requests:
              cpu: ${CPU}
              memory: '${MEM}'
            limits:
              cpu: ${CPULIM}
              memory: '${MEMLIM}'
          "; fi)
      restartPolicy: Never


================================================
FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/n4c16/test01-memtier-stress-ng/code.var.sh
================================================
# Redis parameters
REDIS_PASS=abc123xyz

# Background load parameters
STRESS_NG_CPUS=16 # workers per container
STRESS_NG_CONTS=8 # number of containers per pod
STRESS_NG_PODS=2 # number of pods

# BG_* are background loads
# CPU turbo licence level 2 (causes big drop on GHz) cannot be reached with stress-ng, but could be implemented with
# 1. ["avx-turbo", "--test=avx512_vlzcnt_t", "--min-threads=1", "--max-threads=1", "--iters=0"]
# 2. ["avx-turbo", "--test=avx512_vlzcnt_t", "--min-threads=1", "--max-threads=1", "--iters=0"]
# License level observed with:
# sudo perf stat --pid $(pidof avx-turbo) -e core_power.lvl0_turbo_license,core_power.lvl1_turbo_license,core_power.lvl2_turbo_license -- sleep 1
# In the following: "IPC" == Instructions Per Cycle
BG_NOLOAD=""
# BG_AVX_LL0="stress-ng --ipsec-mb $STRESS_NG_CPUS --ipsec-mb-feature avx512" # AVX, causing CPU turbo license level 0
# BG_AVX_LL1="stress-ng --ipsec-mb $STRESS_NG_CPUS --ipsec-mb-feature avx512" # AVX, causing CPU turbo license level 1
BG_SHM="stress-ng --shm $STRESS_NG_CPUS" # shared memory, memory bound (not causing 100% CPU load), IPC ~0.01-0.19
BG_MEMCPY="stress-ng --memcpy $STRESS_NG_CPUS" # memory bound; IPC =~ 0.15
BG_STREAM="stress-ng --stream $STRESS_NG_CPUS" # IPC =~ 0.49
BG_CPUJMP="stress-ng --cpu $STRESS_NG_CPUS --cpu-method jmp" # IPC ~3.6
BG_CPUALL="stress-ng --cpu $STRESS_NG_CPUS" # IPC ~1.8

# BM_* are benchmarks
bm_stress_ng_iters=10000
BM_MEMTIER="memtier-benchmark --server=redis-service --authenticate=$REDIS_PASS" # this is special case
# BM_MEMCPY="stress-ng --memcpy 1 --memcpy-ops $bm_stress_ng_iters" # IPC ~0.15
# BM_STREAM="stress-ng --stream 1 --stream-ops $bm_stress_ng_iters" # IPC ~0.49
# BM_JMP="stress-ng --cpu 1 --cpu-method jmp --cpu-ops $bm_stress_ng_iters" # IPC ~3.6
# BM_FFT="stress-ng --cpu 1 --cpu-method fft --cpu-ops $bm_stress_ng_iters" # IPC ~2.3
# BM_AVX_LL0="stress-ng --ipsec-mb 1 --ipsec-mb-feature avx2 --ipsec-mb-ops $bm_stress_ng_iters"
# BM_AVX_LL2="stress-ng --ipsec-mb 1 --ipsec-mb-feature avx512 --ipsec-mb-ops $bm_stress_ng_iters"

# Clean up
vm-command "kubectl delete jobs --all --now; kubectl delete deployment redis; kubectl delete service redis-service; kubectl delete secret redis; kubectl delete pods --all --now; true"

# Setup Redis
wait="" create redis-secret
CPU=4 MEM=32G CPULIM=8 MEMLIM=64G NAME=redis wait="Available" create redis
NAME=redis-service wait="" create redis-service

for bg_cmd in "${!BG_@}"; do
    # Reset counters in order to keep creating pod0...
    reset counters

    benchmark_output_dir="$OUTPUT_DIR/benchmark/$bg_cmd"
    mkdir -p "$benchmark_output_dir"

    # Start background noise
    if [[ "${!bg_cmd}" == "stress-ng "* ]]; then
        n="$STRESS_NG_PODS" ARGS="${!bg_cmd#stress-ng }" CONTCOUNT="$STRESS_NG_CONTS" CPU=50m MEM=50M CPULIM=$STRESS_NG_CPUS MEMLIM=1G wait_t=240s create stress-ng
        # Stabilize
        ( vm-run-until --timeout 60 "sh -c 'uptime; exit 1'" ) || echo "expected timeout"
    fi

    for bm_cmd in "${!BM_@}"; do
        for CPU in 4; do
            # Run benchmark
            if [[ "${!bm_cmd}" == "memtier-benchmark "* ]]; then
                AFFINITY=affinity CPU="$CPU" MEM="16G" CPULIM="$CPU" MEMLIM="24G" NAME=memtier-benchmark ARGS="${!bm_cmd#memtier-benchmark }" wait="Complete" wait_t="10m" create memtier-benchmark
                memtier_benchmark_pod="$(kubectl get pods | awk '/memtier-benchmark-/{print $1}')"
                kubectl logs "$memtier_benchmark_pod" | grep -A7 'ALL STATS' | tee "$benchmark_output_dir/$bm_cmd-affinity-cpu-$CPU.txt"
                kubectl delete jobs --all --now

                # AFFINITY=anti-affinity CPU="$CPU" MEM="16G" NAME=memtier-benchmark ARGS="${!bm_cmd#memtier-benchmark }" wait="Complete" wait_t="10m" create memtier-benchmark
                # memtier_benchmark_pod="$(kubectl get pods | awk '/memtier-benchmark-/{print $1}')"
                # kubectl logs "$memtier_benchmark_pod" | grep -A7 'ALL STATS' | tee "$benchmark_output_dir/$bm_cmd-antiaffinity-cpu-$CPU.txt"
                # kubectl delete jobs --all --now

            elif [[ "${!bm_cmd}" == "stress-ng "* ]]; then
                CPU="$CPU" MEM="200M" CPULIM="$STRESS_NG_CPUS" MEM="400M" NAME=stress-ng-benchmark ARGS="${!bm_cmd#stress-ng }" wait="Complete" wait_t="10m" create stress-ng-benchmark
                stress_ng_benchmark_pod="$(kubectl get pods | awk '/stress-ng-benchmark-/{print $1}')"
                kubectl logs "$stress_ng_benchmark_pod" | tee "$benchmark_output_dir/$bm_cmd-cpu-$CPU.txt"
                kubectl delete jobs --all --now
            fi
        done
    done

    # Stop background noise
    ( kubectl delete pods -l e2erole=bgload --now )
done


================================================
FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/n4c16/test01-memtier-stress-ng/post-process.sh
================================================
#!/bin/bash

# Usage: VAR=VALUE post-process.sh output-CRICONFIGNAME1 output-CRICONFIGNAME2...
# VARs:
#   normalize=1..  # normalizes plotted values so that the smallest is 1.00
#   normalize=0..1 # normalizes plotted to values between 0.0 and 1.0
#                  # if normalize="", values are not normalized
#   maxy=MAXY      # maximum value on the Y axis
#   ytrans=log2    # logarithmic Y axis, the default ytrans is 'identity'
#   save=PREFIX    # create PREFIX.svg and PREFIX.csv. The default is 'plot'.

normalize="${normalize:-}"
maxy="${maxy:-}"
ytrans="${ytrans:-identity}"
save="${save:-plot}"

(
    for out_path in "$@"; do (
        benchmark_dir=$out_path/benchmark
        out_dir="$(basename "$out_path")"
        cd "$benchmark_dir" || exit
        for bgload in *; do (
            cd "$bgload" || exit
            for memtier_results in BM_MEMTIER-*; do
                p50latency=\$6
                p99latency=\$7
                p999latency=\$8
                awk "/Totals/{print \"$out_dir $bgload $memtier_results \"$p50latency\" \"$p99latency\" \"$p999latency}" < "$memtier_results"
            done
        ); done
    ); done
) > total-latencies.txt

sed -e 's/output-//g' -e 's/BG_//g' -e 's/BM_MEMTIER-//g' -e 's/-cpu-[0-9]*.txt//g' < total-latencies.txt | awk '{print $1" "$2" "$3" "$4" "$5" "$6}' | grep -v ' antiaffinity' > data.txt

cat > plot.R <<EOF
library(ggplot2)
library(svglite)
d <- read.table(file="data.txt")
names(d) <- c("cri_conf", "bg_load", "annotations", "p5", "p99", "p999")

if ("$normalize" == "1..") {
    smallest_value = min(d[c("p5", "p99", "p999")])
    d[c("p5", "p99", "p999")] = d[c("p5", "p99", "p999")] / smallest_value
    cat("normalized to 1.., the smallest value was", smallest_value, "\n")
    latency_unit = "the fastest is 1.0"
} else if ("$normalize" == "0..1") {
    smallest_value = min(d[c("p5", "p99", "p999")])
    largest_value = max(d[c("p5", "p99", "p999")])
    distance = largest_value - smallest_value
    d[c("p5", "p99", "p999")] = (d[c("p5", "p99", "p999")] - smallest_value) / distance
    cat("normalized to 0..1 the smallest value was", smallest_value, "and the largest", largest_value, "\n")
    latency_unit = "normalized between 0..1"
} else if ("$normalize" == "") {
    cat("not normalizing values\n")
    latency_unit = "ms"
} else {
    stop("invalid 'normalize' value")
}

if ("$maxy" == "") {
    maxy=NA
} else {
    maxy=as.double("$maxy")
    cat("liming y axis to", maxy, "\n")
}

d[c("p5", "p99", "p999")] = round(d[c("p5", "p99", "p999")], 2)
image = (
    ggplot(d, aes(x=bg_load, group=cri_conf, shape=cri_conf), ylim=c(0, maxy))
    + ggtitle("Memtier_benchmark total latencies with plain CRI and CRI-RM")
    + ylab(paste("Latency (", latency_unit, ")", sep=""))
    + xlab("Background load (stress-ng)")
    + labs(linetype="Percentiles")
    + labs(color="CRI layer")
    + geom_line(aes(color=cri_conf, linetype="50 %", y=p5))
    + geom_line(aes(color=cri_conf, linetype="99 %", y=p99))
    + geom_line(aes(color=cri_conf, linetype="99.9 %", y=p999))
    + scale_x_discrete(limits=c("NOLOAD", "SHM", "CPUALL", "CPUJMP", "MEMCPY", "STREAM"))
    + scale_y_continuous(limits=c(NA, maxy), trans='$ytrans')
    )
# print full data matrix
d
print("saving $save.csv")
write.csv(d, file="$save.csv", row.names=FALSE)
print("saving $save.svg")
ggsave(file="$save.svg", plot=image)
EOF
Rscript plot.R


================================================
FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/n4c16/test02-multi-memtier/code.var.sh
================================================
# Redis parameters
REDIS_PASS=abc123xyz

test_time_minutes=3
delay_between_rounds=30s
memtier_threads=1
test_time_seconds=$((test_time_minutes * 60))
test_timeout_minutes=$((test_time_minutes + 2))

# BM_* are benchmarks
BM_MEMTIER="memtier-benchmark --server=redis-service --authenticate=$REDIS_PASS --threads=${memtier_threads} --test-time=${test_time_seconds}"

# Clean up
vm-command "kubectl delete jobs --all --now; kubectl delete deployment redis; kubectl delete service redis-service; kubectl delete secret redis; kubectl delete pods --all --now; true"

vm-command "systemctl restart dbus"
vm-command "systemctl restart kubelet"

# Setup Redis
wait="" create redis-secret
CPU=4 MEM=8G CPULIM=8 MEMLIM=12G NAME=redis wait="Available" create redis
NAME=redis-service wait="" create redis-service

# Reset counters in order to keep creating pod0...
reset counters

benchmark_output_dir="$OUTPUT_DIR/benchmark/multi-memtier"
mkdir -p "$benchmark_output_dir"

export NAME=memtier-benchmark
for bm_cmd in "${!BM_@}"; do
    for CPU in 1 2; do
	for CONTCOUNT in 2 4 6 8 10 12 14; do
            # Run benchmark
            AFFINITY=affinity CPU="$CPU" MEM="4G" CPULIM="$CPU" MEMLIM="5G" ARGS="${!bm_cmd#memtier-benchmark }" wait="Complete" wait_t="${test_timeout_minutes}m" create memtier-benchmark-02
            memtier_benchmark_pod="$(kubectl get pods | awk '/memtier-benchmark-/{print $1}')"
	    outfile="$benchmark_output_dir/$bm_cmd-affinity-cpu-$CPU-contcount-$CONTCOUNT.txt"
	    rm -f $outfile
	    for contnum in $(seq 0 $((CONTCOUNT-1))); do
		contname=${NAME}c${contnum}
		echo "memtier benchmark CPU=$CPU log for $contname:"
		kubectl logs "$memtier_benchmark_pod" "$contname" | grep -A7 'ALL STATS' | tee -a $outfile
	    done
            kubectl delete jobs --all --now
	    # find average from all containers:
	    #============================================================================================================================
	    #Type         Ops/sec     Hits/sec   Misses/sec    Avg. Latency     p50 Latency     p99 Latency   p99.9 Latency       KB/sec
	    #----------------------------------------------------------------------------------------------------------------------------
	    sum_ops_sec=$(awk '/Totals/ {sum+=$2} END {print sum}' $outfile)
	    avg_ops_sec=$(awk '/Totals/ {sum+=$2;cnt++} END {printf("%d", sum/cnt)}' $outfile)
	    avg_latency=$(awk '/Totals/ {sum+=$5;cnt++} END {printf("%f", sum/cnt)}' $outfile)
	    avg_p50_lat=$(awk '/Totals/ {sum+=$6;cnt++} END {printf("%f", sum/cnt)}' $outfile)
	    avg_p99_lat=$(awk '/Totals/ {sum+=$7;cnt++} END {printf("%f", sum/cnt)}' $outfile)
	    printf "CPU=%2d CONTAINERS=%2d Ops/sec_sum:%7.0f Ops/sec_avg:%7.0f Latency:%f p50Latency:%f p99Latency:%f\n" \
		   $CPU $CONTCOUNT $sum_ops_sec $avg_ops_sec $avg_latency $avg_p50_lat $avg_p99_lat |tee -a $outfile
	    echo "sleep ${delay_between_rounds} between tests..."
	    echo "=================================================="
	    sleep ${delay_between_rounds}
	done
    done
done
echo "Use 'grep CONTAINERS output-of-this-session' to show one-line-per-round output"


================================================
FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/n4c16/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/redis-secret.yaml.in
================================================
apiVersion: v1
kind: Secret
metadata:
  name: redis
data:
  REDIS_PASS: $(base64 -w0 <<< "$REDIS_PASS")
type: Opaque


================================================
FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/redis-service.yaml.in
================================================
apiVersion: v1
kind: Service
metadata:
  name: $NAME
  labels:
    app: redis
spec:
  ports:
    - name: redis
      port: 6379
      targetPort: 6379
  selector:
    app: redis


================================================
FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/redis.yaml.in
================================================
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    app: redis
  name: redis
spec:
  replicas: 1
  selector:
    matchLabels:
      app: redis
  template:
    metadata:
      labels:
        app: redis
    spec:
      containers:
        - name: redis
          image: redis
          imagePullPolicy: IfNotPresent
          args: ['--requirepass', '$REDIS_PASS']
          ports:
            - containerPort: 6379
              name: redis
          env:
            - name: MASTER
              value: 'true'
            - name: REDIS_PASS
              valueFrom:
                secretKeyRef:
                  name: redis
                  key: REDIS_PASS
          resources:
            requests:
              cpu: ${CPU}
              memory: '${MEM}'
            limits:
              cpu: ${CPULIM}
              memory: '${MEMLIM}'


================================================
FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/stress-ng-benchmark.yaml.in
================================================
apiVersion: batch/v1
kind: Job
metadata:
  name: stress-ng-benchmark
spec:
  template:
    spec:
      containers:
        - name: ${NAME}c$(( contnum - 1 ))
          image: alexeiled/stress-ng
          imagePullPolicy: IfNotPresent
          args: ['${ARGS// /\', \'}']
          $(if [ "$CPU" != "0" ]; then echo "
          resources:
            requests:
              cpu: ${CPU}
              memory: '${MEM}'
            limits:
              cpu: ${CPULIM}
              memory: '${MEMLIM}'
          "; fi)
      restartPolicy: Never


================================================
FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/stress-ng.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  labels:
    app: ${NAME}
    e2erole: bgload
spec:
  containers:
  $(for contnum in $(seq 1 ${CONTCOUNT}); do echo "
  - name: ${NAME}c$(( contnum - 1 ))
    image: alexeiled/stress-ng
    imagePullPolicy: IfNotPresent
    args: ['${ARGS// /\', \'}']
    $(if [ "$CPU" != "0" ]; then echo "
    resources:
      requests:
        cpu: ${CPU}
        memory: '${MEM}'
      limits:
        cpu: ${CPULIM}
        memory: '${MEMLIM}'
    "; fi)
  "; done )
  terminationGracePeriodSeconds: 1


================================================
FILE: test/e2e/besteffort.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  labels:
    app: ${NAME}
    $(for lbl in ${!LABEL*}; do [[ "$lbl" == LABEL[0-9]* ]] && echo "
    ${!lbl}
    "; done)
  $([ -n "${!ANN*}" ] && echo "
  annotations:
    $(for ann in ${!ANN*}; do [[ "$ann" == ANN[0-9]* ]] && echo "
    ${!ann}
    "; done)
  ")
spec:
  containers:
  $(for contnum in $(seq 1 ${CONTCOUNT}); do echo "
  - name: ${NAME}c$(( contnum - 1 ))
    image: busybox
    imagePullPolicy: IfNotPresent
    command:
      - sh
      - -c
      - echo ${NAME}c$(( contnum - 1 )) \$(sleep inf)
  "; done )
  terminationGracePeriodSeconds: 1


================================================
FILE: test/e2e/blockio.test-suite/blockio/n4c16/test00-slowreader/code.var.sh
================================================
install-bfq() {
    # Install a kernel with BFQ I/O scheduler.
    if [[ "$distro" == ubuntu* ]]; then
        vm-command "uname -a"
        if ! grep -q lowlatency <<< "$COMMAND_OUTPUT"; then
            vm-command "apt install -y linux-image-lowlatency" ||
                command-error "failed to install lowlatency kernel for the BFQ I/O scheduler"

        fi
        vm-reboot
        vm-command-q "uname -a | grep lowlatency" || {
            error "failed to switch to lowlatency kernel"
        }
        vm-command "modprobe bfq"
    elif [[ "$distro" == debian* ]]; then
        vm-command "apt install -y linux-image-rt-amd64"
        vm-reboot
        vm-command "modprobe bfq"
    else
        error "not implemented: install kernel with BFQ I/O scheduler support to $distro"
    fi
}

# Make sure the BFQ scheduler is available in the system.
if ! vm-command-q "grep bfq /sys/block/vda/queue/scheduler"; then
    vm-command "modprobe bfq"
    vm-command-q "grep bfq /sys/block/vda/queue/scheduler" || {
        install-bfq
        vm-command-q "grep bfq /sys/block/vda/queue/scheduler" || {
            error "failed to make bfq I/O scheduler available in /sys/block/vda/queue/scheduler"
        }
    }
fi

# Switch to BFQ.
for blkdev in vda vdb; do
    if ! vm-command-q "grep '[[]bfq[]]' /sys/block/$blkdev/queue/scheduler"; then
        vm-command "echo bfq > /sys/block/$blkdev/queue/scheduler"
        vm-command-q "grep '[[]bfq[]]' /sys/block/$blkdev/queue/scheduler" || {
            error "failed to switch using bfq on /dev/$blkdev"
        }
    fi
done

if [[ "$k8scri" == *"containerd"* ]]; then
    # Start importing configurations from /etc/containerd/config.d/*.toml.
    vm-command-q "[ -f /etc/containerd/config.toml ] || echo "" > /etc/containerd/config.toml"
    vm-command-q "grep '^imports' /etc/containerd/config.toml || sed -i '1iimports = [\"/etc/containerd/config.d/*.toml\"]' /etc/containerd/config.toml"
    vm-command-q "grep -E '^imports.*/etc/containerd/config.d/' || sed -i 's:^\(imports.*\)\]:\1, \"/etc/containerd/config.d/*.toml\"\]:' /etc/containerd/config.toml"
    # e2e-specific config: tasks-service plugin loads blockio_config_file.
    vm-pipe-to-file /etc/containerd/config.d/e2e.toml <<EOF
[plugins."io.containerd.service.v1.tasks-service"]
blockio_config_file="/etc/containers/blockio.yaml"
EOF
    vm-command "systemctl restart containerd"
    sleep 5
else
    vm-command "systemctl restart crio" # make sure to apply latest --blockio-config-file
    sleep 5
fi

# Install a test script for viewing cgroups v1 and v2 values of k8s pods/containers.
vm-put-file "$HOST_PROJECT_DIR/scripts/testing/kube-cgroups" "/usr/local/bin/kube-cgroups"

# Create pod0, a single-container pod with a pod-level annotation.
ANN0='blockio.resources.beta.kubernetes.io/pod: "lowprio"' create besteffort

vm-command "kube-cgroups -f io\\. -p pod0"
# Check the blkio controller data when using cgroups v1.
if \
    grep blkio.throttle <<< "$COMMAND_OUTPUT" && (
        ( ! grep -A2 'blkio.throttle.read_bps_device:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:16 1000000' ) || \
            ( ! grep -A2 'blkio.throttle.read_bps_device:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:0 1000000' ) || \
            ( ! grep -A2 'blkio.throttle.read_iops_device:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:16 2000' ) || \
            ( ! grep -A2 'blkio.throttle.read_iops_device:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:0 2000' ) || \
            ( ! grep -A2 'blkio.throttle.write_bps_device:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:16 512000' ) || \
            ( ! grep -A2 'blkio.throttle.write_bps_device:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:0 512000' ) || \
            ( ! grep -A2 'blkio.throttle.write_iops_device:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:16 300' ) || \
            ( ! grep -A2 'blkio.throttle.write_iops_device:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:0 300' ) \
            )
then
    command-error "expected blkio.throttle.read_bps_device not found from cgroups v1"
fi

# Check the io controller data (successor of blkio) when using cgroups v2.
if \
    grep io.max <<< "$COMMAND_OUTPUT" && (
        ( ! grep -A2 'io.max:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:16 .*rbps=1000000' ) || \
            ( ! grep -A2 'io.max:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:0 .*rbps=1000000' ) || \
            ( ! grep -A2 'io.max:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:16 .*riops=2000' ) || \
            ( ! grep -A2 'io.max:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:0 .*riops=2000' ) || \
            ( ! grep -A2 'io.max:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:16 .*wbps=512000' ) || \
            ( ! grep -A2 'io.max:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:0 .*wbps=512000' ) || \
            ( ! grep -A2 'io.max:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:16 .*wiops=300' ) || \
            ( ! grep -A2 'io.max:' <<< "$COMMAND_OUTPUT" | grep -q -E '25[0-5]:0 .*wiops=300' ) \
            )
then
    command-error "expected io.max values not found from cgroups v2"
fi


# Create pod1, a triple-container pod with all containers in different blockio classes.
CONTCOUNT=3 CPU=1 MEM=64M \
         ANN0='blockio.resources.beta.kubernetes.io/pod: "lowprio"' \
         ANN1='blockio.resources.beta.kubernetes.io/container.pod1c1: "normal"' \
         ANN2='blockio.resources.beta.kubernetes.io/container.pod1c2: "highprio"' \
         create guaranteed

vm-command "kube-cgroups -f '.*io.bfq.weight.*' -c pod1c0"
grep -q 88 <<< "$COMMAND_OUTPUT" || error "expected blkio.bfq.weight: 88"

# In some setups besteffort slice has the default bfq weight 10, in some other 100. Accept both.
vm-command "kube-cgroups -f '.*io.bfq.weight.*' -c pod1c1"
grep -q -E 'default 10(0)?' <<< "$COMMAND_OUTPUT" || error "expected blkio.bfq.weight: default 10 or 100"

# Check device-specific bfq I/O scheduler weights.
vm-command "kube-cgroups -f '.*io.bfq.weight.*' -c pod1c2"
if grep -q blkio.bfq.weight_device <<< "$COMMAND_OUTPUT"; then
    grep -A3 blkio.bfq.weight_device <<< "$COMMAND_OUTPUT" | grep -q -E "25[0-5]:0 444" || error "expected 444"
    grep -A3 blkio.bfq.weight_device <<< "$COMMAND_OUTPUT" | grep -q -E "25[0-5]:16 555" || error "expected 555"
else
    grep -A3 io.bfq.weight <<< "$COMMAND_OUTPUT" | grep -q -E "25[0-5]:0 444" || error "expected 444"
    grep -A3 io.bfq.weight <<< "$COMMAND_OUTPUT" | grep -q -E "25[0-5]:16 555" || error "expected 555"
fi


================================================
FILE: test/e2e/blockio.test-suite/blockio/n4c16/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/blockio.test-suite/blockio/n4c16/vm-files/etc/containers/blockio.yaml
================================================
Classes:
  lowprio:
  - Weight: 88
  - Devices:
    - /dev/vd[a-z]
    ThrottleReadBps: 1M
    ThrottleWriteBps: 512k
    ThrottleReadIOPS: 2000
    ThrottleWriteIOPS: 300
  normal:
  - ThrottleWriteBps: 16M
    Devices:
    - /dev/vda
  highprio:
  - Devices:
    - /dev/vda
    Weight: 444
  - Devices:
    - /dev/vdb
    Weight: 555


================================================
FILE: test/e2e/blockio.test-suite/blockio/n4c16/vm-files/etc/crio/crio.conf.d/55-blockio
================================================
[crio.runtime]
blockio_config_file="/etc/containers/blockio.yaml"


================================================
FILE: test/e2e/blockio.test-suite/containerd_src.var.in.sh
================================================
${containerd_src:-$GOPATH/src/github.com/containerd/containerd}


================================================
FILE: test/e2e/blockio.test-suite/crio_src.var.in.sh
================================================
${crio_src:-$GOPATH/src/github.com/cri-o/cri-o}


================================================
FILE: test/e2e/blockio.test-suite/k8scri.var.in.sh
================================================
${k8scri:-crio}


================================================
FILE: test/e2e/blockio.test-suite/omit_cri_resmgr.var.sh
================================================
1


================================================
FILE: test/e2e/burstable.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  labels:
    app: ${NAME}
    $(for lbl in ${!LABEL*}; do [[ "$lbl" == LABEL[0-9]* ]] && echo "
    ${!lbl}
    "; done)
  $([ -n "${!ANN*}" ] && echo "
  annotations:
    $(for ann in ${!ANN*}; do [[ "$ann" == ANN[0-9]* ]] && echo "
    ${!ann}
    "; done)
  ")
spec:
  containers:
  $(for contnum in $(seq 1 ${CONTCOUNT}); do echo "
  - name: ${NAME}c$(( contnum - 1 ))
    image: busybox
    imagePullPolicy: IfNotPresent
    command:
      - sh
      - -c
      - echo ${NAME}c$(( contnum - 1 )) \$(sleep inf)
    resources:
      requests:
        cpu: ${CPUREQ}
        memory: ${MEMREQ}
      limits:
        cpu: ${CPULIM}
        memory: ${MEMLIM}
  "; done )
  terminationGracePeriodSeconds: 1


================================================
FILE: test/e2e/cri-resmgr-topology-aware.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
logger:
  Debug: cri-resmgr,resource-manager,cache,policy


================================================
FILE: test/e2e/guaranteed.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  labels:
    app: ${NAME}
    $(for lbl in ${!LABEL*}; do [[ "$lbl" == LABEL[0-9]* ]] && echo "
    ${!lbl}
    "; done)
  $([ -n "${!ANN*}" ] && echo "
  annotations:
    $(for ann in ${!ANN*}; do [[ "$ann" == ANN[0-9]* ]] && echo "
    ${!ann}
    "; done)
  ")
spec:
  $(if [ -n "$ICONTCOUNT" ]; then echo "
  initContainers:
  $(for contnum in $(seq 1 ${ICONTCOUNT}); do echo "
  - name: ${NAME}c$(( contnum - 1 ))i
    image: busybox
    imagePullPolicy: IfNotPresent
    command:
      - sh
      - -c
      - echo ${NAME}c$(( contnum - 1 ))i \$(sleep ${ICONTSLEEP:-5})
    resources:
      requests:
        cpu: ${CPU}
        memory: '${MEM}'
      limits:
        cpu: ${CPU}
        memory: '${MEM}'
  "; done )
  "; fi )
  containers:
  $(for contnum in $(seq 1 ${CONTCOUNT}); do echo "
  - name: ${NAME}c$(( contnum - 1 ))
    image: busybox
    imagePullPolicy: IfNotPresent
    command:
      - sh
      - -c
      - echo ${NAME}c$(( contnum - 1 )) \$(sleep inf)
    resources:
      requests:
        cpu: ${CPU}
        memory: '${MEM}'
      limits:
        cpu: ${CPU}
        memory: '${MEM}'
  "; done )
  terminationGracePeriodSeconds: 1


================================================
FILE: test/e2e/packages.test-suite/debian-11/binsrc.var
================================================
packages/debian-11


================================================
FILE: test/e2e/packages.test-suite/debian-11/cri-resmgr.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
logger:
  Debug: cri-resmgr,resource-manager,cache,policy


================================================
FILE: test/e2e/packages.test-suite/debian-11/distro.var
================================================
debian-11


================================================
FILE: test/e2e/packages.test-suite/debian-11/pkgtest/test01-systemd/code.var.sh
================================================
# Clear cri-resmgr output from previous runs.
vm-command "journalctl --vacuum-time=1s"

# Create a pod.
create besteffort

# Verify that new pod was created by systemd-managed cri-resource-manager.
vm-command "journalctl -xeu cri-resource-manager | grep 'StartContainer: starting container pod0:pod0c0'" || {
    command-error "failed to verify that systemd-managed cri-resource-manager launched the pod"
}


================================================
FILE: test/e2e/packages.test-suite/debian-11/pkgtest/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/packages.test-suite/debian-11/reinstall_cri_resmgr.var
================================================
1

================================================
FILE: test/e2e/packages.test-suite/debian-12/binsrc.var
================================================
packages/debian-12


================================================
FILE: test/e2e/packages.test-suite/debian-12/cri-resmgr.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
logger:
  Debug: cri-resmgr,resource-manager,cache,policy


================================================
FILE: test/e2e/packages.test-suite/debian-12/distro.var
================================================
debian-12


================================================
FILE: test/e2e/packages.test-suite/debian-12/pkgtest/test01-systemd/code.var.sh
================================================
# Clear cri-resmgr output from previous runs.
vm-command "journalctl --vacuum-time=1s"

# Create a pod.
create besteffort

# Verify that new pod was created by systemd-managed cri-resource-manager.
vm-command "journalctl -xeu cri-resource-manager | grep 'StartContainer: starting container pod0:pod0c0'" || {
    command-error "failed to verify that systemd-managed cri-resource-manager launched the pod"
}


================================================
FILE: test/e2e/packages.test-suite/debian-12/pkgtest/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/packages.test-suite/debian-12/reinstall_cri_resmgr.var
================================================
1

================================================
FILE: test/e2e/packages.test-suite/debian-sid/binsrc.var
================================================
packages/debian-sid

================================================
FILE: test/e2e/packages.test-suite/debian-sid/cri-resmgr.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
logger:
  Debug: cri-resmgr,resource-manager,cache,policy


================================================
FILE: test/e2e/packages.test-suite/debian-sid/distro.var
================================================
debian-sid


================================================
FILE: test/e2e/packages.test-suite/debian-sid/pkgtest/test01-systemd/code.var.sh
================================================
# Clear cri-resmgr output from previous runs.
vm-command "journalctl --vacuum-time=1s"

# Create a pod.
create besteffort

# Verify that new pod was created by systemd-managed cri-resource-manager.
vm-command "journalctl -xeu cri-resource-manager | grep 'StartContainer: starting container pod0:pod0c0'" || {
    command-error "failed to verify that systemd-managed cri-resource-manager launched the pod"
}


================================================
FILE: test/e2e/packages.test-suite/debian-sid/pkgtest/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/packages.test-suite/debian-sid/reinstall_cri_resmgr.var
================================================
1

================================================
FILE: test/e2e/packages.test-suite/fedora/binsrc.var
================================================
packages/fedora

================================================
FILE: test/e2e/packages.test-suite/fedora/cri-resmgr.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
logger:
  Debug: cri-resmgr,resource-manager,cache,policy


================================================
FILE: test/e2e/packages.test-suite/fedora/distro.var
================================================
fedora


================================================
FILE: test/e2e/packages.test-suite/fedora/pkgtest/test01-systemd/code.var.sh
================================================
# Clear cri-resmgr output from previous runs.
vm-command "journalctl --vacuum-time=1s"

# Create a pod.
create besteffort

# Verify that new pod was created by systemd-managed cri-resource-manager.
vm-command "journalctl -xeu cri-resource-manager | grep 'StartContainer: starting container pod0:pod0c0'" || {
    command-error "failed to verify that systemd-managed cri-resource-manager launched the pod"
}


================================================
FILE: test/e2e/packages.test-suite/fedora/pkgtest/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/packages.test-suite/fedora/reinstall_cri_resmgr.var
================================================
1

================================================
FILE: test/e2e/packages.test-suite/opensuse-15.6/binsrc.var
================================================
packages/opensuse-leap-15.6


================================================
FILE: test/e2e/packages.test-suite/opensuse-15.6/cri-resmgr.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
logger:
  Debug: cri-resmgr,resource-manager,cache,policy


================================================
FILE: test/e2e/packages.test-suite/opensuse-15.6/distro.var
================================================
opensuse-15.6


================================================
FILE: test/e2e/packages.test-suite/opensuse-15.6/pkgtest/test01-systemd/code.var.sh
================================================
# Clear cri-resmgr output from previous runs.
vm-command "journalctl --vacuum-time=1s"

# Create a pod.
create besteffort

# Verify that new pod was created by systemd-managed cri-resource-manager.
vm-command "journalctl -xeu cri-resource-manager | grep 'StartContainer: starting container pod0:pod0c0'" || {
    command-error "failed to verify that systemd-managed cri-resource-manager launched the pod"
}


================================================
FILE: test/e2e/packages.test-suite/opensuse-15.6/pkgtest/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/packages.test-suite/opensuse-15.6/reinstall_cri_resmgr.var
================================================
1

================================================
FILE: test/e2e/packages.test-suite/ubuntu-18.04/binsrc.var
================================================
packages/ubuntu-18.04

================================================
FILE: test/e2e/packages.test-suite/ubuntu-18.04/cri-resmgr.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
logger:
  Debug: cri-resmgr,resource-manager,cache,policy


================================================
FILE: test/e2e/packages.test-suite/ubuntu-18.04/distro.var
================================================
ubuntu-18.04


================================================
FILE: test/e2e/packages.test-suite/ubuntu-18.04/pkgtest/test01-systemd/code.var.sh
================================================
# Clear cri-resmgr output from previous runs.
vm-command "journalctl --vacuum-time=1s"

# Create a pod.
create besteffort

# Verify that new pod was created by systemd-managed cri-resource-manager.
vm-command "journalctl -xeu cri-resource-manager | grep 'StartContainer: starting container pod0:pod0c0'" || {
    command-error "failed to verify that systemd-managed cri-resource-manager launched the pod"
}


================================================
FILE: test/e2e/packages.test-suite/ubuntu-18.04/pkgtest/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/packages.test-suite/ubuntu-18.04/reinstall_cri_resmgr.var
================================================
1

================================================
FILE: test/e2e/packages.test-suite/ubuntu-20.04/binsrc.var
================================================
packages/ubuntu-20.04

================================================
FILE: test/e2e/packages.test-suite/ubuntu-20.04/cri-resmgr.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
logger:
  Debug: cri-resmgr,resource-manager,cache,policy


================================================
FILE: test/e2e/packages.test-suite/ubuntu-20.04/distro.var
================================================
ubuntu-20.04


================================================
FILE: test/e2e/packages.test-suite/ubuntu-20.04/pkgtest/test01-systemd/code.var.sh
================================================
# Clear cri-resmgr output from previous runs.
vm-command "journalctl --vacuum-time=1s"

# Create a pod.
create besteffort

# Verify that new pod was created by systemd-managed cri-resource-manager.
vm-command "journalctl -xeu cri-resource-manager | grep 'StartContainer: starting container pod0:pod0c0'" || {
    command-error "failed to verify that systemd-managed cri-resource-manager launched the pod"
}


================================================
FILE: test/e2e/packages.test-suite/ubuntu-20.04/pkgtest/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/packages.test-suite/ubuntu-20.04/reinstall_cri_resmgr.var
================================================
1

================================================
FILE: test/e2e/packages.test-suite/ubuntu-22.04/binsrc.var
================================================
packages/ubuntu-22.04


================================================
FILE: test/e2e/packages.test-suite/ubuntu-22.04/cri-resmgr.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
logger:
  Debug: cri-resmgr,resource-manager,cache,policy


================================================
FILE: test/e2e/packages.test-suite/ubuntu-22.04/distro.var
================================================
ubuntu-22.04


================================================
FILE: test/e2e/packages.test-suite/ubuntu-22.04/pkgtest/test01-systemd/code.var.sh
================================================
# Clear cri-resmgr output from previous runs.
vm-command "journalctl --vacuum-time=1s"

# Create a pod.
create besteffort

# Verify that new pod was created by systemd-managed cri-resource-manager.
vm-command "journalctl -xeu cri-resource-manager | grep 'StartContainer: starting container pod0:pod0c0'" || {
    command-error "failed to verify that systemd-managed cri-resource-manager launched the pod"
}


================================================
FILE: test/e2e/packages.test-suite/ubuntu-22.04/pkgtest/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/packages.test-suite/ubuntu-22.04/reinstall_cri_resmgr.var
================================================
1

================================================
FILE: test/e2e/packages.test-suite/ubuntu-24.04/binsrc.var
================================================
packages/ubuntu-24.04


================================================
FILE: test/e2e/packages.test-suite/ubuntu-24.04/cri-resmgr.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
logger:
  Debug: cri-resmgr,resource-manager,cache,policy


================================================
FILE: test/e2e/packages.test-suite/ubuntu-24.04/distro.var
================================================
ubuntu-24.04


================================================
FILE: test/e2e/packages.test-suite/ubuntu-24.04/pkgtest/test01-systemd/code.var.sh
================================================
# Clear cri-resmgr output from previous runs.
vm-command "journalctl --vacuum-time=1s"

# Create a pod.
create besteffort

# Verify that new pod was created by systemd-managed cri-resource-manager.
vm-command "journalctl -xeu cri-resource-manager | grep 'StartContainer: starting container pod0:pod0c0'" || {
    command-error "failed to verify that systemd-managed cri-resource-manager launched the pod"
}


================================================
FILE: test/e2e/packages.test-suite/ubuntu-24.04/pkgtest/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/packages.test-suite/ubuntu-24.04/reinstall_cri_resmgr.var
================================================
1

================================================
FILE: test/e2e/policies.test-suite/balloons/balloons-busybox.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  $(if [ -n "$POD_ANNOTATION" ]; then echo "
  annotations:
    $POD_ANNOTATION
  "; fi)
  labels:
    app: ${NAME}
spec:
  containers:
  $(for contnum in $(seq 1 ${CONTCOUNT}); do echo "
  - name: ${NAME}c$(( contnum - 1 ))
    image: busybox
    imagePullPolicy: IfNotPresent
    command:
      - sh
      - -c
      - ${WORK}echo ${NAME}c$(( contnum - 1 )) \$(sleep inf)
    $(if [ -n "${CPUREQ}" ]; then echo "
    resources:
      requests:
        cpu: ${CPUREQ}
        $(if [ -n "${MEMREQ}" ]; then echo "
        memory: '${MEMREQ}'
        "; fi)
      $(if [ -n "${CPULIM}" ]; then echo "
      limits:
        cpu: ${CPULIM}
        $(if [ -n "$MEMLIM" ]; then echo "
        memory: '${MEMLIM}'
        "; fi)
    "; fi)
    "; fi)
  "; done )
  terminationGracePeriodSeconds: 1


================================================
FILE: test/e2e/policies.test-suite/balloons/balloons-configmap.yaml.in
================================================
apiVersion: v1
kind: ConfigMap
metadata:
  name: cri-resmgr-config.default
  namespace: kube-system
data:
  policy: |+
    Active: balloons
    AvailableResources:
      CPU: ${AVAILABLE_CPU:-cpuset:0-15}
    ReservedResources:
      CPU: ${RESERVED_CPU:-1}

    $([ -z "$IDLECPUCLASS" ] || echo "
    IdleCPUClass: ${IDLECPUCLASS}
    ")

    balloons:
      PinCPU: ${PINCPU:-true}
      PinMemory: ${PINMEMORY:-true}
      BalloonTypes:

        $([ -n "$BTYPE0_SKIP" ] || echo "
        - Name: btype0
          MinCPUs: ${BTYPE0_MINCPUS:-2}
          MaxCPUs: ${BTYPE0_MAXCPUS:-2}
          AllocatorPriority: ${BTYPE0_ALLOCATORPRIORITY:-0}
          CPUClass: ${BTYPE0_CPUCLASS:-classA}
          PreferNewBalloons: ${BTYPE0_PREFERNEWBALLOONS:-true}
          PreferSpreadingPods: ${BTYPE0_PREFERSPREADINGPODS:-false}
        ")

        $([ -n "$BTYPE1_SKIP" ] || echo "
        - Name: btype1
          Namespaces:
            - ${BTYPE1_NAMESPACE0:-btype1ns0}
          MinCPUs: ${BTYPE1_MINCPUS:-1}
          MaxCPUs: ${BTYPE1_MAXCPUS:-1}
          AllocatorPriority: ${BTYPE1_ALLOCATORPRIORITY:-1}
          CPUClass: ${BTYPE1_CPUCLASS:-classB}
          PreferNewBalloons: ${BTYPE1_PREFERNEWBALLOONS:-false}
          PreferSpreadingPods: ${BTYPE1_PREFERSPREADINGPODS:-true}
        ")

        $([ -n "$BTYPE2_SKIP" ] || echo "
        - Name: btype2
          Namespaces:
            - ${BTYPE2_NAMESPACE0:-btype2ns0}
            - ${BTYPE2_NAMESPACE1:-btype2ns1}
          MinCPUs: ${BTYPE2_MINCPUS:-4}
          MaxCPUs: ${BTYPE2_MAXCPUS:-8}
          MinBalloons: ${BTYPE2_MINBALLOONS:-1}
          AllocatorPriority: ${BTYPE2_ALLOCATORPRIORITY:-2}
          CPUClass: ${BTYPE2_CPUCLASS:-classC}
          PreferNewBalloons: ${BTYPE2_PREFERNEWBALLOONS:-false}
          PreferSpreadingPods: ${BTYPE2_PREFERSPREADINGPODS:-false}
        ")

  instrumentation: |+
    HTTPEndpoint: :8891
    PrometheusExport: true

  logger: |+
    Debug: policy

  cpu: |+
    classes:
      default:
        minFreq: ${CPU_DEFAULT_MIN:-800}
        maxFreq: ${CPU_DEFAULT_MAX:-2800}
      classA:
        minFreq: ${CPU_CLASSA_MIN:-900}
        maxFreq: ${CPU_CLASSA_MAX:-2900}
      classB:
        minFreq: ${CPU_CLASSB_MIN:-1000}
        maxFreq: ${CPU_CLASSB_MAX:-3000}
      classC:
        minFreq: ${CPU_CLASSC_MIN:-1100}
        maxFreq: ${CPU_CLASSC_MAX:-3100}
        energyPerformancePreference: ${CPU_CLASSC_EPP:-1}


================================================
FILE: test/e2e/policies.test-suite/balloons/cri-resmgr.cfg
================================================
policy:
  Active: balloons
  # Use only 15 CPUs in total, leave cpu0 for other than Kubernetes
  # processes.
  AvailableResources:
    CPU: cpuset:1-15
  # Reserve one of our CPUs (cpu15) for kube-system tasks.
  ReservedResources:
    CPU: 1
  balloons:
    # PinCPU: allow containers to use only the CPUs in their balloons.
    PinCPU: true
    # PinMemory: allow containers to use only the closest memory to
    # the CPUs in their balloons.
    PinMemory: true
    BalloonTypes:
      - Name: two-cpu
        MinCPUs: 2
        MaxCPUs: 2
        AllocatorPriority: 0
        CPUClass: class2
        PreferNewBalloons: true

      - Name: three-cpu
        Namespaces:
          - "three"
        MinCPUs: 3
        AllocatorPriority: 1
        CPUClass: class3
        PreferSpreadingPods: true

      - Name: four-cpu
        MinCPUs: 4
        MaxCPUs: 8
        MinBalloons: 1
        AllocatorPriority: 2
        CPUClass: class4

      - Name: five-cpu
        MaxCPUs: 5
        AllocatorPriority: 3
        PreferSpreadingPods: true
        PreferNewBalloons: true
        CPUClass: class5

logger:
  Debug: policy
  Klog:
    skip_headers: true

cpu:
  classes:
    default:
      minFreq: 800
      maxFreq: 2800
    class2:
      minFreq: 900
      maxFreq: 2900
    class3:
      minFreq: 1000
      maxFreq: 3000
    class4:
      minFreq: 1100
      maxFreq: 3100
      energyPerformancePreference: 1
    class5:
      minFreq: 1200
      maxFreq: 3200
      energyPerformancePreference: 2


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test01-basic-placement/code.var.sh
================================================
# Test placing containers with and without annotations to correct balloons
# reserved and shared CPUs.

cleanup() {
    vm-command "kubectl delete pods -n kube-system pod0; kubectl delete pods --all --now --wait; kubectl delete namespace three --now --wait --ignore-not-found"
    return 0
}

cleanup

# pod0: run on reserved CPUs
namespace=kube-system CONTCOUNT=2 create balloons-busybox
report allowed
verify 'cpus["pod0c0"] == cpus["pod0c1"]' \
       'len(cpus["pod0c0"]) == 1'

# pod1: run on the same two-cpu balloon (running containers of a pod
# on the same balloon takes precedence creating new balloons).
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: two-cpu" CONTCOUNT=2 create balloons-busybox
report allowed
verify 'cpus["pod1c0"] == cpus["pod1c1"]' \
       'len(cpus["pod1c0"]) == 2'

# pod2: run on a different two-cpu balloon than pod1 (new balloon
# creation is preferred).
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: two-cpu" CONTCOUNT=1 create balloons-busybox
report allowed
verify 'len(cpus["pod2c0"]) == 2' \
       'disjoint_sets(cpus["pod2c0"], cpus["pod1c0"])'

# pod3: fits exactly on a single three-cpu instance. No need to create
# new balloon even if spreading pods is preferred.
CPUREQ="1500m" MEMREQ="100M" CPULIM="1500m" MEMLIM="100M"
kubectl create namespace "three"
namespace="three" CONTCOUNT=2 create balloons-busybox
report allowed
verify 'cpus["pod3c0"] == cpus["pod3c1"]' \
       'len(cpus["pod3c0"]) == 3'

cleanup

# pod4: first two containers to the first instance, 3rd to new four-cpu instance
CPUREQ="3" MEMREQ="" CPULIM="3" MEMLIM=""
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: four-cpu" CONTCOUNT=3 create balloons-busybox
report allowed
verify 'cpus["pod4c0"] == cpus["pod4c1"]' \
       'disjoint_sets(cpus["pod4c2"], cpus["pod4c0"])' \
       'len(cpus["pod4c0"]) == 6' \
       'len(cpus["pod4c2"]) == 4'

cleanup

# pod5: all spread containers to their own balloon instances
CPUREQ="1250m" MEMREQ="" CPULIM="" MEMLIM=""
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: five-cpu" CONTCOUNT=3 create balloons-busybox
report allowed
verify 'disjoint_sets(cpus["pod5c0"], cpus["pod5c1"], cpus["pod5c2"])' \
       'len(cpus["pod5c0"]) == 2' \
       'len(cpus["pod5c1"]) == 2' \
       'len(cpus["pod5c2"]) == 2'

cleanup


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test02-prometheus-metrics/balloons-metrics.cfg
================================================
policy:
  Active: balloons
  ReservedResources:
    CPU: 1
  balloons:
    BalloonTypes:
      - Name: full-core
        MinCPUs: 2
        MaxCPUs: 2
        CPUClass: normal

      - Name: fast-dualcore
        MinCPUs: 4
        MaxCPUs: 4
        CPUClass: turbo
        PreferNewBalloons: true

      - Name: flex
        MaxCPUs: 8
        CPUClass: slow
instrumentation:
  HTTPEndpoint: :8891
  PrometheusExport: true
logger:
  Debug: policy
  Klog:
    skip_headers: true


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test02-prometheus-metrics/code.var.sh
================================================
# This test verifies prometheus metrics from the balloons policy.

cleanup() {
    vm-command "kubectl delete pods --all --now --wait"
    return 0
}

cleanup

# Launch cri-resmgr with wanted metrics update interval and a
# configuration that opens the instrumentation http server.
terminate cri-resmgr
cri_resmgr_cfg=${TEST_DIR}/balloons-metrics.cfg  cri_resmgr_extra_args="-metrics-interval 4s" launch cri-resmgr
verify-metrics-has-line 'balloon="default\[0\]"'
verify-metrics-has-line 'balloon="reserved\[0\]"'
verify-metrics-has-no-line 'balloon="full-core\[0\]"'
verify-metrics-has-no-line 'balloon_type="full-core"'
verify-metrics-has-no-line 'balloon_type="fast-dualcore"'
verify-metrics-has-no-line 'balloon_type="flex"'

# pod0 in full-core[0]
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: full-core" CONTCOUNT=2 create balloons-busybox
report allowed
verify-metrics-has-line 'balloon="default\[0\]"'
verify-metrics-has-line 'balloon="reserved\[0\]"'
verify-metrics-has-line 'balloons{balloon="full-core\[0\]",balloon_type="full-core",containers="pod0:pod0c0,pod0:pod0c1",cpu_class="normal",cpus=".*",cpus_allowed=".*",cpus_allowed_count="2",cpus_count="2",cpus_max="2",cpus_min="2",dies="p[01]d0",dies_count="1",mems="0",numas="p[01]d0n[0-3]",numas_count="1",packages="p[01]",packages_count="1",sharedidlecpus="",sharedidlecpus_count="0",tot_req_millicpu="(199|200)"} 2'

# pod1 in fast-dualcore[0]
CPUREQ="200m" MEMREQ="" CPULIM="200m" MEMLIM=""
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: fast-dualcore" CONTCOUNT=1 create balloons-busybox
report allowed
verify-metrics-has-line 'balloon="fast-dualcore\[0\]".*tot_req_millicpu="(199|200)".* 4'
verify-metrics-has-no-line 'balloon="fast-dualcore\[1\]"'

# pod2 in fast-dualcore[1] (FillChain prefers new-balloon)
CPUREQ="500m" MEMREQ="" CPULIM="500m" MEMLIM=""
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: fast-dualcore" CONTCOUNT=1 create balloons-busybox
report allowed
verify-metrics-has-line 'balloon="fast-dualcore\[0\]"'
verify-metrics-has-line 'balloon="fast-dualcore\[1\]".*tot_req_millicpu="500".* 4'
verify-metrics-has-no-line 'balloon_type="flex"'

# pod3 in flex[0]
CPUREQ="3500m" MEMREQ="" CPULIM="3500m" MEMLIM=""
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: flex" CONTCOUNT=1 create balloons-busybox
report allowed
verify-metrics-has-line 'balloon_type="flex".* 4'

# pod4 in flex[0], balloon inflated to fit pod3 + pod4
CPUREQ="1200m" MEMREQ="" CPULIM="1200m" MEMLIM=""
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: flex" CONTCOUNT=1 create balloons-busybox
report allowed
verify-metrics-has-line 'balloon_type="flex"'
verify-metrics-has-line 'balloon_type="flex".* 5'

# check deflating a balloon in metrics
kubectl delete pods --now --wait --ignore-not-found pod3
verify-metrics-has-line 'balloon_type="flex"'
verify-metrics-has-line 'balloon_type="flex".* 2'

kubectl delete pods --now --wait --ignore-not-found pod4
sleep 5
# check popping a balloon from metrics
verify-metrics-has-no-line 'balloon_type="flex"'

# pop fast-dualcore[0], keep fast-dualcore[1]
kubectl delete pods --now --wait --ignore-not-found pod1
verify-metrics-has-line 'balloon="fast-dualcore\[1\]"'
sleep 5
verify-metrics-has-no-line 'balloon="fast-dualcore\[0\]"'

# re-create balloon instance fast-dualcore[0] that was just popped.
# pod5 in fast-dualcore[0], pod2 keeps running in fast-dualcore[1]
CPUREQ="4000m" MEMREQ="100M" CPULIM="4000m" MEMLIM="100M"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: fast-dualcore" CONTCOUNT=1 create balloons-busybox
report allowed
verify-metrics-has-line 'balloon="fast-dualcore\[1\]".*pod2c0.* 4'
verify-metrics-has-line 'balloon="fast-dualcore\[0\]".*pod5c0.* 4'

# # Re-launch cri-resmgr with test suite default parameters
# terminate cri-resmgr
# launch cri-resmgr

cleanup


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test03-reserved/balloons-reserved.cfg
================================================
policy:
  Active: balloons
  ReservedResources:
    CPU: cpuset:0-2
  balloons:
    IdleCPUClass: idle-class
    ReservedPoolNamespaces:
      - "monitor-*"
      - "*-log*"
    BalloonTypes:
      - Name: reserved
        Namespaces:
          - my-exact-name
        CPUClass: reserved-class
      - Name: default
        MinCPUs: 1
      - Name: full-core
        MinCPUs: 2
        MaxCPUs: 2
        CPUClass: turbo
        MinBalloons: 2
logger:
  Debug: policy
  Klog:
    skip_headers: true


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test03-reserved/code.var.sh
================================================
terminate cri-resmgr
cri_resmgr_cfg=${TEST_DIR}/balloons-reserved.cfg launch cri-resmgr

cleanup() {
    vm-command \
        "kubectl delete pod -n kube-system --now --wait --ignore-not-found pod0
         kubectl delete pod -n monitor-mypods --now --wait --ignore-not-found pod1
         kubectl delete pod -n system-logs --now --wait --ignore-not-found pod2
         kubectl delete pod -n kube-system --now --wait --ignore-not-found pod3
         kubectl delete pods --now --wait --ignore-not-found pod4 pod5 pod6
         kubectl delete pod -n kube-system --now --wait --ignore-not-found pod7
         kubectl delete namespace monitor-mypods --wait --ignore-not-found
         kubectl delete namespace system-logs --wait --ignore-not-found
         kubectl delete namespace my-exact-name --wait --ignore-not-found"
    return 0
}

cleanup

kubectl create namespace monitor-mypods
kubectl create namespace system-logs
kubectl create namespace my-exact-name

# pod0: kube-system
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
namespace=kube-system create balloons-busybox
report allowed
verify 'cpus["pod0c0"] == {"cpu00", "cpu01", "cpu02"}'

# pod1: match first ReservedPoolNamespaces glob, multicontainer
CPUREQ="1" MEMREQ="" CPULIM="1" MEMLIM=""
namespace=monitor-mypods CONTCOUNT=2 create balloons-busybox
report allowed
verify 'cpus["pod1c0"] == cpus["pod0c0"]' \
       'cpus["pod1c1"] == cpus["pod0c0"]'

# pod2: match last ReservedPoolNamespaces glob, slightly overbook reserved CPU
CPUREQ="1" MEMREQ="" CPULIM="1" MEMLIM=""
namespace=system-logs create balloons-busybox
report allowed
verify 'cpus["pod2c0"] == cpus["pod0c0"]'

# pod3: force a kube-system pod to full-core using an annotation
CPUREQ="2" MEMREQ="" CPULIM="2" MEMLIM=""
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: full-core" namespace=kube-system create balloons-busybox
report allowed
verify 'len(cpus["pod3c0"]) == 2' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod3c0"])'

# pod4: a default pod, should go to the default balloon and inflate it
# (configuration does not set MaxCPUs limit on the default balloon)
CPUREQ="2500m" MEMREQ="" CPULIM="2500m" MEMLIM=""
create balloons-busybox
report allowed
verify 'len(cpus["pod4c0"]) == 3' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod3c0"], cpus["pod4c0"])'

# pod5: annotate otherwise a default pod to the reserved CPUs,
# severely overbook reserved CPUs
CPUREQ="2500m" MEMREQ="" CPULIM="2500m" MEMLIM=""
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: reserved" create balloons-busybox
report allowed
verify 'cpus["pod5c0"] == {"cpu00", "cpu01", "cpu02"}' \
       'disjoint_sets(cpus["pod5c0"], cpus["pod3c0"], cpus["pod4c0"])'

cleanup

# Now that all pods are deleted, make sure that cpus of reserved and
# default pools are as expected. They could be now wrong if emptying
# the pools would have let to deflating or popping the balloons.

# pod6: a default pod, should go to the default balloon, and fit in
# the single CPU
CPUREQ="999m" MEMREQ="" CPULIM="999m" MEMLIM=""
create balloons-busybox
report allowed
verify 'len(cpus["pod6c0"]) == 1'

# pod7: kube-system
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
namespace=kube-system create balloons-busybox
report allowed
verify 'cpus["pod7c0"] == {"cpu00", "cpu01", "cpu02"}'

cleanup

terminate cri-resmgr
launch cri-resmgr


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test05-namespace/balloons-namespace.cfg
================================================
policy:
  Active: balloons
  ReservedResources:
    CPU: 1
  balloons:
    PinCPU: true
    PinMemory: true
    BalloonTypes:
      - Name: nsballoon
        Namespaces:
          - "*"
        MinCPUs: 2
        MaxCPUs: 4
        PreferPerNamespaceBalloon: true
logger:
  Debug: policy


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test05-namespace/code.var.sh
================================================
terminate cri-resmgr
cri_resmgr_cfg=${TEST_DIR}/balloons-namespace.cfg launch cri-resmgr

cleanup() {
    vm-command \
        "kubectl delete pods --all --now --wait
         kubectl delete namespace e2e-a --wait --ignore-not-found
         kubectl delete namespace e2e-b --wait --ignore-not-found
         kubectl delete namespace e2e-c --wait --ignore-not-found
         kubectl delete namespace e2e-d --wait --ignore-not-found"
    return 0
}
cleanup

kubectl create namespace e2e-a
kubectl create namespace e2e-b
kubectl create namespace e2e-c
kubectl create namespace e2e-d

# pod0: create in the default namespace, both containers go to nsballoon[0]
CPUREQ=""
CONTCOUNT=2 create balloons-busybox
report allowed
verify 'cpus["pod0c0"] == cpus["pod0c1"]' \
       'len(cpus["pod0c0"]) == 2'

# pod1: create in the e2e-a namespace, both containers go nsballoon[1] because
# nsballoon[0] does not contain any containers in this namespace.
CPUREQ=""
namespace="e2e-a" CONTCOUNT=2 create balloons-busybox
report allowed
verify 'cpus["pod1c0"] == cpus["pod1c1"]' \
       'len(cpus["pod0c0"]) == 2' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"])'

# pod2: create in the default namespace, should go to nsballoon[0] as
# pod0, and the balloon should inflate to 4 CPUs. cpusets with pod1
# should not overlap after inflation.
CPUREQ="2" MEMREQ="100M" CPULIM="2" MEMLIM="100M"
CONTCOUNT=2 create balloons-busybox
report allowed
verify 'cpus["pod2c0"] == cpus["pod2c1"]' \
       'len(cpus["pod2c0"]) == 4' \
       'cpus["pod2c0"] == cpus["pod0c0"]' \
       'cpus["pod2c0"] == cpus["pod0c1"]' \
       'disjoint_sets(cpus["pod2c0"], cpus["pod1c0"])'

# pod3: create again in the default namespace. nsballoon[0] has
# reached the maximum capacity, nsballoon[2] should be created for
# this pod.
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
CONTCOUNT=2 create balloons-busybox
report allowed
verify 'cpus["pod3c0"] == cpus["pod3c1"]' \
       'len(cpus["pod3c0"]) == 2' \
       'disjoint_sets(cpus["pod3c0"], cpus["pod2c0"], cpus["pod1c0"])'

# pod4: new namespace => nsballoon[3]
CPUREQ="2" MEMREQ="100M" CPULIM="2" MEMLIM="100M"
namespace="e2e-b" CONTCOUNT=2 create balloons-busybox
report allowed
verify 'cpus["pod4c0"] == cpus["pod4c1"]' \
       'len(cpus["pod4c0"]) == 4' \
       'disjoint_sets(cpus["pod4c0"], cpus["pod3c0"], cpus["pod2c0"], cpus["pod1c0"])'

# pod5: new namespace => nsballoon[5]
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
namespace="e2e-c" CONTCOUNT=2 create balloons-busybox
report allowed
verify 'cpus["pod5c0"] == cpus["pod5c1"]' \
       'len(cpus["pod5c0"]) == 2' \
       'disjoint_sets(cpus["pod5c0"], cpus["pod4c0"], cpus["pod3c0"], cpus["pod2c0"], cpus["pod1c0"])'

# pod6: new namespace, but nsballoon[6] cannot be created because all
# CPUs are already allocated to balloons. Cannot honor the preference
# of spreading different namespaces to different balloon instances
# anymore, should fallback to balanced assignment.
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
namespace="e2e-d" CONTCOUNT=2 create balloons-busybox
report allowed
verify 'cpus["pod6c0"] == cpus["pod6c1"]'

cleanup
terminate cri-resmgr
launch cri-resmgr


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test06-update-configmap/code.var.sh
================================================
# This test verifies that configuration updates via cri-resmgr-agent
# are handled properly in the balloons policy.

testns=e2e-balloons-test06

cleanup() {
    vm-command "kubectl delete pods --all --now --wait; \
        kubectl delete namespace $testns --now --wait --ignore-not-found || :; \
        kubectl delete namespace btype1ns0 --now --wait --ignore-not-found || :"
    terminate cri-resmgr
    terminate cri-resmgr-agent
    vm-command "cri-resmgr -reset-policy; cri-resmgr -reset-config"
}

apply-configmap() {
    vm-put-file $(instantiate balloons-configmap.yaml) balloons-configmap.yaml
    vm-command "cat balloons-configmap.yaml"
    kubectl apply -f balloons-configmap.yaml
}

cleanup
cri_resmgr_extra_args="-metrics-interval 1s" cri_resmgr_config=fallback launch cri-resmgr
launch cri-resmgr-agent

kubectl create namespace $testns
kubectl create namespace btype1ns0

AVAILABLE_CPU="cpuset:0,4-15" BTYPE2_NAMESPACE0='"*"' BTYPE1_MAXCPUS='0' apply-configmap
sleep 3

# pod0 in btype0, annotation
CPUREQ=1 MEMREQ="100M" CPULIM=1 MEMLIM="100M"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: btype0" create balloons-busybox
# pod1 in btype1, namespace
CPUREQ=1 MEMREQ="100M" CPULIM=1 MEMLIM="100M"
namespace="btype1ns0" create balloons-busybox
# pod2 in btype2, wildcard namespace
CPUREQ=1 MEMREQ="100M" CPULIM=1 MEMLIM="100M"
namespace="e2e-balloons-test06" create balloons-busybox
vm-command "curl -s $verify_metrics_url"
verify-metrics-has-line 'btype0\[0\].*containers=".*pod0:pod0c0'
verify-metrics-has-line 'btype1\[0\].*containers=".*pod1:pod1c0'
verify-metrics-has-line 'btype2\[0\].*containers=".*pod2:pod2c0'

# Remove first two balloon types, change btype2 to match all
# namespaces.
BTYPE0_SKIP=1 BTYPE1_SKIP=1 BTYPE2_NAMESPACE0='"*"' apply-configmap
# Note:

# pod0 was successfully assigned to and running in balloon of btype0.
# Now btype0 was completely removed from the node.
# Currently this behavior is undefined.
# Possible behaviors: evict pod0, continue assign chain, refuse config...
# For now, skip pod0c0 balloon validation:
# verify-metrics-has-line '"btype2\[0\]".*pod0:pod0c0'
verify-metrics-has-line '"btype2\[0\]".*pod1:pod1c0'
verify-metrics-has-line '"btype2\[0\]".*pod2:pod2c0'

# Bring back btype0 where pod0 belongs to by annotation.
BTYPE1_SKIP=1 BTYPE2_NAMESPACE0='"*"' apply-configmap
verify-metrics-has-line '"btype0\[0\]".*pod0:pod0c0'
verify-metrics-has-line '"btype2\[0\]".*pod1:pod1c0'
verify-metrics-has-line '"btype2\[0\]".*pod2:pod2c0'

# Change only CPU classes, no reassigning.
verify-metrics-has-line 'btype0\[0\].*pod0:pod0c0.*cpu_class="classA"'
verify-metrics-has-line 'btype2\[0\].*pod1:pod1c0.*cpu_class="classC"'
verify-metrics-has-line 'btype2\[0\].*pod2:pod2c0.*cpu_class="classC"'
BTYPE0_CPUCLASS="classC" BTYPE1_SKIP=1 BTYPE2_CPUCLASS="classB" BTYPE2_NAMESPACE0='"*"'  apply-configmap
verify-metrics-has-line 'btype0\[0\].*pod0:pod0c0.*cpu_class="classC"'
verify-metrics-has-line 'btype2\[0\].*pod1:pod1c0.*cpu_class="classB"'
verify-metrics-has-line 'btype2\[0\].*pod2:pod2c0.*cpu_class="classB"'

cleanup
launch cri-resmgr


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test07-maxballoons/balloons-maxballoons-impossible.cfg
================================================
policy:
  Active: balloons
  ReservedResources:
    CPU: 1
  balloons:
    PinCPU: true
    PinMemory: true
    BalloonTypes:
      - Name: singleton
        MinCPUs: 2
        MaxCPUs: 2
        MinBalloons: 1
        MaxBalloons: 1
      - Name: impossible
        MinBalloons: 2
        MaxBalloons: 1
logger:
  Debug: policy


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test07-maxballoons/balloons-maxballoons.cfg
================================================
policy:
  Active: balloons
  ReservedResources:
    CPU: 1
  balloons:
    PinCPU: true
    PinMemory: true
    BalloonTypes:
      - Name: singleton
        MinCPUs: 2
        MaxCPUs: 2
        MinBalloons: 1
        MaxBalloons: 1
      - Name: dynamictwo
        MaxCPUs: 1
        MaxBalloons: 2
        PreferNewBalloon: true
logger:
  Debug: policy


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test07-maxballoons/code.var.sh
================================================
cleanup() {
    vm-command "kubectl delete pods --all --now --wait"
    return 0
}

cleanup

terminate cri-resmgr
cri_resmgr_cfg=${TEST_DIR}/balloons-maxballoons.cfg launch cri-resmgr

# pod0: allocate 1500/2000 mCPUs of the singleton balloon
CPUREQ="1500m" CPULIM="1500m"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: singleton" CONTCOUNT=1 create balloons-busybox
report allowed
verify 'len(cpus["pod0c0"]) == 2'

# pod1: allocate the rest 500/2000 mCPUs of the singleton balloon
CPUREQ="500m" CPULIM="500m"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: singleton" CONTCOUNT=1 create balloons-busybox
report allowed
verify 'cpus["pod0c0"] == cpus["pod1c0"]'

# pod2: try to fit in the already full singleton balloon
CPUREQ="100m" CPULIM="100m"
( POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: singleton" CONTCOUNT=1 wait_t=5s create balloons-busybox ) && {
    error "creating pod2 succeeded but was expected to fail with balloon allocation error"
}
echo "pod2 creation failed with an error as expected"
vm-command "kubectl describe pod pod2"
if ! grep -q 'no suitable balloon instance available' <<< "$COMMAND_OUTPUT"; then
    error "could not find 'no suitable balloon instance available' in pod2 description"
fi
vm-command "kubectl delete pod pod2 --now --wait --ignore-not-found"

# pod2: create dynamically the first dynamictwo balloon
CPUREQ="800m" CPULIM="800m"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamictwo" CONTCOUNT=1 create balloons-busybox
report allowed
verify 'len(cpus["pod2c0"]) == 1'

# pod3: create dynamically the second dynamictwo balloon
CPUREQ="600m" CPULIM="600m"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamictwo" CONTCOUNT=1 create balloons-busybox
report allowed
verify 'disjoint_sets(cpus["pod2c0"], cpus["pod3c0"])'

# pod4: prefering new balloon fails, but this fits in the second dynamictwo balloon
CPUREQ="300m" CPULIM="300m"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamictwo" CONTCOUNT=1 create balloons-busybox
report allowed
verify 'cpus["pod4c0"] == cpus["pod3c0"]'

# pod5: prefering new balloon fails, and fitting to existing dynamictwo balloons fails
CPUREQ="300m" CPULIM="300m"
( POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamictwo" CONTCOUNT=1 wait_t=5s create balloons-busybox ) && {
    error "creating pod6 succeeded but was expected to fail with balloon allocation error"
}
vm-command "kubectl describe pod pod5"
if ! grep -q 'no suitable balloon instance available' <<< "$COMMAND_OUTPUT"; then
    error "could not find 'no suitable balloon instance available' in pod6 description"
fi
vm-command "kubectl delete pod pod5 --now --wait --ignore-not-found"

cleanup

# Try starting cri-resmgr with a configuration where MinBalloons and
# MaxBalloons of the same balloon type contradict.
terminate cri-resmgr
( cri_resmgr_cfg=${TEST_DIR}/balloons-maxballoons-impossible.cfg launch cri-resmgr ) && {
    error "starting cri-resmgr succeeded, but was expected to fail due to impossible static balloons"
}
echo "starting cri-resmgr with impossible static balloons configuration failed as expected"

terminate cri-resmgr
launch cri-resmgr


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test08-numa/balloons-numa.cfg
================================================
policy:
  Active: balloons
  AvailableResources:
    CPU: cpuset:0-15
  # Reserve one of our CPUs (cpu15) for kube-system tasks.
  ReservedResources:
    CPU: 1
  balloons:
    PinCPU: true
    PinMemory: true
    BalloonTypes:
      - Name: fit-in-numa
        # All (non-system) containers are assigned to this balloon
        # type
        Namespaces:
          - "*"
        # Prevent a balloon to be inflated larger than a NUMA node
        MinCPUs: 0
        MaxCPUs: 4
        AllocatorPriority: 0
        PreferNewBalloons: false


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test08-numa/code.var.sh
================================================
terminate cri-resmgr
cri_resmgr_cfg=${TEST_DIR}/balloons-numa.cfg launch cri-resmgr

# pod0: besteffort, make sure it still gets at least 1 CPU
CPUREQ="" CPULIM="" MEMREQ="" MEMLIM=""
CONTCOUNT=1 create balloons-busybox
report allowed
verify 'len(cpus["pod0c0"]) == 1'

# pod1: guaranteed, make sure it gets the CPU it requested.
# The configuration does not prefer creating new balloons,
# so pod0 and pod1 should be placed in the same balloon.
# Sum of their CPU requests is 1, so they should actually
# run on the same CPU.
CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M"
CONTCOUNT=1 create balloons-busybox
report allowed
verify 'len(cpus["pod0c0"]) == 1' \
       'len(cpus["pod1c0"]) == 1' \
       'cpus["pod0c0"] == cpus["pod1c0"]'

# pod2: guaranteed, make sure it gets the CPU it requested.
CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M"
CONTCOUNT=1 create balloons-busybox
report allowed
verify 'len(cpus["pod0c0"]) == 2' \
       'len(cpus["pod1c0"]) == 2' \
       'len(cpus["pod2c0"]) == 2' \
       'cpus["pod0c0"] == cpus["pod1c0"] == cpus["pod2c0"]'

# pod3: guaranteed, make sure it gets the CPU it requested.
CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M"
CONTCOUNT=1 create balloons-busybox
report allowed
verify 'len(cpus["pod0c0"]) == 3' \
       'len(cpus["pod1c0"]) == 3' \
       'len(cpus["pod2c0"]) == 3' \
       'len(cpus["pod3c0"]) == 3' \
       'cpus["pod0c0"] == cpus["pod1c0"] == cpus["pod2c0"] == cpus["pod3c0"]'

# pod4: guaranteed, fill up a balloon to the MaxCPU
CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M"
CONTCOUNT=1 create balloons-busybox
report allowed
verify 'len(cpus["pod0c0"]) == 4' \
       'len(cpus["pod1c0"]) == 4' \
       'len(cpus["pod2c0"]) == 4' \
       'len(cpus["pod3c0"]) == 4' \
       'len(cpus["pod4c0"]) == 4' \
       'cpus["pod0c0"] == cpus["pod1c0"] == cpus["pod2c0"] == cpus["pod3c0"] == cpus["pod4c0"]'

# pod5: besteffort, no CPU request, should fit into the full balloon
CPUREQ="" CPULIM="" MEMREQ="" MEMLIM=""
CONTCOUNT=1 create balloons-busybox
report allowed
verify 'len(cpus["pod0c0"]) == 4' \
       'len(cpus["pod1c0"]) == 4' \
       'len(cpus["pod2c0"]) == 4' \
       'len(cpus["pod3c0"]) == 4' \
       'len(cpus["pod4c0"]) == 4' \
       'len(cpus["pod5c0"]) == 4' \
       'cpus["pod0c0"] == cpus["pod1c0"] == cpus["pod2c0"] == cpus["pod3c0"] == cpus["pod4c0"] == cpus["pod5c0"]'

# pod6: guaranteed, start filling new balloon
CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M"
CONTCOUNT=1 create balloons-busybox
report allowed
verify 'len(cpus["pod0c0"]) == 4' \
       'len(cpus["pod1c0"]) == 4' \
       'len(cpus["pod2c0"]) == 4' \
       'len(cpus["pod3c0"]) == 4' \
       'len(cpus["pod4c0"]) == 4' \
       'len(cpus["pod5c0"]) == 4' \
       'len(cpus["pod6c0"]) == 1' \
       'cpus["pod0c0"] == cpus["pod1c0"] == cpus["pod2c0"] == cpus["pod3c0"] == cpus["pod4c0"]' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod6c0"])'

# Leave only one guaranteed container to the first balloon.
kubectl delete pods pod1 pod2 pod3 --now --wait --ignore-not-found
report allowed
verify 'len(cpus["pod0c0"]) == 1' \
       'len(cpus["pod4c0"]) == 1' \
       'len(cpus["pod5c0"]) == 1' \
       'len(cpus["pod6c0"]) == 1' \
       'cpus["pod0c0"] == cpus["pod4c0"] == cpus["pod5c0"]' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod6c0"])'

# Leave only bestefforts to the first balloon. Make sure they still
# have a CPU.
kubectl delete pods pod4 --now --wait --ignore-not-found
report allowed
verify 'len(cpus["pod0c0"]) == 1' \
       'len(cpus["pod5c0"]) == 1' \
       'len(cpus["pod6c0"]) == 1' \
       'cpus["pod0c0"] == cpus["pod5c0"]' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod6c0"])'

terminate cri-resmgr
launch cri-resmgr


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test09-isolated/balloons-isolated.cfg
================================================
policy:
  Active: balloons
  ReservedResources:
    CPU: cpuset:0
  balloons:
    BalloonTypes:
      - Name: isolated-pods
        MinCPUs: 0
        MaxCPUs: 2
        CPUClass: turbo
        MinBalloons: 2
        PreferNewBalloons: true
        PreferSpreadingPods: false
      - Name: isolated-ctrs
        MinCPUs: 1
        MaxCPUs: 1
        CPUClass: turbo
        MinBalloons: 2
        PreferNewBalloons: true
        PreferSpreadingPods: true
instrumentation:
  HTTPEndpoint: :8891
  PrometheusExport: true
logger:
  Debug: policy
  Klog:
    skip_headers: true


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test09-isolated/code.var.sh
================================================
terminate cri-resmgr
cri_resmgr_cfg=${TEST_DIR}/balloons-isolated.cfg cri_resmgr_extra_args="-metrics-interval 4s" launch cri-resmgr

verify-metrics-has-line 'balloon="isolated-pods\[0\]"'
verify-metrics-has-line 'balloon="isolated-pods\[1\]"'
verify-metrics-has-no-line 'balloon="isolated-pods\[2\]"'

# pod0: besteffort
CPUREQ="" CPULIM="" MEMREQ="" MEMLIM=""
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: isolated-pods"
CONTCOUNT=2 create balloons-busybox
report allowed
verify 'len(cpus["pod0c0"]) == 1' \
       'len(cpus["pod0c1"]) == 1' \
       'cpus["pod0c0"] == cpus["pod0c1"]'
# Even if the isolated balloon type has PreferNewBalloons=1, adding
# this pod0 or pod1 must not create a new balloon because existing
# empty balloons should be filled first.
verify-metrics-has-line 'balloon="isolated-pods\[0\]"'
verify-metrics-has-line 'balloon="isolated-pods\[1\]"'
verify-metrics-has-no-line 'balloon="isolated-pods\[2\]"'

# pod1: guaranteed
CPUREQ="600m" CPULIM="600m" MEMREQ="100M" MEMLIM="100M"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: isolated-pods"
CONTCOUNT=2 create balloons-busybox
report allowed
verify 'len(cpus["pod0c0"]) == 1' \
       'len(cpus["pod0c1"]) == 1' \
       'len(cpus["pod1c0"]) == 2' \
       'len(cpus["pod1c1"]) == 2' \
       'cpus["pod0c0"] == cpus["pod0c1"]' \
       'cpus["pod1c0"] == cpus["pod1c1"]' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"])'
verify-metrics-has-line 'balloon="isolated-pods\[0\]"'
verify-metrics-has-line 'balloon="isolated-pods\[1\]"'
verify-metrics-has-no-line 'balloon="isolated-pods\[2\]"'

# pod2: burstable
CPUREQ="100m" CPULIM="200m"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: isolated-pods"
CONTCOUNT=2 create balloons-busybox
report allowed
verify 'len(cpus["pod0c0"]) == 1' \
       'len(cpus["pod0c1"]) == 1' \
       'len(cpus["pod1c0"]) == 2' \
       'len(cpus["pod1c1"]) == 2' \
       'len(cpus["pod2c0"]) == 1' \
       'len(cpus["pod2c1"]) == 1' \
       'cpus["pod0c0"] == cpus["pod0c1"]' \
       'cpus["pod1c0"] == cpus["pod1c1"]' \
       'cpus["pod2c0"] == cpus["pod2c1"]' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"], cpus["pod2c0"])'
verify-metrics-has-line 'balloon="isolated-pods\[0\]"'
verify-metrics-has-line 'balloon="isolated-pods\[1\]"'
verify-metrics-has-line 'balloon="isolated-pods\[2\]"'
verify-metrics-has-no-line 'balloon="isolated-pods\[3\]"'

# pod3: isolated containers
CPUREQ="" CPULIM="" MEMREQ="" MEMLIM=""
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: isolated-ctrs"
CONTCOUNT=4 create balloons-busybox
report allowed
verify 'len(cpus["pod0c0"]) == 1' \
       'len(cpus["pod0c1"]) == 1' \
       'len(cpus["pod1c0"]) == 2' \
       'len(cpus["pod1c1"]) == 2' \
       'len(cpus["pod2c0"]) == 1' \
       'len(cpus["pod2c1"]) == 1' \
       'len(cpus["pod3c0"]) == 1' \
       'len(cpus["pod3c1"]) == 1' \
       'len(cpus["pod3c2"]) == 1' \
       'len(cpus["pod3c3"]) == 1' \
       'cpus["pod0c0"] == cpus["pod0c1"]' \
       'cpus["pod1c0"] == cpus["pod1c1"]' \
       'cpus["pod2c0"] == cpus["pod2c1"]' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"], cpus["pod2c0"])' \
       'disjoint_sets(cpus["pod3c0"], cpus["pod3c1"], cpus["pod3c2"], cpus["pod3c3"])' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"], cpus["pod2c0"], cpus["pod3c0"], cpus["pod3c1"], cpus["pod3c2"], cpus["pod3c3"])'
verify-metrics-has-line 'balloon="isolated-pods\[0\]"'
verify-metrics-has-line 'balloon="isolated-pods\[1\]"'
verify-metrics-has-line 'balloon="isolated-pods\[2\]"'
verify-metrics-has-no-line 'balloon="isolated-pods\[3\]"'
verify-metrics-has-line 'balloon="isolated-ctrs\[0\]"'
verify-metrics-has-line 'balloon="isolated-ctrs\[1\]"'
verify-metrics-has-line 'balloon="isolated-ctrs\[2\]"'
verify-metrics-has-line 'balloon="isolated-ctrs\[3\]"'

terminate cri-resmgr
launch cri-resmgr


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/balloons-allocator-opts.cfg
================================================
policy:
  Active: balloons
  ReservedResources:
    CPU: 1
  balloons:
    AllocatorTopologyBalancing: true
    PreferSpreadOnPhysicalCores: true
    BalloonTypes:
      - Name: policydefaults
        MinCPUs: 2
        MinBalloons: 2
      - Name: topo1cores0
        MinCPUs: 2
        MinBalloons: 2
        PreferSpreadOnPhysicalCores: false
      - Name: topo0cores1
        AllocatorTopologyBalancing: false
        PreferSpreadOnPhysicalCores: true
      - Name: topo0cores0
        AllocatorTopologyBalancing: false
        PreferSpreadOnPhysicalCores: false
      - Name: topo1cores1
        AllocatorTopologyBalancing: true
        PreferSpreadOnPhysicalCores: true
instrumentation:
  HTTPEndpoint: :8891
  PrometheusExport: true
logger:
  Debug: policy
  Klog:
    skip_headers: true


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/code.var.sh
================================================
cleanup() {
    vm-command "kubectl delete pods --all --now --wait"
    return 0
}

cleanup

# Launch cri-resmgr with wanted metrics update interval and a
# configuration that opens the instrumentation http server.
terminate cri-resmgr
cri_resmgr_cfg=${TEST_DIR}/balloons-allocator-opts.cfg launch cri-resmgr

# pod0 in a 2-CPU balloon
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: policydefaults" CONTCOUNT=1 create balloons-busybox
report allowed
verify 'len(cores["pod0c0"]) == 2' \
       'len(cpus["pod0c0"]) == 2'


# pod1 in a 2-CPU balloon
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: topo1cores0" CONTCOUNT=1 create balloons-busybox
report allowed
verify 'len(cores["pod1c0"]) == 1' \
       'len(cpus["pod1c0"]) == 2'

# pod2: container 0 resizes first from 0 to 1, container 2 from 1 to 2 CPUs,
# use more cores
CPUREQ="1" MEMREQ="100M" CPULIM="1" MEMLIM="100M"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: topo1cores1" CONTCOUNT=2 create balloons-busybox
report allowed
verify 'len(cores["pod2c0"]) == 2' \
       'len(cpus["pod2c0"]) == 2' \
       'cpus["pod2c0"] == cpus["pod2c1"]'

# pod3: container 0 resizes first from 0 to 1, container 2 from 1 to 2 CPUs,
# pack tightly
CPUREQ="1" MEMREQ="100M" CPULIM="1" MEMLIM="100M"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: topo0cores0" CONTCOUNT=2 create balloons-busybox
report allowed
verify 'len(cores["pod3c0"]) == 1' \
       'len(cpus["pod3c0"]) == 2' \
       'cpus["pod3c0"] == cpus["pod3c1"]'

cleanup


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c16/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c32/test01-dynamic-baloons/balloons-dynamic.cfg
================================================
policy:
  Active: balloons
  ReservedResources:
    cpu: cpuset:31
  balloons:
    AllocatorTopologyBalancing: true
    BalloonTypes:
      - Name: dynamic
        MaxCPUs: 32
        MaxBalloons: 8
        PreferNewBalloons: true
        ShareIdleCpusInSame: numa
instrumentation:
  HTTPEndpoint: :8891
  PrometheusExport: true
logger:
  Debug: policy
  Klog:
    skip_headers: true


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c32/test01-dynamic-baloons/code.var.sh
================================================
terminate cri-resmgr
cri_resmgr_cfg=${TEST_DIR}/balloons-dynamic.cfg cri_resmgr_extra_args="-metrics-interval 4s" launch cri-resmgr

# pod0-pod7: create 8 balloons, where each lands on a different NUMA node.
# Each balloon (except one that lands on the NUMA node with reserved CPUs)
# has 1 shared CPU at the most since a NUMA node has 4 CPUs and a pod is
# requesting 1 CPU. Only one of the balloon that using NUMA node with
#reserved CPU has 0 shared CPUs.
CPUREQLIM="3"
INITCPUREQLIM="100m-100m 100m-100m 100m-100m"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamic"
n=8 create multicontainerpod
verify-metrics-has-line 'balloon="dynamic\[0\]".*cpus_count="3"*'
verify-metrics-has-line 'balloon="dynamic\[1\]".*cpus_count="3"*'
verify-metrics-has-line 'balloon="dynamic\[2\]".*cpus_count="3"*'
verify-metrics-has-line 'balloon="dynamic\[3\]".*cpus_count="3"*'
verify-metrics-has-line 'balloon="dynamic\[4\]".*cpus_count="3"*'
verify-metrics-has-line 'balloon="dynamic\[5\]".*cpus_count="3"*'
verify-metrics-has-line 'balloon="dynamic\[6\]".*cpus_count="3"*'
verify-metrics-has-line 'balloon="dynamic\[7\]".*cpus_count="3"*'
verify-metrics-has-no-line 'cpus_count="4"'
verify-metrics-has-line 'sharedidlecpus_count="1"'
verify-metrics-has-line 'cpus_allowed_count="4"'
verify-metrics-has-line 'sharedidlecpus_count="0"'
verify-metrics-has-line 'cpus_allowed_count="3"'
verify-metrics-has-no-line 'sharedidlecpus_count="2"'
verify-metrics-has-no-line 'cpus_allowed_count="5"'
verify 'disjoint_sets(nodes["pod0c0"], nodes["pod1c0"], nodes["pod2c0"], nodes["pod3c0"], nodes["pod4c0"], nodes["pod5c0"], nodes["pod6c0"], nodes["pod7c0"])' \
       'len(nodes["pod0c0"]) == len(nodes["pod1c0"]) == len(nodes["pod2c0"]) == \
        len(nodes["pod3c0"]) == len(nodes["pod4c0"]) == len(nodes["pod5c0"]) == \
        len(nodes["pod6c0"]) == len(nodes["pod7c0"]) == 1'

# pod8: Add one more pod with 2 CPUs to inflate over NUMAs nodes, which should cross
# the NUMA node boundaries but not the die boundaries. Because two NUMA nodes can offer
# 2 CPUs in total. 
CPUREQLIM="2"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamic"
create multicontainerpod
verify-metrics-has-line 'cpus_count="5"'
verify-metrics-has-line 'sharedidlecpus="",sharedidlecpus_count="0"'
verify-metrics-has-line 'sharedidlecpus_count="1"'
verify 'len(nodes["pod8c0"])==2' \
       'len(dies["pod8c0"])==1' \
       'len(packages["pod8c0"])==1'
kubectl delete pod pod8 --now --wait --ignore-not-found
verify-metrics-has-no-line 'cpus_count="5"'

# pod9: Add one more pod with 4 CPUs to inflate over dies, which should cross
# the NUMA node boundaries as well as dies boundaries. Since 2 dies under the
# same package can offer 4 CPUs, we should not cross the package boundaries.
CPUREQLIM="4"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamic"
create multicontainerpod
verify 'len(nodes["pod9c0"])==4' \
       'len(dies["pod9c0"])==2' \
       'len(packages["pod9c0"])==1'
kubectl delete pod pod9 --now --wait --ignore-not-found
verify 'disjoint_sets(nodes["pod0c0"], nodes["pod1c0"], nodes["pod2c0"], nodes["pod3c0"], nodes["pod4c0"], nodes["pod5c0"], nodes["pod6c0"], nodes["pod7c0"])' \

# pod9: Add one more pod with 7 CPUs to inflate over packages, which should cross
# NUMA node, dies and package boundaries. At this point, there is no free CPUs
# left on the host, so no shared CPUs.
CPUREQLIM="6 1"
POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamic"
create multicontainerpod
verify 'len(nodes["pod10c0"])==7' \
       'len(dies["pod10c0"])==4' \
       'len(packages["pod10c0"])==2'
verify-metrics-has-line 'sharedidlecpus="",sharedidlecpus_count="0"'
verify-metrics-has-no-line 'sharedidlecpus_count="1"'

# pod0, pod9 deflate. This should free up 10 CPUs that will cause having
# shared CPUs available again.
kubectl delete pod pod10 --now --wait --ignore-not-found
kubectl delete pod pod0 --now --wait --ignore-not-found
verify-metrics-has-line 'sharedidlecpus_count="1"'


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c32/test01-dynamic-baloons/multicontainerpod.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  $(if [ -n "$POD_ANNOTATION" ]; then echo "
  annotations:
    $POD_ANNOTATION
  "; fi)
  labels:
    app: ${NAME}
spec:
  containers:
  $(contnum=0; for reqlim in ${CPUREQLIM}; do echo "
  - name: ${NAME}c${contnum}
    image: busybox
    imagePullPolicy: IfNotPresent
    command:
      - sh
      - -c
      - ${WORK}echo ${NAME}c${contnum} \$(sleep inf)
    $(if [ -n "${reqlim}" ]; then echo "
    resources:
      $(if [ -n "${reqlim/-*}" ]; then echo "
      requests:
        cpu: ${reqlim/-*/}
      "; fi)
      $(if [ -n "${reqlim/*-/}" ]; then echo "
      limits:
        cpu: ${reqlim/*-}
      "; fi)
    "; fi)
  "; contnum=$((contnum + 1)); done )
  $(if [ -n "$INITCPUREQLIM" ]; then echo "
  initContainers:
  $(contnum=0; for initreqlim in ${INITCPUREQLIM}; do echo "
  - name: ${NAME}c${contnum}-init
    image: busybox
    imagePullPolicy: IfNotPresent
    command:
      - sh
      - -c
      - ${WORK}echo ${NAME}c${contnum}-init \$(sleep 1)
    $(if [ -n "${initreqlim}" ]; then echo "
    resources:
      $(if [ -n "${initreqlim/-*}" ]; then echo "
      requests:
        cpu: ${initreqlim/-*/}
      "; fi)
      $(if [ -n "${initreqlim/*-/}" ]; then echo "
      limits:
        cpu: ${initreqlim/*-}
      "; fi)
    "; fi)
  "; contnum=$((contnum + 1)); done )
  "; fi)
  terminationGracePeriodSeconds: 1


================================================
FILE: test/e2e/policies.test-suite/balloons/n4c32/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "dies": 2, "packages": 2}
]


================================================
FILE: test/e2e/policies.test-suite/balloons/verify.source.sh
================================================
# Utilities to verify data from metrics

verify_metrics_url="http://localhost:8891/metrics"

verify-metrics-has-line() {
    local expected_line="$1"
    vm-run-until --timeout 10 "echo 'waiting for metrics line: $expected_line' >&2; curl --silent $verify_metrics_url | grep -E '$expected_line'" || {
        command-error "expected line '$1' missing from the output"
    }
}

verify-metrics-has-no-line() {
    local unexpected_line="$1"
    vm-run-until --timeout 10 "echo 'checking absense of metrics line: $unexpected_line' >&2; ! curl --silent $verify_metrics_url | grep -Eq '$unexpected_line'" || {
        command-error "unexpected line '$1' found from the output"
    }
}


================================================
FILE: test/e2e/policies.test-suite/check-correct-policy.source.sh
================================================
# This script does a policy check before the real test code is started.

cache_policy="$(vm-command-q "cat /var/lib/cri-resmgr/cache" | jq -r .PolicyName)"

cfg_policy=$(awk '/Active:/{print $2}' < "$cri_resmgr_cfg")

if [ -n "$cache_policy" ] && [ -n "$cfg_policy" ] && [ "$cache_policy" != "$cfg_policy" ]; then
    echo "cri-resmgr is been started with policy \"$cache_policy\", switching to \"$cfg_policy\""
    terminate cri-resmgr
    echo "destroying cri-resmgr cache with previous policy"
    vm-command "rm -rf /var/lib/cri-resmgr"
    launch cri-resmgr
fi


================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/cri-resmgr.cfg
================================================
policy:
  Active: dynamic-pools
  # Use only 15 CPUs in total, leave cpu0 for other than Kubernetes
  # processes.
  AvailableResources:
    CPU: cpuset:1-15
  # Reserve one of our CPUs for kube-system tasks.
  ReservedResources:
    CPU: 1
  dynamic-pools:
    PinCPU: true
    PinMemory: true
    DynamicPoolTypes:
      - Name: "pool1"
        Namespaces:
          - "pool1"
        CPUClass: "pool1-cpuclass"
      - Name: "pool2"
        Namespaces:
          - "pool2"
        CPUClass: "pool2-cpuclass"
instrumentation:
  HTTPEndpoint: :8891
  PrometheusExport: true
logger:
  Debug: policy
  Klog:
    skip_headers: true
cpu:
  classes:
    default:
      minFreq: 800
      maxFreq: 2800
    pool1-cpuclass:
      minFreq: 900
      maxFreq: 2900
    pool2-cpuclass:
      minFreq: 1000
      maxFreq: 3000

================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/dyp-busybox.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  $(if [ -n "$POD_ANNOTATION" ]; then echo "
  annotations:
    $POD_ANNOTATION
  "; fi)
  labels:
    app: ${NAME}
spec:
  containers:
  $(for contnum in $(seq 1 ${CONTCOUNT}); do echo "
  - name: ${NAME}c$(( contnum - 1 ))
    image: busybox
    imagePullPolicy: IfNotPresent
    command:
      - sh
      - -c
      - ${WORK}echo ${NAME}c$(( contnum - 1 )) \$(sleep inf)
    $(if [ -n "${CPUREQ}" ]; then echo "
    resources:
      requests:
        cpu: ${CPUREQ}
        $(if [ -n "${MEMREQ}" ]; then echo "
        memory: '${MEMREQ}'
        "; fi)
      $(if [ -n "${CPULIM}" ]; then echo "
      limits:
        cpu: ${CPULIM}
        $(if [ -n "$MEMLIM" ]; then echo "
        memory: '${MEMLIM}'
        "; fi)
    "; fi)
    "; fi)
  "; done )
  terminationGracePeriodSeconds: 1

================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/dyp-configmap.yaml.in
================================================
apiVersion: v1
kind: ConfigMap
metadata:
  name: cri-resmgr-config.default
  namespace: kube-system
data:
  policy: |+
    Active: dynamic-pools
    AvailableResources:
      CPU: ${AVAILABLE_CPU:-cpuset:0-15}
    ReservedResources:
      CPU: ${RESERVED_CPU:-1}

    dynamic-pools:
      PinCPU: ${PINCPU:-true}
      PinMemory: ${PINMEMORY:-true}
      DynamicPoolTypes:

        $([ -n "$DYPTYPE0_SKIP" ] || echo "
        - Name: dyptype0
          AllocatorPriority: ${DYPTYPE0_ALLOCATORPRIORITY:-0}
          CPUClass: ${DYPTYPE0_CPUCLASS:-classA}
        ")

        $([ -n "$DYPTYPE1_SKIP" ] || echo "
        - Name: dyptype1
          Namespaces:
            - ${DYPTYPE1_NAMESPACE0:-dyptype1ns0}
          AllocatorPriority: ${DYPTYPE1_ALLOCATORPRIORITY:-1}
          CPUClass: ${DYPTYPE1_CPUCLASS:-classB}
        ")

        $([ -n "$DYPTYPE2_SKIP" ] || echo "
        - Name: dyptype2
          Namespaces:
            - ${DYPTYPE2_NAMESPACE0:-dyptype2ns0}
            - ${DYPTYPE2_NAMESPACE1:-dyptype2ns1}
          AllocatorPriority: ${DYPTYPE2_ALLOCATORPRIORITY:-2}
          CPUClass: ${DYPTYPE2_CPUCLASS:-classC}
        ")

  instrumentation: |+
    HTTPEndpoint: :8891
    PrometheusExport: true

  logger: |+
    Debug: policy

  cpu: |+
    classes:
      default:
        minFreq: ${CPU_DEFAULT_MIN:-800}
        maxFreq: ${CPU_DEFAULT_MAX:-2800}
      classA:
        minFreq: ${CPU_CLASSA_MIN:-900}
        maxFreq: ${CPU_CLASSA_MAX:-2900}
      classB:
        minFreq: ${CPU_CLASSB_MIN:-1000}
        maxFreq: ${CPU_CLASSB_MAX:-3000}
      classC:
        minFreq: ${CPU_CLASSC_MIN:-1100}
        maxFreq: ${CPU_CLASSC_MAX:-3100}
        energyPerformancePreference: ${CPU_CLASSC_EPP:-1}


================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test01-basic-placement/code.var.sh
================================================
# Test placing containers with and without annotations to correct dynamic pools
# reserved and shared CPUs.

cleanup() {
    vm-command "kubectl delete pods pod0 -n kube-system; kubectl delete pods -n pool1 --all --now; kubectl delete pods --all --now; kubectl delete namespace pool1"
    return 0
}

cleanup

terminate cri-resmgr

launch cri-resmgr

# pod0: run on reserved CPUs.
namespace=kube-system CONTCOUNT=2 create dyp-busybox
report allowed
verify 'cpus["pod0c0"] == cpus["pod0c1"]' \
       'len(cpus["pod0c0"]) == 1'

# pod1: run in shared dynamic pool.
# We do not add annotations to this pod, and we do not set any
# namespace, so this pod is expected to be created to the shared pool.
create dyp-busybox
report allowed
verify 'len(cpus["pod1c0"]) == 14'

# The size of each dynamic pool is obtained by adding the requests of the containers in this pool and the CPUs allocated based on cpu utilization,
# so the size of each dynamic pool is greater than or equal to the sum of the requests of the containers in the pool.

# pod2: run in the pool1.
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: pool1" CONTCOUNT=1 create dyp-busybox
report allowed
verify 'len(cpus["pod2c0"]) >= 1' \
      'len(cpus["pod1c0"]) + len(cpus["pod2c0"]) == 14' \
      'disjoint_sets(cpus["pod2c0"], cpus["pod1c0"])'

# pod3: run in the pool1.
CPUREQ="1500m" MEMREQ="100M" CPULIM="1500m" MEMLIM="100M"
POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: pool1" CONTCOUNT=1 create dyp-busybox
report allowed
verify 'cpus["pod2c0"] == cpus["pod3c0"]' \
      'len(cpus["pod3c0"]) >= 2' \
      'len(cpus["pod1c0"]) + len(cpus["pod3c0"]) == 14' \
      'disjoint_sets(cpus["pod1c0"], cpus["pod3c0"])'

# pod4: run in the pool2.
CPUREQ="1500m" MEMREQ="100M" CPULIM="1500m" MEMLIM="100M"
POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: pool2" CONTCOUNT=2 create dyp-busybox
report allowed
verify 'cpus["pod4c0"] == cpus["pod4c1"]' \
      'len(cpus["pod4c0"]) >= 3' \
      'len(cpus["pod3c0"]) >= 2' \
      'len(cpus["pod1c0"]) + len(cpus["pod3c0"]) + len(cpus["pod4c0"]) == 14' \
      'disjoint_sets(cpus["pod4c0"], cpus["pod3c0"], cpus["pod1c0"])'

# pod5: run in the pool1.
CPUREQ="1500m" MEMREQ="100M" CPULIM="1500m" MEMLIM="100M"
kubectl create namespace "pool1"
namespace="pool1" CONTCOUNT=1 create dyp-busybox
report allowed
verify 'cpus["pod5c0"] == cpus["pod2c0"]'\
      'len(cpus["pod5c0"]) >= 4' \
      'len(cpus["pod4c0"]) >= 3' \
      'len(cpus["pod1c0"]) + len(cpus["pod3c0"]) + len(cpus["pod4c0"]) == 14'

cleanup


================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test02-prometheus-metrics/code.var.sh
================================================
# This test verifies prometheus metrics from the dynamic-pools policy.

cleanup() {
    vm-command "kubectl delete pods --all --now"
    terminate cri-resmgr
    terminate cri-resmgr-agent
    vm-command "cri-resmgr -reset-policy; cri-resmgr -reset-config"
    return 0
}

cleanup

# Launch cri-resmgr with wanted metrics update interval and a
# configuration that opens the instrumentation http server.
cri_resmgr_cfg=${TEST_DIR}/dyp-metrics.cfg  cri_resmgr_extra_args="-metrics-interval 1s" launch cri-resmgr
sleep 10
verify-metrics-has-line 'dynamicPool="shared"'
verify-metrics-has-line 'dynamicPool="reserved"'
verify-metrics-has-line 'dynamicPool="full-core"'
verify-metrics-has-line 'dynamicPool="flex"'
verify-metrics-has-line 'dynamicPool="fast-dualcore"'

# pod0: run in shared dynamic pool.
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
CONTCOUNT=2 create dyp-busybox
report allowed
verify-metrics-has-line 'dynamicPool="reserved"'
verify-metrics-has-line 'dynamicPool="full-core"'
verify-metrics-has-line 'dynamicPool="flex"'
verify-metrics-has-line 'dynamicPool="fast-dualcore"'
verify-metrics-has-line 'DynamicPools{containers="pod0:pod0c0,pod0:pod0c1",cpu_class="",cpus=".*",dynamicPool="shared",dynamicPool_type="shared",mems=".*",tot_limit_millicpu="200",tot_req_millicpu="200"} 15'

# pod1: run in fast-dualcore dynamic pool.
CPUREQ="200m" MEMREQ="" CPULIM="200m" MEMLIM=""
POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: fast-dualcore" CONTCOUNT=1 create dyp-busybox
report allowed
verify-metrics-has-line 'containers="pod1:pod1c0".*dynamicPool="fast-dualcore",dynamicPool_type="fast-dualcore".*tot_req_millicpu="(199|200)"'
verify 'len(cpus["pod1c0"]) >= 1'

# pod2: run in flex dynamic pool.
CPUREQ="3500m" MEMREQ="" CPULIM="3500m" MEMLIM=""
POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: flex" CONTCOUNT=1 create dyp-busybox
report allowed
verify-metrics-has-line 'containers="pod2:pod2c0".*dynamicPool="flex",dynamicPool_type="flex"'
verify 'len(cpus["pod2c0"]) >= 4'

# pod3: run in flex dynamic pool.
CPUREQ="1200m" MEMREQ="" CPULIM="1200m" MEMLIM=""
POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: flex" CONTCOUNT=1 create dyp-busybox
report allowed
verify-metrics-has-line 'containers="pod2:pod2c0,pod3:pod3c0".*dynamicPool="flex",dynamicPool_type="flex"'
verify 'len(cpus["pod2c0"]) >= 5'

# Resize flex dynamic pool in metrics.
kubectl delete pods --now pod3
verify-metrics-has-line 'containers="pod2:pod2c0".*dynamicPool="flex",dynamicPool_type="flex"'
verify 'len(cpus["pod2c0"]) >= 4'

kubectl delete pods --now pod2
sleep 5
verify-metrics-has-line 'containers="".*dynamicPool="flex",dynamicPool_type="flex".*0'

# Delete all pods in shared dynamic pool.
kubectl delete pods --now pod0
# pod4: run in fast-dualcore dynamic pool, all CPUs are allocated to fast-dualcore dynamic pool.
CPUREQ="14" MEMREQ="" CPULIM="14" MEMLIM=""
POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: fast-dualcore" CONTCOUNT=1 create dyp-busybox
report allowed
verify-metrics-has-line 'containers="pod1:pod1c0,pod4:pod4c0".*dynamicPool="fast-dualcore",dynamicPool_type="fast-dualcore".*15'
verify 'len(cpus["pod1c0"]) == 15'

cleanup


================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test02-prometheus-metrics/dyp-metrics.cfg
================================================
policy:
  Active: dynamic-pools
  AvailableResources:
    CPU: cpuset:0-15
  # Reserve one of our CPUs for kube-system tasks.
  ReservedResources:
    CPU: cpuset:0
  dynamic-pools:
    DynamicPoolTypes:
      - Name: full-core
        CPUClass: normal

      - Name: fast-dualcore
        CPUClass: turbo

      - Name: flex
        CPUClass: slow
instrumentation:
  HTTPEndpoint: :8891
  PrometheusExport: true
logger:
  Debug: policy
  Klog:
    skip_headers: true


================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test03-rebalancing/code.var.sh
================================================
# Re-launch cri-resmgr with the rebalancing parameter in order to
# enable rebalancing calls. (See help of the "launch" function for
# more options.)

cleanup() {
    vm-command "kubectl delete pods --all --now"
    return 0
}

cleanup
terminate cri-resmgr
cri_resmgr_extra_args="-metrics-interval 1s -rebalance-interval 2s" launch cri-resmgr
sleep 10

# Create three pods:
# - pod0 to "shared"
# - pod1 to "pool1"
# - pod2 to "pool2"
create dyp-busybox
POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: pool1"
create dyp-busybox
POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: pool2"
create dyp-busybox
# Print initial CPU pinning.
report allowed
# Wait at least one rebalancing round.
sleep 3
verify 'len(cpus["pod0c0"]) >= 1'
verify 'len(cpus["pod1c0"]) >= 1'
verify 'len(cpus["pod2c0"]) >= 1'
verify-metrics-has-line 'containers="pod0:pod0c0".*dynamicPool="shared",dynamicPool_type="shared"'
verify-metrics-has-line 'containers="pod1:pod1c0".*dynamicPool="pool1",dynamicPool_type="pool1"'
verify-metrics-has-line 'containers="pod2:pod2c0".*dynamicPool="pool2",dynamicPool_type="pool2"'

# Increase CPU usage of pod1 to 200%
vm-command "nohup kubectl exec pod1 -- /bin/sh -c 'gzip </dev/zero >/dev/null' </dev/null >&/dev/null &"
vm-command "nohup kubectl exec pod1 -- /bin/sh -c 'gzip </dev/zero >/dev/null' </dev/null >&/dev/null &"
# Wait at least one rebalancing round and print CPU pinning.
sleep 10
report allowed
# Now "pool1" has 200% CPU load, "shared" and "pool2" have 0%.
# Verify that the number of CPUs in pool1 is the largest.
verify 'len(cpus["pod1c0"]) > len(cpus["pod0c0"])'
verify 'len(cpus["pod1c0"]) > len(cpus["pod2c0"])'
verify 'len(cpus["pod0c0"]) + len(cpus["pod1c0"]) + len(cpus["pod2c0"]) == 14'

# Remove CPU load from pool1 and put 100% CPU load to pool2.
vm-command "pkill gzip"
vm-command "nohup kubectl exec pod2 -- /bin/sh -c 'gzip </dev/zero >/dev/null' </dev/null >&/dev/null &"
# Wait at least one rebalancing round and print CPU pinning.
sleep 10
report allowed
# Verify that the number of CPUs in pool2 is the largest.
verify 'len(cpus["pod2c0"]) > len(cpus["pod0c0"])'
verify 'len(cpus["pod2c0"]) > len(cpus["pod1c0"])'
verify 'len(cpus["pod0c0"]) + len(cpus["pod1c0"]) + len(cpus["pod2c0"]) == 14'

# Remove CPU load from pool1 and put 100% CPU load to pool2 and pool1.
vm-command "pkill gzip"
vm-command "nohup kubectl exec pod1 -- /bin/sh -c 'gzip </dev/zero >/dev/null' </dev/null >&/dev/null &"
vm-command "nohup kubectl exec pod2 -- /bin/sh -c 'gzip </dev/zero >/dev/null' </dev/null >&/dev/null &"
# Takes time to reach a state of balance
sleep 10
report allowed
# Verify that the number of CPUs in pool1 is greater than or equal to 6 and less than or equal to 8.
# Verify that the number of CPUs in pool2 is greater than or equal to 6 and less than or equal to 8.
verify 'len(cpus["pod0c0"]) == 1'
verify 'len(cpus["pod1c0"]) >= 6'
verify 'len(cpus["pod1c0"]) <= 8'
verify 'len(cpus["pod2c0"]) >= 6'
verify 'len(cpus["pod2c0"]) <= 8'

# Remove CPU load from pool1 and pool2
vm-command "pkill gzip"

cleanup

================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test04-reserved/code.var.sh
================================================
terminate cri-resmgr
cri_resmgr_cfg=${TEST_DIR}/dyp-reserved.cfg launch cri-resmgr

cleanup() {
    vm-command \
        "kubectl delete pod -n kube-system --now pod0
         kubectl delete pod -n monitor-mypods --now pod1
         kubectl delete pod -n system-logs --now pod2
         kubectl delete pod -n kube-system --now pod3
         kubectl delete pods --now pod4 pod5 pod6
         kubectl delete pod -n kube-system --now pod7
         kubectl delete namespace monitor-mypods
         kubectl delete namespace system-logs
         kubectl delete namespace my-exact-name"
    return 0
}

cleanup

kubectl create namespace monitor-mypods
kubectl create namespace system-logs
kubectl create namespace my-exact-name

# pod0: kube-system
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
namespace=kube-system create dyp-busybox
report allowed
verify 'cpus["pod0c0"] == {"cpu00", "cpu01", "cpu02"}'

# pod1: match first ReservedPoolNamespaces glob, multicontainer
CPUREQ="1" MEMREQ="" CPULIM="1" MEMLIM=""
namespace=monitor-mypods CONTCOUNT=2 create dyp-busybox
report allowed
verify 'cpus["pod1c0"] == cpus["pod0c0"]' \
       'cpus["pod1c1"] == cpus["pod0c0"]'

# pod2: match last ReservedPoolNamespaces glob, slightly overbook reserved CPU
CPUREQ="1" MEMREQ="" CPULIM="1" MEMLIM=""
namespace=system-logs create dyp-busybox
report allowed
verify 'cpus["pod2c0"] == cpus["pod0c0"]'

# pod3: force a kube-system pod to full-core dynamic pool using an annotation
CPUREQ="2" MEMREQ="" CPULIM="2" MEMLIM=""
POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: full-core" namespace=kube-system create dyp-busybox
report allowed
verify 'len(cpus["pod3c0"]) >= 2' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod3c0"])'

# pod4: run in shared dynamic pool
CPUREQ="2500m" MEMREQ="" CPULIM="2500m" MEMLIM=""
create dyp-busybox
report allowed
verify 'len(cpus["pod4c0"]) >= 3' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod3c0"], cpus["pod4c0"])'

# pod5: annotate otherwise a default pod to the reserved CPUs,
# severely overbook reserved CPUs
CPUREQ="2500m" MEMREQ="" CPULIM="2500m" MEMLIM=""
POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: reserved" create dyp-busybox
report allowed
verify 'cpus["pod5c0"] == {"cpu00", "cpu01", "cpu02"}' \
       'disjoint_sets(cpus["pod5c0"], cpus["pod3c0"], cpus["pod4c0"])'

cleanup

# Now that all pods are deleted, make sure that cpus of reserved and
# default dynamic pools are as expected.

# pod6: run in shared dynamic pool
CPUREQ="999m" MEMREQ="" CPULIM="999m" MEMLIM=""
create dyp-busybox
report allowed
verify 'len(cpus["pod6c0"]) >= 1'

# pod7: kube-system
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
namespace=kube-system create dyp-busybox
report allowed
verify 'cpus["pod7c0"] == {"cpu00", "cpu01", "cpu02"}'

cleanup

terminate cri-resmgr
launch cri-resmgr


================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test04-reserved/dyp-reserved.cfg
================================================
policy:
  Active: dynamic-pools
  ReservedResources:
    CPU: cpuset:0-2
  dynamic-pools:
    PinCPU: true
    PinMemory: true
    ReservedPoolNamespaces:
      - "monitor-*"
      - "*-log*"
    DynamicPoolTypes:
      - Name: reserved
        Namespaces:
          - my-exact-name
        CPUClass: reserved-class
      - Name: default
      - Name: full-core
        CPUClass: turbo
logger:
  Debug: policy
  Klog:
    skip_headers: true

================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test05-namespace/code.var.sh
================================================
terminate cri-resmgr
cri_resmgr_cfg=${TEST_DIR}/dyp-namespace.cfg launch cri-resmgr

cleanup() {
    vm-command \
        "kubectl delete pods -n e2e-a --all --now
         kubectl delete pods -n e2e-b --all --now
         kubectl delete pods -n e2e-c --all --now
         kubectl delete pods -n e2e-d --all --now
         kubectl delete pods --all --now
         kubectl delete namespace e2e-a
         kubectl delete namespace e2e-b
         kubectl delete namespace e2e-c
         kubectl delete namespace e2e-d"
    return 0
}
cleanup

kubectl create namespace e2e-a
kubectl create namespace e2e-b
kubectl create namespace e2e-c
kubectl create namespace e2e-d

# pod0: create in the default namespace, CPUREQ is nil, both containers go to shared dynamic pool.
CPUREQ=""
CONTCOUNT=2 create dyp-busybox
report allowed
verify 'cpus["pod0c0"] == cpus["pod0c1"]' \
       'len(cpus["pod0c0"]) == 15'

# pod1: create in the e2e-a namespace, CPUREQ is nil, both containers go to shared dynamic pool.
CPUREQ=""
namespace="e2e-a" CONTCOUNT=2 create dyp-busybox
report allowed
verify 'cpus["pod1c0"] == cpus["pod1c1"] == cpus["pod0c0"]' \
       'len(cpus["pod1c0"]) == 15' \

# pod2: create in the default namespace, CPUREQ is 2*2, both containers go to nsdyp dynamic pool.
CPUREQ="2" MEMREQ="100M" CPULIM="2" MEMLIM="100M"
CONTCOUNT=2 create dyp-busybox
report allowed
verify 'cpus["pod2c0"] == cpus["pod2c1"]' \
       'len(cpus["pod2c0"]) >= 4' \
       'disjoint_sets(cpus["pod2c0"], cpus["pod1c0"])' \
       'disjoint_sets(cpus["pod2c0"], cpus["pod0c0"])'

# pod3: create again in the default namespace, CPUREQ is 200m*2, both containers go to nsdyp dynamic pool.
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
CONTCOUNT=2 create dyp-busybox
report allowed
verify 'cpus["pod3c0"] == cpus["pod3c1"] == cpus["pod2c0"]' \
       'len(cpus["pod3c0"]) >= 5'

# pod4: create in the e2e-b namespace, CPUREQ is 2*2, both containers go to nsdyp dynamic pool.
CPUREQ="2" MEMREQ="100M" CPULIM="2" MEMLIM="100M"
namespace="e2e-b" CONTCOUNT=2 create dyp-busybox
report allowed
verify 'cpus["pod4c0"] == cpus["pod4c1"] == cpus["pod3c0"] == cpus["pod2c0"]' \
       'len(cpus["pod4c0"]) >= 9'

# pod5: create in the e2e-c namespace, CPUREQ is 100m*2, both containers go to nsdyp dynamic pool.
CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
namespace="e2e-c" CONTCOUNT=2 create dyp-busybox
report allowed
verify 'cpus["pod5c0"] == cpus["pod5c1"] == cpus["pod4c0"] == cpus["pod3c0"] == cpus["pod2c0"]' \
       'len(cpus["pod5c0"]) >= 9'

cleanup
terminate cri-resmgr
launch cri-resmgr


================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test05-namespace/dyp-namespace.cfg
================================================
policy:
  Active: dynamic-pools
  ReservedResources:
    CPU: 1
  dynamic-pools:
    PinCPU: true
    PinMemory: true
    DynamicPoolTypes:
      - Name: nsdyp
        Namespaces:
          - "*"
logger:
  Debug: policy


================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test06-update-configmap/code.var.sh
================================================
# This test verifies that configuration updates via cri-resmgr-agent
# are handled properly in the dynamic-pools policy.

testns=e2e-dyp-test06

cleanup() {
    vm-command "kubectl delete pods --all --now; \
        kubectl delete pods -n $testns --all --now; \
        kubectl delete pods -n dyptype1ns0 --all --now; \
        kubectl delete namespace $testns || :; \
        kubectl delete namespace dyptype1ns0 || :"
    terminate cri-resmgr
    terminate cri-resmgr-agent
    vm-command "cri-resmgr -reset-policy; cri-resmgr -reset-config"
}

apply-configmap() {
    vm-put-file $(instantiate dyp-configmap.yaml) dyp-configmap.yaml
    vm-command "cat dyp-configmap.yaml"
    kubectl apply -f dyp-configmap.yaml
}

cleanup
cri_resmgr_extra_args="-metrics-interval 1s" cri_resmgr_config=fallback launch cri-resmgr
launch cri-resmgr-agent

kubectl create namespace $testns
kubectl create namespace dyptype1ns0

AVAILABLE_CPU="cpuset:1,4-15" DYPTYPE2_NAMESPACE0='"*"' apply-configmap
sleep 3

# pod0 run in dyptype0, annotation
CPUREQ=1 MEMREQ="100M" CPULIM=1 MEMLIM="100M"
POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: dyptype0" create dyp-busybox
# pod1 run in dyptype1, namespace
CPUREQ=1 MEMREQ="100M" CPULIM=1 MEMLIM="100M"
namespace="dyptype1ns0" create dyp-busybox
# pod2 run in dyptype2, wildcard namespace
CPUREQ=1 MEMREQ="100M" CPULIM=1 MEMLIM="100M"
namespace="e2e-dyp-test06" create dyp-busybox
sleep 3
vm-command "curl -s $verify_metrics_url"
verify-metrics-has-line 'pod0:pod0c0.*"dyptype0"'
verify-metrics-has-line 'pod1:pod1c0.*"dyptype1"'
verify-metrics-has-line 'pod2:pod2c0.*"dyptype2"'

# Remove first two dynamic pool types, change dyptype2 to match all
# namespaces.
DYPTYPE0_SKIP=1 DYPTYPE1_SKIP=1 DYPTYPE2_NAMESPACE0='"*"' apply-configmap
# Note:

# pod0 was successfully assigned to and running in dyptype0 dynamic pool.
# Now dyptype0 was completely removed from the node.
# Currently this behavior is undefined.
# Possible behaviors: evict pod0, continue assign chain, refuse config...
# For now, skip pod0c0 dynamic pool validation:
# verify-metrics-has-line '"dyptype2".*pod0:pod0c0'
verify-metrics-has-line 'pod1:pod1c0.*"dyptype2"'
verify-metrics-has-line 'pod2:pod2c0.*"dyptype2"'

# Bring back dyptype0 where pod0 belongs to by annotation.
DYPTYPE1_SKIP=1 DYPTYPE2_NAMESPACE0='"*"' apply-configmap
verify-metrics-has-line 'pod0:pod0c0.*"dyptype0"'
verify-metrics-has-line 'pod1:pod1c0.*"dyptype2"'
verify-metrics-has-line 'pod2:pod2c0.*"dyptype2"'

# Change only CPU classes, no reassigning.
verify-metrics-has-line 'pod0:pod0c0.*cpu_class="classA".*"dyptype0"'
verify-metrics-has-line 'pod1:pod1c0.*cpu_class="classC".*"dyptype2"'
verify-metrics-has-line 'pod2:pod2c0.*cpu_class="classC".*"dyptype2"'
DYPTYPE0_CPUCLASS="classC" DYPTYPE1_SKIP=1 DYPTYPE2_CPUCLASS="classB" DYPTYPE2_NAMESPACE0='"*"'  apply-configmap
verify-metrics-has-line 'pod0:pod0c0.*cpu_class="classC".*"dyptype0"'
verify-metrics-has-line 'pod1:pod1c0.*cpu_class="classB".*"dyptype2"'
verify-metrics-has-line 'pod2:pod2c0.*cpu_class="classB".*"dyptype2"'

cleanup
launch cri-resmgr


================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test07-numa/code.var.sh
================================================
terminate cri-resmgr
cri_resmgr_cfg=${TEST_DIR}/dyp-numa.cfg launch cri-resmgr

# pod0: besteffort, go to shared dynamic pool, make sure it still gets at least 1 CPU.
CPUREQ="" CPULIM="" MEMREQ="" MEMLIM=""
CONTCOUNT=1 create dyp-busybox
report allowed
verify 'len(cpus["pod0c0"]) == 15'

# pod1: guaranteed, go to fit-in-numa dynamic pool, make sure it gets the CPU it requested.
CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M"
CONTCOUNT=1 create dyp-busybox
report allowed
verify 'len(cpus["pod0c0"]) >= 1' \
       'len(cpus["pod1c0"]) >= 1' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"])'

# pod2: guaranteed, go to fit-in-numa dynamic pool, make sure it gets the CPU it requested.
CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M"
CONTCOUNT=1 create dyp-busybox
report allowed
verify 'len(cpus["pod0c0"]) >= 1' \
       'len(cpus["pod1c0"]) >= 2' \
       'len(cpus["pod2c0"]) >= 2' \
       'cpus["pod1c0"] == cpus["pod2c0"]' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod2c0"])'

# pod3: guaranteed, go to fit-in-numa dynamic pool, make sure it gets the CPU it requested.
CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M"
CONTCOUNT=1 create dyp-busybox
report allowed
verify 'len(cpus["pod0c0"]) >= 1' \
       'len(cpus["pod1c0"]) >= 3' \
       'len(cpus["pod2c0"]) >= 3' \
       'len(cpus["pod3c0"]) >= 3' \
       'cpus["pod1c0"] == cpus["pod2c0"] == cpus["pod3c0"]' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod3c0"])'

# pod4: guaranteed, go to fit-in-numa dynamic pool, make sure it gets the CPU it requested.
CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M"
CONTCOUNT=1 create dyp-busybox
report allowed
verify 'len(cpus["pod0c0"]) >= 1' \
       'len(cpus["pod1c0"]) >= 4' \
       'len(cpus["pod2c0"]) >= 4' \
       'len(cpus["pod3c0"]) >= 4' \
       'len(cpus["pod4c0"]) >= 4' \
       'cpus["pod1c0"] == cpus["pod2c0"] == cpus["pod3c0"] == cpus["pod4c0"]' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod4c0"])'

# pod5: besteffort, no CPU request, should fit into the shared dynamic pool.
CPUREQ="" CPULIM="" MEMREQ="" MEMLIM=""
CONTCOUNT=1 create dyp-busybox
report allowed
verify 'len(cpus["pod0c0"]) >= 1' \
       'len(cpus["pod1c0"]) >= 4' \
       'len(cpus["pod2c0"]) >= 4' \
       'len(cpus["pod3c0"]) >= 4' \
       'len(cpus["pod4c0"]) >= 4' \
       'len(cpus["pod5c0"]) >= 1' \
       'cpus["pod1c0"] == cpus["pod2c0"] == cpus["pod3c0"] == cpus["pod4c0"]' \
       'cpus["pod0c0"] == cpus["pod5c0"]'

# Leave only one guaranteed container to the fit-in-numa dynamic pool.
kubectl delete pods pod1 pod2 pod3 --now
report allowed
verify 'len(cpus["pod0c0"]) >= 1' \
       'len(cpus["pod4c0"]) >= 1' \
       'len(cpus["pod5c0"]) >= 1' \
       'cpus["pod0c0"] == cpus["pod5c0"]' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod4c0"])'

# Leave only bestefforts to the dynamic pool.
kubectl delete pods pod4 --now
report allowed
verify 'len(cpus["pod0c0"]) >= 1' \
       'len(cpus["pod5c0"]) >= 1' \
       'cpus["pod0c0"] == cpus["pod5c0"]'

terminate cri-resmgr
launch cri-resmgr


================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test07-numa/dyp-numa.cfg
================================================
policy:
  Active: dynamic-pools
  AvailableResources:
    CPU: cpuset:0-15
  # Reserve one of our CPUs (cpu15) for kube-system tasks.
  ReservedResources:
    CPU: 1
  dynamic-pools:
    PinCPU: true
    PinMemory: true
    DynamicPoolTypes:
      - Name: fit-in-numa
        # All (non-system) containers are assigned to this dynamic pool
        # type
        Namespaces:
          - "*"


================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/policies.test-suite/dynamic-pools/verify.source.sh
================================================
# Utilities to verify data from metrics

verify_metrics_url="http://localhost:8891/metrics"

verify-metrics-has-line() {
    local expected_line="$1"
    vm-run-until --timeout 10 "echo 'waiting for metrics line: $expected_line' >&2; curl --silent $verify_metrics_url | grep -E '$expected_line'" || {
        command-error "expected line '$1' missing from the output"
    }
}

verify-metrics-has-no-line() {
    local unexpected_line="$1"
    vm-run-until --timeout 10 "echo 'checking absense of metrics line: $unexpected_line' >&2; ! curl --silent $verify_metrics_url | grep -Eq '$unexpected_line'" || {
        command-error "unexpected line '$1' found from the output"
    }
}


================================================
FILE: test/e2e/policies.test-suite/podpools/cri-resmgr.cfg
================================================
policy:
  Active: podpools
  # Use 14 CPUs in total.
  AvailableResources:
    CPU: cpuset:2-15
  # One CPU is dedicated for reserved tasks, 13 CPUs left.
  ReservedResources:
    CPU: cpuset:15
  podpools:
    PinCPU: true
    PinMemory: true
    Pools:
      # Take 3 CPUs to "singlecpu" podpools, 10 CPUs left.
      - Name: singlecpu
        CPU: 1
        MaxPods: 2
        Instances: 3 CPUs
        # Not defining pool fill order equals to the default:
        # fillOrder: Balanced.

      # Take at most ~6.5 CPUs (= 50% * 13) to "dualcpu" pools.
      # Allocating 2 CPUs per pool allows instantiating 3 pools,
      # that is, 6 CPUs is really taken.
      # 4 CPUs left.
      # Leftover CPUs will be shared among pods and containers not in
      # pools.
      - Name: dualcpu
        CPU: 2
        MaxPods: 3
        Instances: 50 %
        FillOrder: Packed
logger:
  Debug: cri-resmgr,resource-manager,cache,policy
  Klog:
    skip_headers: true


================================================
FILE: test/e2e/policies.test-suite/podpools/n4c16/podpools-configmap.yaml.in
================================================
apiVersion: v1
kind: ConfigMap
metadata:
  name: cri-resmgr-config.default
  namespace: kube-system
data:
  policy: |+
    Active: podpools
    ReservedResources:
      CPU: 1
    podpools:
      Pools:
        - Name: $NAME
          Instances: $INSTANCES
          CPU: $CPU
          MaxPods: $MAXPODS
  logger: |+
    Debug: resource-manager,cache,policy,memory


================================================
FILE: test/e2e/policies.test-suite/podpools/n4c16/py_consts.var.py
================================================
# This file captures expected CPU allocator behavior when the podpools
# policy is started with the test default cri-resmgr configuration on
# n4c16 topology.

# cri-resmgr output on constructed pools.
expected_podpools_output = """
podpools policy pools:
- pool 0: reserved[0]{cpus:15, mems:3, pods:0/0, containers:0}
- pool 1: default[0]{cpus:5,12-14, mems:1,3, pods:0/0, containers:0}
- pool 2: singlecpu[0]{cpus:2, mems:0, pods:0/2, containers:0}
- pool 3: singlecpu[1]{cpus:3, mems:0, pods:0/2, containers:0}
- pool 4: singlecpu[2]{cpus:4, mems:1, pods:0/2, containers:0}
- pool 5: dualcpu[0]{cpus:6-7, mems:1, pods:0/3, containers:0}
- pool 6: dualcpu[1]{cpus:8-9, mems:2, pods:0/3, containers:0}
- pool 7: dualcpu[2]{cpus:10-11, mems:2, pods:0/3, containers:0}
"""

# 1. Parse expected_podpools_output into
#    expected.cpus.POOLNAME[INSTANCE] = {"cpuNN", ...}
# 2. Calculate memory nodes based on expected.cpus into
#    expected.mems.POOLNAME[INSTANCE] = {"nodeN", ...}
#    (do not read these from output in order to verify its correctness)
#
# As the result:
# expected.cpus.singlecpu == [{"cpu02"}, {"cpu03"}, {"cpu04"}]
# expected.mems.singlecpu == [{"node0"}, {"node0"}, {"node1"}]

import re

class expected:
    class cpus:
        pass
    class mems:
        pass

def _add_expected_pool(poolname, poolindex, cpuset):
    cpus = []
    for cpurange in cpuset.split(","):
        lower_upper = [int(n) for n in cpurange.split("-")]
        if len(lower_upper) == 1:
            cpus.append(lower_upper[0])
        else:
            cpus.extend([i for i in range(lower_upper[0], lower_upper[1]+1)])
    if not hasattr(expected.cpus, poolname):
        setattr(expected.cpus, poolname, [])
        setattr(expected.mems, poolname, [])
    getattr(expected.cpus, poolname).append(set('cpu%s' % (str(cpu).zfill(2),) for cpu in cpus))
    getattr(expected.mems, poolname).append(set("node%s" % (cpu//4,) for cpu in cpus))

for poolname, poolindex, cpuset in re.findall(r': ([a-z]+)\[([0-9]+)\]\{cpus:([0-9,-]+), ', expected_podpools_output):
    _add_expected_pool(poolname, poolindex, cpuset)


================================================
FILE: test/e2e/policies.test-suite/podpools/n4c16/test01-basic-placement/code.var.sh
================================================
# Test placing containers with and without annotations to correct pools
# reserved and shared CPUs.

( kubectl delete pods pod3 -n kube-system --now --wait --ignore-not-found ) || true

# pod0: singlecpu
out ""
out "### Multicontainer pod, all containers run on single CPU"
# singlecpu pool has capacity for two pods => 500 mCPU/pod
# test with 3 containers per pod => 167 mCPU/container
CPUREQ="167m" MEMREQ="" CPULIM="" MEMLIM=""
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: singlecpu" CONTCOUNT=3 create podpools-busybox
report allowed
verify 'cpus["pod0c0"] == cpus["pod0c1"] == cpus["pod0c2"]' \
       'cpus["pod0c0"] == expected.cpus.singlecpu[0]' \
       'mems["pod0c0"] == expected.mems.singlecpu[0]'

# pod1: dualcpu
out ""
out "### Multicontainer pod, all containers run on two CPUs."
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CONTCOUNT=3 create podpools-busybox
report allowed
verify 'cpus["pod1c0"] == cpus["pod1c1"] == cpus["pod1c2"]' \
       'cpus["pod1c0"] == expected.cpus.dualcpu[0]' \
       'mems["pod1c1"] == expected.mems.dualcpu[0]'

# pod2: default
out ""
out "### Multicontainer pod, no annotations. Runs on shared CPUs."
CONTCOUNT=3 create podpools-busybox
report allowed
verify 'cpus["pod2c0"] == cpus["pod2c1"] == cpus["pod2c2"]' \
       'cpus["pod2c0"] == expected.cpus.default[0]' \
       'mems["pod2c2"] == expected.mems.default[0]'

# pod3: reserved
out ""
out "### Multicontainer pod in kube-system namespace. Runs on reserved CPUs."
namespace=kube-system CONTCOUNT=3 create podpools-busybox
report allowed
verify 'cpus["pod3c0"] == cpus["pod3c1"] == cpus["pod3c2"]' \
       'cpus["pod3c0"] == expected.cpus.reserved[0]' \
       'mems["pod3c0"] == expected.mems.reserved[0]'

kubectl delete pods pod3 -n kube-system --now --wait --ignore-not-found

# pod4: bad pool name
out ""
out "### Single container pod, fallback to the default pool."
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: non-existing-pool" create podpools-busybox
report allowed
verify 'cpus["pod4c0"] == expected.cpus.default[0]' \
       'mems["pod4c0"] == expected.mems.default[0]'

kubectl delete pods pod0 pod1 pod2 --now --wait --ignore-not-found


================================================
FILE: test/e2e/policies.test-suite/podpools/n4c16/test02-fill-order/code.var.sh
================================================
# Test filling pools with pods in correct order

# Test only BestEffort containers
CPUREQ="" MEMREQ="" CPULIM="" MEMLIM=""

# pod0..2: balanced filling, every singlecpu pool should have one pod
out "### Filling singlecpu pool in Balanced fill order"
n=3 POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: singlecpu" CONTCOUNT=2 create podpools-busybox
report allowed
verify 'cpus["pod0c0"] == cpus["pod0c1"]' \
       'cpus["pod1c0"] == cpus["pod1c1"]' \
       'cpus["pod2c0"] == cpus["pod2c1"]' \
       'len(cpus["pod0c0"]) == 1' \
       'len(cpus["pod1c0"]) == 1' \
       'len(cpus["pod2c0"]) == 1' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"], cpus["pod2c0"])'

# pod3..5: balanced filling up to max, every singlecpu pool should have two pods
n=3 POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: singlecpu" CONTCOUNT=2 create podpools-busybox
report allowed
verify 'cpus["pod0c0"] == cpus["pod0c1"]' \
       'cpus["pod1c0"] == cpus["pod1c1"]' \
       'cpus["pod2c0"] == cpus["pod2c1"]' \
       'len(cpus["pod0c0"]) == 1' \
       'len(cpus["pod1c0"]) == 1' \
       'len(cpus["pod2c0"]) == 1' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"], cpus["pod2c0"])' \
       'cpus["pod3c0"] == cpus["pod3c1"]' \
       'cpus["pod4c0"] == cpus["pod4c1"]' \
       'cpus["pod5c0"] == cpus["pod5c1"]' \
       'len(cpus["pod3c0"]) == 1' \
       'len(cpus["pod4c0"]) == 1' \
       'len(cpus["pod5c0"]) == 1' \
       'disjoint_sets(cpus["pod3c0"], cpus["pod4c0"], cpus["pod5c0"])' \
       'cpus["pod5c0"] == cpus["pod2c0"]' # the last pool should have been filled by pods 2 and 5

# make a little room to the first pool and clear the last pool
kubectl delete pods pod0 pod2 pod5 --now --wait --ignore-not-found

# pod6: Balanced fill order should place this pod to the last pool (it has maximal free space)
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: singlecpu" CONTCOUNT=1 create podpools-busybox
report allowed
verify 'disjoint_sets(cpus["pod6c0"],
                      set.union(cpus["pod1c0"], cpus["pod3c0"], cpus["pod4c0"]))'

kubectl delete pods --all --now --wait
reset counters

out "### Filling dualcpu pool in Packed fill order"
# pod0..2: should go to the first pool
n=3 POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CONTCOUNT=1 create podpools-busybox
report allowed
verify 'cpus["pod0c0"] == cpus["pod1c0"] == cpus["pod2c0"]'

# pod3..5: should go to the second pool
n=3 POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CONTCOUNT=1 create podpools-busybox
report allowed
verify 'cpus["pod0c0"] == cpus["pod1c0"] == cpus["pod2c0"]' \
       'cpus["pod3c0"] == cpus["pod4c0"] == cpus["pod5c0"]' \
       'disjoint_sets(cpus["pod0c0"], cpus["pod3c0"])'

# Deleting two pods from the first pool, one from the last.
kubectl delete pods pod0 pod1 pod5 --now --wait --ignore-not-found

# pod6: Packed fill order should place this to the last pool (it has minimal free space)
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CONTCOUNT=1 create podpools-busybox
report allowed
verify 'cpus["pod3c0"] == cpus["pod4c0"] == cpus["pod6c0"]' \
       'disjoint_sets(cpus["pod2c0"], cpus["pod6c0"])'


================================================
FILE: test/e2e/policies.test-suite/podpools/n4c16/test03-qos/code.var.sh
================================================
# Test all QoS class pods in a pool, reserved and shared CPUs.
# Verify that CFS CPU shares is set correctly in all cases.

vm-put-file "$HOST_PROJECT_DIR/scripts/testing/kube-cgroups" "/usr/local/bin/kube-cgroups"

verify-cpushare() {
    podXcY=$1
    expected_cgv1=$2
    expected_cgv2=$3
    vm-command "kube-cgroups -n . -c $podXcY -f 'cpu.(shares|weight)\$'"
    CPU_SHARES_WEIGHT=$(echo "$COMMAND_OUTPUT" | awk '/cpu.*:/{print $2}')
    if [ "$CPU_SHARES_WEIGHT" = "$expected_cgv1" ]; then
        echo "verified cpu.shares of $podXcY == $expected_cgv1"
    elif [ "$CPU_SHARES_WEIGHT" = "$expected_cgv2" ]; then
        echo "verified cpu.weight of $podXcY == $expected_cgv2"
    else
        echo "assertion failed when verifying $podXcY: got '$COMMAND_OUTPUT' expected 'cpu.shares=$expected_cgv1' or 'cpu.weight=$expected_cgv2'"
        exit 1
    fi
}

CPUREQ="" MEMREQ="" CPULIM="" MEMLIM="" POD_ANNOTATION=""

out "### Assigning BestEffort, Burstable and Guaranteed pods to the same (dualcpu) pool"
# pod0c0: besteffort
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" create podpools-busybox
# pod1c0: burstable
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ=500m create podpools-busybox
# pod2c0: guaranteed
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ=1 CPULIM=1 MEMREQ=100M MEMLIM=100M create podpools-busybox
report allowed

verify-cpushare pod0c0 2 1
verify-cpushare pod1c0 512 20
verify-cpushare pod2c0 1024 39

kubectl delete pods --all --now --wait
reset counters

out "### Assigning BestEffort, Burstable and Guaranteed pods shared CPUs"
# pod0c0: besteffort
create podpools-busybox
# pod1c0: burstable
CPUREQ=500m create podpools-busybox
# pod2c0: guaranteed
CPUREQ=1 CPULIM=1 MEMREQ=100M MEMLIM=100M create podpools-busybox
report allowed

verify-cpushare pod0c0 2 1
verify-cpushare pod1c0 512 20
verify-cpushare pod2c0 1024 39

kubectl delete pods --all --now --wait
reset counters

out "### Assigning BestEffort, Burstable and Guaranteed pods reserved CPUs"
# pod0c0: besteffort
namespace=kube-system create podpools-busybox
# pod1c0: burstable
namespace=kube-system CPUREQ=500m create podpools-busybox
# pod2c0: guaranteed
namespace=kube-system CPUREQ=1 CPULIM=1 MEMREQ=100M MEMLIM=100M create podpools-busybox
report allowed

verify-cpushare pod0c0 2 1
verify-cpushare pod1c0 512 20
verify-cpushare pod2c0 1024 39

kubectl delete pods pod0 pod1 pod2 -n kube-system --now --wait --ignore-not-found


================================================
FILE: test/e2e/policies.test-suite/podpools/n4c16/test04-overbook-cpus/code.var.sh
================================================
# Test CPU request warnings and errors:
# - Overbooked CPU sets
# - Bad CPU requests: mismatch between pool CPUs per pod and container CPU requests

CRI_RESMGR_OUTPUT="cat cri-resmgr.output.txt"

# pod0: overbook with single burstable pod and container
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ=2900m CPULIM="" MEMREQ="" MEMLIM="" create podpools-busybox
report allowed
vm-command "$CRI_RESMGR_OUTPUT | grep -E '^E.*overbooked.*(2899|2900)m'" || error "missing overbook warning"
kubectl delete pods --all --now --wait

# pod1: overbook with single burstable pod with two containers
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ=1050m CPULIM="" MEMREQ="" MEMLIM="" CONTCOUNT=2 create podpools-busybox
report allowed
vm-command "$CRI_RESMGR_OUTPUT | grep -E '^E.*overbooked.*2100m'" || error "missing overbook warning"
kubectl delete pods --all --now --wait

# pod2, pod3: overbook with two guaranteed pods, one container in each pod
n=2 POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ=1001m MEMREQ=100M CPULIM=1001m MEMLIM=100M create podpools-busybox
report allowed
vm-command "$CRI_RESMGR_OUTPUT | grep -E '^E.*overbooked.*2002m'" || error "missing overbook warning"
kubectl delete pods --all --now --wait

# pod4, pod5: no overbooking with exact CPUs guaranteed + besteffort pod
terminate cri-resmgr # restart to clear log
launch cri-resmgr
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ=1000m CPULIM=1000m MEMREQ=100M MEMLIM=100M CONTCOUNT=2 create podpools-busybox
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ="" CPULIM="" MEMREQ="" MEMLIM="" create podpools-busybox
report allowed
vm-command "$CRI_RESMGR_OUTPUT | grep -E '^E.*overbooked'" && error "overbook warning with maximum allowed load"
kubectl delete pods --all --now --wait
# podpools logs misaligned CPU requests after pod deletion
vm-command "$CRI_RESMGR_OUTPUT | grep -E '^E.*bad CPU requests:.*pod4.* requested 2000 mCPUs.* 666 mCPUs'" || error "bad CPU request from pod4 expected but not found"
vm-command "$CRI_RESMGR_OUTPUT | grep -E '^E.*bad CPU requests:.*pod5.* requested 0 mCPUs.* 666 mCPUs'" || error "bad CPU request from pod5 expected but not found"

# pod6: request 4 * 167 mCPU, that is almost required 666 mCPU. Should not be bad CPU request
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ=167m CPULIM="" MEMREQ="" MEMLIM="" CONTCOUNT=4 create podpools-busybox
vm-command "$CRI_RESMGR_OUTPUT | grep -E '^E.*bad CPU requests:.*pod6'" && error "pod6 CPU request was ok, but 'bad CPU request' error found"

kubectl delete pods --all --now --wait


================================================
FILE: test/e2e/policies.test-suite/podpools/n4c16/test05-agent-updates-config/code.var.sh
================================================
# Relaunch cri-resmgr so that it will listen to cri-resmgr-agent
cleanup() {
    vm-command "kubectl delete pod -n kube-system pod0 --now --wait --ignore-not-found; kubectl delete pods --all --now --wait; kubectl delete cm -n kube-system cri-resmgr-config.default"
    terminate cri-resmgr
    terminate cri-resmgr-agent
    vm-command "cri-resmgr -reset-policy; cri-resmgr -reset-config"
}

cleanup
cri_resmgr_config=fallback launch cri-resmgr
launch cri-resmgr-agent

# Create a pod to every pod pool in the default config:
# reserved, shared, singlecpu, dualcpu
# pod0: reserved
CPUREQ="" namespace=kube-system create podpools-busybox
# pod1: default
CPUREQ="" create podpools-busybox
# pod2: singlecpu
CPUREQ="1" POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: singlecpu" create podpools-busybox
# pod3, pod4, pod5, pod6: dualcpu (dualcpu 3 pods/pool, packed)
n=4 CPUREQ="1" POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" create podpools-busybox
report allowed
verify "cpus['pod0c0'] == expected.cpus.reserved[0]" \
       "cpus['pod1c0'] == expected.cpus.default[0]" \
       "cpus['pod2c0'] == expected.cpus.singlecpu[0]" \
       "cpus['pod3c0'] == expected.cpus.dualcpu[0]" \
       "cpus['pod4c0'] == expected.cpus.dualcpu[0]" \
       "cpus['pod5c0'] == expected.cpus.dualcpu[0]" \
       "cpus['pod6c0'] == expected.cpus.dualcpu[1]"

echo "Switch to new configuration without singlecpu pools"
vm-put-file $(NAME=dualcpu CPU=2 MAXPODS=2 INSTANCES="100 %" instantiate podpools-configmap.yaml) podpools-dualcpu-configmap.yaml
kubectl apply -f podpools-dualcpu-configmap.yaml
sleep 5
report allowed
verify "cpus['pod0c0'] == expected.cpus.reserved[0]" `# reserved remains the same` \
       "len(cpus['pod1c0']) == 1" `# the default pool has only one CPU` \
       "cpus['pod2c0'] == cpus['pod1c0']" `# no singlecpu pool -> assign to default` \
       `# there are many dualcpu pools (1 out of 2 pods/pool, balanced)` \
       "len(cpus['pod3c0']) == 2" \
       "len(cpus['pod4c0']) == 2" \
       "len(cpus['pod5c0']) == 2" \
       "len(cpus['pod6c0']) == 2" \
       "disjoint_sets(cpus['pod3c0'], cpus['pod4c0'], cpus['pod5c0'], cpus['pod6c0'])"

echo "Negative test: try switching to an invalid configuration, check assignments have not changed"
vm-put-file $(NAME=borked CPU=130 MAXPODS=2 INSTANCES=1 instantiate podpools-configmap.yaml) podpools-borked-configmap.yaml
kubectl apply -f podpools-borked-configmap.yaml
sleep 5
report allowed
verify "cpus['pod0c0'] == {'cpu15'}" \
       "cpus['pod1c0'] == cpus['pod2c0']" \
       "disjoint_sets(cpus['pod3c0'], cpus['pod4c0'], cpus['pod5c0'], cpus['pod6c0'])" \

echo "After broken reconfiguration trial, switch to valid configuration without dualcpu pools"
# This configuration leaves no left-over CPUs for the default pool
# => the default pool will use the same CPUs as the reserved pool.
vm-put-file $(NAME=singlecpu CPU=1 MAXPODS=1 INSTANCES="100 %" instantiate podpools-configmap.yaml) podpools-dualcpu-configmap.yaml
kubectl apply -f podpools-dualcpu-configmap.yaml
sleep 5
report allowed

verify "cpus['pod0c0'] == expected.cpus.reserved[0]" `# reserved remains the same` \
       "cpus['pod1c0'] == expected.cpus.reserved[0]" `# the default pool equals to reserved` \
       "len(cpus['pod2c0']) == 1" `# pod2 in singlecpu[0]` \
       "disjoint_sets(cpus['pod2c0'], expected.cpus.reserved[0])" \
       `# all dualcpu pods endup into the default pool` \
       "cpus['pod3c0'] == cpus['pod4c0'] == cpus['pod5c0'] == cpus['pod6c0']" \
       "cpus['pod3c0'] == expected.cpus.reserved[0]"

echo "Not enough dualcpu pools for all running dualcpu pods, the rest fall back to the default pool"
vm-put-file $(NAME=dualcpu CPU=2 MAXPODS=1 INSTANCES="2" instantiate podpools-configmap.yaml) podpools-dualcpu-configmap.yaml
kubectl apply -f podpools-dualcpu-configmap.yaml
sleep 5
report allowed
pp cpus
verify "cpus['pod0c0'] == expected.cpus.reserved[0]" `# reserved remains the same` \
       "len(cpus['pod1c0']) == 9" `# the default pool` \
       "cpus['pod2c0'] == cpus['pod1c0']" `# no singlecpu pool -> assign to default` \
       `# two dualcpu pods go to dualcpu pools, two to the default pool` \
       "len([c for c in ['pod3c0', 'pod4c0', 'pod5c0', 'pod6c0'] if len(cpus[c])==2]) == 2" \
       "len([c for c in ['pod3c0', 'pod4c0', 'pod5c0', 'pod6c0'] if len(cpus[c])==9]) == 2"

# Clean up agent-delivered configuration setup as it might break tests
# that by default rely on forced configurations.
cleanup
launch cri-resmgr
launch cri-resmgr-agent


================================================
FILE: test/e2e/policies.test-suite/podpools/n4c16/test06-prometheus-metrics/code.var.sh
================================================
# Test reporting Prometheus metrics from podpools

cleanup() {
    vm-command "kubectl get pods -A | grep -E ' pod[0-9]' | while read namespace pod rest; do kubectl -n \$namespace delete pod \$pod --now --wait --ignore-not-found; done"
}

parse-commandoutput-log_pool_cpuset() {
    log_pool_cpuset=$(awk -F 'cpus:|, ' "{print \$2}" <<< "$COMMAND_OUTPUT")
    out "parsed: log_pool_cpuset=$log_pool_cpuset"
}

parse-commandoutput-log_pool_name() {
    log_pool_name=$(awk -F"[ {]*" "{print \$10}" <<< "$COMMAND_OUTPUT")
    out "parsed: log_pool_name=$log_pool_name"
}

verify-log-vs-metrics() {
    local podXcY="$1"
    local cpuUsageMin="$2" # optional
    local cpuUsageMax="$3" # optional
    vm-command "grep 'assigning container $podXcY to pool' cri-resmgr.output.txt"
    parse-commandoutput-log_pool_cpuset
    parse-commandoutput-log_pool_name
    local usageCmd="curl --silent $metrics_url | grep $log_pool_cpuset | grep $podXcY"
    vm-run-until --timeout 10 "$usageCmd" || {
        error "cannot find pod:container $1 and cpuset $log_pool_cpuset from the report"
    }
    if [ -n "$cpuUsageMax" ]; then
        echo "verifying CPU usage $cpuUsageMin < X < $cpuUsageMax"
        vm-run-until --timeout 20 "X=\"\$($usageCmd)\"; echo \"\$X\"; X=\${X##* }; X=\${X%%.*}; echo $cpuUsageMin \< \$X \< $cpuUsageMax; (( $cpuUsageMin < \$X )) && (( \$X < $cpuUsageMax ))"
    fi
}

verify-metrics-has-line() {
    local expected_line="$1"
    out "verifying metrics line syntax..."
    vm-run-until --timeout 10 "echo '    waiting for metrics line: $expected_line' >&2; curl --silent $metrics_url | grep -E '$expected_line'" || {
        command-error "expected line '$1' missing from the output"
    }
}

# Delete left-over test pods from the kube-system namespace
for podX in $(kubectl get pods -n kube-system | awk '/^pod[0-9]/{print $1}'); do
    kubectl delete pods $podX -n kube-system --now --wait --ignore-not-found
done

metrics_url="http://localhost:8891/metrics"

# Launch cri-resmgr with wanted metrics update interval
# and configuration that opens the instrumentation http server.
terminate cri-resmgr
cri_resmgr_cfg=${TEST_DIR}/podpools-metrics.cfg  cri_resmgr_extra_args="-metrics-interval 4s" launch cri-resmgr

# pod0: single container, reserve 400m CPU, but do not use it.
out ""
out "### Idle single-container pod"
CPUREQ="400m" MEMREQ="" CPULIM="400m" MEMLIM=""
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: 400mCPU" CONTCOUNT=1 create podpools-busybox
report allowed
verify-log-vs-metrics pod0:pod0c0 0 20

# pod0: single container, reserve 400m CPU and use it.
# "yes" should show up in top with 40 % CPU consumption.
out ""
out "### Busy single-container pod"
CPUREQ="400m" MEMREQ="" CPULIM="400m" MEMLIM=""
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: 400mCPU" CONTCOUNT=1 WORK='yes>/dev/null & ' create podpools-busybox
report allowed
verify-log-vs-metrics pod1:pod1c0 30 50

out ""
out "### Idle four-container pod"
CPUREQ="100m" CPULIM="100m"
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: 400mCPU" CONTCOUNT=4 create podpools-busybox
report allowed
verify-metrics-has-line 'pool_cpu_usage{CPUs="[0-9]-[0-9]",container_name="pod2:pod2c0,pod2:pod2c1,pod2:pod2c2,pod2:pod2c3",def_name="400mCPU",memory="1",pod_name="pod2",policy="podpools",pool_size="2000",pretty_name="400mCPU\[[0-9]\]"}'
verify-log-vs-metrics pod2:pod2c3 0 20

out ""
out "### Busy four-container pod"
CPUREQ="100m" CPULIM="100m"
POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: 400mCPU" CONTCOUNT=4 WORK='yes>/dev/null & ' create podpools-busybox
report allowed
verify-log-vs-metrics pod3:pod3c3 30 50

out ""
out "### Multicontainer pod, no annotations. Runs on shared CPUs."
CPUREQ="" CPULIM=""
CONTCOUNT=2 create podpools-busybox
report allowed
vm-command "curl --silent $metrics_url | grep -v ^cgroup_"
verify-log-vs-metrics pod4:pod4c1 0 20

out ""
out "### Multicontainer pod in kube-system namespace. Runs on reserved CPUs."
CPUREQ="" CPULIM=""
namespace=kube-system CONTCOUNT=3 create podpools-busybox
report allowed
vm-command "curl --silent $metrics_url | grep -v ^cgroup_"
# There should be kube-apiserver, etcd etc. running on reserved CPUs as well,
# therefore allow a lot of CPU usage yet pod5 is not doing anything.
verify-log-vs-metrics pod5:pod5c1 0 100

cleanup


================================================
FILE: test/e2e/policies.test-suite/podpools/n4c16/test06-prometheus-metrics/podpools-metrics.cfg
================================================
policy:
  Active: podpools
  ReservedResources:
    CPU: 1
  podpools:
    Pools:
      - Name: 400mCPU
        Instances: 90 %
        CPU: 2
        MaxPods: 5
        # (2000m CPUs/pool) / (5 pods/pool) = 400m CPUs/pod
instrumentation:
  HTTPEndpoint: :8891
  PrometheusExport: true
logger:
  Debug: resource-manager,cache,policy,memory
  Klog:
    skip_headers: true


================================================
FILE: test/e2e/policies.test-suite/podpools/n4c16/test07-custom-default-pool/code.var.sh
================================================
# Launch cri-resmgr with a custom default pool and many highperf
# pools. The CPUs in the custom default pool are disjoint from CPUs in
# the reserved pool. 100 % of remaining CPUs are allocated to highperf
# pools.
terminate cri-resmgr
cri_resmgr_cfg=${TEST_DIR}/podpools-custom-default.cfg launch cri-resmgr

cleanup() {
    ( kubectl delete pods --all --now --wait )
    ( kubectl delete pod -n kube-system pod0c-mysystem --now --wait --ignore-not-found )
    ( kubectl delete namespace daemons --now --wait --ignore-not-found )
}

cleanup

namespace=kube-system NAME=pod0c-mysystem CONTCOUNT=2 create podpools-busybox
kubectl create namespace daemons
namespace=daemons NAME=pod0c-mydaemon CONTCOUNT=2 create podpools-busybox
report allowed
verify 'len(cpus["pod0c-mysystemc0"]) == 1' \
       'len(cpus["pod0c-mydaemonc0"]) == 3' \
       'disjoint_sets(cpus["pod0c-mysystemc0"], cpus["pod0c-mydaemonc0"])'

NAME=pod1c-highperf POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: highperf" CPUREQ=2 CPULIM=2 MEMREQ="" MEMLIM="" create podpools-busybox
NAME=pod2c-highperf POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: highperf" CPUREQ=2 CPULIM=2 MEMREQ="" MEMLIM="" create podpools-busybox
NAME=pod3c-highperf POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: highperf" CPUREQ=2 CPULIM=2 MEMREQ="" MEMLIM="" create podpools-busybox
NAME=pod4c-highperf POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: highperf" CPUREQ=2 CPULIM=2 MEMREQ="" MEMLIM="" create podpools-busybox
report allowed
verify 'len(cpus["pod1c-highperfc0"]) == 2' \
       'len(cpus["pod2c-highperfc0"]) == 2' \
       'len(cpus["pod3c-highperfc0"]) == 2' \
       'len(cpus["pod4c-highperfc0"]) == 2' \
       'disjoint_sets(cpus["pod1c-highperfc0"], cpus["pod2c-highperfc0"], cpus["pod3c-highperfc0"], cpus["pod4c-highperfc0"])'

cleanup
vm-command "cat < cri-resmgr.output.txt > cri-resmgr-podpools-single-pool.output.txt"
terminate cri-resmgr
launch cri-resmgr


================================================
FILE: test/e2e/policies.test-suite/podpools/n4c16/test07-custom-default-pool/podpools-custom-default.cfg
================================================
policy:
  Active: podpools
  ReservedResources:
    CPU: cpuset:0
  podpools:
    Pools:
      - Name: default
        CPU: 3
      - Name: highperf
        Instances: 100%
        CPU: 2
        MaxPods: 1
logger:
  Debug: resource-manager,cache,policy,memory
  Klog:
    skip_headers: true


================================================
FILE: test/e2e/policies.test-suite/podpools/n4c16/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/policies.test-suite/podpools/podpools-busybox.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  $(if [ -n "$POD_ANNOTATION" ]; then echo "
  annotations:
    $POD_ANNOTATION
  "; fi)
  labels:
    app: ${NAME}
spec:
  containers:
  $(for contnum in $(seq 1 ${CONTCOUNT}); do echo "
  - name: ${NAME}c$(( contnum - 1 ))
    image: busybox
    imagePullPolicy: IfNotPresent
    command:
      - sh
      - -c
      - ${WORK}echo ${NAME}c$(( contnum - 1 )) \$(sleep inf)
    $(if [ -n "${CPUREQ}" ]; then echo "
    resources:
      requests:
        cpu: ${CPUREQ}
        $(if [ -n "${MEMREQ}" ]; then echo "
        memory: '${MEMREQ}'
        "; fi)
      $(if [ -n "${CPULIM}" ]; then echo "
      limits:
        cpu: ${CPULIM}
        $(if [ -n "$MEMLIM" ]; then echo "
        memory: '${MEMLIM}'
        "; fi)
    "; fi)
    "; fi)
  "; done )
  terminationGracePeriodSeconds: 1


================================================
FILE: test/e2e/policies.test-suite/static-pools/README.txt
================================================
# E2E static-pools policy test

## Requirements

This test requires containerd v1.4 or later on the VM. Earlier
containerd versions fail to mount container images built on top of
Clear Linux base image. That includes mounting cri-resmgr-webhook.

`cri-resmgr-webhook` image must be present on the host (`make
images`). The latest image in `docker images cri-resmgr-webhook` list
will be installed and tested on the VM.


================================================
FILE: test/e2e/policies.test-suite/static-pools/cmk-exclusive.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  labels:
    app: ${NAME}
spec:
  terminationGracePeriodSeconds: 1
  tolerations:
    - key: 'cmk'
      operator: 'Equal'
      value: 'true'
      effect: 'NoSchedule'
  containers:
    - name: ${NAME}c0
      image: busybox
      imagePullPolicy: IfNotPresent
      env:
$([ -z $STP_POOL ] || echo "
        - name: STP_POOL
          value: '${STP_POOL}'")
$([ -z $STP_SOCKET_ID ] || echo "
        - name: STP_SOCKET_ID
          value: '${STP_SOCKET_ID}'")
      command: ['sh', '-c']
      args:
        - 'while :; do echo ${NAME}c0 CMK_CPUS_ASSIGNED=\"\$CMK_CPUS_ASSIGNED\"; sleep 1; done'
      resources:
        requests:
          cpu: ${CPU}
$([ "$EXCLCORES" = "omit" ] || echo "
          cmk.intel.com/exclusive-cores: '${EXCLCORES}'")
        limits:
          cpu: ${CPU}
$([ "$EXCLCORES" = "omit" ] || echo "
          cmk.intel.com/exclusive-cores: '${EXCLCORES}'")


================================================
FILE: test/e2e/policies.test-suite/static-pools/cmk-isolate.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  labels:
    app: ${NAME}
spec:
  terminationGracePeriodSeconds: 1
  tolerations:
    - key: 'cmk'
      operator: 'Equal'
      value: 'true'
      effect: 'NoSchedule'
  containers:
    - name: ${NAME}c0
      image: busybox
      imagePullPolicy: IfNotPresent
      env:
$([ -z $STP_POOL ] || echo "
        - name: STP_POOL
          value: '${STP_POOL}'")
$([ -z $STP_SOCKET_ID ] || echo "
        - name: STP_SOCKET_ID
          value: '${STP_SOCKET_ID}'")
$([ "$CMDSPLIT" = "command_all" ] && echo "
      command: ['cmk', 'isolate' $CMK_ISOLATE, 'sh', '-c', 'while :; do echo ${NAME}c0 ${ECHO_VARS}; sleep 1; done']"
  [ "$CMDSPLIT" = "command_cmk_sh" ] && echo "
      command: ['cmk', 'isolate' $CMK_ISOLATE, 'sh', '-c']
      args: ['while :; do echo ${NAME}c0 ${ECHO_VARS}; sleep 1; done']"
  [ "$CMDSPLIT" = "command_cmk" ] && echo "
      command: ['cmk', 'isolate' $CMK_ISOLATE]
      args: ['sh', '-c', 'while :; do echo ${NAME}c0 ${ECHO_VARS}; sleep 1; done']")
      resources:
        requests:
          cpu: ${CPU}
          $([ -z $EXCLCORES ] || echo "cmk.intel.com/exclusive-cores: '${EXCLCORES}'")
        limits:
          cpu: ${CPU}
          $([ -z $EXCLCORES ] || echo "cmk.intel.com/exclusive-cores: '${EXCLCORES}'")


================================================
FILE: test/e2e/policies.test-suite/static-pools/cmk-tolerating-guaranteed.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  labels:
    app: ${NAME}
spec:
  tolerations:
    - {'key': 'cmk', 'operator': 'Equal', 'value': 'true', 'effect': 'NoSchedule'}
  containers:
  $(for contnum in $(seq 1 ${CONTCOUNT}); do echo "
  - name: ${NAME}c$(( contnum - 1 ))
    image: busybox
    imagePullPolicy: IfNotPresent
    command:
      - sh
      - -c
      - echo ${NAME}c$(( contnum - 1 )) \$(sleep inf)
    resources:
      requests:
        cpu: ${CPU}
        memory: '${MEM}'
      limits:
        cpu: ${CPU}
        memory: '${MEM}'
  "; done )
  terminationGracePeriodSeconds: 1


================================================
FILE: test/e2e/policies.test-suite/static-pools/cri-resmgr.cfg
================================================
policy:
  Active: static-pools
  ReservedResources:
    CPU: 750m
  static-pools:
    pools:
      shared:
        cpuLists:
          - Cpuset: 0-7
            Socket: 0
          - Cpuset: 8-15
            Socket: 1
        exclusive: false
logger:
  Debug: cri-resmgr,resource-manager,cache,policy,stp
  Klog:
    skip_headers: true


================================================
FILE: test/e2e/policies.test-suite/static-pools/n4c16/cri-resmgr-static-pools.cfg
================================================
policy:
  Active: static-pools
  ReservedResources:
    CPU: 750m
  static-pools:
    ConfFilePath: "/etc/cmk/pools.conf"
    LabelNode: true
    TaintNode: true
logger:
  Debug: cri-resmgr,resource-manager,cache,policy,stp
  Klog:
    skip_headers: true


================================================
FILE: test/e2e/policies.test-suite/static-pools/n4c16/py_consts.var.py
================================================
exclusive_cores={'node0/core0', 'node0/core1', 'node2/core0'}
shared_cores={'node1/core2', 'node1/core3'}
infra_cores={'node2/core1', 'node3/core2', 'node3/core3'}


================================================
FILE: test/e2e/policies.test-suite/static-pools/n4c16/test00-node-status/code.var.sh
================================================
# Test that the static-pools policy
# 1. labels the node with cmk.intel.com/cmk-node
# 2. advertises correct number of exclusive-cores resources
# 3. taints the node

# shellcheck disable=SC2148
cri_resmgr_cfg="$TEST_DIR/../cri-resmgr-static-pools.cfg" static-pools-relaunch-cri-resmgr

out ""
out "### Verifying that node has cmk-node label"
vm-run-until 'kubectl get nodes -o jsonpath="{.items[*].metadata.labels}" | grep \"cmk.intel.com/cmk-node\"\:\"true\"' ||
    error "cmk.intel.com/cmk-node label missing"

out ""
out "### Verifying that amount exclusive cores on node matches /etc/cmk/pools.conf"
vm-run-until 'kubectl get nodes -o jsonpath="{.items[*].status.allocatable}" | grep -q \"cmk.intel.com/exclusive-cores\"\:\"3\"' ||
    error "expected 3 allocatable cmk.intel.com/exclusive-cores"

out ""
out "### Creating a pod that should not be scheduled due to node taint"
( wait_t=2s create besteffort ) || {
    echo "failed as expected due to node taint"
}

out ""
out "### Verifying that scheduling normal pod failed"
vm-command 'kubectl describe pods/pod0 | grep -E "FailedScheduling .*cmk: true"' || {
    error "FailedScheduling expected but not found"
}


================================================
FILE: test/e2e/policies.test-suite/static-pools/n4c16/test01-exclusive-pods/code.var.sh
================================================
# Test that exclusive-cores containers
# 1. run on exclusive cores
# 2. are pinned according to STP_POOL and STP_SOCKET_ID
#    when "cmk isolate" is not used.
# 3. all exclusive cores can be consumed with and without
#    specifying STP_SOCKET_ID.

# shellcheck disable=SC2148
cri_resmgr_cfg="$TEST_DIR/../cri-resmgr-static-pools.cfg" static-pools-relaunch-cri-resmgr
export STP_POOL=exclusive

out ""
out "### Creating exclusive CMK pod with 1 exclusive core"
CPU=1000m STP_SOCKET_ID=1 EXCLCORES=1 create cmk-exclusive
report allowed
verify 'len(cores["pod0c0"]) == 1' \
       'packages["pod0c0"] == {"package1"}'

out ""
out "### Deleting exclusive CMK pod"
kubectl delete pods --all --now --wait

out ""
out "### Creating exclusive CMK pod with 2 exclusive cores"
CPU=1000m STP_SOCKET_ID=0 EXCLCORES=2 create cmk-exclusive
report allowed
verify 'len(cores["pod1c0"]) == 2' \
       'packages["pod1c0"] == {"package0"}'

out ""
out "### Deleting exclusive CMK pod"
kubectl delete pods --all --now --wait

out ""
out "### Creating two exclusive CMK pods with 1 exclusive core each"
n=2 CPU=1000m STP_SOCKET_ID=0 EXCLCORES=1 create cmk-exclusive
report allowed
verify 'len(cores["pod2c0"]) == 1' \
       'len(cores["pod3c0"]) == 1' \
       'disjoint_sets(cores["pod2c0"], cores["pod3c0"])' \
       'packages["pod2c0"] == packages["pod3c0"] == {"package0"}'

out ""
out "### Creating one more exclusive CMK pods, consuming all exclusive cores"
CPU=1000m STP_SOCKET_ID=1 EXCLCORES=1 create cmk-exclusive
report allowed
verify 'len(cores["pod2c0"]) == 1' \
       'len(cores["pod3c0"]) == 1' \
       'len(cores["pod4c0"]) == 1' \
       'disjoint_sets(cores["pod2c0"], cores["pod3c0"], cores["pod4c0"])' \
       'set.union(cores["pod2c0"], cores["pod3c0"], cores["pod4c0"]) == exclusive_cores'
kubectl delete pods --all --now --wait

out ""
out "### Test consuming all exclusive cores without specifying STP_SOCKET_ID"
n=3 CPU=1000m STP_SOCKET_ID="" EXCLCORES=1 create cmk-exclusive
verify 'len(cores["pod5c0"]) == 1' \
       'len(cores["pod6c0"]) == 1' \
       'len(cores["pod7c0"]) == 1' \
       'disjoint_sets(cores["pod5c0"], cores["pod6c0"], cores["pod7c0"])' \
       'set.union(cores["pod5c0"], cores["pod6c0"], cores["pod7c0"]) == exclusive_cores'


================================================
FILE: test/e2e/policies.test-suite/static-pools/n4c16/test02-pods-without-cmk/code.var.sh
================================================
# Test that normal pods/containers scheduled on a CMK node
# are running in the shared pool, yet there are not as many
# CPUs as required.

cri_resmgr_cfg="$TEST_DIR/../cri-resmgr-static-pools.cfg" static-pools-relaunch-cri-resmgr

out ""
out "### Creating a guaranteed pod, 1 CPU, goes to the shared bool"
CPU=1 create cmk-tolerating-guaranteed
report allowed
verify 'cores["pod0c0"].issubset(shared_cores)'

out ""
out "### Creating next guaranteed pod, 2 CPUs, goes to the shared pool"
CPU=2 create cmk-tolerating-guaranteed
report allowed
verify 'cores["pod0c0"].issubset(shared_cores)' \
       'cores["pod1c0"].issubset(shared_cores)'

out ""
out "### Creating next guaranteed pod, 4 CPUs, goes to the shared pool"
CPU=4 create cmk-tolerating-guaranteed
report allowed
verify 'cores["pod0c0"].issubset(shared_cores)' \
       'cores["pod1c0"].issubset(shared_cores)' \
       'cores["pod2c0"].issubset(shared_cores)'

out ""
out "### Creating next guaranteed pod, 8 CPUs, goes to the shared pool"
CPU=6 create cmk-tolerating-guaranteed
report allowed
verify 'cores["pod0c0"].issubset(shared_cores)' \
       'cores["pod1c0"].issubset(shared_cores)' \
       'cores["pod2c0"].issubset(shared_cores)' \
       'cores["pod3c0"].issubset(shared_cores)'


================================================
FILE: test/e2e/policies.test-suite/static-pools/n4c16/test03-cmk-isolate/code.var.sh
================================================
# Test that legacy exclusive-cores containers
# 1. run on exclusive cores
# 2. are pinned according to "cmk isolate" command
#    parameters
# 3. run without "cmk" existing on the image
# 3. all exclusive cores can be consumed

# shellcheck disable=SC2148
cri_resmgr_cfg="$TEST_DIR/../cri-resmgr-static-pools.cfg" static-pools-relaunch-cri-resmgr
export STP_POOL="" STP_SOCKET_ID=""

export CMK_ISOLATE=", '--conf-dir=/etc/cmk.conf', '--pool=exclusive', '--socket-id=1'"
out ""
out "### Creating pod 'cmk', 'isolate'$CMK_ISOLATE..."
CPU=1000m EXCLCORES=1 CMDSPLIT="command_all" create cmk-isolate
report allowed
verify 'len(cores["pod0c0"]) == 1' \
       'cores["pod0c0"].issubset(exclusive_cores)' \
       'packages["pod0c0"] == {"package1"}'

export CMK_ISOLATE=", '--socket-id=0', '--pool=exclusive'"
out ""
out "### Creating pod 'cmk', 'isolate'$CMK_ISOLATE..."
CPU=2000m EXCLCORES=2 CMDSPLIT="command_cmk_sh" create cmk-isolate
report allowed
verify 'len(cores["pod1c0"]) == 2' \
       'cores["pod1c0"].issubset(exclusive_cores)' \
       'packages["pod1c0"] == {"package0"}'

export CMK_ISOLATE=", '--pool=shared'"
out ""
out "### Creating pod 'cmk', 'isolate'$CMK_ISOLATE..."
CPU=1000m EXCLCORES="" CMDSPLIT="command_cmk" create cmk-isolate
report allowed
verify 'cores["pod2c0"] == shared_cores'

export CMDSPLIT="command_cmk"

export CMK_ISOLATE=", '--conf-dir=/etc/cmk.conf', '--pool=infra'"
out ""
out "### Creating pod 'cmk', 'isolate'$CMK_ISOLATE..."
CPU=1000m EXCLCORES="" create cmk-isolate
report allowed
verify 'cores["pod3c0"] == infra_cores'

out ""
out "### Deleting only exclusive CMK pods, leave shared/infra running"
kubectl delete pods/pod0 pods/pod1 --now --wait --ignore-not-found

export CMK_ISOLATE=", '--pool=exclusive'"
out ""
out "### Creating 3 exclusive pods 'cmk', 'isolate'$CMK_ISOLATE..."
n=3 CPU=1000m EXCLCORES=1 create cmk-isolate
report allowed
verify 'len(cores["pod4c0"]) == 1' \
       'len(cores["pod5c0"]) == 1' \
       'len(cores["pod6c0"]) == 1' \
       'disjoint_sets(cores["pod4c0"], cores["pod5c0"], cores["pod6c0"])' \
       'cores["pod4c0"].issubset(exclusive_cores)' \
       'cores["pod5c0"].issubset(exclusive_cores)' \
       'cores["pod6c0"].issubset(exclusive_cores)'


================================================
FILE: test/e2e/policies.test-suite/static-pools/n4c16/test04-cmk-isolate-noaffinity/code.var.sh
================================================
# Test that cmk isolate --no-affinity is effective on every pool
# with and without STP_POOL / STP_SOCKET_ID env vars.
# Test that all exclusive cores can be consumed with --no-affinity.

# shellcheck disable=SC2148
cri_resmgr_cfg="$TEST_DIR/../cri-resmgr-static-pools.cfg" static-pools-relaunch-cri-resmgr
export STP_POOL="" STP_SOCKET_ID="" CMDSPLIT="command_all"
export ECHO_VARS='CMK_CPUS_ASSIGNED="$CMK_CPUS_ASSIGNED" CMK_CPUS_SHARED="$CMK_CPUS_SHARED" CMK_CPUS_INFRA="$CMK_CPUS_INFRA"'

export CMK_ISOLATE=", '--conf-dir=/etc/cmk.conf', '--pool=exclusive', '--socket-id=0', '--no-affinity'"
out ""
out "### Creating no-affinity pod 'cmk', 'isolate'$CMK_ISOLATE..."
CPU=1000m EXCLCORES=1 create cmk-isolate
report allowed
verify 'len(cores["pod0c0"]) == 8'
cpus_assigned="$(kubectl logs pod0 | tail -n 1 | awk '{print $2}')"
cpus_shared="$(kubectl logs pod0 | tail -n 1 | awk '{print $3}')"
cpus_infra="$(kubectl logs pod0 | tail -n 1 | awk '{print $4}')"
[ "$cpus_assigned" == "CMK_CPUS_ASSIGNED=0,1" ] ||
    error "expected CMK_CPUS_ASSIGNED=0,1, got $cpus_assigned"
[ "$cpus_shared" == "CMK_CPUS_SHARED=4-6,7" ] ||
    error "expected CMK_CPUS_SHARED=4-6,7, got $cpus_shared"
[ "$cpus_infra" == "CMK_CPUS_INFRA=10-15" ] ||
    error "expected CMK_CPUS_INFRA=10-15, got $cpus_infra"

export CMK_ISOLATE=", '--conf-dir=/etc/cmk.conf', '--pool=exclusive', '--socket-id=1', '--no-affinity'"
out ""
out "### Creating no-affinity pod 'cmk', 'isolate'$CMK_ISOLATE..."
CPU=1000m EXCLCORES=1 STP_POOL="exclusive" STP_SOCKET_ID="1" create cmk-isolate
report allowed
verify 'len(cores["pod1c0"]) == 8'
cpus_assigned="$(kubectl logs pod1 | tail -n 1 | awk '{print $2}')"
[ "$cpus_assigned" == "CMK_CPUS_ASSIGNED=8,9" ] ||
    error "expected CMK_CPUS_ASSIGNED=8,9, got $cpus_assigned"

export CMK_ISOLATE=", '--conf-dir=/etc/cmk.conf', '--pool=exclusive', '--no-affinity'"
out ""
out "### Creating no-affinity pod 'cmk', 'isolate'$CMK_ISOLATE..."
CPU=1000m EXCLCORES=1 STP_POOL="exclusive" create cmk-isolate
report allowed
verify 'len(cores["pod2c0"]) == 8'
cpus_assigned="$(kubectl logs pod2 | tail -n 1 | awk '{print $2}')"
[ "$cpus_assigned" == "CMK_CPUS_ASSIGNED=2,3" ] ||
    error "expected CMK_CPUS_ASSIGNED=2,3, got $cpus_assigned"

export CMK_ISOLATE=", '--no-affinity', '--pool=shared'"
out ""
out "### Creating no-affinity pod 'cmk', 'isolate'$CMK_ISOLATE..."
CPU=1000m EXCLCORES="" create cmk-isolate
report allowed
verify 'len(cores["pod3c0"]) == 8'
cpus_assigned="$(kubectl logs pod3 | tail -n 1 | awk '{print $2}')"
[ "$cpus_assigned" == "CMK_CPUS_ASSIGNED=4-6,7" ] ||
    error "expected CMK_CPUS_ASSIGNED=5-6,7, got $cpus_assigned"

export CMK_ISOLATE=", '--pool=infra', '--no-affinity'"
out ""
out "### Creating no-affinity pod 'cmk', 'isolate'$CMK_ISOLATE..."
CPU=1000m EXCLCORES="" create cmk-isolate
report allowed
verify 'len(cores["pod4c0"]) == 8'
cpus_assigned="$(kubectl logs pod4 | tail -n 1 | awk '{print $2}')"
[ "$cpus_assigned" == "CMK_CPUS_ASSIGNED=10-15" ] ||
    error "expected CMK_CPUS_ASSIGNED=10-15 got $cpus_assigned"


================================================
FILE: test/e2e/policies.test-suite/static-pools/n4c16/test05-negative-tests/code.var.sh
================================================

# shellcheck disable=SC2148
cri_resmgr_cfg="$TEST_DIR/../cri-resmgr-static-pools.cfg" static-pools-relaunch-cri-resmgr
export STP_POOL=exclusive

errmsg_zero_cores="static-pools: exclusive pool specified but the number of exclusive CPUs requested is 0"
errmsg_non_existing_pool="static-pools: non-existent pool"
errmsg_not_enough_exclcores="static-pools: not enough free cpu lists"

out ""
out "### Request cores from non-existing pool"
( CPU=1000m STP_SOCKET_ID=0 EXCLCORES=1 STP_POOL=elusive wait_t=5s create cmk-exclusive )  &&
    error "expected timeout, but pod launched with cores from non-existing pool"
vm-run-until "kubectl describe pods/pod0 | grep '$errmsg_non_existing_pool'" ||
    error "cannot find expected error message from pod description"
out "Failed as expected"

kubectl delete pods --all --now --wait || error "failed to delete pods"

out ""
out "### Request cores from non-existing socket"
( CPU=1000m STP_SOCKET_ID=2 EXCLCORES=1 wait_t=5s create cmk-exclusive )  &&
    error "expected timeout, but pod launched with cores from non-existing socket"
vm-run-until "kubectl describe pods/pod0 | grep '$errmsg_not_enough_exclcores'" ||
    error "cannot find expected error message from pod description"
out "Failed as expected"

kubectl delete pods --all --now --wait || error "failed to delete pods"

out ""
out "### Request exclusive pool but do not mention exclusive-cores"
( CPU=1000m STP_SOCKET_ID=0 EXCLCORES='omit' wait_t=5s create cmk-exclusive )  &&
    error "expected timeout, but pod launched without mentioning exclusive cores from the exclusive pool"
vm-run-until "kubectl describe pods/pod0 | grep '$errmsg_zero_cores'" ||
    error "cannot find expected error message from pod description"
out "Failed as expected"

kubectl delete pods --all --now --wait || error "failed to delete pods"

out ""
out "### Request 0 cores from exclusive pool"
( CPU=1000m STP_SOCKET_ID=0 EXCLCORES=0 wait_t=5s create cmk-exclusive )  &&
    error "expected timeout, but pod launched with 0 cores from the exclusive pool"
vm-run-until "kubectl describe pods/pod0 | grep '$errmsg_zero_cores'" ||
    error "cannot find expected error message from pod description"
out "Failed as expected"

kubectl delete pods --all --now --wait || error "failed to delete pods"

out ""
out "### Request more cores from socket 0 than available"
( CPU=3000m STP_SOCKET_ID=0 EXCLCORES=3 wait_t=5s create cmk-exclusive ) &&
    error "expected timeout, but pod got too many cores successfully"
vm-run-until "kubectl describe pods/pod0 | grep '$errmsg_not_enough_exclcores'" ||
    error "cannot find expected error message from pod description"
out "Failed as expected"

kubectl delete pods --all --now --wait || error "failed to delete pods"

out ""
out "### Request more cores from socket 1 than available"
( CPU=1000m STP_SOCKET_ID=1 EXCLCORES=2 wait_t=5s create cmk-exclusive ) &&
    error "expected timeout, but pod got too many cores successfully"
vm-run-until "kubectl describe pods/pod0 | grep '$errmsg_not_enough_exclcores'" ||
    error "cannot find expected error message from pod description"
out "Failed as expected"

kubectl delete pods --all --now --wait || error "failed to delete pods"


================================================
FILE: test/e2e/policies.test-suite/static-pools/n4c16/test99-cleanup/code.var.sh
================================================
# This test cleans up static-pools test suite configurations from vm.
# Other policy tests can be run after this test on the same vm without
# recreating the vm from scratch.

static-pools-cleanup


================================================
FILE: test/e2e/policies.test-suite/static-pools/n4c16/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/policies.test-suite/static-pools/n4c16/vm-files/etc/cmk/pools.conf
================================================
pools:
  exclusive:
    cpuLists:
    - Cpuset: 8,9
      Socket: 1
    - Cpuset: 0,1
      Socket: 0
    - Cpuset: 2,3
      Socket: 0
    exclusive: true
  shared:
    cpuLists:
    - Cpuset: 4-6,7
      Socket: 0
    exclusive: false
  infra:
    cpuLists:
    - Cpuset: 10-15
      Socket: 1
    exclusive: false


================================================
FILE: test/e2e/policies.test-suite/static-pools/static-pools-lib.source.sh
================================================
# shellcheck disable=SC2148
static-pools-relaunch-cri-resmgr() {
    local webhook_running=0
    out "# Relaunching cri-resmgr and agent, launch webhook if not already running"

    vm-command-q "kubectl get mutatingwebhookconfiguration/cri-resmgr" >& /dev/null && {
        webhook_running=1
    }
    # cleanup
    terminate cri-resmgr
    terminate cri-resmgr-agent
    vm-command "rm -rf /var/lib/cri-resmgr"
    extended-resources remove cmk.intel.com/exclusive-cpus >/dev/null

    # launch again
    launch cri-resmgr-agent
    launch cri-resmgr
    vm-run-until "! kubectl get node | grep NotReady" ||
        error "kubectl node is NotReady after launching cri-resmgr-agent and cri-resmgr"
    if [ "$webhook_running" == 0 ]; then
        vm-command-q "[ -f webhook/webhook-deployment.yaml ]" ||
            install cri-resmgr-webhook
        launch cri-resmgr-webhook
    fi
}

static-pools-cleanup() {
    ( terminate cri-resmgr-agent )
    ( uninstall cri-resmgr-webhook )
    ( extended-resources remove cmk.intel.com/exclusive-cpus >/dev/null )
    ( terminate cri-resmgr )
    vm-command 'kubectl taint node $(hostname) cmk=true:NoSchedule-' || true
}


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test01-pmem-node-assigning/code.var.sh
================================================
# Test that CPU-less PMEM nodes are assigned to closest nodes with CPU.

# Restart cri-resmgr in order to clear logs and make sure assignment
# is successful with installed cri-resmgr.
terminate cri-resmgr
launch cri-resmgr

CRI_RESMGR_OUTPUT_COMMAND="cat cri-resmgr.output.txt"

echo "Verify PMEM node assignment to CPU-ful nodes"
for expected_output in \
    "PMEM node #4 assigned to .*#2" \
    "PMEM node #5 assigned to .*#3" \
    "PMEM node #6 assigned to .*#0" \
    "PMEM node #7 assigned to .*#1"; do
    vm-command "$CRI_RESMGR_OUTPUT_COMMAND | grep -E '$expected_output'" ||
        command-error "expected PMEM assignment not found"
done


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test02-annotation-memory-type/code.var.sh
================================================
# Test that container memory is pinned according to memory-type annotation

# pod0c0 runs on node 1, uses only dram
# pod0c1 runs on node 2, uses only pmem
# pod0c2 runs on node 3, uses dram+pmem
# pod0c9 runs on root node (all non-reserved CPUs),
#     no memory-type restrictions (=> use all memory nodes)
MEM=250M MEMTYPEC0=dram MEMTYPEC1=pmem MEMTYPEC2=pmem,dram create memtype-guaranteed
report allowed
verify 'cpus["pod0c0"] == {"cpu1"}' \
       'mems["pod0c0"] == {"node1"}' \
       'cpus["pod0c1"] == {"cpu2"}' \
       'mems["pod0c1"] == {"node4"}' \
       'cpus["pod0c2"] == {"cpu3"}' \
       'mems["pod0c2"] == {"node3", "node5"}' \
       'cpus["pod0c9"] == {"cpu1", "cpu2", "cpu3"}' \
       'mems["pod0c9"] == {"node0", "node1", "node2", "node3", "node4", "node5", "node6", "node7"}'


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test02-annotation-memory-type/memtype-guaranteed.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  annotations:
    memory-type.cri-resource-manager.intel.com/container.${NAME}c0: ${MEMTYPEC0}
    memory-type.cri-resource-manager.intel.com/container.${NAME}c1: ${MEMTYPEC1}
    memory-type.cri-resource-manager.intel.com/container.${NAME}c2: ${MEMTYPEC2}
spec:
  containers:
    $(for CONT in 0 1 2; do echo "
    - name: ${NAME}c${CONT}
      image: busybox
      imagePullPolicy: IfNotPresent
      command: ['sh', '-c', 'echo ${NAME}c${CONT} \$(sleep inf)']
      resources:
        requests:
          cpu: 500m
          memory: ${MEM}
        limits:
          cpu: 500m
          memory: ${MEM}
    "; done)
    - name: ${NAME}c9
      image: busybox
      imagePullPolicy: IfNotPresent
      command: ['sh', '-c', 'echo ${NAME}c9 \$(sleep inf)']
      resources:
        requests:
          cpu: 500m
          memory: ${MEM}
        limits:
          cpu: 500m
          memory: ${MEM}


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test02-annotation-memory-type-deprecated-syntax/code.var.sh
================================================
# Test that container memory is pinned according to memory-type annotation

# pod0c0 runs on node 1, uses only dram
# pod0c1 runs on node 2, uses only pmem
# pod0c2 runs on node 3, uses dram+pmem
# pod0c9 runs on root node (all non-reserved CPUs),
#     no memory-type restrictions (=> use all memory nodes)
MEM=250M MEMTYPEC0=dram MEMTYPEC1=pmem MEMTYPEC2=pmem,dram create memtype-guaranteed
report allowed
verify 'cpus["pod0c0"] == {"cpu1"}' \
       'mems["pod0c0"] == {"node1"}' \
       'cpus["pod0c1"] == {"cpu2"}' \
       'mems["pod0c1"] == {"node4"}' \
       'cpus["pod0c2"] == {"cpu3"}' \
       'mems["pod0c2"] == {"node3", "node5"}' \
       'cpus["pod0c9"] == {"cpu1", "cpu2", "cpu3"}' \
       'mems["pod0c9"] == {"node0", "node1", "node2", "node3", "node4", "node5", "node6", "node7"}'


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test02-annotation-memory-type-deprecated-syntax/memtype-guaranteed.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  annotations:
    cri-resource-manager.intel.com/memory-type: |
      ${NAME}c0: ${MEMTYPEC0}
      ${NAME}c1: ${MEMTYPEC1}
      ${NAME}c2: ${MEMTYPEC2}
spec:
  containers:
    $(for CONT in 0 1 2; do echo "
    - name: ${NAME}c${CONT}
      image: busybox
      imagePullPolicy: IfNotPresent
      command: ['sh', '-c', 'echo ${NAME}c${CONT} \$(sleep inf)']
      resources:
        requests:
          cpu: 500m
          memory: ${MEM}
        limits:
          cpu: 500m
          memory: ${MEM}
    "; done)
    - name: ${NAME}c9
      image: busybox
      imagePullPolicy: IfNotPresent
      command: ['sh', '-c', 'echo ${NAME}c9 \$(sleep inf)']
      resources:
        requests:
          cpu: 500m
          memory: ${MEM}
        limits:
          cpu: 500m
          memory: ${MEM}


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test03-coldstart/bb-coldstart.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  annotations:
    memory-type.cri-resource-manager.intel.com/container.${NAME}c0: dram,pmem
    cold-start.cri-resource-manager.intel.com/container.${NAME}c0: |
      duration: ${DURATION}
spec:
  containers:
    - name: ${NAME}c0
      image: busybox
      imagePullPolicy: IfNotPresent
      command:
        - sh
        - -c
        - 'cold_alloc=\$(dd if=/dev/zero bs=${COLD_ALLOC_KB}kB count=1 | tr \"\\\0\" \"x\");
           sh -c \"paused after cold_alloc \\\$(sleep inf)\";
           warm_alloc=\$(dd if=/dev/zero bs=${WARM_ALLOC_KB}kB count=1 | tr \"\\\0\" \"x\");
           sh -c \"paused after warm_alloc \\\$(sleep inf)\";
           echo ${NAME}c0 \$(sleep inf); # needed for pod resource discovery'
      resources:
        requests:
          cpu: 500m
          memory: ${MEM}
        limits:
          cpu: 500m
          memory: ${MEM}


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test03-coldstart/code.var.sh
================================================
# Test that a cold-started pod...
# 1. is allowed to allocate memory only from PMEM nodes
#    during cold period (of length $DURATION).
# 2. is restricted from the very beginning of pod execution:
#    immediately allocated memory blob consumes PMEM from expected node.
# 3. is allowed to allocate memory from both PMEM and DRAM after
#    the cold period.
# 4. is no more restricted after $DURATION + 1s has passed in pod:
#    warm-allocated memory is not taken from PMEM nodes.

PMEM_NODES='{"node4", "node5", "node6", "node7"}'

# pmem-used returns total MemUsed (allocated) memory on PMEM nodes
pmem-used() {
    local pmem_nodes_shell=${PMEM_NODES//[\" ]/}
    vm-command "cat /sys/devices/system/node/$pmem_nodes_shell/meminfo | awk '/MemUsed:/{mem+=\$4}END{print mem}'" >/dev/null ||
        command-error "cannot read PMEM usage from node $node"
    echo "$COMMAND_OUTPUT"
}

CRI_RESMGR_OUTPUT="cat cri-resmgr.output.txt"

PMEM_USED_BEFORE_POD0="$(pmem-used)"

DURATION=10s
COLD_ALLOC_KB=$((50 * 1024))
WARM_ALLOC_KB=$((100 * 1024))
MEM=1G
create bb-coldstart

echo "Wait that coldstart period is started for the pod"
vm-run-until "$CRI_RESMGR_OUTPUT | grep 'coldstart: triggering coldstart for pod0:pod0c0'" ||
    error "cri-resmgr did not report triggering coldstart period"

verify 'cores["pod0c0"] == {"node1/core0"}' \
       "mems['pod0c0'] == {'node7'}"

echo "Wait that the pod has finished memory allocation during cold period."
vm-run-until "pgrep -f '^sh -c paused after cold_alloc'" >/dev/null ||
    error "cold memory allocation timed out"

echo "Verify PMEM consumption during cold period."
# meminfo MemUsed vs dd bytes error margin, use 10%
PMEM_ERROR_MARGIN=$((COLD_ALLOC_KB / 10))
sleep 1
PMEM_USED_COLD_POD0="$(pmem-used)"
PMEM_COLD_CONSUMED=$(( $PMEM_USED_COLD_POD0 - $PMEM_USED_BEFORE_POD0 ))
if (( $PMEM_COLD_CONSUMED + $PMEM_ERROR_MARGIN < $COLD_ALLOC_KB )); then
    error "pod0 did not allocate ${COLD_ALLOC_KB}kB from PMEM. MemUsed PMEM delta: $PMEM_COLD_CONSUMED"
else
    echo "### Verified: PMEM memory consumed during cold period: $PMEM_COLD_CONSUMED kB, pod script allocated: $COLD_ALLOC_KB kB"
fi

coldstarts=$(vm-command-q "$CRI_RESMGR_OUTPUT | grep 'finishing coldstart period for pod0:pod0c0' | wc -l")
echo "Wait that cri-resmgr finishes coldstart period within 5s + $DURATION."
sleep 5s
vm-run-until --timeout ${DURATION%s} "[ \$($CRI_RESMGR_OUTPUT | grep 'finishing coldstart period for pod0:pod0c0' | wc -l) -gt $coldstarts ]" ||
    error "cri-resmgr did not report finishing coldstart period within $DURATION"

vm-command "$CRI_RESMGR_OUTPUT | grep 'pinning to memory 1,7'" ||
    error "cri-resmgr did not report pinning to expected memory nodes"

verify 'cores["pod0c0"] == {"node1/core0"}' \
       'mems["pod0c0"] == {"node1", "node7"}'

echo "Let the pod continue from cold_alloc to warm_alloc."
vm-command 'kill -9 $(pgrep -f "^sh -c paused after cold_alloc")'

echo "Make sure that bb-coldstart finishes allocating memory in warm mode."
vm-run-until "pgrep -f '^sh -c paused after warm_alloc'"  ||
    error "warm memory allocation timed out"

echo "Verify (soft): PMEM consumption after cold period."
sleep 1
PMEM_USED_WARM_POD0="$(pmem-used)"
PMEM_WARM_CONSUMED=$(( $PMEM_USED_WARM_POD0 - $PMEM_USED_COLD_POD0 ))
if (( $PMEM_WARM_CONSUMED > 0 )); then
    echo "### Verify (soft) failed: pod0 allocated $WARM_ALLOC_KB kB from PMEM. Should have been taken from DRAM."
else
    echo "### Verified (soft): PMEM memory consumption delta during warm period: $PMEM_WARM_CONSUMED kB, pod script allocated: $WARM_ALLOC_KB kB"
fi


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test03-coldstart-deprecated-syntax/bb-coldstart.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  annotations:
    cri-resource-manager.intel.com/memory-type: |
      ${NAME}c0: dram,pmem
    cri-resource-manager.intel.com/cold-start: |
      ${NAME}c0:
        duration: ${DURATION_S}s
spec:
  containers:
    - name: ${NAME}c0
      image: busybox
      imagePullPolicy: IfNotPresent
      command:
        - sh
        - -c
        - 'cold_alloc=\$(dd if=/dev/zero bs=${COLD_ALLOC_KB}kB count=1 | tr \"\\\0\" \"x\");
           sh -c \"paused after cold_alloc \\\$(sleep inf)\";
           warm_alloc=\$(dd if=/dev/zero bs=${WARM_ALLOC_KB}kB count=1 | tr \"\\\0\" \"x\");
           sh -c \"paused after warm_alloc \\\$(sleep inf)\";
           echo ${NAME}c0 \$(sleep inf); # needed for pod resource discovery'
      resources:
        requests:
          cpu: 500m
          memory: ${MEM}
        limits:
          cpu: 500m
          memory: ${MEM}


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test03-coldstart-deprecated-syntax/code.var.sh
================================================
# Test that a cold-started pod...
# 1. is allowed to allocate memory only from PMEM nodes
#    during cold period (of length $DURATION_S).
# 2. is restricted from the very beginning of pod execution:
#    immediately allocated memory blob consumes PMEM from expected node.
# 3. is allowed to allocate memory from both PMEM and DRAM after
#    the cold period.
# 4. is no more restricted after $DURATION_S + 1s has passed in pod:
#    warm-allocated memory is not taken from PMEM nodes.

PMEM_NODES='{"node4", "node5", "node6", "node7"}'

# pmem-used returns total MemUsed (allocated) memory on PMEM nodes
pmem-used() {
    local pmem_nodes_shell=${PMEM_NODES//[\" ]/}
    vm-command "cat /sys/devices/system/node/$pmem_nodes_shell/meminfo | awk '/MemUsed:/{mem+=\$4}END{print mem}'" >/dev/null ||
        command-error "cannot read PMEM usage from node $node"
    echo "$COMMAND_OUTPUT"
}

CRI_RESMGR_OUTPUT="cat cri-resmgr.output.txt"

PMEM_USED_BEFORE_POD0="$(pmem-used)"

DURATION_S=10
COLD_ALLOC_KB=$((50 * 1024))
WARM_ALLOC_KB=$((100 * 1024))
MEM=1G
create bb-coldstart

echo "Wait that coldstart period is started for the pod"
vm-run-until "$CRI_RESMGR_OUTPUT | grep 'coldstart: triggering coldstart for pod0:pod0c0'" ||
    error "cri-resmgr did not report triggering coldstart period"

verify 'cores["pod0c0"] == {"node1/core0"}' \
       "mems['pod0c0'] == {'node7'}"

echo "Wait that the pod has finished memory allocation during cold period."
vm-run-until "pgrep -f '^sh -c paused after cold_alloc'" >/dev/null ||
    error "cold memory allocation timed out"

echo "Verify PMEM consumption during cold period."
# meminfo MemUsed vs dd bytes error margin, use 10%
PMEM_ERROR_MARGIN=$((COLD_ALLOC_KB / 10))
sleep 1
PMEM_USED_COLD_POD0="$(pmem-used)"
PMEM_COLD_CONSUMED=$(( $PMEM_USED_COLD_POD0 - $PMEM_USED_BEFORE_POD0 ))
if (( $PMEM_COLD_CONSUMED + $PMEM_ERROR_MARGIN < $COLD_ALLOC_KB )); then
    error "pod0 did not allocate ${COLD_ALLOC_KB}kB from PMEM. MemUsed PMEM delta: $PMEM_COLD_CONSUMED"
else
    echo "### Verified: PMEM memory consumed during cold period: $PMEM_COLD_CONSUMED kB, pod script allocated: $COLD_ALLOC_KB kB"
fi

coldstarts=$(vm-command-q "$CRI_RESMGR_OUTPUT | grep 'finishing coldstart period for pod0:pod0c0' | wc -l")
echo "Wait that cri-resmgr finishes coldstart period within $(($DURATION_S + 10)) seconds."
vm-run-until --timeout $((DURATION_S + 10)) "[ \$($CRI_RESMGR_OUTPUT | grep 'finishing coldstart period for pod0:pod0c0' | wc -l) -gt $coldstarts ]" ||
    error "cri-resmgr did not report finishing coldstart period within $DURATION_S seconds"

vm-command "$CRI_RESMGR_OUTPUT | grep 'pinning to memory 1,7'" ||
    error "cri-resmgr did not report pinning to expected memory nodes"

verify 'cores["pod0c0"] == {"node1/core0"}' \
       'mems["pod0c0"] == {"node1", "node7"}'

echo "Let the pod continue from cold_alloc to warm_alloc."
vm-command 'kill -9 $(pgrep -f "^sh -c paused after cold_alloc")'

echo "Make sure that bb-coldstart finishes allocating memory in warm mode."
vm-run-until "pgrep -f '^sh -c paused after warm_alloc'"  ||
    error "warm memory allocation timed out"

echo "Verify (soft): PMEM consumption after cold period."
sleep 1
PMEM_USED_WARM_POD0="$(pmem-used)"
PMEM_WARM_CONSUMED=$(( $PMEM_USED_WARM_POD0 - $PMEM_USED_COLD_POD0 ))
if (( $PMEM_WARM_CONSUMED > 0 )); then
    echo "### Verify (soft) failed: pod0 allocated $WARM_ALLOC_KB kB from PMEM. Should have been taken from DRAM."
else
    echo "### Verified (soft): PMEM memory consumption delta during warm period: $PMEM_WARM_CONSUMED kB, pod script allocated: $WARM_ALLOC_KB kB"
fi


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test04-dynamic-page-demotion/bb-memload.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  annotations:
    memory-type.cri-resource-manager.intel.com/pod: dram,pmem
spec:
  containers:
    - name: ${NAME}c0
      image: busybox
      imagePullPolicy: IfNotPresent
      command:
        - sh
        - -c
        - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c0; done | awk '{r+=1;if(r<${WORN%M}*1024*1024/$BSIZE){worn[r]=\$1;wr+=1;}if($PRINT_WRBYTES_IF)print wr*$BSIZE;}'
      resources:
        requests:
          cpu: ${CPU}
          memory: $(( ${WORN%M} * 1024 * 1024 / $BSIZE + 100000 ))k
        limits:
          cpu: ${CPU}
          memory: $(( ${WORN%M} * 1024 * 1024 / $BSIZE + 100000 ))k
    - name: ${NAME}c1
      image: busybox
      imagePullPolicy: IfNotPresent
      command:
        - sh
        - -c
        - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c1; done | awk '{r+=1;wmrn[r%(${WMRN%M}*1024*1024/$BSIZE)]=\$1;wr+=1;if($PRINT_WRBYTES_IF)print wr*$BSIZE;}'
      resources:
        requests:
          cpu: ${CPU}
          memory: $(( ${WMRN%M} * 1024 * 1024 / $BSIZE + 100000 ))k
        limits:
          cpu: ${CPU}
          memory: $(( ${WMRN%M} * 1024 * 1024 / $BSIZE + 100000 ))k
    - name: ${NAME}c2
      image: busybox
      imagePullPolicy: IfNotPresent
      command:
        - sh
        - -c
        - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c2; done | awk '{r+=1;if (worm[r%(${WORM%M}*1024*1024/$BSIZE)]!=\$1){worm[r%(${WORM%M}*1024*1024/$BSIZE)]=\$1;wr+=1;}if($PRINT_WRBYTES_IF)print wr*$BSIZE;}'
      resources:
        requests:
          cpu: ${CPU}
          memory: $(( ${WORM%M} * 1024 * 1024 / $BSIZE + 100000 ))k
        limits:
          cpu: ${CPU}
          memory: $(( ${WORM%M} * 1024 * 1024 / $BSIZE + 100000 ))k
    - name: ${NAME}c3
      image: busybox
      imagePullPolicy: IfNotPresent
      command:
        - sh
        - -c
        - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c3; done | awk '{r+=1;if (wmrm[r%(${WMRM%M}*1024*1024/$BSIZE)]!=\$1 || length(\$1) > 0){wmrm[r%(${WMRM%M}*1024*1024/$BSIZE)]=\$1;wr+=1;}if($PRINT_WRBYTES_IF)print wr*$BSIZE;}'
      resources:
        requests:
          cpu: ${CPU}
          memory: $(( ${WMRM%M} * 1024 * 1024 / $BSIZE + 100000 ))k
        limits:
          cpu: ${CPU}
          memory: $(( ${WMRM%M} * 1024 * 1024 / $BSIZE + 100000 ))k


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test04-dynamic-page-demotion/code.var.sh
================================================
# Test migrating memory pages from DRAM to PMEM.
# - Memory pages that are written once and never read
#   must be migrated to PMEM and must stay there.
# - Memory pages that are actively written and read
#   must not be migrated to PMEM.
# - Migration speed is as configured.

vm-command "echo 0 > /proc/sys/kernel/numa_balancing || true"

# Relaunch cri-resmgr with dynamic page demotion configuration.
cri_resmgr_cfg=$TEST_DIR/cri-resmgr-dynamic-page-demotion.cfg
terminate cri-resmgr
launch cri-resmgr

# Different memory usage profiles are implemented with awk
# in order to manage with the same busybox image as other tests.
# Memory size parameters for the busybox memory load pod:
# - BSIZE: Block size in bytes (length of each stored string)
#   The larger the block the faster the awk goes through its memory.
#   If too large, memory for strings is no more allocated from heap
#   which makes page tracking harder and breaks this test.
# - WORN: Write Once Read Never
# - WORM: Write Once Read Many
# - WMRN: Write Many Read Never
# - WMRM: Write Many Read Many
PRINT_WRBYTES_IF="wr%1000==0 && wr<10000"
CPU=500m
BSIZE=4096
awkmem=2M
WORN=$awkmem WORM=$awkmem WMRN=$awkmem WMRM=$awkmem create bb-memload

# Calculate page migration speed from cri-resmgr configuration.
pages_per_second_per_process="$(awk '
    /MaxPageMoveCount:/{mpmc=$2}
    /PageMoveInterval:/{gsub(/[^0-9]/, "", $2); pmi=$2}
    END{print mpmc/pmi}
    ' < "$cri_resmgr_cfg")"

# After how many rounds (seconds) first migrations should be visible.
first_migrations_visible="$(awk '
    /PageScanInterval:/{gsub(/[^0-9]/, "", $2); print $2+8}
    ' < "$cri_resmgr_cfg")"

# Expected migrated number of pages when fully migrated.
pages_error_margin=100
fully_migrated_threshold=$(( ${awkmem%M} * 1024 * 1024 / 4096 - pages_error_margin ))

# Maximum number of pages in PMEM when not migrated.
not_migrated_threshold=$pages_error_margin

# Watch memory page locations and validate results.
memload_stats="$OUTPUT_DIR/memload-stats.txt"
echo -n "" > "$memload_stats"
max_rounds=30
round=0
declare -A pmem_pages_prev # number of pages in PMEM in previous round
for wxrx in wmrm wmrn worm worn; do
    pmem_pages_prev[$wxrx]=0
done
while (( round < max_rounds )); do
    vm-command-q '
       cat /sys/devices/system/node/node[0-7]/meminfo | awk "/Active:/{a[\$2]=(\$4/1024)}END{s=\"active mem\";for(n=0;n<8;n++){s=sprintf(\"%s N%d=%.0fM\",s,n,a[n])}print s}"
       for p in $(pidof awk); do
           awkinfo=$(grep -a -o -E w[om]r[nm] /proc/$p/cmdline | head -n 1)
           rss=$(awk "/VmRSS:/{print \$2}" < /proc/$p/status);
           pages=$(echo $(grep -v file= /proc/$p/numa_maps | tr " " "\n" | awk -F= "/N([0-9])/{s[\$1]+=\$2}END{for(n=0;n<8;n++)if (s[\"N\"n]>0)print \"N\"n\"=\"s[\"N\"n]}"))
           echo "$awkinfo" pid "$p" VmRSS "$rss" kB, "pages:" "$pages"
       done' | while read line; do echo "round $round $line"; done | tee -a "$memload_stats"

    echo "validating..."

    # Check that at least something has migrated after scan period.
    if (( round > first_migrations_visible )); then
        grep -q -E 'pages:.*N[4-7]' "$memload_stats" ||
            error "any of the awk processes was not migrated to PMEM in time"
    fi

    # Validate PMEM page migration speed.
    # Allow double the configured speed because stats polling interval > 1s.
    for wxrx in wmrm wmrn worm worn; do
        pmem_pages_now="$(grep "round $round $wxrx .*pages:" < "$memload_stats" | awk 'BEGIN{RS=" ";FS="=";pmem=0}/N[4-9]/{pmem+=$2}END{print pmem}')"
        if (( pmem_pages_now - pmem_pages_prev[$wxrx] > 2 * pages_per_second_per_process )); then
            error "number of PMEM pages of $wxrx grew too quickly on this round"
        fi
        pmem_pages_prev[$wxrx]=$pmem_pages_now
    done

    # Check that write-once-read-never (worn) has migrated and stays in PMEM.
    if (( round > 20 )); then
        worn_pmem_pages="$(grep "round $round worn .*pages:" < "$memload_stats" | awk 'BEGIN{RS=" ";FS="=";pmem=0}/N[4-9]/{pmem+=$2}END{print pmem}')"
        if (( worn_pmem_pages < fully_migrated_threshold )); then
            error "write-once-read-never was expected to end up and stay in PMEM, but only $worn_pmem_pages pages in PMEM."
        fi
    fi

    # Check that write-many-read-many and -read-never (wmrm and wmrn) stay in DRAM.
    for wmrx in wmrm wmrn; do
        wmrx_pmem_pages="$(grep "round $round $wmrx .*pages:" < "$memload_stats" | awk 'BEGIN{RS=" ";FS="=";pmem=0}/N[4-9]/{pmem+=$2}END{print pmem}')"
        if (( wmrx_pmem_pages > not_migrated_threshold )); then
            error "$wmrx was expected to stay in DRAM, but $wmrx_pmem_pages pages migrated to PMEM."
        fi
    done

    sleep 1 >/dev/null
    round=$(( round + 1 ))
done
echo "All rounds were good."
kubectl delete pods --all --now --wait


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test04-dynamic-page-demotion/cri-resmgr-dynamic-page-demotion.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 250m
resource-manager:
  control:
    page-migration:
      PageScanInterval: 10s
      PageMoveInterval: 1s
      MaxPageMoveCount: 100
logger:
  Debug: cri-resmgr,resource-manager,cache,policy
  Klog:
    skip_headers: true


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test04-dynamic-page-demotion-deprecated-syntax/bb-memload.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  annotations:
    cri-resource-manager.intel.com/memory-type: |
      pod0c0: dram,pmem
      pod0c1: dram,pmem
      pod0c2: dram,pmem
      pod0c3: dram,pmem
spec:
  containers:
    - name: ${NAME}c0
      image: busybox
      imagePullPolicy: IfNotPresent
      command:
        - sh
        - -c
        - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c0; done | awk '{r+=1;if(r<${WORN%M}*1024*1024/$BSIZE){worn[r]=\$1;wr+=1;}if($PRINT_WRBYTES_IF)print wr*$BSIZE;}'
      resources:
        requests:
          cpu: ${CPU}
          memory: $(( ${WORN%M} * 1024 * 1024 / $BSIZE + 100000 ))k
        limits:
          cpu: ${CPU}
          memory: $(( ${WORN%M} * 1024 * 1024 / $BSIZE + 100000 ))k
    - name: ${NAME}c1
      image: busybox
      imagePullPolicy: IfNotPresent
      command:
        - sh
        - -c
        - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c1; done | awk '{r+=1;wmrn[r%(${WMRN%M}*1024*1024/$BSIZE)]=\$1;wr+=1;if($PRINT_WRBYTES_IF)print wr*$BSIZE;}'
      resources:
        requests:
          cpu: ${CPU}
          memory: $(( ${WMRN%M} * 1024 * 1024 / $BSIZE + 100000 ))k
        limits:
          cpu: ${CPU}
          memory: $(( ${WMRN%M} * 1024 * 1024 / $BSIZE + 100000 ))k
    - name: ${NAME}c2
      image: busybox
      imagePullPolicy: IfNotPresent
      command:
        - sh
        - -c
        - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c2; done | awk '{r+=1;if (worm[r%(${WORM%M}*1024*1024/$BSIZE)]!=\$1){worm[r%(${WORM%M}*1024*1024/$BSIZE)]=\$1;wr+=1;}if($PRINT_WRBYTES_IF)print wr*$BSIZE;}'
      resources:
        requests:
          cpu: ${CPU}
          memory: $(( ${WORM%M} * 1024 * 1024 / $BSIZE + 100000 ))k
        limits:
          cpu: ${CPU}
          memory: $(( ${WORM%M} * 1024 * 1024 / $BSIZE + 100000 ))k
    - name: ${NAME}c3
      image: busybox
      imagePullPolicy: IfNotPresent
      command:
        - sh
        - -c
        - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c3; done | awk '{r+=1;if (wmrm[r%(${WMRM%M}*1024*1024/$BSIZE)]!=\$1 || length(\$1) > 0){wmrm[r%(${WMRM%M}*1024*1024/$BSIZE)]=\$1;wr+=1;}if($PRINT_WRBYTES_IF)print wr*$BSIZE;}'
      resources:
        requests:
          cpu: ${CPU}
          memory: $(( ${WMRM%M} * 1024 * 1024 / $BSIZE + 100000 ))k
        limits:
          cpu: ${CPU}
          memory: $(( ${WMRM%M} * 1024 * 1024 / $BSIZE + 100000 ))k


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test04-dynamic-page-demotion-deprecated-syntax/code.var.sh
================================================
# Test migrating memory pages from DRAM to PMEM.
# - Memory pages that are written once and never read
#   must be migrated to PMEM and must stay there.
# - Memory pages that are actively written and read
#   must not be migrated to PMEM.
# - Migration speed is as configured.

# Relaunch cri-resmgr with dynamic page demotion configuration.
cri_resmgr_cfg=$TEST_DIR/cri-resmgr-dynamic-page-demotion.cfg
terminate cri-resmgr
launch cri-resmgr

# Different memory usage profiles are implemented with awk
# in order to manage with the same busybox image as other tests.
# Memory size parameters for the busybox memory load pod:
# - BSIZE: Block size in bytes (length of each stored string)
#   The larger the block the faster the awk goes through its memory.
#   If too large, memory for strings is no more allocated from heap
#   which makes page tracking harder and breaks this test.
# - WORN: Write Once Read Never
# - WORM: Write Once Read Many
# - WMRN: Write Many Read Never
# - WMRM: Write Many Read Many
PRINT_WRBYTES_IF="wr%1000==0 && wr<10000"
CPU=500m
BSIZE=4096
awkmem=2M
WORN=$awkmem WORM=$awkmem WMRN=$awkmem WMRM=$awkmem create bb-memload

# Calculate page migration speed from cri-resmgr configuration.
pages_per_second_per_process="$(awk '
    /MaxPageMoveCount:/{mpmc=$2}
    /PageMoveInterval:/{gsub(/[^0-9]/, "", $2); pmi=$2}
    END{print mpmc/pmi}
    ' < "$cri_resmgr_cfg")"

# After how many rounds (seconds) first migrations should be visible.
first_migrations_visible="$(awk '
    /PageScanInterval:/{gsub(/[^0-9]/, "", $2); print $2+8}
    ' < "$cri_resmgr_cfg")"

# Expected migrated number of pages when fully migrated.
pages_error_margin=100
fully_migrated_threshold=$(( ${awkmem%M} * 1024 * 1024 / 4096 - pages_error_margin ))

# Maximum number of pages in PMEM when not migrated.
not_migrated_threshold=$pages_error_margin

# Watch memory page locations and validate results.
memload_stats="$OUTPUT_DIR/memload-stats.txt"
echo -n "" > "$memload_stats"
max_rounds=30
round=0
declare -A pmem_pages_prev # number of pages in PMEM in previous round
for wxrx in wmrm wmrn worm worn; do
    pmem_pages_prev[$wxrx]=0
done
while (( round < max_rounds )); do
    vm-command-q '
       cat /sys/devices/system/node/node[0-7]/meminfo | awk "/Active:/{a[\$2]=(\$4/1024)}END{s=\"active mem\";for(n=0;n<8;n++){s=sprintf(\"%s N%d=%.0fM\",s,n,a[n])}print s}"
       for p in $(pidof awk); do
           awkinfo=$(grep -a -o -E w[om]r[nm] /proc/$p/cmdline | head -n 1)
           rss=$(awk "/VmRSS:/{print \$2}" < /proc/$p/status);
           pages=$(echo $(grep -v file= /proc/$p/numa_maps | tr " " "\n" | awk -F= "/N([0-9])/{s[\$1]+=\$2}END{for(n=0;n<8;n++)if (s[\"N\"n]>0)print \"N\"n\"=\"s[\"N\"n]}"))
           echo "$awkinfo" pid "$p" VmRSS "$rss" kB, "pages:" "$pages"
       done' | while read line; do echo "round $round $line"; done | tee -a "$memload_stats"

    echo "validating..."

    # Check that at least something has migrated after scan period.
    if (( round > first_migrations_visible )); then
        grep -q -E 'pages:.*N[4-7]' "$memload_stats" ||
            error "any of the awk processes was not migrated to PMEM in time"
    fi

    # Validate PMEM page migration speed.
    # Allow double the configured speed because stats polling interval > 1s.
    for wxrx in wmrm wmrn worm worn; do
        pmem_pages_now="$(grep "round $round $wxrx .*pages:" < "$memload_stats" | awk 'BEGIN{RS=" ";FS="=";pmem=0}/N[4-9]/{pmem+=$2}END{print pmem}')"
        if (( pmem_pages_now - pmem_pages_prev[$wxrx] > 2 * pages_per_second_per_process )); then
            error "number of PMEM pages of $wxrx grew too quickly on this round"
        fi
        pmem_pages_prev[$wxrx]=$pmem_pages_now
    done

    # Check that write-once-read-never (worn) has migrated and stays in PMEM.
    if (( round > 20 )); then
        worn_pmem_pages="$(grep "round $round worn .*pages:" < "$memload_stats" | awk 'BEGIN{RS=" ";FS="=";pmem=0}/N[4-9]/{pmem+=$2}END{print pmem}')"
        if (( worn_pmem_pages < fully_migrated_threshold )); then
            error "write-once-read-never was expected to end up and stay in PMEM, but only $worn_pmem_pages pages in PMEM."
        fi
    fi

    # Check that write-many-read-many and -read-never (wmrm and wmrn) stay in DRAM.
    for wmrx in wmrm wmrn; do
        wmrx_pmem_pages="$(grep "round $round $wmrx .*pages:" < "$memload_stats" | awk 'BEGIN{RS=" ";FS="=";pmem=0}/N[4-9]/{pmem+=$2}END{print pmem}')"
        if (( wmrx_pmem_pages > not_migrated_threshold )); then
            error "$wmrx was expected to stay in DRAM, but $wmrx_pmem_pages pages migrated to PMEM."
        fi
    done

    sleep 1 >/dev/null
    round=$(( round + 1 ))
done
echo "All rounds were good."
kubectl delete pods --all --now --wait


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test04-dynamic-page-demotion-deprecated-syntax/cri-resmgr-dynamic-page-demotion.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 250m
resource-manager:
  control:
    page-migration:
      PageScanInterval: 10s
      PageMoveInterval: 1s
      MaxPageMoveCount: 100
logger:
  Debug: cri-resmgr,resource-manager,cache,policy
  Klog:
    skip_headers: true


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test05-guarantee-memory/code.var.sh
================================================
CRI_RESMGR_OUTPUT="cat cri-resmgr.output.txt | tr -d '\0'"
CRI_RESMGR_ROTATE="echo > cri-resmgr.output.txt"

podno=0
kubectl delete pod --all --now --wait

# account for being done with test for the current pod
nextpod () {
    podno=$((podno+1))
}

# print current pod name
pod () {
    echo pod$podno
}

# print current container name, by default for current pod
container () {
    local _p _c
    case $# in
        0) _p=${podno}; _c=0;;
        1) _p=${podno}; _c=$1;;
        2) _p=$1; _c=$2;;
        *)
            _c=pod${1}c${2}; shift 2
            echo ${_c}_INVALID_WITH_EXTRA_${#}_ARGS_$(echo $* | tr -s ' ' '_')
            return 1
            ;;
    esac
    case $_p in
        +*|-*) _p=$((${podno}$_p));;
    esac
    echo pod${_p}c${_c}
}

# rotate cri-resmgr logs
rotate_log () {
    vm-command "$CRI_RESMGR_ROTATE"
}

###########################################################################
# test #1: squeeze multiple containers in every NUMA node
#
# We squeeze an increasing number of containers in all NUMA node pools
# in a loop. For every iteration we calculate the usable amount of CPU
# and memory based on the available number of NUMA nodes and the amount
# of CPU and memory per NUMA node. We use a conservative estimate for
# the amount of memory available per NUMA node because some of them will
# have a sizeable allocation by the kernel.
#

rotate_log

# use conservative estimate for available memory per node
PER_NODE_MEM=$((1500+4000))
PER_NODE_CPU=1000
PER_NODE_PMEM=1
NODE_COUNT_TOTAL=4
# All nodes have only a single CPU. Thus, with any (< 1000m) CPU reservation
# we'll have one node (#0) fully reserved for kube-system containers. Hence,
# our (usable) node count is one less than the total one.
NODE_COUNT=$((NODE_COUNT_TOTAL - 1))

for pernode in 2 3 4; do
    cpu=$(echo "scale=3;0.75*$PER_NODE_CPU/$pernode" | bc | cut -d '.' -f1)
    mem=$(echo "scale=3;0.75*$PER_NODE_MEM/$pernode" | bc | cut -d '.' -f1)
    CPU=${cpu}m MEM=${mem}Mi CONTCOUNT=$((pernode*NODE_COUNT)) create guaranteed

    echo "Verify that any pod's containers were not raised to guarantee memory"
    echo ""
    vm-command "$CRI_RESMGR_OUTPUT | grep upward" && {
        pp mems
        error "Unexpected memset upward expansion detected!"
    }

    echo "Verify that all containers are pinned to a single NUMA node"
    echo ""
    c=0; while [ "$c" -lt "$((pernode*NODE_COUNT))" ]; do
        verify "len(mems['$(container $c)']) == $((1+PER_NODE_PMEM))"
        c=$((c+1))
    done

    kubectl delete pod --all --now --wait

    nextpod
done

###########################################################################
# test #2: negative test for lifting containers upwards.
#
# This test first creates a pod that fits into a singe NUMA node. Then
# it creates a pod that allocates a negligible amount of memory from the
# root node (by asking for more CPU than a single NUMA node can provide).
# The allocation of this pod must not cause lifting pod0 containers'
# memory assignment upwards in the pool tree.
#

rotate_log

CPU=200m MEM=100M create guaranteed
report allowed
verify "len(mems['$(container 0)']) == 2"

nextpod

CPU=1200m MEM=100M create guaranteed
report allowed
verify "len(mems['$(container -1 0)']) == 2" \
       "len(mems['$(container 0)']) == 8"

echo "Verify that $(pod)'s containers were not raised to guarantee memory"
echo ""
vm-command "$CRI_RESMGR_OUTPUT | grep upward" && {
    pp mems
    error "Unexpected memset upward expansion detected!"
}

kubectl delete pod $(pod) --now --wait --ignore-not-found

nextpod

###########################################################################
# test #3: positive test for lifting containers upwards.
#
# This test creates two containers which both get their own socket and
# take > 50 % of their socket's mem. Then it reserves a lot of memory
# from the root node to force lifting one of the containers. Every socket
# has 6G PMEM+DRAM, one pods containers take 5G and the other take 2G.
# => pessimistic max 7G will not fit to any socket
# => no memory grants can be given to any socket alone.
#

CPU=200m MEM=5G CONTCOUNT=2 create guaranteed
report allowed
verify "len(mems['$(container 0)']) == 2" \
       "len(mems['$(container 1)']) == 2" \
       "mems['$(container 0)'] != mems['$(container 1)']"

nextpod

CPU=1200m MEM=2G create guaranteed

echo "Verify that $(pod)'s containers were raised to guarantee memory"
echo ""
vm-command "$CRI_RESMGR_OUTPUT | grep upward" || {
    error "Expected memset upward expansion not found!"
}
report allowed
pp mems
verify "len(mems['$(container 0)']) == 8" \
       "len(mems['$(container -1 0)']) == 8" \
       "len(mems['$(container -1 1)']) == 8"


================================================
FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/topology.var.json
================================================
[
    {"mem": "2G", "threads": 1, "cores": 1, "packages": 4},
    {"mem": "4G", "node-dist": {"2": 17}},
    {"mem": "4G", "node-dist": {"3": 17}},
    {"mem": "4G", "node-dist": {"0": 17}},
    {"mem": "4G", "node-dist": {"1": 17}}
]


================================================
FILE: test/e2e/policies.test-suite/topology-aware/cri-resmgr.cfg
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
logger:
  Debug: cri-resmgr,resource-manager,cache,policy
  Klog:
    skip_headers: true
dump:
  Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*,off:.*Image.*


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh
================================================

# pod0: Test that 4 guaranteed containers eligible for isolated CPU allocation
# gets evenly spread over NUMA nodes.
CONTCOUNT=4 CPU=1 create guaranteed
report allowed
verify \
    'len(cpus["pod0c0"]) == 1' \
    'len(cpus["pod0c1"]) == 1' \
    'len(cpus["pod0c2"]) == 1' \
    'len(cpus["pod0c3"]) == 1' \
    'disjoint_sets(cpus["pod0c0"], cpus["pod0c1"], cpus["pod0c2"], cpus["pod0c3"])' \
    'disjoint_sets(nodes["pod0c0"], nodes["pod0c1"], nodes["pod0c2"], nodes["pod0c3"])'

kubectl delete pods --all --now --wait

# pod1: Test that 4 guaranteed containers not eligible for isolated CPU allocation
# gets evenly spread over NUMA nodes.
CONTCOUNT=4 CPU=3 create guaranteed
report allowed
verify \
    'len(cpus["pod1c0"]) == 3' \
    'len(cpus["pod1c1"]) == 3' \
    'len(cpus["pod1c2"]) == 3' \
    'len(cpus["pod1c3"]) == 3' \
    'disjoint_sets(cpus["pod1c0"], cpus["pod1c1"], cpus["pod1c2"], cpus["pod1c3"])' \
    'disjoint_sets(nodes["pod1c0"], nodes["pod1c1"], nodes["pod1c2"], nodes["pod1c3"])'

kubectl delete pods --all --now --wait

# pod2: Test that 4 burstable containers not eligible for isolated/exclusive CPU allocation
# gets evenly spread over NUMA nodes.
CONTCOUNT=4 CPUREQ=2 CPULIM=4 create burstable
report allowed
verify \
    'disjoint_sets(cpus["pod2c0"], cpus["pod2c1"], cpus["pod2c2"], cpus["pod2c3"])' \
    'disjoint_sets(nodes["pod2c0"], nodes["pod2c1"], nodes["pod2c2"], nodes["pod2c3"])'

kubectl delete pods --all --now --wait

# pod3: Test that initContainer resources are freed before launching
# containers: instantiate 5 init containers, each requiring 5 CPUs. If
# the resources of an init container weren't freed before next init
# container is launched, not all of them could be launched, and not
# real containers could fit on the node.
ICONTCOUNT=5 ICONTSLEEP=1 CONTCOUNT=2 CPU=5 MEM=100M create guaranteed
report allowed
verify \
    'disjoint_sets(cpus["pod3c0"], cpus["pod3c1"])' \
    'disjoint_sets(nodes["pod3c0"], nodes["pod3c1"])' \
    'disjoint_sets(packages["pod3c0"], packages["pod3c1"])'

kubectl delete pods --all --now --wait

# pod4: Test that with pod colocation enabled containers within a pod get
# colocated (assigned topologically close to each other) as opposed to being
# evenly spread out.
terminate cri-resmgr
cri_resmgr_cfg=$(COLOCATE_PODS=true instantiate cri-resmgr.cfg)
launch cri-resmgr

CONTCOUNT=4 CPU=100m create guaranteed
report allowed
verify \
    'cpus["pod4c1"] == cpus["pod4c0"]' \
    'cpus["pod4c2"] == cpus["pod4c0"]' \
    'cpus["pod4c3"] == cpus["pod4c0"]'

kubectl delete pods --all --now --wait

# pod{5,6,7}: Test that with namespace colocation enabled containers of pods
# in the same namespace get colocated (assigned topologically close to each
# other) as opposed to being evenly spread out.
terminate cri-resmgr
cri_resmgr_cfg=$(COLOCATE_NAMESPACES=true instantiate cri-resmgr.cfg)
launch cri-resmgr

kubectl create namespace test-ns

CONTCOUNT=1 CPU=100m namespace=test-ns create guaranteed
CONTCOUNT=1 CPU=100m namespace=test-ns create guaranteed
CONTCOUNT=2 CPU=100m namespace=test-ns create guaranteed
report allowed
verify \
    'cpus["pod6c0"] == cpus["pod5c0"]' \
    'cpus["pod7c0"] == cpus["pod5c0"]' \
    'cpus["pod7c1"] == cpus["pod5c0"]'

kubectl delete namespace test-ns --now --wait --ignore-not-found

# Restore default test configuration, restart cri-resmgr.
terminate cri-resmgr
cri_resmgr_cfg=$(instantiate cri-resmgr.cfg)
launch cri-resmgr


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/cri-resmgr.cfg.in
================================================
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
  topology-aware:
    ColocatePods: $(echo ${COLOCATE_PODS:-false})
    ColocateNamespaces: $(echo ${COLOCATE_NAMESPACES:-false})
logger:
  Debug: cri-resmgr,resource-manager,cache,policy
  Klog:
    skip_headers: true
dump:
  Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*,off:.*Image.*


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test01-always-fits/code.var.sh
================================================
# Test that guaranteed and burstable pods get the CPUs they require
# when there are enough CPUs available.

# pod0, fits in a core
CPU=1 create guaranteed
report allowed
verify \
    'node_ids(nodes["pod0c0"]) == {1}' \
    'cpu_ids(cpus["pod0c0"]) == {4}'

# pod1, takes full core - from a different node than pod0
CPU=2 create guaranteed
report allowed
verify \
    'cpu_ids(cpus["pod0c0"]) == {4}' \
    'node_ids(nodes["pod1c0"]) == {2}' \
    'cpu_ids(cpus["pod1c0"]) == {8, 9}'

# pod2, does not fit in a core but fits in a node
CPU=3 create guaranteed
report allowed
verify \
    'len(cpus["pod0c0"]) == 1' \
    'len(cpus["pod1c0"]) == 2' \
    'len(cores["pod1c0"]) == 1' \
    'len(cpus["pod2c0"]) == 3' \
    'len(cores["pod2c0"]) == 2' \
    'len(nodes["pod2c0"]) == 1' \
    'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"], cpus["pod2c0"])'

# pod3, tries to fully exhaust the shared subset of a (NUMA node) pool
# Currently topology-aware refuses to exhaust even idle shared CPU subsets of
# a pool. Therefore such attempts will try to squeeze the container to
# another pool at the same level or, if none found, push the container
# one level up to the parent pool.
#
# There is a pending commit to change this behavior to allow exhausting
# fully idle subsets (no active shared grants). Once that lands, update
# this test accordingly as well.
CPU=4 create guaranteed
report allowed
verify \
    'len(cpus["pod0c0"]) == 1' \
    'len(cpus["pod1c0"]) == 2' \
    'len(cores["pod1c0"]) == 1' \
    'len(cpus["pod2c0"]) == 3' \
    'len(cores["pod2c0"]) == 2' \
    'len(nodes["pod2c0"]) == 1' \
    'len(cpus["pod3c0"]) == 4' \
    'len(cores["pod3c0"]) == 2' \
    'len(nodes["pod3c0"]) == 2' \
    'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"], cpus["pod2c0"], cpus["pod3c0"])'

kubectl delete pods --all --now --wait

# pod4, fits in a die/package
CPU=5 create guaranteed
report allowed
verify \
    'len(cpus["pod4c0"]) == 5' \
    'len(cores["pod4c0"]) == 3' \
    'len(nodes["pod4c0"]) == 2' \
    'len(dies["pod4c0"]) == 1'

# pod5, takes a full die/package
# cpu0 is reserved, so allocating 7 CPUs is expected to fill package0/die0
CPU=7 create guaranteed
report allowed
verify \
    'len(cpus["pod4c0"]) == 5' \
    'len(cores["pod4c0"]) == 3' \
    'len(nodes["pod4c0"]) == 2' \
    'len(dies["pod4c0"]) == 1' \
    'len(cpus["pod5c0"]) == 7' \
    'len(cores["pod5c0"]) == 4' \
    'len(dies["pod5c0"]) == 1' \
    'disjoint_sets(cpus["pod4c0"], cpus["pod5c0"])'

kubectl delete pods --all --now --wait

# pod6, doesn't fit in a die/package, needs virtual root
CPU=9 create guaranteed
report allowed
verify \
    'len(cpus["pod6c0"]) == 9' \
    'len(packages["pod6c0"]) == 2'

kubectl delete pods --all --now --wait

reset counters

# pod0, burstable containers must get at least the cores they require
CPUREQ=3 CPULIM=$(( CPUREQ + 1 )) create burstable
report allowed
verify \
    'len(cpus["pod0c0"]) >= 2'

# pod1
CPUREQ=4 CPULIM=$(( CPUREQ + 1 )) create burstable
report allowed
verify \
    'len(cpus["pod0c0"]) >= 2' \
    'len(cpus["pod1c0"]) >= 4'

# pod2
CPUREQ=5 CPULIM=$(( CPUREQ + 1 )) create burstable
report allowed
verify \
    'len(cpus["pod0c0"]) >= 2' \
    'len(cpus["pod1c0"]) >= 4' \
    'len(cpus["pod2c0"]) >= 5'

kubectl delete pods pod0 pod1 --now --wait --ignore-not-found

# pod3
CPUREQ=8 CPULIM=$(( CPUREQ + 1 )) create burstable
report allowed
verify \
    'len(cpus["pod2c0"]) >= 5' \
    'len(cpus["pod3c0"]) >= 8'

kubectl delete pods pod3 --now --wait --ignore-not-found

# pod4, pod5 (and existing pod2) take 5 and 4 CPUs. As there are 8
# CPUs/node, pod2 and pod4 have consumed free node
# pairs/dies/packages. pod5 will be spread across nodes.
CPUREQ=5 CPULIM=$(( CPUREQ + 1 )) create burstable
report allowed
CPUREQ=4 CPULIM=$(( CPUREQ + 1 )) create burstable
report allowed
verify \
    'len(cpus["pod2c0"]) >= 5' \
    'len(cpus["pod4c0"]) >= 5' \
    'len(cpus["pod5c0"]) >= 4'


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test02-shrink-and-grow-shared/code.var.sh
================================================
# pod0: require 10 out of 16 CPUs with two containers.
# Both containers should fit in their own die. (8 CPUs per die.)
CPU=5 CONTCOUNT=2 create guaranteed
report allowed
verify \
    'len(cpus["pod0c0"]) == 5' \
    'len(cpus["pod0c1"]) == 5' \
    'len(nodes["pod0c0"]) == len(nodes["pod0c1"]) == 2' \
    'len(dies["pod0c0"]) == len(dies["pod0c1"]) == 1' \
    'disjoint_sets(cpus["pod0c0"], cpus["pod0c1"])'

# pod1: two containers in a besteffort pod.
CONTCOUNT=2 create besteffort
report allowed
verify \
    'len(cpus["pod0c0"]) == 5' \
    'len(cpus["pod0c1"]) == 5' \
    'disjoint_sets(set.union(cpus["pod0c0"], cpus["pod0c1"]))' \
    'len(cpus["pod1c0"]) > 0' \
    'len(cpus["pod1c1"]) > 0' \
    'disjoint_sets(
         set.union(cpus["pod0c0"], cpus["pod0c1"]),
         set.union(cpus["pod1c0"], cpus["pod1c1"]))'

# Delete pod0
delete pods/pod0 --now
report allowed

# Next squeeze the besteffort containers to the minimum.

# pod2: 4 guaranteed containers, each requiring 3 CPUs.
CPU=3 CONTCOUNT=4 create guaranteed
report allowed
verify \
    'len(cpus["pod2c0"]) == len(cpus["pod2c1"]) == len(cpus["pod2c2"]) == len(cpus["pod2c3"]) == 3' \
    'disjoint_sets(cpus["pod2c0"], cpus["pod2c1"], cpus["pod2c2"], cpus["pod2c3"])'

# pod3: 1 guaranteed container taking the last non-reserved CPU
# that can be taken from shared pools.
CPU=1 create guaranteed
report allowed
verify \
    'disjoint_sets(
         set.union(cpus["pod1c0"], cpus["pod1c1"]),
         set.union(cpus["pod3c0"],
                   cpus["pod2c0"], cpus["pod2c1"], cpus["pod2c2"], cpus["pod2c3"]))'


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test03-simple-affinity/code.var.sh
================================================
# Test that guaranteed and burstable pods get the CPUs they require
# when there are enough CPUs available.

inject-affinities() {
    local var=$1 srcdst src dst hdr line
    shift
    if [ -z "$var" ] || [ -z "${!var}" ]; then
        return 0
    fi
    case "$var" in
        ANTI_*|*_ANTI_*) hdr="cri-resource-manager.intel.com/anti-affinity";;
        *)      hdr="cri-resource-manager.intel.com/affinity";;
    esac
    for srcdst in ${!var}; do
        src=${srcdst%:*}
        dst=${srcdst#*:}
        [ -n "$hdr" ] && { echo "    $hdr: |"; hdr=""; }
        line="$src: [ ${dst//,/, } ]"
        echo 1>&2 "* [affinity]: injecting affinity '$line'"
        echo "      $line"
    done
}

deref_keys() {
    eval "echo \${!$1[@]}"
}

deref_value() {
    eval "echo \${$1[$2]}"
}

inject-annotations() {
    local var=$1 values key value
    shift
    if [ -z "$var" ] || [ -z "${!var}" ]; then
        return 0
    fi
    for key in $(deref_keys ${!var}); do
        value=$(deref_value ${!var} $key)
        line="$key: $value"
        echo 1>&2 "* [annotation]: injecting annotation '$line'"
        echo "    $line"
    done
}

# pod0
# 4 containers, no affinities => spread out evenly over NUMA nodes
CONTCOUNT=4 CPU=1 create guaranteed+affinity
report allowed

verify \
    'nodes["pod0c0"] == {"node1"}' \
    'nodes["pod0c1"] == {"node2"}' \
    'nodes["pod0c2"] == {"node3"}' \
    'nodes["pod0c3"] == {"node0"}'

kubectl delete pods --all --now --wait

# pod1
# 4 containers, affinites [0,1], [2,3] => colocate c0,c1 in node1, c2,c3 in node2
CONTCOUNT=4 AFFINITIES="pod1c0:pod1c1 pod1c2:pod1c3" CPU=1 create guaranteed+affinity
report allowed

verify \
    'nodes["pod1c0"] == nodes["pod1c1"] == {"node1"}' \
    'nodes["pod1c2"] == nodes["pod1c3"] == {"node2"}'

kubectl delete pods --all --now --wait

# pod2
# 6 containers, anti-affinites 4:[0,1,2], 5:[0,2,3]
#   => don't co-locate 4 with {0,1,2}, or 5 with {0,2,3}
CONTCOUNT=6 ANTI_AFFINITIES="pod2c4:pod2c0,pod2c1,pod2c2 pod2c5:pod2c0,pod2c2,pod2c3" CPU=1 \
    create guaranteed+affinity
report allowed

verify \
    'disjoint_sets(nodes["pod2c4"], nodes["pod2c0"], nodes["pod2c1"], nodes["pod2c2"])' \
    'disjoint_sets(nodes["pod2c5"], nodes["pod2c0"], nodes["pod2c2"], nodes["pod2c3"])'


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test03-simple-affinity/guaranteed+affinity.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  labels:
    app: ${NAME}
  annotations:
$([ -z "$(type -t inject-affinities)" ] || inject-affinities AFFINITIES)
$([ -z "$(type -t inject-affinities)" ] || inject-affinities ANTI_AFFINITIES)
$([ -z "$(type -t inject-annotations)" ] || inject-annotations ANNOTATIONS)
spec:
  containers:
  $(for contnum in $(seq 1 ${CONTCOUNT}); do echo "
  - name: ${NAME}c$(( contnum - 1 ))
    image: busybox
    imagePullPolicy: IfNotPresent
    command:
      - sh
      - -c
      - echo ${NAME}c$(( contnum - 1 )) \$(sleep inf)
    resources:
      requests:
        cpu: ${CPU}
        memory: '${MEM}'
      limits:
        cpu: ${CPU}
        memory: '${MEM}'
  "; done )
  terminationGracePeriodSeconds: 1


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test04-available-resources/code.var.sh
================================================
# Test that AvailableResources are honored.

# Test explicit cpuset in AvailableResources.CPU
terminate cri-resmgr
AVAILABLE_CPU="cpuset:4-7,8-11"
cri_resmgr_cfg=$(instantiate cri-resmgr-available-resources.cfg)
launch cri-resmgr

# pod0: exclusive CPUs
CPU=3 create guaranteed
verify "cpus['pod0c0'] == {'cpu04', 'cpu05', 'cpu06'}" \
       "mems['pod0c0'] == {'node1'}"

# pod1: shared CPUs
CONTCOUNT=2 CPU=980m create guaranteed
verify "cpus['pod1c0'] == {'cpu08', 'cpu09', 'cpu10'}" \
       "cpus['pod1c1'] == {'cpu08', 'cpu09', 'cpu10'}" \
       "mems['pod1c0'] == {'node2'}" \
       "mems['pod1c1'] == {'node2'}"
kubectl delete pods --all --now --wait
reset counters

# Test cgroup cpuset directory in AvailableResources.CPU

test-and-verify-allowed() {
    # pod0: shared CPUs
    CONTCOUNT=2 CPU=980m create guaranteed
    report allowed
    verify "cpus['pod0c0'] == {'cpu0$1', 'cpu0$2', 'cpu0$3'}" \
           "cpus['pod0c1'] == {'cpu0$4'}"

    # pod1: exclusive CPU
    CPU=1 create guaranteed
    report allowed
    verify "disjoint_sets(cpus['pod1c0'], cpus['pod0c0'])" \
           "disjoint_sets(cpus['pod1c0'], cpus['pod0c1'])"

    kubectl delete pods --all --now --wait
    reset counters
}

if vm-command "[ -d /sys/fs/cgroup/cpuset ]"; then
    # cgroup v1
    CGROUP_CPUSET=/sys/fs/cgroup/cpuset
else
    # cgroup v2
    CGROUP_CPUSET=/sys/fs/cgroup
fi
CRIRM_CGROUP=$CGROUP_CPUSET/cri-resmgr-test-05-1
vm-command "rmdir $CRIRM_CGROUP; mkdir $CRIRM_CGROUP; echo 1-4,11 > $CRIRM_CGROUP/cpuset.cpus"

terminate cri-resmgr
AVAILABLE_CPU="\"$CRIRM_CGROUP\""
cri_resmgr_cfg=$(instantiate cri-resmgr-available-resources.cfg)
launch cri-resmgr
test-and-verify-allowed 1 2 3 4
vm-command "rmdir $CRIRM_CGROUP || true"

CRIRM_CGROUP=$CGROUP_CPUSET/cri-resmgr-test-05-2
vm-command "rmdir $CRIRM_CGROUP; mkdir $CRIRM_CGROUP; echo 5-8,11 > $CRIRM_CGROUP/cpuset.cpus"

terminate cri-resmgr
AVAILABLE_CPU="\"${CRIRM_CGROUP#/sys/fs/cgroup/cpuset}\""
cri_resmgr_cfg=$(instantiate cri-resmgr-available-resources.cfg)
launch cri-resmgr
test-and-verify-allowed 5 6 7 8
vm-command "rmdir $CRIRM_CGROUP || true"

# cleanup, do not leave weirdly configured cri-resmgr running
terminate cri-resmgr


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test04-available-resources/cri-resmgr-available-resources.cfg.in
================================================
policy:
  Active: topology-aware
  AvailableResources:
    cpu: ${AVAILABLE_CPU}
  ReservedResources:
    cpu: cpuset:11
logger:
  Debug: cri-resmgr,resource-manager,cache,policy
  Klog:
    skip_headers: true
dump:
  Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*,off:.*Image.*


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test05-reserved-resources/code.var.sh
================================================
# Test that
# - kube-system containers are pinned on Reserved CPUs.
# - Reserved CPU allocation and releasing works.
# - A pod cannot be launched if reserved CPU capacity in insufficient.

AVAILABLE_CPU="cpuset:4-7,8-13"

cri_resmgr_cfg_orig=$cri_resmgr_cfg

# This script will create pods to the kube-system namespace
# that is not automatically cleaned up by the framework.
# Make sure the namespace is clear when starting the test and clean it up
# if exiting with success. Otherwise leave the pod running for
# debugging in case of a failure.
cleanup-kube-system() {
    ( kubectl delete pods pod0 pod1 pod2 pod3 pod4 pod5 -n kube-system --now --wait --ignore-not-found ) || true
}
cleanup-kube-system

# Test launch failure, Reserved CPUs is not subset of Available CPUs
terminate cri-resmgr
RESERVED_CPU="cpuset:3,7,11,15"
cri_resmgr_cfg=$(instantiate cri-resmgr-reserved.cfg)
( launch cri-resmgr ) && error "unexpected success" || {
    echo "Launch failed as expected"
}

# Test launch failure, there are more reserved CPUs than available CPUs
terminate cri-resmgr
RESERVED_CPU="11"
cri_resmgr_cfg=$(instantiate cri-resmgr-reserved.cfg)
( launch cri-resmgr ) && error "unexpected success" || {
    echo "Launch failed as expected"
}

# Test that BestEffort containers are allowed to run on both Reserved
# CPUs when the CPUs are on the same NUMA node.
terminate cri-resmgr
RESERVED_CPU="cpuset:10-11"
cri_resmgr_cfg=$(instantiate cri-resmgr-reserved.cfg)
launch cri-resmgr

namespace=kube-system CONTCOUNT=3 create besteffort
report allowed
verify "cpus['pod0c0'] == cpus['pod0c1'] == cpus['pod0c2'] == {'cpu10', 'cpu11'}"
kubectl delete -n kube-system pods pod0 --now --wait --ignore-not-found

# Test that BestEffort containers are pinned to reserved CPUs.
terminate cri-resmgr
RESERVED_CPU="cpuset:7,11"
cri_resmgr_cfg=$(instantiate cri-resmgr-reserved.cfg)
launch cri-resmgr

namespace=kube-system CONTCOUNT=4 create besteffort
report allowed
verify "cpus['pod1c0'] == cpus['pod1c1'] == cpus['pod1c2'] == cpus['pod1c3']" \
       "cpus['pod1c0'] == {'cpu07', 'cpu11'}"

# Test that guaranteed kube-system pods are pinned to Reserved CPUs.
namespace=kube-system CPU=200m CONTCOUNT=4 create guaranteed
report allowed
verify "cpus['pod2c0'] == cpus['pod2c1'] == cpus['pod2c2'] == cpus['pod2c3']" \
       "cpus['pod2c0'] == {'cpu07', 'cpu11'}"

# Test requesting more reserved CPUs than available on single node
# but what fits in the node tree.
# pod2 already consumed 4 * 200m of reserved CPUs that have been balanced
# so that at least 200m from both nodes have been consumed. There are
# at most 800m reserved CPUs free on both nodes. Root node still has
# 1200m free. That is, 1000m requesting, isolated-looking guaranteed
# pod should fit in because reserved CPUs are not isolated.
#
# Run this twice to make sure allocated reserved CPUs are released correctly.
for pod in pod3 pod4; do
    namespace=kube-system CPU=1 CONTCOUNT=1 create guaranteed
    verify "cpus['${pod}c0'] == {'cpu07', 'cpu11'}"
    kubectl delete -n kube-system pods/$pod --now --wait --ignore-not-found
done

# Test requesting more reserved CPUs than available in the system.
# pod5 is expected to run on shared CPUs.
namespace=kube-system CPU=2 CONTCOUNT=1 create guaranteed
report allowed
verify "cpus['pod5c0'] == {'cpu04', 'cpu05', 'cpu06', 'cpu08', 'cpu09', 'cpu10', 'cpu12', 'cpu13'}"

cleanup-kube-system

# Test that the first available CPUs are reserved when reserving milli CPUs.
# The number of reserved CPUs is the ceiling of the milli CPUs.
reset counters
terminate cri-resmgr
RESERVED_CPU="2250m"
cri_resmgr_cfg=$(instantiate cri-resmgr-reserved.cfg)
launch cri-resmgr
namespace=kube-system CPU=2 CONTCOUNT=1 create besteffort
verify "cpus['pod0c0'] == {'cpu04', 'cpu05', 'cpu06'}"

kubectl delete -n kube-system pods/pod0 --now --wait --ignore-not-found

terminate cri-resmgr
cri_resmgr_cfg=$cri_resmgr_cfg_orig
launch cri-resmgr


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test05-reserved-resources/cri-resmgr-reserved.cfg.in
================================================
policy:
  Active: topology-aware
  AvailableResources:
    cpu: ${AVAILABLE_CPU}
  ReservedResources:
    cpu: ${RESERVED_CPU}
logger:
  Debug: cri-resmgr,resource-manager,cache,policy
  Klog:
    skip_headers: true
dump:
  Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*,off:.*Image.*


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test06-fuzz/code.var.sh
================================================
source $TEST_DIR/codelib.sh || {
    echo "error importing codelib.sh"
    exit 1
}

# Clean test pods from the kube-system namespace
( kubectl delete pods --now --wait --ignore-not-found -n kube-system $(kubectl get pods -n kube-system | awk '/t[0-9]r[gb][ue]/{print $1}') ) || true

# Run generated*.sh test scripts in this directory.
genscriptcount=0
for genscript in "$TEST_DIR"/generated*.sh; do
    if [ ! -f "$genscript" ]; then
        continue
    fi
    (
        paralleloutdir="$outdir/parallel$genscriptcount"
        [ -d "$paralleloutdir" ] && rm -rf "$paralleloutdir"
        mkdir "$paralleloutdir"
        OUTPUT_DIR="$paralleloutdir"
        COMMAND_OUTPUT_DIR="$paralleloutdir/commands"
        mkdir "$COMMAND_OUTPUT_DIR"
        source "$genscript" 2>&1 | sed -u -e "s/^/$(basename "$genscript"): /g"
    ) &
    genscriptcount=$(( genscriptcount + 1))
done

if [[ "$genscriptcount" == "0" ]]; then
    echo "WARNING:"
    echo "WARNING: Skipping fuzz tests:"
    echo "WARNING: - Generated tests not found."
    echo "WARNING: - Generate a test by running:"
    echo "WARNING:   $TEST_DIR/generate.sh"
    echo "WARNING: - See test generation options:"
    echo "WARNING:   $TEST_DIR/generate.sh --help"
    echo "WARNING:"
    sleep 5
    exit 0
fi

echo "waiting for $genscriptcount generated tests to finish..."
wait


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test06-fuzz/codelib.sh
================================================
container-exit0() {
    # Terminate a container by killing the "sleep inf" child process in
    # echo CONTNAME $(sleep inf)
    local contname="$1"
    vm-command "contpid=\$(ps axf | grep -A1 'echo $contname' | grep -v grep | awk '/_ sleep inf/{print \$1}'); kill -KILL \$contpid"
}

container-signal() {
    local contname="$1"
    local signal="$2"
    vm-command "pkill -$signal -f 'echo $contname'"
}


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test06-fuzz/fuzz.aal
================================================
language python {
    max_mem=7500  # maximum memory on VM in MB
    max_cpu=15000 # maximum CPUs on node in mCPU
    max_reserved_cpu=1000 # maximum reserved CPUs on node in mCPU
    class Vars:
        # namespace for variables in input names
        def __repr__(self):
            return "{" + ",".join("%s:%s" % (a, getattr(self, a)) for a in sorted(self.__dict__.keys()) if not a.startswith("_")) + "}\n"
    def inputvars(input_name):
        # parse VAR=VALUE's from input_name
        v = Vars()
        for word in input_name.split():
            keyvalue = word.split("=")
            if len(keyvalue) == 2:
                if (keyvalue[1].endswith("m") or keyvalue[1].endswith("M")) and len(keyvalue[1]) > 1 and keyvalue[1][-2] in '0123456789':
                    keyvalue[1] = keyvalue[1][:-1]
                try:
                    setattr(v, keyvalue[0], int(keyvalue[1]))
                except:
                    setattr(v, keyvalue[0], keyvalue[1])
        return v
}

variables {
    mem, cpu, reserved_cpu, pods
}

initial_state {
    mem=0
    cpu=0
    reserved_cpu=0
    pods={}
}

# Create non-reserved CPU pods
# On this topology, there is
# - 2G mem/numanode, 4G mem/package, 8G mem in total
# -  4 CPU/numanode,  8 CPU/package, 16 CPU in total
input
    "NAME=gu0 CONTCOUNT=1 CPU=200m MEM=1500M create guaranteed",
    "NAME=gu1 CONTCOUNT=2 CPU=1000m MEM=500M create guaranteed",
    "NAME=gu2 CONTCOUNT=2 CPU=1200m MEM=4500M create guaranteed",
    "NAME=gu3 CONTCOUNT=3 CPU=2000m MEM=500M create guaranteed",
    "NAME=gu4 CONTCOUNT=1 CPU=4200m MEM=100M create guaranteed",
    "NAME=bu0 CONTCOUNT=1 CPU=1200m MEM=50M CPUREQ=900m MEMREQ=49M CPULIM=1200m MEMLIM=50M create burstable",
    "NAME=bu1 CONTCOUNT=2 CPU=1900m MEM=300M CPUREQ=1800m MEMREQ=299M CPULIM=1900m MEMLIM=300M create burstable",
    "NAME=be0 CONTCOUNT=1 CPU=0 MEM=0 create besteffort",
    "NAME=be1 CONTCOUNT=3 CPU=0 MEM=0 create besteffort"
{
    guard {
        v = inputvars(input_name)
        return (v.NAME not in pods
                and (mem + v.MEM * v.CONTCOUNT < max_mem)
                and (cpu + v.CPU * v.CONTCOUNT < max_cpu))
    }
    body {
        v = inputvars(input_name)
        v.namespace = getattr(v, "namespace", "default")
        mem += v.MEM * v.CONTCOUNT
        cpu += v.CPU * v.CONTCOUNT
        pods[v.NAME] = v
    }
}

# Create pods to the kube-system namespace
input
    "NAME=rgu0 CONTCOUNT=2 CPU=100m MEM=1000M namespace=kube-system create guaranteed",
    "NAME=rbu0 CONTCOUNT=1 CPU=100m MEM=100M CPUREQ=99m MEMREQ=99M CPULIM=100m MEMLIM=100M namespace=kube-system create burstable",
    "NAME=rbe0 CONTCOUNT=2 CPU=0 MEM=0 namespace=kube-system create besteffort"
{
    guard {
        v = inputvars(input_name)
        return (v.NAME not in pods
                and (mem + v.MEM * v.CONTCOUNT < max_mem)
                and (reserved_cpu + v.CPU * v.CONTCOUNT < max_reserved_cpu))

    }
    body {
        v = inputvars(input_name)
        mem += v.MEM * v.CONTCOUNT
        reserved_cpu += v.CPU * v.CONTCOUNT
        pods[v.NAME] = v
    }
}

# Kill a process in a container
# - "echo gu0c1" matches and kills process only in container gu0c1 in pod gu0
# - "echo gu0" matches and kills processes in all containers of pod gu0
input
    "NAME=gu0 container-exit0 gu0c0",
    "NAME=gu1 container-exit0 gu1c0",
    "NAME=gu2 container-exit0 gu2c0",
    "NAME=gu3 container-exit0 gu3",
    "NAME=gu4 container-exit0 gu4c",
    "NAME=bu0 container-exit0 bu0c0",
    "NAME=bu1 container-exit0 bu1c0",
    "NAME=be0 container-exit0 be0c0",
    "NAME=be1 container-exit0 be0c0",
    "NAME=rgu0 container-exit0 rgu0c0",
    "NAME=rbu0 container-exit0 rbu0c0",
    "NAME=rbe0 container-exit0 rbe0c0"
{
    guard {
        v = inputvars(input_name)
        return v.NAME in pods
    }
}

# Delete single pod
input
    "NAME=gu0 kubectl delete pod gu0 --now --wait --ignore-not-found",
    "NAME=gu1 kubectl delete pod gu1 --now --wait --ignore-not-found",
    "NAME=gu2 kubectl delete pod gu2 --now --wait --ignore-not-found",
    "NAME=gu3 kubectl delete pod gu3 --now --wait --ignore-not-found",
    "NAME=gu4 kubectl delete pod gu4 --now --wait --ignore-not-found",
    "NAME=bu0 kubectl delete pod bu0 --now --wait --ignore-not-found",
    "NAME=bu1 kubectl delete pod bu1 --now --wait --ignore-not-found",
    "NAME=be0 kubectl delete pod be0 --now --wait --ignore-not-found",
    "NAME=be1 kubectl delete pod be1 --now --wait --ignore-not-found",
    "NAME=rgu0 kubectl delete pod rgu0 -n kube-system --now --wait --ignore-not-found",
    "NAME=rbu0 kubectl delete pod rbu0 -n kube-system --now --wait --ignore-not-found",
    "NAME=rbe0 kubectl delete pod rbe0 -n kube-system --now --wait --ignore-not-found"
{
    guard {
        v = inputvars(input_name)
        return v.NAME in pods
    }
    body {
        v = inputvars(input_name)
        p = pods[v.NAME]
        mem -= p.MEM * p.CONTCOUNT
        if getattr(p, "namespace", "") == "kube-system":
            reserved_cpu -= p.CPU * p.CONTCOUNT
        else:
            cpu -= p.CPU * p.CONTCOUNT
        del pods[v.NAME]
    }
}


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test06-fuzz/fuzz.fmbt.conf
================================================
model = aal_remote(remote_pyaal --verbose-fmbt-log fuzz.aal)
heuristic = mrandom(80,lookahead(1:2),20,random)
coverage = perm(2)

pass = coverage(10)
pass = steps(100)


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test06-fuzz/generate.sh
================================================
#!/bin/bash

usage() {
    cat <<EOF
generate.sh - generate fuzz tests.

Configuring test generation with environment variables:
  TESTCOUNT=<NUM>       Number of generated test scripts than run in parallel.
  MEM=<NUM>             Memory [MB] available for test pods in the system.
  CPU=<NUM>             Non-reserved CPU [mCPU] available for test pods in the system.
  RESERVED_CPU=<NUM>    Reserved CPU [mCPU] available for test pods in the system.
  STEPS=<NUM>           Total number of test steps in all parallel tests.

  FMBT_IMAGE=<IMG:TAG>  Generate the test using fmbt from docker image IMG:TAG.
                        The default is fmbt-cli:latest.
EOF
    exit 0
}

if [ -n "$1" ]; then
    usage
fi

TESTCOUNT=${TESTCOUNT:-1}
MEM=${MEM:-7500}
# 950 mCPU taken by the control plane, split the remaining 15050 mCPU
# available for test pods to CPU and RESERVED_CPU pods.
CPU=${CPU:-14050}
RESERVED_CPU=${RESERVED_CPU:-1000}
STEPS=${STEPS:-100}
FMBT_IMAGE=${FMBT_IMAGE:-"fmbt-cli:latest"}

mem_per_test=$(( MEM / TESTCOUNT ))
cpu_per_test=$(( CPU / TESTCOUNT ))
reserved_cpu_per_test=$(( RESERVED_CPU / TESTCOUNT ))
steps_per_test=$(( STEPS / TESTCOUNT ))

# Check fmbt Docker image
docker run "$FMBT_IMAGE" fmbt --version 2>&1 | grep ^Version: || {
    echo "error: cannot run fmbt from Docker image '$FMBT_IMAGE'"
    echo "You can build the image locally by running:"
    echo "( cd /tmp && git clone --branch devel https://github.com/intel/fmbt && cd fmbt && docker build . -t $FMBT_IMAGE -f Dockerfile.fmbt-cli )"
    exit 1
}

cd "$(dirname "$0")" || {
    echo "cannot cd to the directory of $0"
    exit 1
}

for testnum in $(seq 1 "$TESTCOUNT"); do
    testid=$(( testnum - 1))
    sed -e "s/max_mem=.*/max_mem=${mem_per_test}/" \
        -e "s/max_cpu=.*/max_cpu=${cpu_per_test}/" \
        -e "s/max_reserved_cpu=.*/max_reserved_cpu=${reserved_cpu_per_test}/" \
        < fuzz.aal > tmp.fuzz.aal
    sed -e "s/fuzz\.aal/tmp.fuzz.aal/" \
        -e "s/pass = steps(.*/pass = steps(${steps_per_test})/" \
        < fuzz.fmbt.conf > tmp.fuzz.fmbt.conf
    OUTFILE=generated${testid}.sh
    echo "generating $OUTFILE..."
    docker run -v "$(pwd):/mnt/models" "$FMBT_IMAGE" sh -c 'cd /mnt/models; fmbt tmp.fuzz.fmbt.conf 2>/dev/null | fmbt-log -f STEP\$sn\$as\$al' | grep -v AAL | sed -e 's/^, /  /g' -e '/^STEP/! s/\(^.*\)/echo "TESTGEN: \1"/g' -e 's/^STEP\([0-9]*\)i:\(.*\)/echo "TESTGEN: STEP \1"; vm-command "date +%T.%N"; \2; vm-command "date +%T.%N"; kubectl get pods -A/g' | sed "s/\([^a-z0-9]\)\(r\?\)\(gu\|bu\|be\)\([0-9]\)/\1t${testid}\2\3\4/g" > "$OUTFILE"
done


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test07-mixed-allocations/code.var.sh
================================================
# Place pod0c0 and pod0c1 to shared pools on separate nodes.
CONTCOUNT=2 CPU=500m create guaranteed
report allowed
verify "len(mems['pod0c0']) == 1" \
       "len(mems['pod0c1']) == 1" \
       "disjoint_sets(mems['pod0c0'], mems['pod0c1'])" \
       "len(cpus['pod0c0']) == 4" \
       "len(cpus['pod0c1']) == 4" \
       "disjoint_sets(cpus['pod0c0'], cpus['pod0c1'])"

# Place pod1c0 to its own node, as there is still one 4-CPU node free.
# The placement of pod1c1 is more interesting:
# - node0 has only 3 CPUs (CPU #0 is reserved)
# - node1, node2 and node3 have containers in their shared pools
# - shared pools with pod0c* containers have more free space than node0
#   => pod1c0 should be place to either of those
# - because pod1c1 should get one exclusive CPU, either of pod0c0 and
#   pod0c1 should run in a shared pool of only 3 CPUs from now on.
CONTCOUNT=2 CPU=1500m create guaranteed
report allowed
verify `# every container is placed on a single node (no socket, no root)` \
       "[len(mems[c]) for c in mems] == [1] * len(mems)" \
       `# pod1c0 and pod1c1 are on different nodes` \
       "disjoint_sets(mems['pod1c0'], mems['pod1c1'])" \
       `# either of pod0c0 and pod0c1 has only 3 CPUs, the other has 4.` \
       "len(cpus['pod0c0']) == 3 or len(cpus['pod0c1']) == 3" \
       "len(cpus['pod0c0']) == 4 or len(cpus['pod0c1']) == 4" \
       `# pod1c0 and pod1c1 are allowed to use all CPUs on their nodes` \
       "len(cpus['pod1c0']) == 4" \
       "len(cpus['pod1c1']) == 4" \
       `# pod1c1 should have one exclusive CPU on its node` \
       "len(cpus['pod1c1'] - cpus['pod0c0'] - cpus['pod0c1']) == 1"

# Place pod2c0 to node0, as it has largest free shared pool (3 CPUs).
# Place pod2c1 to the node that has only either pod0c0 or pod0c1,
# while the other one of them already shares a node with pod1c1.
CONTCOUNT=2 CPU=2400m create guaranteed
report allowed
verify `# every container is placed on a single node (no socket, no root)` \
       "[len(mems[c]) for c in mems] == [1] * len(mems)" \
       `# pod1c1 should have kept its own exclusive CPU` \
       "len(cpus['pod1c1'] - set.union(*[cpus[c] for c in cpus if c != 'pod1c1'])) == 1" \
       `# pod2c0 is the only container in node0, so it happens to have 3 unshared CPUs for now` \
       "len(cpus['pod2c0']) == 3" \
       "len(cpus['pod2c0'] - set.union(*[cpus[c] for c in cpus if c != 'pod2c0'])) == 3" \
       `# pod2c1 shares its node and should not have exclusive CPUs` \
       "len(cpus['pod2c1']) == 4" \
       "len(cpus['pod2c1'] - set.union(*[cpus[c] for c in cpus if c != 'pod2c1'])) == 0" \
       `# pod2c1 should run in the same node as either pod0c0 or pod0c1` \
       "mems['pod2c1'] == mems['pod0c0'] or mems['pod2c1'] == mems['pod0c1']"

# pod3c0 should get 2 exclusive CPUs and 400m share from a shared pool.
# To get that, annotate the pod to:
# - opt-out from shared CPUs (=> opt-in to exclusive CPUs)
# - opt-in to isolated CPUs (this should not matter, test opt-out with pod4).
# There is only one node where the container fits: the same node as pod1c0.
ANNOTATIONS=('prefer-shared-cpus.cri-resource-manager.intel.com/pod: "false"'
             'prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true"')
CONTCOUNT=1 CPU=2400m create guaranteed-annotated
report allowed
verify `# every container is placed on a single node (no socket, no root)` \
       "[len(mems[c]) for c in mems] == [1] * len(mems)" \
       `# pod3c0 and pod1c0 are placed in the same node` \
       "mems['pod3c0'] == mems['pod1c0']" \
       `# pod1c0 has 1 an exclusive CPU` \
       "len(cpus['pod1c0'] - set.union(*[cpus[c] for c in cpus if c != 'pod1c0'])) == 1" \
       `# pod3c0 has 2 exclusive CPUs` \
       "len(cpus['pod3c0'] - set.union(*[cpus[c] for c in cpus if c != 'pod3c0'])) == 2"

# Replace pod3 with pod4.
# Test release/(re)allocate mixed pod with exclusive CPUs and
# no-effect from isolated preference.
# - opt-out from shared CPUs (=> opt-in to exclusive CPUs)
# - opt-out from isolated CPUs (this does not affect getting exclusive CPUs)
kubectl delete pods pod3 --now --wait --ignore-not-found
ANNOTATIONS=('prefer-shared-cpus.cri-resource-manager.intel.com/pod: "false"'
             'prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "false"')
CONTCOUNT=1 CPU=2400m create guaranteed-annotated
report allowed
verify `# every container is placed on a single node (no socket, no root)` \
       "[len(mems[c]) for c in mems] == [1] * len(mems)" \
       `# pod4c0 and pod1c0 are placed in the same node` \
       "mems['pod4c0'] == mems['pod1c0']" \
       `# pod1c0 has 1 exclusive CPU` \
       "len(cpus['pod1c0'] - set.union(*[cpus[c] for c in cpus if c != 'pod1c0'])) == 1" \
       `# pod4c0 has 2 exclusive CPUs` \
       "len(cpus['pod4c0'] - set.union(*[cpus[c] for c in cpus if c != 'pod4c0'])) == 2"

# Replace pod1 with pod5.
# pod1 implicitly opted-in to exlusive CPUs due to 1500 mCPU request.
# Now explicitly opt-out of it by opting-in to shared-cpus.
kubectl delete pods pod1 --now --wait --ignore-not-found
# Make sure that shared pool size increased correctly after mixed pod deletion.
verify `# pod0c0 or pod0c1 shared a node with pod1c1 and had only 3 CPUs` \
       "len(cpus['pod0c0']) == 4" \
       "len(cpus['pod0c1']) == 4"

ANNOTATIONS=('prefer-shared-cpus.cri-resource-manager.intel.com/pod: "true"')
CONTCOUNT=2 CPU=1500m create guaranteed-annotated
report allowed
verify `# every container is placed on a single node (no socket, no root)` \
       "[len(mems[c]) for c in mems] == [1] * len(mems)" \
       `# pod5c0 should share a node with pod0c0 or pod0c1 and have access to all CPUs` \
       "mems['pod5c0'] == mems['pod0c0'] or mems['pod5c0'] == mems['pod0c1']" \
       "len(cpus['pod5c0']) == 4" \
       "len(cpus['pod0c0']) == 4" \
       "len(cpus['pod0c1']) == 4" \
       `# pod5c1 should run in a node with pod4c0 (this is where pod1c0 used to be)` \
       "mems['pod5c1'] == mems['pod4c0']" \
       "len(cpus['pod5c1']) == 2" \
       `# pod5c0 and pod5c1 share a node with another container => all their CPUs should be shared` \
       "len(cpus['pod5c0'] - set.union(*[cpus[c] for c in cpus if c != 'pod5c0'])) == 0" \
       "len(cpus['pod5c1'] - set.union(*[cpus[c] for c in cpus if c != 'pod5c1'])) == 0"


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test07-mixed-allocations/guaranteed-annotated.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  $(if [ -n "${ANNOTATIONS[0]}" ]; then echo "
  annotations:
    $(for annotation in "${ANNOTATIONS[@]}"; do echo "
    $annotation
    "; done)
  "; fi)
  labels:
    app: ${NAME}
spec:
  containers:
  $(for contnum in $(seq 1 ${CONTCOUNT}); do echo "
  - name: ${NAME}c$(( contnum - 1 ))
    image: busybox
    imagePullPolicy: IfNotPresent
    command:
      - sh
      - -c
      - echo ${NAME}c$(( contnum - 1 )) \$(sleep inf)
    resources:
      requests:
        cpu: ${CPU}
        memory: '${MEM}'
      limits:
        cpu: ${CPU}
        memory: '${MEM}'
  "; done )
  terminationGracePeriodSeconds: 1


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test08-isolcpus/code.var.sh
================================================
vm-command "grep isolcpus=8,9 /proc/cmdline" || {
    vm-set-kernel-cmdline "isolcpus=8,9"
    vm-reboot
    vm-command "grep isolcpus=8,9 /proc/cmdline" || {
        error "failed to set isolcpus kernel commandline parameter"
    }
    launch cri-resmgr
    vm-command "systemctl restart kubelet"
    sleep 1
    vm-wait-process --timeout 120 kube-apiserver
    vm-run-until --timeout 120 "kubectl get node"
}

CONTCOUNT=1

# pod0: opt-in isolated CPUs
ANNOTATIONS='prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true"'
CPU=1 create guaranteed-annotated
report allowed
verify "cpus['pod0c0'] == {'cpu08'} or cpus['pod0c0'] == {'cpu09'}" \
       "mems['pod0c0'] == {'node2'}"

# pod1: opt-out isolated CPUs
ANNOTATIONS='prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "false"'
CPU=1 create guaranteed-annotated
report allowed
verify "disjoint_sets(cpus['pod1c0'], {'cpu08', 'cpu09'})"

# pod2: without annotation CPU=1 guaranteed pod is eligible to run on isolated CPUs
ANNOTATIONS=''
CPU=1 create guaranteed-annotated
report allowed
verify "cpus['pod0c0'] == {'cpu08'} or cpus['pod0c0'] == {'cpu09'}" \
       "cpus['pod2c0'] == {'cpu08'} or cpus['pod2c0'] == {'cpu09'}" \
       "disjoint_sets(cpus['pod0c0'], cpus['pod2c0'])" \
       "mems['pod0c0'] == {'node2'}" \
       "mems['pod2c0'] == {'node2'}"

# free isolated (and all other) cpus
kubectl delete pods --all --now --wait

# pod3: opt-in isolated CPUs, take all of them
ANNOTATIONS='prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true"'
CPU=2000m create guaranteed-annotated
report allowed
verify "cpus['pod3c0'] == {'cpu08', 'cpu09'}" \
       "len(cpus['pod3c0']) == 2"

# free isolated cpus
kubectl delete pods --all --now --wait

# pod4: opt-in isolated CPUs but require a fraction more CPUs than there are isolated CPUs
ANNOTATIONS=('prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true"'
             'prefer-shared-cpus.cri-resource-manager.intel.com/pod: "false"')
CPU=2500m create guaranteed-annotated
report allowed
verify "'cpu08' in cpus['pod4c0'] and 'cpu09' in cpus['pod4c0']" \
       "len(cpus['pod4c0']) == 4"

# free isolated cpus
kubectl delete pods --all --now --wait

# pod5: opt-in isolated CPUs but require a fraction less CPUs than there are isolated CPUs
ANNOTATIONS=('prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true"'
             'prefer-shared-cpus.cri-resource-manager.intel.com/pod: "false"')
CPU=1500m create guaranteed-annotated
report allowed
verify "'cpu08' in cpus['pod5c0'] or 'cpu09' in cpus['pod5c0']" \
       "'cpu10' in cpus['pod5c0'] and 'cpu11' in cpus['pod5c0']" \
       "len(cpus['pod5c0']) == 3"

# free isolated cpus
kubectl delete pods --all --now --wait

# pod6: opt-in isolated CPUs but require a full CPU more than there
# are isolated CPUs
ANNOTATIONS=('prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true"'
             'prefer-shared-cpus.cri-resource-manager.intel.com/pod: "false"')
CPU=3000m create guaranteed-annotated
report allowed
verify "len(cpus['pod6c0']) == 3" \
       "disjoint_sets(cpus['pod6c0'], {'cpu08', 'cpu09'})" \
       "len(mems['pod6c0']) == 1"

# pod7: sub-core is never eligble for isolated CPUs, even if annotated
# to opt-in.
ANNOTATIONS=('prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true"'
             'prefer-shared-cpus.cri-resource-manager.intel.com/pod: "false"')
CONTCOUNT=4 CPU=200m create guaranteed-annotated
report allowed
verify "disjoint_sets(set.union(cpus['pod7c0'], cpus['pod7c1'], cpus['pod7c2'], cpus['pod7c3']), {'cpu08', 'cpu09'})" \
       "len(cpus['pod7c0']) >= 2" \
       "len(cpus['pod7c1']) >= 2" \
       "len(cpus['pod7c2']) >= 2" \
       "len(cpus['pod7c3']) >= 2"

# Cleanup kernel commandline, otherwise isolcpus will affect CPU
# pinning and cause false negatives from other tests on this VM.
vm-set-kernel-cmdline ""
vm-reboot
vm-command "grep isolcpus /proc/cmdline" && {
    error "failed to clean up isolcpus kernel commandline parameter"
}
echo "isolcpus removed from kernel commandline"
launch cri-resmgr
vm-command "systemctl restart kubelet"
vm-wait-process --timeout 120 kube-apiserver


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test08-isolcpus/guaranteed-annotated.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  $(if [ -n "${ANNOTATIONS[0]}" ]; then echo "
  annotations:
    $(for annotation in "${ANNOTATIONS[@]}"; do echo "
    $annotation
    "; done)
  "; fi)
  labels:
    app: ${NAME}
spec:
  containers:
  $(for contnum in $(seq 1 ${CONTCOUNT}); do echo "
  - name: ${NAME}c$(( contnum - 1 ))
    image: busybox
    imagePullPolicy: IfNotPresent
    command:
      - sh
      - -c
      - echo ${NAME}c$(( contnum - 1 )) \$(sleep inf)
    resources:
      requests:
        cpu: ${CPU}
        memory: '${MEM}'
      limits:
        cpu: ${CPU}
        memory: '${MEM}'
  "; done )
  terminationGracePeriodSeconds: 1


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test09-container-exit/code.var.sh
================================================
# Test resource allocation / free on different container exit and
# restart scenarios.

CONTCOUNT=1 CPU=1000m MEM=64M create guaranteed
report allowed
verify 'len(cpus["pod0c0"]) == 1' \
       '"pod0c0" in allocations'

out '### Crash and restart pod0c0'
vm-command "kubectl get pods pod0"

vm-command "set -x; [[ -n \"\$(pgrep -f pod0c0)\" ]] && [[ \"\$(pgrep -f pod0c0 --oldest)\" != \"\$(pgrep -f pod0c0 --newest)\" ]]" || {
    command-error "There must be separate parent and child 'pod0c0' processes in order to run this test"
}

out '### Kill the root process in pod0c0. The container should get Restarted.'
vm-command "kill -KILL \$(pgrep -f pod0c0 --oldest)"
sleep 2
vm-command 'kubectl wait --for=condition=Ready pods/pod0'
vm-run-until --timeout 30 "pgrep -f pod0c0 > /dev/null 2>&1"
vm-command "kubectl get pods pod0"
report allowed
verify 'len(cpus["pod0c0"]) == 1' \
       '"pod0c0" in allocations'

out '### Kill the child process in pod0c0. The root process exits with status 0, the container should get Completed.'
vm-command "kubectl get pods pod0"
vm-command "ps axf | grep pod0c0; echo newest: \$(pgrep -f pod0c0 --newest)"
vm-command "kill -KILL \$(pgrep -f pod0c0 --newest)"
sleep 2
vm-command "kubectl get pods pod0"
# pod0c0 process is not on vm anymore
verify '"pod0c0" not in cpus'
# pod0c0 is not allocated any resources on CRI-RM
( verify '"pod0c0" not in allocations' ) || {
    # pretty-print allocations contents
    pp allocations
    error "pod0c0 expected to disappear from allocations"
}


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test10-additional-reserved-namespaces/code.var.sh
================================================
# Test that
# - containers marked in ReservedPoolNamespaces option pinned on Reserved CPUs.

(kubectl create namespace reserved-test) || true

cri_resmgr_cfg_orig=$cri_resmgr_cfg

# This script will create pods to the reserved and default namespace.
# Make sure the namespace is clear when starting the test and clean it up
# if exiting with success. Otherwise leave the pod running for
# debugging in case of a failure.
cleanup-test-pods() {
    ( kubectl delete pods pod0 -n kube-system --now --wait --ignore-not-found ) || true
    ( kubectl delete pods pod1 --now --wait --ignore-not-found ) || true
}
cleanup-test-pods

terminate cri-resmgr
AVAILABLE_CPU="cpuset:8-11"
RESERVED_CPU="cpuset:10-11"
cri_resmgr_cfg=$(instantiate cri-resmgr-reserved-namespaces.cfg)
launch cri-resmgr

CONTCOUNT=1 namespace=kube-system create besteffort
CONTCOUNT=1 create besteffort
report allowed
verify 'cpus["pod0c0"] == {"cpu10", "cpu11"}'
verify 'cpus["pod1c0"] == {"cpu08", "cpu09"}'

cleanup-test-pods

# Test that
# - containers that are namespace-assigned to reserved pools are pinned there
# - containers that are annotated to opt-put that are pinned elsewhere, and
# - containers that are namespace-assigned and annotated to reserved pools are pinned there

(kubectl create namespace foobar) || true

cleanup-foobar-namespace() {
    (kubectl delete pods -n foobar --all --now --wait) || true
}
cleanup-foobar-namespace

CONTCOUNT=1 namespace=foobar create besteffort
ANN0='prefer-reserved-cpus.cri-resource-manager.intel.com/pod: "false"'
CONTCOUNT=1 namespace=foobar create besteffort
ANN0='prefer-reserved-cpus.cri-resource-manager.intel.com/pod: "true"'
CONTCOUNT=1 namespace=foobar create besteffort

report allowed
verify 'cpus["pod2c0"] == {"cpu10", "cpu11"}'
verify 'cpus["pod3c0"] == {"cpu08", "cpu09"}'
verify 'cpus["pod4c0"] == {"cpu10", "cpu11"}'

cleanup-foobar-namespace


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test10-additional-reserved-namespaces/cri-resmgr-reserved-namespaces.cfg.in
================================================
policy:
  Active: topology-aware
  ReservedResources:
    cpu: ${RESERVED_CPU}
  AvailableResources:
    cpu: ${AVAILABLE_CPU}
  topology-aware:
    ReservedPoolNamespaces: [\"reserved-pool\",\"reserved-*\",\"foobar\"]
logger:
  Debug: cri-resmgr,resource-manager,cache,policy
  Klog:
    skip_headers: true
dump:
  Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*,off:.*Image.*


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/code.var.sh
================================================
# Test that
# - containers marked in Annotations pinned on Reserved CPUs.

cri_resmgr_cfg_orig=$cri_resmgr_cfg

cleanup-test-pods() {
    ( kubectl delete pods pod0 --now --wait --ignore-not-found ) || true
    ( kubectl delete pods pod1 --now --wait --ignore-not-found ) || true
}
cleanup-test-pods

cri_resmgr_cfg_orig=$cri_resmgr_cfg
terminate cri-resmgr

AVAILABLE_CPU="cpuset:8-11"
RESERVED_CPU="cpuset:10-11"
cri_resmgr_cfg=$(instantiate cri-resmgr-reserved-annotations.cfg)
launch cri-resmgr

ANNOTATIONS='prefer-reserved-cpus.cri-resource-manager.intel.com/pod: "true"'
CONTCOUNT=1 create reserved-annotated
report allowed

ANNOTATIONS='prefer-reserved-cpus.cri-resource-manager.intel.com/container.special: "false"'
CONTCOUNT=1 create reserved-annotated
report allowed

verify 'cpus["pod0c0"] == {"cpu10", "cpu11"}'
verify 'cpus["pod1c0"] == {"cpu08"}'

cleanup-test-pods

terminate cri-resmgr
cri_resmgr_cfg=$cri_resmgr_cfg_orig
launch cri-resmgr


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/cri-resmgr-reserved-annotations.cfg.in
================================================
policy:
  Active: topology-aware
  ReservedResources:
    cpu: ${RESERVED_CPU}
  AvailableResources:
    cpu: ${AVAILABLE_CPU}
logger:
  Debug: cri-resmgr,resource-manager,cache,policy
  Klog:
    skip_headers: true
dump:
  Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*,off:.*Image.*


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/reserved-annotated.yaml.in
================================================
apiVersion: v1
kind: Pod
metadata:
  name: ${NAME}
  $(if [ -n "${ANNOTATIONS[0]}" ]; then echo "
  annotations:
    $(for annotation in "${ANNOTATIONS[@]}"; do echo "
    $annotation
    "; done)
  "; fi)
  labels:
    app: ${NAME}
spec:
  containers:
  $(for contnum in $(seq 1 ${CONTCOUNT}); do echo "
  - name: ${NAME}c$(( contnum - 1 ))
    image: busybox
    imagePullPolicy: IfNotPresent
    command:
      - sh
      - -c
      - echo ${NAME}c$(( contnum - 1 )) \$(sleep inf)
    resources:
      requests:
        cpu: ${CPU}
        memory: '${MEM}'
      limits:
        cpu: ${CPU}
        memory: '${MEM}'
  "; done )
  terminationGracePeriodSeconds: 1


================================================
FILE: test/e2e/policies.test-suite/topology-aware/n4c16/topology.var.json
================================================
[
    {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2}
]


================================================
FILE: test/e2e/run.sh
================================================
#!/bin/bash

DEMO_TITLE="Container Runtime End-to-End Testing"
DEFAULT_DISTRO="ubuntu-22.04"

PV='pv -qL'

binsrc=${binsrc-local}

SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")"
DEMO_LIB_DIR=$(realpath "$SCRIPT_DIR/../../demo/lib")
OUTPUT_DIR=${outdir-"$SCRIPT_DIR"/output}
COMMAND_OUTPUT_DIR="$OUTPUT_DIR"/commands

# shellcheck disable=SC1091
# shellcheck source=../../demo/lib/command.bash
source "$DEMO_LIB_DIR"/command.bash
# shellcheck disable=SC1091
# shellcheck source=../../demo/lib/host.bash
source "$DEMO_LIB_DIR"/host.bash
# shellcheck disable=SC1091
# shellcheck source=../../demo/lib/vm.bash
source "$DEMO_LIB_DIR"/vm.bash

script_source="$(< "$0") $(< "$DEMO_LIB_DIR/host.bash") $(< "$DEMO_LIB_DIR/command.bash") $(< "$DEMO_LIB_DIR/vm.bash")"

usage() {
    echo "$DEMO_TITLE"
    echo "Usage: [VAR=VALUE] ./run.sh MODE [SCRIPT]"
    echo "  MODE:     \"play\" plays the test as a demo."
    echo "            \"record\" plays and records the demo."
    echo "            \"test\" runs fast, reports pass or fail."
    echo "            \"debug\" enables k8scri pipe debugging and"
    echo "                    copies sources of all *_src VARs (see below) to vm."
    echo "            \"interactive\" launches interactive shell"
    echo "            for running test script commands"
    echo "            (see ./run.sh help script [FUNCTION])."
    echo "  SCRIPT:   test script file to run instead of the default test."
    echo ""
    echo "  VARs:"
    echo "    vm:      govm virtual machine name."
    echo "             For non-govm-managed hosts: set VM_IP and VM_SSH_USER, too."
    echo "             'ssh \$VM_SSH_USER@\$VM_IP sudo id' must not require password."
    echo "    containerd_src:"
    echo "             \"/host/path/to/go/project\": replace vm /usr/bin binaries"
    echo "             from /host/path/to/go/project/bin directory."
    echo "             The default is to use vm OS package manager containerd."
    echo "    crio_src:"
    echo "             \"/host/path/to/go/project\": replace vm /usr/bin binaries"
    echo "             from /host/path/to/go/project/bin directory."
    echo "             Must be set if crio is a part of \$k8scri and the vm distro"
    echo "             does not have (or implement installing) cri-o packages."
    echo "    crirm_src:"
    echo "             \"/host/path/to/go/project\": replace vm /usr/local/bin binaries"
    echo "             from /host/path/to/go/project/bin directory."
    echo "             The default is to use the project of these e2e tests."
    echo "    runc_src:"
    echo "             \"/host/path/to/go/project\": replace vm /usr/bin binaries"
    echo "             from /host/path/to/go/project/bin directory."
    echo "    distro_binaries:"
    echo "             0: use the normal binaries built for this host (the default)."
    echo "             1: use binaries cross-built for distros."
    echo "    binsrc:  Where to get cri-resmgr to the vm."
    echo "             \"github\": go get from master and build inside vm."
    echo "             \"local\": (the default) copy from \${crirm_src}/bin, or"
    echo "                      from \${crirm_src}/binaries/\$distro if \$distro_binaries=1."
    echo "             \"packages/<distro>\": use distro packages from this dir"
    echo "    reinstall_<containerd|crio|cri_resmgr|cri_resmgr_agent|runc>:"
    echo "             If 1, stop the daemon (if not runc),"
    echo "             then reinstall and restart it before starting test run."
    echo "             The default is 0."
    echo "             Set containerd_src/crio_src/runc_src to install a local build."
    echo "    reinstall_k8s: if 1, destroy existing k8s cluster and create a new one."
    echo "    reinstall_bootstrap: if 1, run the bootstrap and proxy setup commands."
    echo "                         Only available if VM_IP is set when calling the script."
    echo "    reinstall_all: if 1, set all above reinstall_* options to 1."
    echo "    omit_cri_resmgr: if 1, omit checking/installing/starting cri-resmgr."
    echo "    omit_agent: if 1, omit checking/installing/starting cri-resmgr-agent."
    echo "    outdir:  Save output under given directory."
    echo "             The default is \"${SCRIPT_DIR}/output\"."
    echo "    speed:   Demo play speed."
    echo "             The default is 10 (keypresses per second)."
    echo "    cleanup: Level of cleanup after a test run:"
    echo "             0: leave vm running (the default)"
    echo "             1: delete vm"
    echo "             2: stop vm, but do not delete it."
    echo "  Hook VARs:"
    echo "    on_vm_online: code to be executed when SSH connection to vm works."
    echo "    on_k8s_online: code to be executed when Kubernetes is ready for use."
    echo "    on_verify_fail, on_create_fail: code to be executed in case"
    echo "             verify() or create() fails. Example: go to interactive"
    echo "             mode if a verification fails: on_verify_fail=interactive"
    echo "    on_verify, on_create, on_launch: code to be executed every time"
    echo "             after verify/create/launch function"
    echo "    on_{cri,runc,k8s}_install: code to be executed right after installing"
    echo "             these components."
    echo ""
    echo "  VM configuration VARs: (effective when vm is not already configured)"
    echo "    topology: JSON to override NUMA node list used in tests."
    echo "             See: python3 ${DEMO_LIB_DIR}/topology2qemuopts.py --help"
    echo "    distro:  Linux distribution to be / already installed on vm."
    echo "             Supported values: debian-11, debian-12, debian-sid"
    echo "                 fedora, opensuse-tumbleweed,"
    echo "                 opensuse-15.6 (same as opensuse), sles,"
    echo "                 ubuntu-18.04, ubuntu-20.04, ubuntu-22.04, ubuntu-24.04"
    echo "             If sles: set VM_SLES_REGCODE=<CODE> to use official packages."
    echo "    cgroups: cgroups version in the VM, v1 or v2. The default is v1."
    echo "             cgroups=v2 is supported only on distro=fedora"
    echo "    k8s:     Kubernetes version to be installed on VM creation"
    echo "             The default is the latest available on selected distro."
    echo "             Example: k8s=1.31"
    echo "    k8scri:  The container runtime pipe where kubelet connects to."
    echo "             Options are:"
    echo "             \"cri-resmgr|containerd\" cri-resmgr is a proxy to containerd."
    echo "             \"cri-resmgr|crio\"       cri-resmgr is a proxy to cri-o."
    echo "             \"containerd\"            containerd, no cri-resmgr."
    echo "             \"containerd&cri-resmgr\" containerd, cri-resmgr is an NRI plugin."
    echo "             \"crio\"                  cri-o, no cri-resmgr."
    echo "             \"crio&cri-resmgr\"       cri-o, cri-resmgr is an NRI plugin."
    echo "             The default is \"cri-resmgr|containerd\"."
    echo "    k8scni:  The container network interface plugin to install. Options are:"
    echo "             \"cilium\" (the default), \"flannel\", \"weavenet\"."
    echo "    k8smaster: Name of the existing vm whose cluster this vm will join."
    echo "             If empty (default), this vm forms its own single-node cluster."
    echo "    crio_version: Version of cri-o to try to pull in, if cri-o is"
    echo "                  not being installed from sources."
    echo "    setup_proxies: Setup proxies even if not using govm based VM."
    echo "                   This is only needed if you have set VM_IP and want"
    echo "                   the proxy information set in the target host. By default"
    echo "                   the proxies are not set if VM_IP is set."
    echo ""
    echo "  Test input VARs:"
    echo "    cri_resmgr_cfg: configuration file forced to cri-resmgr."
    echo "    cri_resmgr_extra_args: arguments to be added on cri-resmgr"
    echo "             command line when launched"
    echo "    cri_resmgr_agent_extra_args: arguments to be added on"
    echo "              cri-resmgr-agent command line when launched"
    echo "    use_host_images: if \"1\", export images from the host docker"
    echo "              to vm whenever they are available."
    echo "              The default is 0: always pull images from repositories to vm."
    echo "    vm_files: \"serialized\" associative array of files to be created on vm"
    echo "             associative array syntax:"
    echo "             vm_files['/path/file']=file:/path/on/host"
    echo "                                   ='data:,plain text content'"
    echo "                                   =data:;base64,ZGF0YQ=="
    echo "                                   =dir: (creates only /path/file directory)"
    echo "             vm_files['/etc/motd']='data:,hello world'"
    echo "             How to execute run.sh with serialized array:"
    echo "             vm_files=\$(declare -p vm_files) ./run.sh"
    echo "    code:    Variable that contains test script code to be run"
    echo "             if SCRIPT is not given."
    echo "    py_consts: Python code that runs always before pyexec in SCRIPT."
    echo ""
    echo "Default test input VARs: ./run.sh help defaults"
    echo ""
    echo "Create VM 'foo' that runs k8s 1.28 on Debian Sid:"
    echo "vm=foo distro=debian-sid k8s=1.28 ./run.sh interactive"
}

error() {
    (echo ""; echo "error: $1" ) >&2
    command-exit-if-not-interactive
}

out() {
    if [ -n "$PV" ]; then
        speed=${speed-10}
        echo "$1" | $PV "$speed"
    else
        echo "$1"
    fi
    echo ""
}

record() {
    clear
    out "Recording this screencast..."
    host-command "asciinema rec -t \"$DEMO_TITLE\" crirm-demo-blockio.cast -c \"./run.sh play\""
}

screen-create-vm() {
    speed=60 out "### Running the test in vm=\"$VM_NAME\"."
    host-create-vm "$vm" "$topology"
    vm-networking
    if [ -z "$VM_IP" ]; then
        error "creating VM failed"
    fi
}

screen-install-cri-resmgr() {
    speed=60 out "### Installing CRI Resource Manager to VM."
    vm-install-cri-resmgr
}

screen-launch-cri-resmgr() {
    speed=60 out "### Launching cri-resmgr with config $cri_resmgr_cfg."
    if [ "${binsrc#packages}" != "$binsrc" ]; then
        launch cri-resmgr-systemd
    else
        launch cri-resmgr
    fi
}

screen-create-singlenode-cluster() {
    speed=60 out "### Setting up single-node Kubernetes cluster."
    speed=60 out "### Container runtime parts: $k8scri"
    vm-create-singlenode-cluster
}

screen-launch-cri-resmgr-agent() {
    speed=60 out "### Launching cri-resmgr-agent."
    speed=60 out "### The agent will make cri-resmgr configurable with ConfigMaps."
    launch cri-resmgr-agent
}

get-py-allowed() {
    topology_dump_file="$OUTPUT_DIR/topology_dump.$VM_NAME"
    res_allowed_file="$OUTPUT_DIR/res_allowed.$VM_NAME"
    if ! [ -f "$topology_dump_file" ]; then
        vm-command "$("$DEMO_LIB_DIR/topology.py" bash_topology_dump)" >/dev/null || {
            command-error "error fetching topology_dump from $VM_NAME"
        }
        echo -e "$COMMAND_OUTPUT" > "$topology_dump_file"
    fi
    get-res-allowed "$res_allowed_file"
    py_allowed="
import re
allowed=$("$DEMO_LIB_DIR/topology.py" -t "$topology_dump_file" -r "$res_allowed_file" res_allowed -o json)
_branch_pod=[(p, d, n, c, t, cpu, pod.rsplit('/', 1)[0])
             for p in allowed
             for d in allowed[p]
             for n in allowed[p][d]
             for c in allowed[p][d][n]
             for t in allowed[p][d][n][c]
             for cpu in allowed[p][d][n][c][t]
             for pod in allowed[p][d][n][c][t][cpu]]
# cpu resources allowed for a pod:
packages, dies, nodes, cores, threads, cpus = {}, {}, {}, {}, {}, {}
# mem resources allowed for a pod:
mems = {}
for p, d, n, c, t, cpu, pod in _branch_pod:
    if c == 'mem': # this _branch_pod entry is about memory
        if not pod in mems:
            mems[pod] = set()
        # topology.py can print memory nodes as children of cpu-ful nodes
        # if distance looks like they are behind the same memory controller.
        # The thread field, however, is the true node who contains the memory.
        mems[pod].add(t)
        continue
    # this _branch_pod entry is about cpu
    if not pod in packages:
        packages[pod] = set()
        dies[pod] = set()
        nodes[pod] = set()
        cores[pod] = set()
        threads[pod] = set()
        cpus[pod] = set()
    packages[pod].add(p)
    dies[pod].add('%s/%s' % (p, d))
    nodes[pod].add(n)
    cores[pod].add('%s/%s' % (n, c))
    threads[pod].add('%s/%s/%s' % (n, c, t))
    cpus[pod].add(cpu)

def disjoint_sets(*sets):
    'set.isdisjoint() for n > 1 sets'
    s = sets[0]
    for next in sets[1:]:
        if not s.isdisjoint(next):
            return False
        s = s.union(next)
    return True

def set_ids(str_ids, chars='[a-z]'):
    num_ids = set()
    for str_id in str_ids:
        if '/' in str_id:
            num_ids.add(tuple(int(re.sub(chars, '', s)) for s in str_id.split('/')))
        else:
            num_ids.add(int(re.sub(chars, '', str_id)))
    return num_ids
package_ids = lambda i: set_ids(i, '[package]')
die_ids = lambda i: set_ids(i, '[packagedie]')
node_ids = lambda i: set_ids(i, '[node]')
core_ids = lambda i: set_ids(i, '[nodecore]')
thread_ids = lambda i: set_ids(i, '[nodecorethread]')
cpu_ids = lambda i: set_ids(i, '[cpu]')
"
}

get-res-allowed() {
    local res_allowed_file="$1"
    local retries=5
    while (( retries > 0 )); do
        # Fetch data and update allowed* variables from the virtual machine
        vm-command "$("$DEMO_LIB_DIR/topology.py" bash_res_allowed 'pod[0-9]*c[0-9]*')" >/dev/null || {
            command-error "error fetching res_allowed from $VM_NAME"
        }
        echo -e "$COMMAND_OUTPUT" > "$res_allowed_file"
        # Validate res_allowed_file. Retry if there is same container
        # name with two different sets of allowed CPUs or
        # memories. This is possible if cpuset.cpus of the cgroup has
        # been just changed and different processes in the same
        # container are just going through the change. Or if there are
        # several pods/containers running with the same name.
        awk -F "[ /]" '{if (pod[$1]!=0 && pod[$1]!=""$3""$4){print "error: ambiguous allowed resources for name "$1; exit(1)};pod[$1]=""$3""$4}' "$res_allowed_file" && return 0
        mv "$res_allowed_file" "$res_allowed_file.retries${retries}"
        echo "    see $res_allowed_file.retries${retries} for more details"
        retries=$(( retries - 1 ))
    done
    error "error: container/process name collision: test environment may need cleanup."
}

get-py-cache() {
    # Fetch current cri-resmgr cache from a virtual machine.
    speed=1000 vm-command "cat \"/var/lib/cri-resmgr/cache\"" >/dev/null 2>&1 || {
        command-error "fetching cache file failed"
    }
    cat > "${OUTPUT_DIR}/cache" <<<"$COMMAND_OUTPUT"
    py_cache="
import json
cache=json.load(open(\"${OUTPUT_DIR}/cache\"))
try:
    allocations=json.loads(cache['PolicyJSON']['allocations'])
except KeyError:
    allocations=None
containers=cache['Containers']
pods=cache['Pods']
for _contid in list(containers.keys()):
    try:
        _cmd = ' '.join(containers[_contid]['Command'])
    except:
        continue # Command may be None
    # Recognize echo podXcY ; sleep inf -type test pods and make them
    # easily accessible: containers['pod0c0'], pods['pod0']
    if 'echo pod' in _cmd and 'sleep inf' in _cmd:
        _contname = _cmd.split()[3] # _contname is podXcY
        _podid = containers[_contid]['PodID']
        _podname = pods[_podid]['Name'] # _podname is podX
        if not allocations is None and _contid in allocations:
            allocations[_contname] = allocations[_contid]
        containers[_contname] = containers[_contid]
        pods[_podname] = pods[_podid]
"
}

resolve-template() {
    local name="$1" r="" d t
    shift
    for d in "$@"; do
        if [ -z "$d" ] || ! [ -d "$d" ]; then
            continue
        fi
        t="$d/$name.in"
        if ! [ -e "$t" ]; then
            continue
        fi
        if [ -z "$r" ]; then
            r="$t"
            echo 1>&2 "template $name resolved to file $r"
        else
            echo 1>&2 "WARNING: template file $r shadows $t"
        fi
    done
    if [ -n "$r" ]; then
        echo "$r"
        return 0
    fi
    return 1
}

is-hooked() {
    local hook_code_var hook_code
    hook_code_var=$1
    hook_code="${!hook_code_var}"
    if [ -n "${hook_code}" ]; then
        return 0 # logic: if is-hooked xyz; then run-hook xyz; fi
    fi
    return 1
}

run-hook() {
    local hook_code_var hook_code
    hook_code_var=$1
    hook_code="${!hook_code_var}"
    echo "Running hook: $hook_code_var"
    eval "${hook_code}"
}

install-files() {
    # Usage: install-files $(declare -p files_assoc_array)
    #
    # Parameter is a serialized associative array with
    # key: target filepath on VM
    # value: source URL ("file:", limited "data:" and "dir:" schemes supported)
    #
    # Example: build an associative array and install files in the array
    #   files['/path/file1']=file:/hostpath/file
    #   files['/path/file2']=data:,hello
    #   files['/path/file3']=data:;base64,aGVsbG8=
    #   files['/path/dir1']='dir:'
    #   install-files "$(declare -p files)"
    local -A files
    eval "files=${1#*=}"
    local tgt src data
    for tgt in "${!files[@]}"; do
        src="${files[$tgt]}"
        case $src in
            "data:,"*)
                data=${src#data:,}
                ;;
            "data:;base64,"*)
                data=$(base64 -d <<< "${src#data:;base64,}")
                ;;
            "file:"*)
                data=$(< "${src#file:}")
                ;;
            "dir:")
                echo -n "Creating on vm: $tgt/... "
                vm-command-q "mkdir -p \"$tgt\"" || {
                    error "failed to make directory to vm \"$tgt\""
                }
                echo "ok."
                continue
                ;;
            *)
                error "invalid source scheme \"${src}\", expected \"data:,\" \"data:;base64,\", \"file:\" or \"dir:\""
                ;;
        esac
        echo -n "Writing on vm: $tgt... "
        vm-write-file "$tgt" "$data" || {
            error "failed to write to vm file \"$tgt\""
        }
        echo "ok."
    done
}

### Test script helpers

install() { # script API
    # Usage: install TARGET
    #
    # Supported TARGETs:
    #   cri-resmgr: install cri-resmgr to VM.
    #               Install latest local build to VM: (the default)
    #                 $ install cri-resmgr
    #               Fetch github master to VM, build and install on VM:
    #                 $ binsrc=github install cri-resmgr
    #   cri-resmgr-webhook: install cri-resmgr-webhook to VM.
    #               Installs from the latest webhook Docker image on the host.
    #
    # Example:
    #   uninstall cri-resmgr
    #   install cri-resmgr
    #   launch cri-resmgr
    local target="$1"
    case "$target" in
        "cri-resmgr")
            vm-install-cri-resmgr
            ;;
        "cri-resmgr-agent")
            vm-install-cri-resmgr-agent
            ;;
        "cri-resmgr-webhook")
            vm-install-cri-resmgr-webhook
            ;;
        *)
            error "unknown target to install \"$1\""
            ;;
    esac
}

uninstall() { # script API
    # Usage: uninstall TARGET
    #
    # Supported TARGETs:
    #   cri-resmgr: stop (kill) cri-resmgr and purge all files from VM.
    #   cri-resmgr-webhook: stop cri-resmgr-webhook and delete webhook files from VM.
    local target="$1"
    case $target in
        "cri-resmgr")
            terminate cri-resmgr
            terminate cri-resmgr-agent
            distro-remove-pkg cri-resource-manager
            vm-command "rm -rf /usr/local/bin/cri-resmgr /usr/bin/cri-resmgr /usr/local/bin/cri-resmgr-agent /usr/bin/cri-resmgr-agent /var/lib/cri-resmgr /etc/cri-resmgr"
            ;;
        "cri-resmgr-agent")
            terminate cri-resmgr-agent
            vm-command "rm -rf /usr/local/bin/cri-resmgr /usr/bin/cri-resmgr /usr/local/bin/cri-resmgr-agent /usr/bin/cri-resmgr-agent /var/lib/cri-resmgr /etc/cri-resmgr"
            ;;
        "cri-resmgr-webhook")
            terminate cri-resmgr-webhook
            vm-command "rm -rf webhook"
            ;;
        *)
            error "uninstall: invalid target \"$target\""
            ;;
    esac
}

launch() { # script API
    # Usage: launch TARGET
    #
    # Supported TARGETs:
    #   cri-resmgr:  launch cri-resmgr on VM. Environment variables:
    #                cri_resmgr_cfg: configuration filepath (on host)
    #                cri_resmgr_extra_args: extra arguments on command line
    #                cri_resmgr_config: "force" (default) or "fallback"
    #                k8scri: if the CRI pipe starts with cri-resmgr
    #                        this launches cri-resmgr as a proxy,
    #                        otherwise as a dynamic NRI plugin.
    #
    #   cri-resmgr-systemd:
    #                launch cri-resmgr on VM using "systemctl start".
    #                Works when installed with binsrc=packages/<distro>.
    #                Environment variables:
    #                cri_resmgr_cfg: configuration filepath (on host)
    #
    #   cri-resmgr-agent:
    #                launch cri-resmgr-agent on VM. Environment variables:
    #                cri_resmgr_agent_extra_args: extra arguments on command line
    #
    #   cri-resmgr-webhook:
    #                deploy cri-resmgr-webhook from the image on VM.
    #
    # Example:
    #   cri_resmgr_cfg=/tmp/topology-aware.cfg launch cri-resmgr
    local target="$1"
    local launch_cmd
    local adjustment_schema="$HOST_PROJECT_DIR/pkg/apis/resmgr/v1alpha1/adjustment-schema.yaml"
    local cri_resmgr_config_option="-${cri_resmgr_config:-force}-config"
    local cri_resmgr_mode=""
    case $target in
        "cri-resmgr")
            host-command "$SCP \"$cri_resmgr_cfg\" $VM_SSH_USER@$VM_IP:" || {
                command-error "copying \"$cri_resmgr_cfg\" to VM failed"
            }
            vm-command "cat $(basename "$cri_resmgr_cfg")"
            if [[ "$k8scri" == cri-resmgr* ]]; then
                # launch cri-resmgr as the top element in the k8s container runtime stack
                cri_resmgr_mode="-relay-socket ${cri_resmgr_sock} -runtime-socket $cri_sock -image-socket $cri_sock"
            else
                # launch cri-resmgr as an NRI plugin to running container runtime
                cri_resmgr_mode="-use-nri-plugin"
            fi
            launch_cmd="cri-resmgr $cri_resmgr_mode $cri_resmgr_config_option $(basename "$cri_resmgr_cfg") $cri_resmgr_extra_args"
            vm-command-q "rm -f $cri_resmgr_pidfile"
            vm-command-q "echo '$launch_cmd' > cri-resmgr.launch.sh ; rm -f cri-resmgr.output.txt"
            vm-command "$launch_cmd  >cri-resmgr.output.txt 2>&1 &"
            vm-wait-process --timeout 30 --pidfile "$cri_resmgr_pidfile" cri-resmgr
            vm-command "grep 'FATAL ERROR' cri-resmgr.output.txt" >/dev/null 2>&1 && {
                command-error "launching cri-resmgr failed with FATAL ERROR"
            }
            vm-command "fuser ${cri_resmgr_pidfile}" >/dev/null 2>&1 || {
                echo "cri-resmgr last output line:"
                vm-command-q "tail -n 1 cri-resmgr.output.txt"
                command-error "launching cri-resmgr failed, cannot find cri-resmgr PID"
            }
            ;;

        "cri-resmgr-agent")
            host-command "$SCP \"$adjustment_schema\" $VM_SSH_USER@$VM_IP:" ||
                command-error "copying \"$adjustment_schema\" to VM failed"
            vm-command "kubectl delete -f $(basename "$adjustment_schema"); kubectl create -f $(basename "$adjustment_schema")"
            launch_cmd="NODE_NAME=\$(hostname) cri-resmgr-agent -kubeconfig /root/.kube/config $cri_resmgr_agent_extra_args"
            vm-command-q "echo '$launch_cmd' >cri-resmgr-agent.launch.sh; rm -f cri-resmgr-agent.output.txt"
            vm-command "$launch_cmd >cri-resmgr-agent.output.txt 2>&1 &"
            vm-wait-process --timeout 30 cri-resmgr-agent
            vm-command "grep 'FATAL ERROR' cri-resmgr-agent.output.txt" >/dev/null 2>&1 &&
                command-error "launching cri-resmgr-agent failed with FATAL ERROR"
            vm-command "fuser ${cri_resmgr_agent_sock}" >/dev/null 2>&1 ||
                command-error "launching cri-resmgr-agent failed, cannot find cri-resmgr-agent PID"
            ;;

        "cri-resmgr-systemd")
            host-command "$SCP \"$cri_resmgr_cfg\" $VM_SSH_USER@$VM_IP:" ||
                command-error "copying \"$cri_resmgr_cfg\" to VM failed"
            vm-command "cp \"$(basename "$cri_resmgr_cfg")\" /etc/cri-resmgr/fallback.cfg"
            vm-command "systemctl daemon-reload ; systemctl start cri-resource-manager" ||
                command-error "systemd failed to start cri-resource-manager"
            vm-wait-process --timeout 30 cri-resmgr
            vm-command "systemctl is-active cri-resource-manager" || {
                vm-command "systemctl status cri-resource-manager"
                command-error "cri-resource-manager did not become active after systemctl start"
            }
            ;;

        "cri-resmgr-webhook")
            kubectl apply -f webhook/webhook-deployment.yaml
            kubectl wait --for=condition=Available -n cri-resmgr deployments/cri-resmgr-webhook ||
                error "cri-resmgr-webhook deployment did not become Available"
            kubectl apply -f webhook/mutating-webhook-config.yaml
            ;;

        *)
            error "launch: invalid target \"$1\""
            ;;
    esac
    is-hooked on_launch && run-hook on_launch
    return 0
}

terminate() { # script API
    # Usage: terminate TARGET
    #
    # Supported TARGETs:
    #   cri-resmgr: stop (kill) cri-resmgr.
    #   cri-resmgr-agent: stop (kill) cri-resmgr-agent.
    #   cri-resmgr-webhook: delete cri-resmgr-webhook from k8s.
    local target="$1"
    case $target in
        "cri-resmgr")
            vm-command "fuser --kill ${cri_resmgr_pidfile} 2>/dev/null"
            ;;
        "cri-resmgr-agent")
            vm-command "fuser --kill ${cri_resmgr_agent_sock} 2>/dev/null"
            ;;
        "cri-resmgr-webhook")
            vm-command "kubectl delete -f webhook/mutating-webhook-config.yaml; kubectl delete -f webhook/webhook-deployment.yaml"
            ;;
        *)
            error "terminate: invalid target \"$target\""
            ;;
    esac
}

sleep() { # script API
    # Usage: sleep PARAMETERS
    #
    # Run sleep PARAMETERS on host.
    host-command "sleep $*"
}

extended-resources() { # script API
    # Usage: extended-resources <add|remove> RESOURCE [VALUE]
    #
    # Examples:
    #   extended-resources remove cmk.intel.com/exclusive-cpus
    #   extended-resources add cmk.intel.com/exclusive-cpus 4
    local action="$1"
    local resource="$2"
    local value="$3"
    local resource_escaped="${resource/\//~1}"
    if [ -z "$resource" ]; then
        error "extended-resource: missing resource"
        return 1
    fi
    # make sure kubectl proxy is running
    vm-command-q "ss -ltn | grep -q 127.0.0.1:8001 || { kubectl proxy &>/dev/null </dev/null & sleep 2 ; }"
    case $action in
        add)
            if [ -z "$value" ]; then
                error "extended-resource: missing value to add to resource $resource"
                return 1
            fi
            vm-command "curl --header 'Content-Type: application/json-patch+json' --request PATCH --data '[{\"op\": \"add\", \"path\": \"/status/capacity/$resource_escaped\", \"value\": \"$value\"}]' http://localhost:8001/api/v1/nodes/\$(hostname)/status"
            ;;
        remove)
            vm-command "curl --header 'Content-Type: application/json-patch+json' --request PATCH --data '[{\"op\": \"remove\", \"path\": \"/status/capacity/$resource_escaped\"}]' http://localhost:8001/api/v1/nodes/\$(hostname)/status"
            ;;
        *)
            error "extended-resource: invalid action \"$action\""
            return 1
            ;;
    esac
}

pyexec() { # script API
    # Usage: pyexec [PYTHONCODE...]
    #
    # Run python3 -c PYTHONCODEs on host. Stops if execution fails.
    #
    # Variables available in PYTHONCODE:
    #   allocations: dictionary: shorthand to cri-resmgr policy allocations
    #                (unmarshaled cache['PolicyJSON']['allocations'])
    #   allowed      tree: {package: {die: {node: {core: {thread: {pod}}}}}}
    #                resource topology and pods allowed to use the resources.
    #   packages, dies, nodes, cores, threads:
    #                dictionaries: {podname: set-of-allowed}
    #                Example: pyexec 'print(dies["pod0c0"])'
    #   cache:       dictionary, cri-resmgr cache
    #
    # Note that variables are *not* updated when pyexec is called.
    # You can update the variables by running "verify" without expressions.
    #
    # Code in environment variable py_consts runs before PYTHONCODE.
    #
    # Example:
    #   verify ; pyexec 'import pprint; pprint.pprint(allowed)'
    PYEXEC_STATE_PY="$OUTPUT_DIR/pyexec_state.py"
    PYEXEC_PY="$OUTPUT_DIR/pyexec.py"
    PYEXEC_LOG="$OUTPUT_DIR/pyexec.output.txt"
    local last_exit_status=0
    {
        echo "import pprint; pp=pprint.pprint"
        echo "# \$py_allowed:"
        echo -e "$py_allowed"
        echo "# \$py_cache:"
        echo -e "$py_cache"
        echo "# \$py_consts:"
        echo -e "$py_consts"
    } > "$PYEXEC_STATE_PY"
    for PYTHONCODE in "$@"; do
        {
            echo "from pyexec_state import *"
            echo -e "$PYTHONCODE"
        } > "$PYEXEC_PY"
        PYTHONPATH="$OUTPUT_DIR:$PYTHONPATH:$DEMO_LIB_DIR" python3 "$PYEXEC_PY" 2>&1 | tee "$PYEXEC_LOG"
        last_exit_status=${PIPESTATUS[0]}
        if [ "$last_exit_status" != "0" ]; then
            error "pyexec: non-zero exit status \"$last_exit_status\", see \"$PYEXEC_PY\" and \"$PYEXEC_LOG\""
        fi
    done
    return "$last_exit_status"
}

pp() { # script API
    # Usage: pp EXPR
    #
    # Pretty-print the value of Python expression EXPR.
    pyexec "pp($*)"
}

report() { # script API
    # Usage: report [VARIABLE...]
    #
    # Updates and reports current value of VARIABLE.
    #
    # Supported VARIABLEs:
    #     allocations
    #     allowed
    #     cache
    #
    # Example: print cri-resmgr policy allocations. In interactive mode
    #          you may use a pager like less.
    #   report allocations | less
    local varname
    for varname in "$@"; do
        if [ "$varname" == "allocations" ]; then
            get-py-cache
            pyexec "
import pprint
pprint.pprint(allocations)
"
        elif [ "$varname" == "allowed" ]; then
            get-py-allowed
            pyexec "
import topology
print(topology.str_tree(allowed))
"
        elif [ "$varname" == "cache" ]; then
            get-py-cache
            pyexec "
import pprint
pprint.pprint(cache)
"
        else
            error "report: unknown variable \"$varname\""
        fi
    done
}

verify() { # script API
    # Usage: verify [--retry N] [EXPR...]
    #
    # Run python3 -c "assert(EXPR)" to test that every EXPR is True.
    # Stop immediately after the first failing assertion and fail the test.
    #
    # If a verify is expected to fail, failing the whole test can be
    # prevented by running the verify in a subshell (in parenthesis):
    #   (verify 'False') || echo '...this was expected to fail.'
    #
    # --retry N reruns all assertions at most N times before failing
    # the test. All assertions must hold at the same time for a
    # successful verification. By default N=3.
    #
    # Variables available in EXPRs:
    #   See variables in: help pyexec
    #
    # Note that all variables are updated every time verify is called
    # before evaluating (asserting) expressions.
    #
    # Example: require that containers pod0c0 and pod1c0 run on separate NUMA
    #          nodes and that pod0c0 is allowed to run on 4 CPUs:
    #   verify 'set.intersection(nodes["pod0c0"], nodes["pod1c0"]) == set()' \
    #          'len(cpus["pod0c0"]) == 4'
    local retries=3
    local poll_delay=1s
    if [[ "$1" == "--retry" ]]; then
        retries="$2"
        shift; shift
    fi
    while ! _verify "$@"; do
        if (( retries <= 0 )); then
            if is-hooked on_verify_fail; then
                run-hook on_verify_fail
            else
                command-exit-if-not-interactive
            fi
            return 1
        fi
        out "### Retrying verify at most $retries time(s) after $poll_delay..."
        sleep "$poll_delay"
        retries=$(( retries - 1 ))
    done
    is-hooked on_verify && run-hook on_verify
    return 0
}

_verify() {
    get-py-allowed
    get-py-cache
    for py_assertion in "$@"; do
        speed=1000 out "### Verifying assertion '$py_assertion'"
        ( speed=1000 pyexec "
try:
    import time,sys
    assert(${py_assertion})
except KeyError as e:
    print('WARNING: *')
    print('WARNING: *** KeyError - %s' % str(e))
    print('WARNING: *** Your verify expression might have a typo/thinko.')
    print('WARNING: *')
    sys.stdout.flush()
    time.sleep(5)
    raise e
except IndexError as e:
    print('WARNING: *')
    print('WARNING: *** IndexError - %s' % str(e))
    print('WARNING: *** Your verify expression might have a typo/thinko.')
    print('WARNING: *')
    sys.stdout.flush()
    time.sleep(5)
    raise e
" ) || {
                out "### The assertion FAILED
### post-mortem debug help begin ###
cd $OUTPUT_DIR
python3
from pyexec_state import *
$py_assertion
### post-mortem debug help end ###"
                echo "verify: assertion '$py_assertion' failed." >> "$SUMMARY_FILE"
                return 1
        }
        speed=1000 out "### The assertion holds."
    done
    return 0
}

kubectl-force-delete() { # script API
    # Usage: kubectl-force-delete RESOURCE NAME
    #
    # Force-deleting a "Terminating" namespace clears finalizers that
    # have failed to finish. Therefore there may be resources left in the
    # namespace NAME. Following command prints them.
    #
    #     kubectl api-resources --verbs=list --namespaced -o name | \
    #       xargs -n 1 kubectl get --show-kind --ignore-not-found -n NAME
    #
    # Example: delete a namespace that is stuck in the "Terminating" phase
    #
    #     kubectl-force-delete namespace my-namespace

    if [ -z "$1" ]; then
        error "missing RESOURCE"
        return 1
    fi

    if [ -z "$2" ]; then
        error "missing resource NAME"
        return 1
    fi

    if [[ "$1" == "namespace" ]] || [[ "$1" == "ns" ]]; then
        local ns="$2"
        vm-command "
            kubectl get namespace $ns -o json > force-delete-ns.json || exit 0
            (
              grep -E phase.*Terminating force-delete-ns.json || exit 0
              tr -d '\n' < force-delete-ns.json \
              | sed 's/\"finalizers\": \[[^]]\+\]/\"finalizers\": []/' \
              | kubectl replace --raw /api/v1/namespaces/$ns/finalize -f -
            )
            rm -f force-delete-ns.json
            "
    else
        error "unsupported force-delete resource: $1"
        return 1
    fi
}

kubectl() { # script API
    # Usage: kubectl parameters
    #
    # Runs kubectl command on virtual machine.
    vm-command "kubectl $*" || {
        command-error "kubectl $* failed"
    }
}

delete() { # script API
    # Usage: delete PARAMETERS
    #
    # Run "kubectl delete PARAMETERS".
    vm-command "kubectl delete $*" || {
        command-error "kubectl delete failed"
    }
}

instantiate() { # script API
    # Usage: instantiate FILENAME
    #
    # Produces $OUTPUT_DIR/instance/FILENAME. Prints the filename on success.
    # Uses FILENAME.in as source (resolved from $TEST_DIR, $TOPOLOGY_DIR, ...)
    local FILENAME="$1"
    local RESULT="$OUTPUT_DIR/instance/$FILENAME"

    template_file=$(resolve-template "$FILENAME" "$TEST_DIR" "$TOPOLOGY_DIR" "$POLICY_DIR" "$SCRIPT_DIR")
    if [ ! -f "$template_file" ]; then
        error "error instantiating \"$FILENAME\": missing template ${template_file}"
    fi
    mkdir -p "$(dirname "$RESULT")" 2>/dev/null
    eval "echo -e \"$(<"${template_file}")\"" | grep -v '^ *$' > "$RESULT" ||
        error "instantiating \"$FILENAME\" failed"
    echo "$RESULT"
}

declare -a pulled_images_on_vm
create() { # script API
    # Usage: [VAR=VALUE][n=COUNT] create TEMPLATE_NAME
    #
    # Create n instances from TEMPLATE_NAME.yaml.in, copy each of them
    # from host to vm, kubectl create -f them, and wait for them
    # becoming Ready. Templates are searched in $TEST_DIR, $TOPOLOGY_DIR,
    # $POLICY_DIR, and $SCRIPT_DIR in this order of preference. The first
    # template found is used.
    #
    # Parameters:
    #   TEMPLATE_NAME: the name of the template without extension (.yaml.in)
    #
    # Optional parameters (VAR=VALUE):
    #   namespace: namespace to which instances are created
    #   wait: condition to be waited for (see kubectl wait --for=condition=).
    #         If empty (""), skip waiting. The default is wait="Ready".
    #   wait_t: wait timeout. The default is wait_t=240s.
    local template_file
    template_file=$(resolve-template "$1.yaml" "$TEST_DIR" "$TOPOLOGY_DIR" "$POLICY_DIR" "$SCRIPT_DIR")
    local namespace_args
    local template_kind
    template_kind=$(awk '/kind/{print tolower($2)}' < "$template_file")
    local wait=${wait-Ready}
    local wait_t=${wait_t-240s}
    local images
    local image
    local tag
    local errormsg
    local default_name=${NAME:-""}
    if [ -z "$n" ]; then
        local n=1
    fi
    if [ -n "${namespace:-}" ]; then
        namespace_args="-n $namespace"
    else
        namespace_args=""
    fi
    if [ ! -f "$template_file" ]; then
        error "error creating from template \"$template_file.yaml.in\": template file not found"
    fi
    for _ in $(seq 1 $n); do
        kind_count[$template_kind]=$(( ${kind_count[$template_kind]} + 1 ))
        if [ -n "$default_name" ]; then
            local NAME="$default_name"
        else
            local NAME="${template_kind}$(( ${kind_count[$template_kind]} - 1 ))" # the first pod is pod0
        fi
        eval "echo -e \"$(<"${template_file}")\"" | grep -v '^ *$' > "$OUTPUT_DIR/$NAME.yaml"
        host-command "$SCP \"$OUTPUT_DIR/$NAME.yaml\" $VM_SSH_USER@$VM_IP:" || {
            command-error "copying \"$OUTPUT_DIR/$NAME.yaml\" to VM failed"
        }
        vm-command "cat $NAME.yaml"
        images="$(grep -E '^ *image: .*$' "$OUTPUT_DIR/$NAME.yaml" | sed -E 's/^ *image: *([^ ]*)$/\1/g' | sort -u)"
        if [ "${#pulled_images_on_vm[@]}" = "0" ]; then
            # Initialize pulled images available on VM
            vm-command "crictl -i unix://${k8scri_sock} images" >/dev/null &&
            while read -r image tag _; do
                if [ "$image" = "IMAGE" ]; then
                    continue
                fi
                local notopdir_image="${image#*/}"
                local norepo_image="${image##*/}"
                if [ "$tag" = "latest" ]; then
                    pulled_images_on_vm+=("$image")
                    pulled_images_on_vm+=("$notopdir_image")
                    pulled_images_on_vm+=("$norepo_image")
                fi
                pulled_images_on_vm+=("$image:$tag")
                pulled_images_on_vm+=("$notopdir_image:$tag")
                pulled_images_on_vm+=("$norepo_image:$tag")
            done <<< "$COMMAND_OUTPUT"
        fi
        for image in $images; do
            if ! [[ " ${pulled_images_on_vm[*]} " == *" ${image} "* ]]; then
                if [ "$use_host_images" == "1" ] && vm-put-docker-image "$image"; then
                    : # no need to pull the image to vm, it is now imported.
                else
                    vm-command "crictl -i unix://${k8scri_sock} pull \"$image\"" || {
                        errormsg="pulling image \"$image\" for \"$OUTPUT_DIR/$NAME.yaml\" failed."
                        if is-hooked on_create_fail; then
                            echo "$errormsg"
                            run-hook on_create_fail
                        else
                            command-error "$errormsg"
                        fi
                    }
                fi
                pulled_images_on_vm+=("$image")
            fi
        done
        vm-command "kubectl create -f $NAME.yaml $namespace_args" || {
            if is-hooked on_create_fail; then
                echo "kubectl create error"
                run-hook on_create_fail
            else
                command-error "kubectl create error"
            fi
        }
        if [ "x$wait" != "x" ]; then
            speed=1000 vm-command "kubectl wait --timeout=${wait_t} --for=condition=${wait} $namespace_args ${template_kind}/$NAME" >/dev/null 2>&1 || {
                errormsg="waiting for ${template_kind} \"$NAME\" to become ready timed out"
                if is-hooked on_create_fail; then
                    echo "$errormsg"
                    run-hook on_create_fail
                else
                    command-error "$errormsg"
                fi
            }
        fi
    done
    is-hooked on_create && run-hook on_create
    return 0
}

reset() { # script API
    # Usage: reset counters
    #
    # Resets counters
    if [ "$1" == "counters" ]; then
        kind_count[pod]=0
    else
        error "invalid reset \"$1\""
    fi
}

interactive() { # script API
    # Usage: interactive
    #
    # Enter the interactive mode: read next script commands from
    # the standard input until "exit".
    echo "Entering the interactive mode until \"exit\"."
    INTERACTIVE_MODE=$(( INTERACTIVE_MODE + 1 ))
    # shellcheck disable=SC2162
    while read -e -p "run.sh> " -a commands; do
        if [ "${commands[0]}" == "exit" ]; then
            break
        fi
        eval "${commands[@]}"
    done
    INTERACTIVE_MODE=$(( INTERACTIVE_MODE - 1 ))
}

help() { # script API
    # Usage: help [FUNCTION|all]
    #
    # Print help on all functions or on the FUNCTION available in script.
    awk -v f="$1" \
        '/^[a-z].*script API/{split($1,a,"(");if(f==""||f==a[1]||f=="all"){print "";print a[1]":";l=2}}
         !/^    #/{l=l-1}
         /^    #/{if(l>=1){split($0,a,"#"); print "   "a[2]; if (f=="") l=0}}' <<<"$script_source"
}

### End of user code helpers

test-user-code() {
    vm-command-q "kubectl get pods 2>&1 | grep -q NAME" && vm-command "kubectl delete pods --all --now --wait"
    ( eval "$code" ) || {
        TEST_FAILURES="${TEST_FAILURES} test script failed"
    }
}

# Validate parameters
input_var_names="mode user_script_file distro k8scri k8smaster vm cgroups speed binsrc reinstall_all reinstall_containerd reinstall_crio reinstall_cri_resmgr reinstall_k8s reinstall_oneshot outdir cleanup on_verify_fail on_create_fail on_verify on_create on_launch topology cri_resmgr_cfg cri_resmgr_extra_args cri_resmgr_agent_extra_args code py_consts"

INTERACTIVE_MODE=0
mode=$1
user_script_file=$2
distro=${distro:=$DEFAULT_DISTRO}
k8s=${k8s:=}
k8scri=${k8scri:="cri-resmgr|containerd"}
k8smaster=${k8smaster:=}
cri_resmgr_pidfile="/var/run/cri-resmgr*.pid"
cri_resmgr_sock="/var/run/cri-resmgr/cri-resmgr.sock"
cri_resmgr_agent_sock="/var/run/cri-resmgr/cri-resmgr-agent.sock"
case "${k8scri}" in
    "cri-resmgr|containerd")
        k8scri_sock="${cri_resmgr_sock}"
        cri_sock="/var/run/containerd/containerd.sock"
        cri=containerd
        ;;
    "cri-resmgr|crio")
        k8scri_sock="${cri_resmgr_sock}"
        cri_sock="/var/run/crio/crio.sock"
        cri=crio
        ;;
    "containerd")
        k8scri_sock="/var/run/containerd/containerd.sock"
        cri_sock="/var/run/containerd/containerd.sock"
        cri=containerd
        omit_cri_resmgr=1
        omit_agent=1
        ;;
    "containerd&cri-resmgr")
        k8scri_sock="/var/run/containerd/containerd.sock"
        cri_sock="/var/run/containerd/containerd.sock"
        cri=containerd
        ;;
    "crio")
        k8scri_sock="/var/run/crio/crio.sock"
        cri_sock="/var/run/crio/crio.sock"
        cri=crio
        omit_cri_resmgr=1
        omit_agent=1
        ;;
    "crio&cri-resmgr")
        k8scri_sock="/var/run/crio/crio.sock"
        cri_sock="/var/run/crio/crio.sock"
        cri=crio
        ;;
    *)
        error "unsupported k8scri: \"${k8scri}\""
        ;;
esac
distro_binaries=${distro_binaries:=0}
containerd_src=${containerd_src:=}
crio_src=${crio_src:=}
crirm_src=${crirm_src:=$HOST_PROJECT_DIR}
runc_src=${runc_src:=}
crio_version=${crio_version:=}
if [ "$distro_binaries" = "1" ]; then
    if [ -z "$distro" ]; then
        error "distro_binaries=1 but distro is not set"
    fi
    BIN_DIR=${crirm_src}/binaries/$distro
else
    BIN_DIR=${crirm_src}/bin
fi
TOPOLOGY_DIR=${TOPOLOGY_DIR:=e2e}
vm=${vm:=$(basename ${TOPOLOGY_DIR})-${distro}-${cri}}
vm_files=${vm_files:-""}
cgroups=${cgroups:-v1}
cri_resmgr_cfg=${cri_resmgr_cfg:-"${SCRIPT_DIR}/cri-resmgr-topology-aware.cfg"}
cri_resmgr_extra_args=${cri_resmgr_extra_args:-""}
cri_resmgr_agent_extra_args=${cri_resmgr_agent_extra_args:-""}
cleanup=${cleanup:-0}
reinstall_all=${reinstall_all:-0}
reinstall_bootstrap=${reinstall_bootstrap:-0}
reinstall_containerd=${reinstall_containerd:-0}
reinstall_cri_resmgr=${reinstall_cri_resmgr:-0}
reinstall_cri_resmgr_agent=${reinstall_cri_resmgr_agent:-0}
reinstall_crio=${reinstall_crio:-0}
reinstall_k8s=${reinstall_k8s:-0}
reinstall_kubeadm=${reinstall_kubeadm:-0}
reinstall_kubectl=${reinstall_kubectl:-0}
reinstall_kubelet=${reinstall_kubelet:-0}
reinstall_oneshot=${reinstall_oneshot:-0}
reinstall_runc=${reinstall_runc:-0}
if [ "$reinstall_all" == "1" ]; then
    for reinstall_var in ${!reinstall_*}; do
        eval "${reinstall_var}=1"
    done
fi
if [ "$reinstall_k8s" == "1" ]; then
    reinstall_kubeadm=1
    reinstall_kubectl=1
    reinstall_kubelet=1
fi
if [ "$reinstall_bootstrap" == "1" ]; then
    setup_proxies=1
fi
omit_agent=${omit_agent:-0}
omit_cri_resmgr=${omit_cri_resmgr:-0}
use_host_images=${use_host_images:-0}
py_consts="${py_consts:-''}"
topology=${topology:-'[
    {"mem": "1G", "cores": 1, "nodes": 2, "packages": 2, "node-dist": {"4": 28, "5": 28}},
    {"nvmem": "8G", "node-dist": {"5": 28, "0": 17}},
    {"nvmem": "8G", "node-dist": {"2": 17}}
    ]'}
code=${code:-"
CPU=1 create guaranteed # creates pod 0, 1 CPU taken
report allowed
CPU=2 create guaranteed # creates pod 1, 3 CPUs taken
report allowed
CPU=3 create guaranteed # creates pod 2, 6 CPUs taken
report allowed
verify \\
    'len(cpus[\"pod0c0\"]) == 1' \\
    'len(cpus[\"pod1c0\"]) == 2' \\
    'len(cpus[\"pod2c0\"]) == 3' \\
    'len(set.union(cpus[\"pod0c0\"], cpus[\"pod1c0\"], cpus[\"pod2c0\"])) == 6'
n=3 create besteffort   # creates pods 3, 4 and 5
verify \\
    'set.intersection(
       set.union(cpus[\"pod0c0\"], cpus[\"pod1c0\"], cpus[\"pod2c0\"]),
       set.union(cpus[\"pod3c0\"], cpus[\"pod4c0\"], cpus[\"pod5c0\"])) == set()'

delete pods pod2        # deletes pod 2, 3 CPUs taken
n=2 create besteffort   # creates pods 6 and 7
CPU=2 n=2 create guaranteed # creates pod 8 and 9, 7 CPUs taken
verify \\
    'len(set.union(cpus[\"pod0c0\"], cpus[\"pod1c0\"], cpus[\"pod8c0\"], cpus[\"pod9c0\"])) == 7'
"}
warning_delay=${warning_delay:-5}

yaml_in_defaults="CPU=1 MEM=100M ISO=true CPUREQ=1 CPULIM=2 MEMREQ=100M MEMLIM=200M CONTCOUNT=1"

if [ "$mode" == "help" ]; then
    if [ "$2" == "defaults" ]; then
        echo "Test input defaults:"
        echo ""
        echo "topology=${topology}"
        echo "distro=${distro}"
        echo "k8s=${k8s}"
        echo ""
        echo "cri_resmgr_cfg=${cri_resmgr_cfg}"
        echo ""
        echo "cri_resmgr_extra_args=${cri_resmgr_extra_args}"
        echo ""
        echo -e "code=\"${code}\""
        echo ""
        echo "The defaults to QOSCLASS.yaml.in variables:"
        echo "    ${yaml_in_defaults}"
    elif [ "$2" == "script" ]; then
        if [ "x$3" == "x" ]; then
            help
        else
            help "$3"
        fi
    elif [ "x$2" == "x" ]; then
        usage
    else
        echo "invalid help page, try:"
        echo "  ./run.sh help"
        echo "  ./run.sh help defaults"
        echo "  ./run.sh help script [FUNCTION|all]"
        exit 1
    fi
    exit 0
elif [ "$mode" == "play" ]; then
    speed=${speed-10}
elif [ "$mode" == "test" ]; then
    PV=
elif [ "$mode" == "debug" ]; then
    PV=
elif [ "$mode" == "interactive" ]; then
    PV=
elif [ "$mode" == "record" ]; then
    record
else
    usage
    error "missing valid MODE"
    exit 1
fi

host-require-cmd jq
host-require-cmd pv

if [ -n "$user_script_file" ]; then
    if [ ! -f "$user_script_file" ]; then
        error "cannot find test script file \"$user_script_file\""
    fi
    code=$(<"$user_script_file")
fi

# Prepare for test/demo
mkdir -p "$OUTPUT_DIR"
mkdir -p "$COMMAND_OUTPUT_DIR"
rm -f "$COMMAND_OUTPUT_DIR"/0*
( echo x > "$OUTPUT_DIR"/x && rm -f "$OUTPUT_DIR"/x ) || {
    error "output directory outdir=$OUTPUT_DIR is not writable"
}

SUMMARY_FILE="$OUTPUT_DIR/summary.txt"
echo -n "" > "$SUMMARY_FILE" || error "cannot write summary to \"$SUMMARY_FILE\""

## Save test inputs and defaults for the record
mkdir -p "$OUTPUT_DIR/input"; rm -f "$OUTPUT_DIR/input/*"
for var in $input_var_names; do
    if [ -n "${!var}" ]; then
        echo -e "${!var}" > "$OUTPUT_DIR/input/${var}.var"
    fi
done

if [ "$binsrc" == "local" ]; then
    if [ "$omit_cri_resmgr" != "1" ]; then
        [ -f "${BIN_DIR}/cri-resmgr" ] || error "missing \"${BIN_DIR}/cri-resmgr\""
    fi
    if [ "$omit_agent" != "1" ]; then
        [ -f "${BIN_DIR}/cri-resmgr-agent" ] || error "missing \"${BIN_DIR}/cri-resmgr-agent\""
    fi
fi

host-get-vm-config "$vm" || host-set-vm-config "$vm" "$distro" "$cri"

if [ -z "$VM_IP" ] || [ -z "$VM_SSH_USER" ]; then
    screen-create-vm
else
    if [ "$setup_proxies" == "1" ]; then
	vm-setup-proxies
    fi

    if [ "$reinstall_bootstrap" == "1" ]; then
	vm-bootstrap
    fi
fi

is-hooked "on_vm_online" && run-hook "on_vm_online"

if [ "$reinstall_oneshot" == "1" ] || ! vm-command-q "[ -f .vm-setup-oneshot ]"; then
    vm-setup-oneshot
    vm-command-q "touch .vm-setup-oneshot"
fi

if [ -n "$vm_files" ]; then
    install-files "$vm_files"
fi

if [ "$reinstall_containerd" == "1" ] || [ "$reinstall_crio" == "1" ] || ! vm-command-q "( type -p containerd || type -p crio ) >/dev/null"; then
    vm-install-cri
    is-hooked on_cri_install && run-hook on_cri_install
fi

# runc is installed as a dependency of containerd and crio.
# If reinstalling runc is explictly wished for, it is safe to do
# only after (re)installing contaienrd/crio. Otherwise
# a custom locally built runc may be overridden from packages.
if [ "$reinstall_runc" == "1" ] || ! vm-command-q "type -p runc >/dev/null"; then
    vm-install-runc
    is-hooked on_runc_install && run-hook on_runc_install
fi

if [ "$reinstall_k8s" == "1" ] || ! vm-command-q "type -p kubelet >/dev/null"; then
    vm-install-k8s
    is-hooked on_k8s_install && run-hook on_k8s_install
fi

if [ "$reinstall_cri_resmgr" == "1" ]; then
    uninstall cri-resmgr
fi

if [ "$reinstall_cri_resmgr_agent" == "1" ]; then
    uninstall cri-resmgr-agent
fi

if [[ "$k8scri" == cri-resmgr* ]] || [ -n "$crirm_src" ]; then
    if [ "$omit_cri_resmgr" != "1" ]; then
        if ! vm-command-q "type -p cri-resmgr >/dev/null"; then
            install cri-resmgr
        fi
    fi

    if [ "$omit_agent" != "1" ]; then
        if ! vm-command-q "type -p cri-resmgr-agent >/dev/null"; then
            install cri-resmgr-agent
        fi
    fi
fi

if [ "$mode" == "debug" ]; then
    vm-command-q "[ -x /root/go/bin/dlv ]" || vm-install-dlv
    if [ -d "$crio_src" ]; then
        vm-dlv-add-src "$crio_src"
    fi
    if [ -d "$containerd_src" ]; then
        vm-dlv-add-src "$containerd_src"
    fi
    if [ -d "$crirm_src" ]; then
        vm-dlv-add-src "$crirm_src"
    fi
    if [ -d "$runc_src" ]; then
        vm-dlv-add-src "$runc_src"
    fi
    echo "How to debug cri-resmgr:"
    echo "- Attach debugger to running cri-resmgr:"
    echo "  ssh $VM_SSH_USER@$VM_IP"
    echo "  sudo /root/go/bin/dlv attach \$(pidof cri-resmgr)"
    echo "- Relaunch cri-resmgr in debugger:"
    echo "  ssh $VM_SSH_USER@$VM_IP"
    echo "  sudo -i"
    echo "  kill -9 \$(pidof cri-resmgr); /root/go/bin/dlv exec /usr/local/bin/cri-resmgr -- -force-config /home/$VM_SSH_USER/*.cfg"
    echo "dlv on VM is ready for use"
    exit 0
fi

if [ -n "$containerd_src" ] && [[ "$k8scri" == *containerd* ]]; then
    vm-check-source-files-changed "$containerd_src" "$containerd_src/bin/containerd"
    vm-check-running-binary "$containerd_src/bin/containerd"
fi

if [ -n "$crio_src" ] && [[ "$k8scri" == *crio* ]]; then
    vm-check-source-files-changed "$crio_src" "$crio_src/bin/crio"
    vm-check-running-binary "$crio_src/bin/crio"
fi

# Start cri-resmgr if not already running
if [ "$omit_cri_resmgr" != "1" ]; then
    if ! vm-command-q "fuser ${cri_resmgr_pidfile}" >/dev/null 2>&1; then
        screen-launch-cri-resmgr
    fi
    if [ -n "$crirm_src" ]; then
        vm-check-source-files-changed "$crirm_src" "$crirm_src/bin/cri-resmgr"
        vm-check-running-binary "$crirm_src/bin/cri-resmgr"
    fi
fi

# Create kubernetes cluster or wait that it is online
if [ "$reinstall_k8s" == "1" ]; then
    vm-destroy-cluster
fi

if vm-command-q "[ ! -f /var/lib/kubelet/config.yaml ]"; then
    if [ -n "$k8smaster" ]; then
        vm-join "$k8smaster"
    else
        screen-create-singlenode-cluster
    fi
else
    # Wait for kube-apiserver to launch (may be down if the VM was just booted)
    vm-wait-process kube-apiserver
fi

# Start cri-resmgr-agent if not already running
if [ "$omit_agent" != "1" ]; then
    if ! vm-command-q "fuser ${cri_resmgr_agent_sock}" >/dev/null; then
        screen-launch-cri-resmgr-agent
    fi
fi

is-hooked "on_k8s_online" && run-hook "on_k8s_online"

declare -A kind_count # associative arrays for counting created objects, like kind_count[pod]=1
eval "${yaml_in_defaults}"
if [ "$mode" == "interactive" ]; then
    interactive
else
    # Run test/demo
    TEST_FAILURES=""
    test-user-code
fi

# Save logs
host-command "$SCP $VM_SSH_USER@$VM_IP:cri-resmgr*.output.txt \"$OUTPUT_DIR/\""

# Cleanup
if [ "$cleanup" == "0" ]; then
    echo "The VM, Kubernetes and cri-resmgr are left running. Next steps:"
    vm-print-usage
elif [ "$cleanup" == "1" ]; then
    host-stop-vm "$vm"
    host-delete-vm "$vm"
elif [ "$cleanup" == "2" ]; then
    host-stop-vm "$vm"
fi

# Summarize results
exit_status=0
if [ "$mode" == "test" ]; then
    if [ -n "$TEST_FAILURES" ]; then
        echo "Test verdict: FAIL" >> "$SUMMARY_FILE"
    else
        echo "Test verdict: PASS" >> "$SUMMARY_FILE"
    fi
    cat "$SUMMARY_FILE"
fi
exit $exit_status


================================================
FILE: test/e2e/run_all_configurations.sh
================================================
#!/bin/bash

RUN_SH="${0%/*}/run.sh"
PAIRWISE="${0%/*}/../../scripts/testing/pairwise"

"${PAIRWISE}" \
    distro={debian-sid,fedora-40,opensuse-tumbleweed} \
    k8scri={containerd,crio,cri-resmgr\|containerd,cri-resmgr\|crio} \
    k8scni={cilium,flannel,weavenet} | while read -r env_vars; do

    eval "export $env_vars"

    code='create besteffort'
    # shellcheck disable=SC2154
    # ...as it cannot know that pairwise+eval exports distro et. al.
    vm="config-$distro-${k8scri/|/-}-$k8scni"
    outdir="output-configs/output-$vm"
    export code vm outdir

    govm rm "$vm" >/dev/null 2>&1
    mkdir -p "$outdir"
    "$RUN_SH" test </dev/null >"$outdir/run.sh.output" 2>&1
    govm rm "$vm" >/dev/null 2>&1
done


================================================
FILE: test/e2e/run_tests.sh
================================================
#!/bin/bash

TESTS_DIR="$1"
RUN_SH="${0%/*}/run.sh"

DEFAULT_DISTRO="ubuntu-22.04"

usage() {
    echo "Usage: run_tests.sh TESTS_DIR"
    echo "TESTS_DIR is expected to be structured as POLICY/TOPOLOGY/TEST with files:"
    echo "POLICY/cri-resmgr.cfg: configuration of cri-resmgr"
    echo "POLICY/TOPOLOGY/topology.var.json: contents of the topology variable for run.sh"
    echo "POLICY/TOPOLOGY/TEST/code.var.sh: contents of the code var (that is, test script)"
}

error() {
    (echo ""; echo "error: $1" ) >&2
    exit 1
}

warning() {
    echo "WARNING: $1" >&2
}

export-var-files() {
    # export ENV_VAR from ENV_VAR.var.* file content
    local var_file_dir="$1"
    local var_filepath
    local var_file_name
    local var_name
    for var_filepath in "$var_file_dir"/*.var "$var_file_dir"/*.var.*; do
        if ! [ -f "$var_filepath" ] || [[ "$var_filepath" == *"~" ]] || [[ "$var_filepath" == *"#"* ]]; then
            continue
        fi
        var_file_name=$(basename "$var_filepath")
        var_name=${var_file_name%%.var*}
        if [ "$var_name" == "code" ] || [ "$var_name" == "py_consts" ]; then
            # append values in code variables
            echo "exporting $var_name - appending from $var_filepath"
            export "$var_name"="${!var_name}""
$(< "$var_filepath")"
        else
            # creating / replace other variables
            if [ -z "${!var_name}" ]; then
                echo "exporting $var_name - creating from $var_filepath"
            else
                echo "exporting $var_name - overriding from $var_filepath"
            fi
            if [[ "$var_file_name" == *.var.in.* ]]; then
                export "$var_name"="$(eval "echo -e \"$(<"${var_filepath}")\"")"
            else
                export "$var_name"="$(< "$var_filepath")"
            fi
        fi
    done
}

export-vm-files() {
    # update and export vm_files associative array from directory content
    local vm_files_dir="$1"
    if [ ! -d "$vm_files_dir" ]; then
        return
    fi
    if [[ "$vm_files" == *"="* ]] ; then
        eval "declare -A vm_files_aa=${vm_files#*=}"
    else
        declare -A vm_files_aa
    fi
    prefix_len=${#vm_files_dir}
    shopt -s globstar
    for f in "$vm_files_dir"/**; do
        file_vm_name=${f:$prefix_len}
        if [ -z "$file_vm_name" ] || [ "$file_vm_name" == "/" ]; then
            continue
        elif [ -f "$f" ]; then
            if [ -n "${vm_files_aa[$file_vm_name]}" ]; then
                warning "vm file $file_vm_name: new file \"$f\" overrides \"${vm_files_aa[$file_vm_name]}\""
            fi
            vm_files_aa[$file_vm_name]="file:$(realpath "$f")"
        fi
    done
    # serialize from associative array
    local serialized_vm_files
    serialized_vm_files="$(declare -p vm_files_aa)"
    export vm_files="declare -A vm_files${serialized_vm_files#declare -A vm_files_aa}"
}

source-source-files() {
    # Test execution will source *.source.* files before it executes
    # the real test code. The files will be sourced starting from the
    # test suite (root) directory and ending up to the test directory,
    # which enables overriding inherited functions and variables.
    local src_file_dir="$1"
    local src_filepath
    for src_filepath in "$src_file_dir"/*.source "$src_file_dir"/*.source.*; do
        if ! [ -f "$src_filepath" ] || [[ "$src_filepath" == *"~" ]]; then
            continue
        fi
        echo "sourcing $src_filepath before running test code"
        source_libs="${source_libs}""
source \"$src_filepath\"
"
    done
}

export-and-source-dir() {
    local dir="$1"
    export-var-files "$dir"
    export-vm-files "$dir/vm-files"
    source-source-files "$dir"
}

if [ -z "$TESTS_DIR" ] || [ "$TESTS_DIR" == "help" ] || [ "$TESTS_DIR" == "--help" ]; then
    usage
    error "missing TESTS_DIR"
fi

if ! [ -d "$TESTS_DIR" ]; then
    error "bad TESTS_DIR: \"$TESTS_DIR\""
fi

# Find TESTS_DIR root by looking for POLICY_DIR/*.cfg. If TESTS_DIR was not the
# root dir, then execute tests only under TESTS_DIR.
root_dir_glob="*.test-suite"
# shellcheck disable=SC2053
if [[ "$(basename "$TESTS_DIR")" == $root_dir_glob ]]; then
    TESTS_ROOT_DIR="$TESTS_DIR"
elif [[ "$(basename "$(realpath "$TESTS_DIR"/..)")" == $root_dir_glob ]]; then
    TESTS_ROOT_DIR=$(realpath "$TESTS_DIR/..")
    TESTS_POLICY_FILTER=$(basename "${TESTS_DIR}")
elif [[ "$(basename "$(realpath "$TESTS_DIR"/../..)")" == $root_dir_glob ]]; then
    TESTS_ROOT_DIR=$(realpath "$TESTS_DIR/../..")
    TESTS_POLICY_FILTER=$(basename "$(dirname "${TESTS_DIR}")")
    TESTS_TOPOLOGY_FILTER=$(basename "${TESTS_DIR}")
elif [[ "$(basename "$(realpath "$TESTS_DIR"/../../..)")" == $root_dir_glob ]]; then
    TESTS_ROOT_DIR=$(realpath "$TESTS_DIR/../../..")
    TESTS_POLICY_FILTER=$(basename "$(dirname "$(dirname "${TESTS_DIR}")")")
    TESTS_TOPOLOGY_FILTER=$(basename "$(dirname "${TESTS_DIR}")")
    TESTS_TEST_FILTER=$(basename "${TESTS_DIR}")
else
    error "TESTS_DIR=\"$TESTS_DIR\" is invalid tests/policy/topology/test dir: *.cfg not found"
fi

echo "Running tests matching:"
echo "    TESTS_ROOT_DIR=$TESTS_ROOT_DIR"
echo "    TESTS_POLICY_FILTER=$TESTS_POLICY_FILTER"
echo "    TESTS_TOPOLOGY_FILTER=$TESTS_TOPOLOGY_FILTER"
echo "    TESTS_TEST_FILTER=$TESTS_TEST_FILTER"

cleanup() {
    rm -rf "$summary_dir"
}
summary_dir=$(mktemp -d)
trap cleanup TERM EXIT QUIT

summary_file="$summary_dir/summary.txt"
echo -n "" > "$summary_file"

export-and-source-dir "$TESTS_ROOT_DIR"

for POLICY_DIR in "$TESTS_ROOT_DIR"/*; do
    if ! [ -d "$POLICY_DIR" ]; then
        continue
    fi
    if ! [[ "$(basename "$POLICY_DIR")" =~ .*"$TESTS_POLICY_FILTER".* ]]; then
        continue
    fi
    # Run exports in subshells so that variables exported for previous
    # tests do not affect any other tests.
    (
        for CFG_FILE in "$POLICY_DIR"/*.cfg; do
            if ! [ -f "$CFG_FILE" ]; then
                continue
            fi
            export cri_resmgr_cfg=$CFG_FILE
        done
        export-and-source-dir "$POLICY_DIR"
        for TOPOLOGY_DIR in "$POLICY_DIR"/*; do
            if ! [ -d "$TOPOLOGY_DIR" ]; then
                continue
            fi
            if ! [[ "$(basename "$TOPOLOGY_DIR")" =~ .*"$TESTS_TOPOLOGY_FILTER".* ]]; then
                continue
            fi
            if [ "$(basename "$TOPOLOGY_DIR")" == "vm-files" ]; then
                continue
            fi
            (
                distro=${distro:=$DEFAULT_DISTRO}
                export distro
                # Create name for the vm.
                # Needs topology, distro and container runtime stack.
                k8scri=${k8scri:-"cri-resmgr|containerd"}
                case "${k8scri}" in
                    "cri-resmgr|containerd")
                        criname=crirm-containerd
                        ;;
                    "cri-resmgr|crio")
                        criname=crirm-crio
                        ;;
                    "containerd")
                        criname=containerd
                        ;;
                    "containerd&cri-resmgr")
                        criname=nrirm-containerd
                        ;;
                    "crio")
                        criname=crio
                        ;;
                    "crio&cri-resmgr")
                        criname=nrirm-crio
                        ;;
                    *)
                        error "unsupported k8scri: \"${k8scri}\""
                        ;;
                esac
                vm="$(basename "$TOPOLOGY_DIR")-${distro}-${criname}"
                export vm
                export-and-source-dir "$TOPOLOGY_DIR"
                for TEST_DIR in "$TOPOLOGY_DIR"/*; do
                    if ! [ -d "$TEST_DIR" ]; then
                        continue
                    fi
                    if ! [[ "$(basename "$TEST_DIR")" =~ .*"$TESTS_TEST_FILTER".* ]]; then
                        continue
                    fi
                    if [ "$(basename "$TEST_DIR")" == "vm-files" ]; then
                        continue
                    fi
                    (
                        export outdir="$TEST_DIR/output"
                        export-and-source-dir "$TEST_DIR"
                        export code="${source_libs}""
${code}"
                        mkdir -p "$outdir"
                        echo "Run $(basename "$TEST_DIR")"
                        TEST_DIR=$TEST_DIR TOPOLOGY_DIR=$TOPOLOGY_DIR POLICY_DIR=$POLICY_DIR \
                            "$RUN_SH" test 2>&1 | tee "$outdir/run.sh.output"
                        test_name="$(basename "$POLICY_DIR")/$(basename "$TOPOLOGY_DIR")/$(basename "$TEST_DIR")"
                        if grep -q "Test verdict: PASS" "$outdir/run.sh.output"; then
                            echo "PASS $test_name" >> "$summary_file"
                        elif grep -q "Test verdict: FAIL" "$outdir/run.sh.output"; then
                            echo "FAIL $test_name" >> "$summary_file"
                        else
                            echo "ERROR $test_name" >> "$summary_file"
                        fi
                    )
                done
            )
        done
    )
done

echo ""
echo "Tests summary:"
cat "$summary_file"
if grep -q ERROR "$summary_file" || grep -q FAIL "$summary_file"; then
    exit 1
fi


================================================
FILE: test/functional/e2e_test.go
================================================
// Copyright 2020 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package e2e

import (
	"context"
	"flag"
	"fmt"
	"net"
	"os"
	"path/filepath"
	"testing"
	"time"

	resmgr "github.com/intel/cri-resource-manager/pkg/cri/resource-manager"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache"
	"github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes"
	"github.com/intel/cri-resource-manager/pkg/dump"
	"google.golang.org/grpc"
	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"

	logger "github.com/intel/cri-resource-manager/pkg/log"
)

const (
	testDir = "/tmp/cri-rm-test"
)

func init() {
	rate := logger.Rate{Limit: logger.Every(1 * time.Minute)}
	logger.SetGrpcLogger("grpc", &rate)

	if err := os.MkdirAll(testDir, 0700); err != nil {
		fmt.Printf("unable to create %q: %+v\n", testDir, err)
	}
}

type testEnv struct {
	t           *testing.T
	handlers    map[string]interface{}
	client      criv1.RuntimeServiceClient
	forceConfig string
	mgr         resmgr.ResourceManager
	cache       cache.Cache
}

func (env *testEnv) Run(name string, testFunction func(context.Context, *testEnv)) {
	t := env.t
	overriddenCriHandlers := env.handlers

	t.Helper()
	t.Run(name, func(t *testing.T) {
		tmpDir, err := os.MkdirTemp(testDir, "requests-")
		if err != nil {
			t.Fatalf("unable to create temp directory: %+v", err)
		}
		defer os.RemoveAll(tmpDir)

		if err := flag.Set("runtime-socket", filepath.Join(tmpDir, "fakecri.sock")); err != nil {
			t.Fatalf("unable to set runtime-socket")
		}
		if err := flag.Set("image-socket", filepath.Join(tmpDir, "fakecri.sock")); err != nil {
			t.Fatalf("unable to set image-socket")
		}
		if err := flag.Set("relay-socket", filepath.Join(tmpDir, "relay.sock")); err != nil {
			t.Fatalf("unable to set relay-socket")
		}
		if err := flag.Set("relay-dir", filepath.Join(tmpDir, "relaystorage")); err != nil {
			t.Fatalf("unable to set relay-dir")
		}
		if err := flag.Set("agent-socket", filepath.Join(tmpDir, "agent.sock")); err != nil {
			t.Fatalf("unable to set agent-socket")
		}
		if err := flag.Set("config-socket", filepath.Join(tmpDir, "config.sock")); err != nil {
			t.Fatalf("unable to set config-socket")
		}
		if err := flag.Set("allow-untested-runtimes", "true"); err != nil {
			t.Fatalf("unable to allow untested runtimes: %v", err)
		}

		if env.forceConfig != "" {
			path := filepath.Join(tmpDir, "forcedconfig.cfg")
			if err := os.WriteFile(path, []byte(env.forceConfig), 0644); err != nil {
				t.Fatalf("failed to create configuration file %s: %v", path, err)
			}
			if err := flag.Set("force-config", path); err != nil {
				t.Fatalf("unable to set force-config")
			}
		}

		flag.Parse()

		fakeCri := newFakeCriServer(t, filepath.Join(tmpDir, "fakecri.sock"), overriddenCriHandlers)
		defer fakeCri.stop()

		resMgr, err := resmgr.NewResourceManager()
		if err != nil {
			t.Fatalf("unable to create resource manager: %+v", err)
		}
		if err := resMgr.Start(); err != nil {
			t.Fatalf("unable to start resource manager: %+v", err)
		}
		defer resMgr.Stop()

		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
		defer cancel()

		conn, err := grpc.DialContext(ctx, filepath.Join(tmpDir, "relay.sock"), grpc.WithInsecure(), grpc.WithBlock(),
			grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
				if deadline, ok := ctx.Deadline(); ok {
					return net.DialTimeout("unix", addr, time.Until(deadline))
				}
				return net.DialTimeout("unix", addr, 0)
			}),
		)
		if err != nil {
			t.Fatalf("unable to connect to relay: %+v", err)
		}
		defer conn.Close()

		client := criv1.NewRuntimeServiceClient(conn)

		env.client = client
		env.mgr = resMgr
		env.cache = resMgr.GetCache()

		testFunction(ctx, env)

		// until pkg/log fixes gets merged: wait until pkg/dump is done with
		// logging before we run next test (and consequently do a reconfig)
		dump.Sync()
	})
}

func TestListPodSandbox(t *testing.T) {
	tcases := []struct {
		name         string
		pods         []*criv1.PodSandbox
		expectedPods int
	}{
		{
			name: "empty",
		},
		{
			name:         "list one pod",
			pods:         []*criv1.PodSandbox{{}},
			expectedPods: 1,
		},
	}
	for _, tc := range tcases {
		criHandlers := map[string]interface{}{
			"ListPodSandbox": func(*fakeCriServer, context.Context, *criv1.ListPodSandboxRequest) (*criv1.ListPodSandboxResponse, error) {
				return &criv1.ListPodSandboxResponse{
					Items: tc.pods,
				}, nil
			},
		}
		env := &testEnv{
			t:        t,
			handlers: criHandlers,
		}
		env.Run(tc.name, func(ctx context.Context, env *testEnv) {
			t := env.t
			client := env.client
			resp, err := client.ListPodSandbox(ctx, &criv1.ListPodSandboxRequest{})
			if err != nil {
				t.Errorf("Unexpected error: %+v", err)
				return
			}
			if len(resp.Items) != tc.expectedPods {
				t.Errorf("Expected %d pods, got %d", tc.expectedPods, len(resp.Items))
			}
		})
	}
}

func TestListContainers(t *testing.T) {
	tcases := []struct {
		name               string
		containers         []*criv1.Container
		expectedContainers int
	}{
		{
			name: "empty",
		},
		{
			name:               "list one container",
			containers:         []*criv1.Container{{}},
			expectedContainers: 1,
		},
	}
	for _, tc := range tcases {
		criHandlers := map[string]interface{}{
			"ListContainers": func(*fakeCriServer, context.Context, *criv1.ListContainersRequest) (*criv1.ListContainersResponse, error) {
				return &criv1.ListContainersResponse{
					Containers: tc.containers,
				}, nil
			},
		}
		env := &testEnv{
			t:        t,
			handlers: criHandlers,
		}
		env.Run(tc.name, func(ctx context.Context, env *testEnv) {
			t := env.t
			client := env.client
			resp, err := client.ListContainers(ctx, &criv1.ListContainersRequest{})
			if err != nil {
				t.Errorf("Unexpected error: %+v", err)
				return
			}
			if len(resp.Containers) != tc.expectedContainers {
				t.Errorf("Expected %d pods, got %d", tc.expectedContainers, len(resp.Containers))
			}
		})
	}
}

func TestLingeringPodCleanup(t *testing.T) {
	cfg := `
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
`
	tcases := []struct {
		name         string
		reqs         []*criv1.RunPodSandboxRequest
		expectedPods int
	}{
		{
			name: "create Pod #1",
			reqs: []*criv1.RunPodSandboxRequest{
				createPodRequest("Pod#1", "UID#1", "", nil, nil, ""),
			},
			expectedPods: 1,
		},
		{
			name: "create Pods #1 and #2",
			reqs: []*criv1.RunPodSandboxRequest{
				createPodRequest("Pod#1", "UID#1", "", nil, nil, ""),
				createPodRequest("Pod#2", "UID#2", "", nil, nil, ""),
			},
			expectedPods: 2,
		},
		{
			name: "create Pods #1, #2, and #3",
			reqs: []*criv1.RunPodSandboxRequest{
				createPodRequest("Pod#1", "UID#1", "", nil, nil, ""),
				createPodRequest("Pod#2", "UID#2", "", nil, nil, ""),
				createPodRequest("Pod#3", "UID#3", "", nil, nil, ""),
			},
			expectedPods: 3,
		},
		{
			name: "create Pods #1, #2, #3, #4, '1, '2, '3",
			reqs: []*criv1.RunPodSandboxRequest{
				createPodRequest("Pod#1", "UID#1", "", nil, nil, ""),
				createPodRequest("Pod#2", "UID#2", "", nil, nil, ""),
				createPodRequest("Pod#3", "UID#3", "", nil, nil, ""),
				createPodRequest("Pod#4", "UID#4", "", nil, nil, ""),
				createPodRequest("Pod#1", "UID#1", "", nil, nil, ""),
				createPodRequest("Pod#2", "UID'2", "", nil, nil, ""),
				createPodRequest("Pod#3", "UID'3", "", nil, nil, ""),
				createPodRequest("Pod#1", "UID#1", "", nil, nil, ""),
				createPodRequest("Pod#2", "UID#2", "", nil, nil, ""),
				createPodRequest("Pod#3", "UID#3", "", nil, nil, ""),
				createPodRequest("Pod#1", "UID'1", "", nil, nil, ""),
				createPodRequest("Pod#2", "UID'2", "", nil, nil, ""),
				createPodRequest("Pod#3", "UID'3", "", nil, nil, ""),
				createPodRequest("Pod#4", "UID#4", "", nil, nil, ""),
			},
			expectedPods: 7,
		},
	}

	numPods := 0
	for _, tc := range tcases {
		criHandlers := map[string]interface{}{
			"RunPodSandbox": func(*fakeCriServer, context.Context, *criv1.RunPodSandboxRequest) (*criv1.RunPodSandboxResponse, error) {
				numPods++
				return &criv1.RunPodSandboxResponse{
					PodSandboxId: fmt.Sprintf("Pod#%d", numPods),
				}, nil
			},
		}
		env := &testEnv{
			t:           t,
			handlers:    criHandlers,
			forceConfig: cfg,
		}
		env.Run(tc.name, func(ctx context.Context, env *testEnv) {
			t := env.t
			client := env.client
			cache := env.cache
			for _, req := range tc.reqs {
				_, err := client.RunPodSandbox(ctx, req)
				if err != nil {
					t.Errorf("failed to create pod %+v: %v", req, err)
				}
			}
			pods := cache.GetPods()
			if len(pods) != tc.expectedPods {
				t.Errorf("expected %d pods in cache, got %d (%v)", tc.expectedPods, len(pods), pods)
			}
		})
	}
}

func TestLingeringContainerCleanup(t *testing.T) {
	cfg := `
policy:
  Active: topology-aware
  ReservedResources:
    CPU: 750m
`
	type pod struct {
		UID string
		ID  string
		req *criv1.RunPodSandboxRequest
	}

	type container struct {
		pod    string
		name   string
		expect int
		req    *criv1.CreateContainerRequest
		ID     string
	}

	tcases := []struct {
		name       string
		pods       []*criv1.RunPodSandboxRequest
		containers []*container
	}{
		{
			name: "create containers per one pod",
			pods: []*criv1.RunPodSandboxRequest{
				createPodRequest("Pod#1", "UID#1", "", nil, nil, ""),
			},
			containers: []*container{
				{pod: "UID#1", name: "Container#1", expect: 1},
				{pod: "UID#1", name: "Container#2", expect: 2},
			},
		},
		{
			name: "create lingering containers per one pod",
			pods: []*criv1.RunPodSandboxRequest{
				createPodRequest("Pod#1", "UID#1", "", nil, nil, ""),
			},
			containers: []*container{
				{pod: "UID#1", name: "Container#1", expect: 1},
				{pod: "UID#1", name: "Container#2", expect: 2},
				{pod: "UID#1", name: "Container#3", expect: 3},
				{pod: "UID#1", name: "Container#3", expect: 3},
				{pod: "UID#1", name: "Container#2", expect: 3},
				{pod: "UID#1", name: "Container#1", expect: 3},
			},
		},
	}

	numPods := 0
	numContainers := 0
	for _, tc := range tcases {
		criHandlers := map[string]interface{}{
			"RunPodSandbox": func(*fakeCriServer, context.Context, *criv1.RunPodSandboxRequest) (*criv1.RunPodSandboxResponse, error) {
				numPods++
				return &criv1.RunPodSandboxResponse{
					PodSandboxId: fmt.Sprintf("Pod#%d", numPods),
				}, nil
			},
			"CreateContainer": func(*fakeCriServer, context.Context, *criv1.CreateContainerRequest) (*criv1.CreateContainerResponse, error) {
				numContainers++
				return &criv1.CreateContainerResponse{
					ContainerId: fmt.Sprintf("Container#%d", numContainers),
				}, nil
			},
		}
		env := &testEnv{
			t:           t,
			handlers:    criHandlers,
			forceConfig: cfg,
		}
		env.Run(tc.name, func(ctx context.Context, env *testEnv) {
			t := env.t
			client := env.client
			cache := env.cache
			pods := map[string]*pod{}

			for _, req := range tc.pods {
				rpl, err := client.RunPodSandbox(ctx, req)
				if err != nil {
					t.Errorf("failed to create pod %+v: %v", req, err)
				} else {
					id := rpl.PodSandboxId
					uid := req.Config.Metadata.Uid
					pods[uid] = &pod{
						UID: uid,
						ID:  id,
						req: req,
					}
				}
			}

			for _, c := range tc.containers {
				pod, ok := pods[c.pod]
				if !ok {
					t.Errorf("failed to find pod by UID %s", c.pod)
					continue
				}

				c.req = createContainerRequest(pod.ID, c.name, pod.req)
				rpl, err := client.CreateContainer(ctx, c.req)
				if err != nil {
					t.Errorf("failed to create container %+v: %v", c.req, err)
				} else {
					c.ID = rpl.ContainerId
					cached := cache.GetContainers()
					if len(cached) != c.expect {
						t.Errorf("pod %s, container %s: expected %d containers in cache, got %d",
							c.pod, c.name, c.expect, len(cached))
					}
				}
			}
		})
	}
}

func createPodRequest(name, uid, namespace string,
	labels, annotations map[string]string,
	cgroupParent string) *criv1.RunPodSandboxRequest {
	if namespace == "" {
		namespace = "default"
	}
	if labels == nil {
		labels = map[string]string{}
	}
	labels[kubernetes.PodUIDLabel] = uid
	return &criv1.RunPodSandboxRequest{
		Config: &criv1.PodSandboxConfig{
			Metadata: &criv1.PodSandboxMetadata{
				Name:      name,
				Uid:       uid,
				Namespace: namespace,
			},
			Labels:      labels,
			Annotations: annotations,
			Linux: &criv1.LinuxPodSandboxConfig{
				CgroupParent: cgroupParent,
			},
		},
	}
}

func createContainerRequest(podID, name string,
	podReq *criv1.RunPodSandboxRequest) *criv1.CreateContainerRequest {
	return &criv1.CreateContainerRequest{
		PodSandboxId: podID,
		Config: &criv1.ContainerConfig{
			Metadata: &criv1.ContainerMetadata{
				Name: name,
			},
			Linux: &criv1.LinuxContainerConfig{},
		},
		SandboxConfig: podReq.Config,
	}
}


================================================
FILE: test/functional/fake_cri_server_test.go
================================================
// Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package e2e

import (
	"context"
	"fmt"
	"net"
	"os"
	"path/filepath"
	"reflect"
	"runtime"
	"strings"
	"testing"
	"time"

	"github.com/intel/cri-resource-manager/pkg/utils"
	"google.golang.org/grpc"
	criv1 "k8s.io/cri-api/pkg/apis/runtime/v1"
)

const (
	fakeKubeAPIVersion    = "0.1.0"
	fakeRuntimeName       = "fake-CRI-runtime"
	fakeRuntimeVersion    = "v0.0.0"
	fakeRuntimeAPIVersion = "v1"
)

type fakeCriServer struct {
	t            *testing.T
	socket       string
	grpcServer   *grpc.Server
	fakeHandlers map[string]interface{}
}

func newFakeCriServer(t *testing.T, socket string, fakeHandlers map[string]interface{}) *fakeCriServer {
	t.Helper()

	if !filepath.IsAbs(socket) {
		t.Fatalf("invalid socket %q, absolute path expected", socket)
	}

	if err := os.MkdirAll(filepath.Dir(socket), 0700); err != nil {
		t.Fatalf("failed to create directory for socket %q: %v", socket, err)
	}

	srv := &fakeCriServer{
		t:            t,
		socket:       socket,
		grpcServer:   grpc.NewServer(),
		fakeHandlers: fakeHandlers,
	}

	criv1.RegisterRuntimeServiceServer(srv.grpcServer, srv)
	criv1.RegisterImageServiceServer(srv.grpcServer, srv)

	lis, err := net.Listen("unix", socket)
	if err != nil {
		if ls, err := utils.IsListeningSocket(socket); ls || err != nil {
			t.Fatalf("failed to create fake server: socket %s already exists", socket)
		}
		os.Remove(socket)
		lis, err = net.Listen("unix", socket)
		if err != nil {
			t.Fatalf("failed to create fake server on socket %q: %v", socket, err)
		}
	}

	go func() {
		if err := srv.grpcServer.Serve(lis); err != nil {
			fmt.Printf("unable to start gRPC server: %+v\n", err)
		}
	}()

	if err := utils.WaitForServer(socket, time.Second); err != nil {
		t.Fatalf("starting fake CRI server failed: %v", err)
	}

	return srv
}

func (s *fakeCriServer) stop() {
	s.t.Helper()
	s.grpcServer.Stop()
	os.Remove(s.socket)
}

func (s *fakeCriServer) callHandler(ctx context.Context, request interface{}, defaultHandler interface{}) (interface{}, error) {
	var err error

	pc, _, _, _ := runtime.Caller(1)
	nameFull := runtime.FuncForPC(pc).Name()
	nameEnd := filepath.Ext(nameFull)
	name := strings.TrimPrefix(nameEnd, ".")

	handler, found := s.fakeHandlers[name]
	if !found {
		if defaultHandler == nil {
			method := reflect.ValueOf(s).MethodByName(name)
			returnType := method.Type().Out(0)
			return reflect.New(returnType).Elem().Interface(), fmt.Errorf("%s() not implemented", name)
		}

		handler = defaultHandler
	}

	in := make([]reflect.Value, 3)
	in[0] = reflect.ValueOf(s)
	in[1] = reflect.ValueOf(ctx)
	in[2] = reflect.ValueOf(request)
	out := reflect.ValueOf(handler).Call(in)

	if !out[1].IsNil() {
		err = out[1].Interface().(error)
	}

	return out[0].Interface(), err
}

// Implementation of criv1.RuntimeServiceServer

func (s *fakeCriServer) Version(ctx context.Context, req *criv1.VersionRequest) (*criv1.VersionResponse, error) {
	response, err := s.callHandler(ctx, req,
		func(*fakeCriServer, context.Context, *criv1.VersionRequest) (*criv1.VersionResponse, error) {
			return &criv1.VersionResponse{
				Version:           fakeKubeAPIVersion,
				RuntimeName:       fakeRuntimeName,
				RuntimeVersion:    fakeRuntimeVersion,
				RuntimeApiVersion: fakeRuntimeAPIVersion,
			}, nil
		},
	)
	return response.(*criv1.VersionResponse), err
}

func (s *fakeCriServer) RunPodSandbox(ctx context.Context, req *criv1.RunPodSandboxRequest) (*criv1.RunPodSandboxResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.RunPodSandboxResponse), err
}

func (s *fakeCriServer) StopPodSandbox(ctx context.Context, req *criv1.StopPodSandboxRequest) (*criv1.StopPodSandboxResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.StopPodSandboxResponse), err
}

func (s *fakeCriServer) RemovePodSandbox(ctx context.Context, req *criv1.RemovePodSandboxRequest) (*criv1.RemovePodSandboxResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.RemovePodSandboxResponse), err
}

func (s *fakeCriServer) PodSandboxStatus(ctx context.Context, req *criv1.PodSandboxStatusRequest) (*criv1.PodSandboxStatusResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.PodSandboxStatusResponse), err
}

func (s *fakeCriServer) ListPodSandbox(ctx context.Context, req *criv1.ListPodSandboxRequest) (*criv1.ListPodSandboxResponse, error) {
	response, err := s.callHandler(ctx, req, func(*fakeCriServer, context.Context, *criv1.ListPodSandboxRequest) (*criv1.ListPodSandboxResponse, error) {
		return &criv1.ListPodSandboxResponse{}, nil
	})
	return response.(*criv1.ListPodSandboxResponse), err
}

func (s *fakeCriServer) CreateContainer(ctx context.Context, req *criv1.CreateContainerRequest) (*criv1.CreateContainerResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.CreateContainerResponse), err
}

func (s *fakeCriServer) StartContainer(ctx context.Context, req *criv1.StartContainerRequest) (*criv1.StartContainerResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.StartContainerResponse), err
}

func (s *fakeCriServer) StopContainer(ctx context.Context, req *criv1.StopContainerRequest) (*criv1.StopContainerResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.StopContainerResponse), err
}

func (s *fakeCriServer) RemoveContainer(ctx context.Context, req *criv1.RemoveContainerRequest) (*criv1.RemoveContainerResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.RemoveContainerResponse), err
}

func (s *fakeCriServer) ListContainers(ctx context.Context, req *criv1.ListContainersRequest) (*criv1.ListContainersResponse, error) {
	response, err := s.callHandler(ctx, req, func(*fakeCriServer, context.Context, *criv1.ListContainersRequest) (*criv1.ListContainersResponse, error) {
		return &criv1.ListContainersResponse{}, nil
	})
	return response.(*criv1.ListContainersResponse), err
}

func (s *fakeCriServer) ContainerStatus(ctx context.Context, req *criv1.ContainerStatusRequest) (*criv1.ContainerStatusResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.ContainerStatusResponse), err
}

func (s *fakeCriServer) UpdateContainerResources(ctx context.Context, req *criv1.UpdateContainerResourcesRequest) (*criv1.UpdateContainerResourcesResponse, error) {
	response, err := s.callHandler(ctx, req,
		func(*fakeCriServer, context.Context, *criv1.UpdateContainerResourcesRequest) (*criv1.UpdateContainerResourcesResponse, error) {
			return &criv1.UpdateContainerResourcesResponse{}, nil
		},
	)
	return response.(*criv1.UpdateContainerResourcesResponse), err
}

func (s *fakeCriServer) ReopenContainerLog(ctx context.Context, req *criv1.ReopenContainerLogRequest) (*criv1.ReopenContainerLogResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.ReopenContainerLogResponse), err
}

func (s *fakeCriServer) ExecSync(ctx context.Context, req *criv1.ExecSyncRequest) (*criv1.ExecSyncResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.ExecSyncResponse), err
}

func (s *fakeCriServer) Exec(ctx context.Context, req *criv1.ExecRequest) (*criv1.ExecResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.ExecResponse), err
}

func (s *fakeCriServer) Attach(ctx context.Context, req *criv1.AttachRequest) (*criv1.AttachResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.AttachResponse), err
}

func (s *fakeCriServer) PortForward(ctx context.Context, req *criv1.PortForwardRequest) (*criv1.PortForwardResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.PortForwardResponse), err
}

func (s *fakeCriServer) ContainerStats(ctx context.Context, req *criv1.ContainerStatsRequest) (*criv1.ContainerStatsResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.ContainerStatsResponse), err
}

func (s *fakeCriServer) ListContainerStats(ctx context.Context, req *criv1.ListContainerStatsRequest) (*criv1.ListContainerStatsResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.ListContainerStatsResponse), err
}

func (s *fakeCriServer) PodSandboxStats(ctx context.Context, req *criv1.PodSandboxStatsRequest) (*criv1.PodSandboxStatsResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.PodSandboxStatsResponse), err
}

func (s *fakeCriServer) ListPodSandboxStats(ctx context.Context, req *criv1.ListPodSandboxStatsRequest) (*criv1.ListPodSandboxStatsResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.ListPodSandboxStatsResponse), err
}

func (s *fakeCriServer) UpdateRuntimeConfig(ctx context.Context, req *criv1.UpdateRuntimeConfigRequest) (*criv1.UpdateRuntimeConfigResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.UpdateRuntimeConfigResponse), err
}

func (s *fakeCriServer) Status(ctx context.Context, req *criv1.StatusRequest) (*criv1.StatusResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.StatusResponse), err
}

func (s *fakeCriServer) CheckpointContainer(ctx context.Context, req *criv1.CheckpointContainerRequest) (*criv1.CheckpointContainerResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.CheckpointContainerResponse), err
}

func (s *fakeCriServer) GetContainerEvents(_ *criv1.GetEventsRequest, _ criv1.RuntimeService_GetContainerEventsServer) error {
	return nil
}

func (s *fakeCriServer) ListMetricDescriptors(ctx context.Context, req *criv1.ListMetricDescriptorsRequest) (*criv1.ListMetricDescriptorsResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.ListMetricDescriptorsResponse), err
}

func (s *fakeCriServer) ListPodSandboxMetrics(ctx context.Context, req *criv1.ListPodSandboxMetricsRequest) (*criv1.ListPodSandboxMetricsResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.ListPodSandboxMetricsResponse), err
}

func (s *fakeCriServer) RuntimeConfig(ctx context.Context, req *criv1.RuntimeConfigRequest) (*criv1.RuntimeConfigResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.RuntimeConfigResponse), err
}

// Implementation of criv1.ImageServiceServer

func (s *fakeCriServer) ListImages(ctx context.Context, req *criv1.ListImagesRequest) (*criv1.ListImagesResponse, error) {
	response, err := s.callHandler(ctx, req,
		func(*fakeCriServer, context.Context, *criv1.ListImagesRequest) (*criv1.ListImagesResponse, error) {
			return &criv1.ListImagesResponse{}, nil
		},
	)
	return response.(*criv1.ListImagesResponse), err
}

func (s *fakeCriServer) ImageStatus(ctx context.Context, req *criv1.ImageStatusRequest) (*criv1.ImageStatusResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.ImageStatusResponse), err
}

func (s *fakeCriServer) PullImage(ctx context.Context, req *criv1.PullImageRequest) (*criv1.PullImageResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.PullImageResponse), err
}

func (s *fakeCriServer) RemoveImage(ctx context.Context, req *criv1.RemoveImageRequest) (*criv1.RemoveImageResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.RemoveImageResponse), err
}

func (s *fakeCriServer) ImageFsInfo(ctx context.Context, req *criv1.ImageFsInfoRequest) (*criv1.ImageFsInfoResponse, error) {
	response, err := s.callHandler(ctx, req, nil)
	return response.(*criv1.ImageFsInfoResponse), err
}