Repository: cloudworkz/kafka-minion Branch: master Commit: 06075b9f33bd Files: 107 Total size: 334.9 KB Directory structure: gitextract_ind4nfvb/ ├── .github/ │ ├── ai-opt-out │ └── workflows/ │ ├── docker-image.yml │ └── goreleaser.yml ├── .gitignore ├── .goreleaser.yml ├── Dockerfile ├── LICENSE ├── README.md ├── charts/ │ ├── README.md │ ├── archives/ │ │ ├── .gitkeep │ │ ├── index.yaml │ │ ├── kminion-0.1.0.tgz │ │ ├── kminion-0.1.1.tgz │ │ ├── kminion-0.1.2.tgz │ │ ├── kminion-0.1.3.tgz │ │ ├── kminion-0.11.1.tgz │ │ ├── kminion-0.11.2.tgz │ │ ├── kminion-0.11.3.tgz │ │ ├── kminion-0.12.0.tgz │ │ ├── kminion-0.2.0.tgz │ │ ├── kminion-0.2.1.tgz │ │ ├── kminion-0.2.2.tgz │ │ ├── kminion-0.3.0.tgz │ │ ├── kminion-0.3.1.tgz │ │ ├── kminion-0.4.0.tgz │ │ ├── kminion-0.5.0.tgz │ │ ├── kminion-0.6.0.tgz │ │ ├── kminion-0.7.0.tgz │ │ ├── kminion-0.8.0.tgz │ │ ├── kminion-0.8.1.tgz │ │ ├── kminion-0.8.2.tgz │ │ ├── kminion-0.8.3.tgz │ │ └── kminion-0.9.0.tgz │ └── kminion/ │ ├── .helmignore │ ├── Chart.yaml │ ├── templates/ │ │ ├── NOTES.txt │ │ ├── _helpers.tpl │ │ ├── configmap.yaml │ │ ├── daemonset.yaml │ │ ├── deployment.yaml │ │ ├── hpa.yaml │ │ ├── ingress.yaml │ │ ├── poddisruptionbudget.yaml │ │ ├── service.yaml │ │ ├── serviceaccount.yaml │ │ └── servicemonitor.yaml │ └── values.yaml ├── config.go ├── docker-compose.yml ├── docs/ │ ├── end-to-end.md │ ├── metrics.md │ └── reference-config.yaml ├── e2e/ │ ├── client_hooks.go │ ├── config.go │ ├── config_consumer.go │ ├── config_producer.go │ ├── config_topic.go │ ├── config_topic_test.go │ ├── consumer.go │ ├── endtoend_message.go │ ├── group_tracker.go │ ├── message_tracker.go │ ├── partition_planner.go │ ├── partition_planner_test.go │ ├── producer.go │ ├── service.go │ ├── topic.go │ └── utils.go ├── go.mod ├── go.sum ├── kafka/ │ ├── client_config_helper.go │ ├── client_logger.go │ ├── config.go │ ├── config_sasl.go │ ├── config_sasl_gssapi.go │ ├── config_sasl_oauthbearer.go │ ├── config_tls.go │ └── service.go ├── logging/ │ ├── config.go │ └── logger.go ├── main.go ├── minion/ │ ├── client_hooks.go │ ├── config.go │ ├── config_consumer_group.go │ ├── config_log_dirs.go │ ├── config_topic_config.go │ ├── consumer_group_offsets.go │ ├── describe_consumer_groups.go │ ├── describe_topic_config.go │ ├── list_offsets.go │ ├── log_dirs.go │ ├── metadata.go │ ├── offset_consumer.go │ ├── service.go │ ├── storage.go │ ├── utils.go │ └── versions.go └── prometheus/ ├── collect_broker_info.go ├── collect_cluster_info.go ├── collect_consumer_group_lags.go ├── collect_consumer_groups.go ├── collect_exporter_metrics.go ├── collect_log_dirs.go ├── collect_topic_info.go ├── collect_topic_partition_offsets.go ├── config.go └── exporter.go ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/ai-opt-out ================================================ opt-out: true ================================================ FILE: .github/workflows/docker-image.yml ================================================ --- name: docker-image on: push: tags: ['*'] branches: ['master'] paths-ignore: ['charts/**'] permissions: id-token: write contents: read jobs: build: runs-on: ubuntu-latest steps: - uses: aws-actions/configure-aws-credentials@v4 with: aws-region: ${{ vars.RP_AWS_CRED_REGION }} role-to-assume: arn:aws:iam::${{ secrets.RP_AWS_CRED_ACCOUNT_ID }}:role/${{ vars.RP_AWS_CRED_BASE_ROLE_NAME }}${{ github.event.repository.name }} - uses: aws-actions/aws-secretsmanager-get-secrets@v2 with: secret-ids: | ,sdlc/prod/github/dockerhub parse-json-secrets: true - uses: actions/checkout@v4 - uses: docker/setup-qemu-action@v3 - uses: docker/setup-buildx-action@v3 with: driver-opts: | image=moby/buildkit:v0.21.1 network=host - name: Set build date run: | echo "BUILT_AT=$(date --rfc-3339=date)" >> ${GITHUB_ENV} - uses: docker/metadata-action@v5 id: docker_meta with: # list of Docker images to use as base name for tags images: | redpandadata/kminion name=public.ecr.aws/l9j0i2e0/kminion,enable=${{ startsWith(github.ref, 'refs/tags/v') }} # generate Docker tags based on the following events/attributes # Semver type is only active on 'push tag' events, # hence no enable condition required tags: | type=sha,prefix={{branch}}-,format=short,enable={{is_default_branch}} type=semver,pattern={{raw}} - uses: docker/login-action@v3 with: username: ${{ env.DOCKERHUB_USER }} password: ${{ env.DOCKERHUB_TOKEN }} - uses: aws-actions/configure-aws-credentials@v4 if: ${{ startsWith(github.ref, 'refs/tags/v') }} with: aws-region: us-east-1 role-to-assume: arn:aws:iam::${{ secrets.RP_AWS_CRED_ACCOUNT_ID }}:role/${{ vars.RP_AWS_CRED_BASE_ROLE_NAME }}${{ github.event.repository.name }} - uses: aws-actions/amazon-ecr-login@v2 if: ${{ startsWith(github.ref, 'refs/tags/v') }} with: registry-type: public - uses: docker/build-push-action@v6 with: provenance: false push: true platforms: linux/amd64,linux/arm64 tags: ${{ steps.docker_meta.outputs.tags }} build-args: | VERSION=${{ fromJSON(steps.docker_meta.outputs.json).labels['org.opencontainers.image.version'] }} BUILT_AT=${{ env.BUILT_AT }} COMMIT=${{ github.sha }} cache-from: type=gha cache-to: type=gha,mode=max ================================================ FILE: .github/workflows/goreleaser.yml ================================================ --- name: goreleaser on: push: tags: ['*'] jobs: goreleaser: runs-on: ubuntu-latest permissions: contents: write steps: - uses: actions/checkout@v6 with: fetch-depth: 0 - uses: actions/setup-go@v6 with: go-version-file: 'go.mod' - uses: goreleaser/goreleaser-action@v7 if: startsWith(github.ref, 'refs/tags/') with: version: latest args: release --clean workdir: . env: CGO_ENABLED: 0 GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GORELEASER_CURRENT_TAG: ${{ github.ref_name }} ================================================ FILE: .gitignore ================================================ # Binaries for programs and plugins *.exe *.exe~ *.dll *.so *.dylib # Test binary, build with `go test -c` *.test # Output of the go coverage tool, specifically when used with LiteIDE *.out zk-single-kafka-single zk-multiple-kafka-multiple .vscode .idea config /kminion ================================================ FILE: .goreleaser.yml ================================================ --- version: 2 release: name_template: '{{.Version}} / {{time "2006-01-02"}}' prerelease: auto mode: append footer: | ## Docker Image Use the following command to pull this release's Docker image: ```sh docker pull redpandadata/kminion:{{ .Tag }} ``` changelog: disable: false use: github filters: # Commit messages matching the regexp listed here will be removed from the changelog exclude: - '^docs:' - '^test:' - '^npm:' - '^go.mod:' - '^.github:' - 'Merge branch' builds: - id: kminion binary: kminion goos: - darwin - linux - windows goarch: - amd64 - arm64 ldflags: - -s -w -X main.version={{.Version}} -X main.builtAt={{.Date}} -X main.commit={{.Commit}} checksum: name_template: 'checksums.txt' ================================================ FILE: Dockerfile ================================================ ############################################################ # Build image ############################################################ FROM golang:1.26-alpine AS builder ARG VERSION ARG BUILT_AT ARG COMMIT RUN apk update && apk upgrade --no-cache && apk add --no-cache git ca-certificates && update-ca-certificates WORKDIR /app COPY go.mod . COPY go.sum . RUN go mod download COPY . . RUN CGO_ENABLED=0 go build \ -ldflags="-w -s \ -X main.version=$VERSION \ -X main.commit=$COMMIT \ -X main.builtAt=$BUILT_AT" \ -o ./bin/kminion ############################################################ # Runtime Image ############################################################ FROM alpine:3 COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ COPY --from=builder /app/bin/kminion /app/kminion RUN addgroup -S redpanda \ && adduser -S redpanda -G redpanda \ && chmod o+rx /app/kminion \ && apk upgrade --no-cache USER redpanda ENTRYPOINT ["/app/kminion"] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2021 CloudHut Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Redpanda Minion (KMinion) - Prometheus Exporter via Kafka API [Redpanda](https://www.redpanda.com) Minion is a feature-rich and flexible Prometheus Exporter to monitor your Redpanda or Kafka cluster. All valuable information that are accessible via the Kafka protocol are supposed to be accessible using KMinion. ## 🚀 Features - **Kafka versions:** Supports all Kafka versions v0.11+ - **Supported SASL mechanisms:** plain, scram-sha-256/512, gssapi/kerberos - **TLS support:** TLS is supported, regardless whether you need mTLS, a custom CA, encrypted keys or just the trusted root certs - **Consumer Group Lags:** Number of messages a consumer group is lagging behind the latest offset - **Log dir sizes:** Metric for log dir sizes either grouped by broker or by topic - **Broker info:** Metric for each broker with its address, broker id, controller and rack id - **Configurable granularity:** Export metrics (e.g. consumer group lags) either per partition or per topic. Helps to reduce the number of exported metric series. - **End to End Monitoring:** Sends messages to its own topic and consumes them, measuring a messages real-world "roundtrip" latency. Also provides ack-latency and offset-commit-latency. [More Info](/docs/end-to-end.md) - **Configurable targets:** You can configure what topics or groups you'd like to export using regex expressions - **Multiple config parsers:** It's possible to configure KMinion using YAML, Environment variables or a mix of both You can find a list of all exported metrics here: [/docs/metrics.md](/docs/metrics.md) ## Getting started ### 🐳 Docker image All images will be built on each push to master or for every new release. You can find an overview of all available tags in our [DockerHub repository](https://hub.docker.com/r/redpandadata/kminion/tags). ```shell docker pull redpandadata/kminion:latest ``` ### ☸ Helm chart A Helm chart will be maintained as part of Redpanda's [helm-charts](https://github.com/redpanda-data/helm-charts/tree/main/charts/kminion) repository. ### 🔧 Configuration All options in KMinion can be configured via YAML or environment variables. Configuring some options via YAML and some via environment variables is also possible. Environment variables take precedence in this case. You can find the reference config with additional documentation in [/docs/reference-config.yaml](/docs/reference-config.yaml). If you want to use a YAML config file, specify the path to the config file by setting the env variable `CONFIG_FILEPATH`. ### 📊 Grafana Dashboards I uploaded three separate Grafana dashboards that can be used as inspiration in order to create your own dashboards. Please take note that these dashboards might not immediately work for you due to different labeling in your Prometheus config. Cluster Dashboard: https://grafana.com/grafana/dashboards/14012 Consumer Group Dashboard: https://grafana.com/grafana/dashboards/14014 Topic Dashboard: https://grafana.com/grafana/dashboards/14013

### ⚡ Testing locally This repo contains a docker-compose file that you can run on your machine. It will spin up a Kafka & ZooKeeper cluster and starts KMinion on port 8080 which is exposed to your host machine: ```shell # 1. Clone this repo # 2. Browse to the repo's root directory and run: docker-compose up ``` ## Chat with us We use Slack to communicate. If you are looking for more interactive discussions or support, you are invited to join our Slack server: https://redpanda.com/slack ## License KMinion is distributed under the [MIT License](https://github.com/cloudhut/kminion/blob/master/LICENSE). ================================================ FILE: charts/README.md ================================================ # Helm Chart ⚠️ This chart has been moved to https://github.com/redpanda-data/helm-charts/tree/main/charts/kminion . Please install this chart instead. The existing archives are still being hosted here, to not break existing deployments. --- This chart is intentionally very light on input validation. The goal was to offer a flexible Helm chart that allows users to deploy KMinion the way they want to. Therefore it's very flexible at the cost of less input validation, so that you might run into runtime errors for a misconfiguration. All available input is documented inside of the [values.yaml](./kminion/values.yaml) file. ## Installing the Helm chart ```shell helm repo add kminion https://raw.githubusercontent.com/cloudhut/kminion/master/charts/archives helm repo update helm install -f values.yaml kminion kminion/kminion ``` ================================================ FILE: charts/archives/.gitkeep ================================================ ================================================ FILE: charts/archives/index.yaml ================================================ apiVersion: v1 entries: kminion: - apiVersion: v2 appVersion: v2.2.5 created: "2023-07-03T16:38:22.568312+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: 32e2ee36d0b0a045061d4e1490780fef905b4c85d7a23659819c5cb128aaa119 name: kminion type: application urls: - kminion-0.12.0.tgz version: 0.12.0 - apiVersion: v2 appVersion: v2.2.5 created: "2023-07-03T16:38:22.567922+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: 8a7be130d57f6f8ead720277b69319ff4dcd364859e80f4750416abe5ed460c3 name: kminion type: application urls: - kminion-0.11.3.tgz version: 0.11.3 - apiVersion: v2 appVersion: v2.2.3 created: "2023-07-03T16:38:22.5675+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: 42991a871f58b6d31a9e5b38539eb3d1e9cd35c0097a0fcf63f21f818fa7a999 name: kminion type: application urls: - kminion-0.11.2.tgz version: 0.11.2 - apiVersion: v2 appVersion: v2.2.3 created: "2023-07-03T16:38:22.566877+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: 65d7231f1e8ee586bec42bc383b66726d596fe03e0f3183e14b688174a3a8112 name: kminion type: application urls: - kminion-0.11.1.tgz version: 0.11.1 - apiVersion: v2 appVersion: v2.2.0 created: "2023-07-03T16:38:22.575384+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: 358bdd509f573049d4bfe77d2edb94c7ad3938f609aea11a8e2c2dc65cca2a9a name: kminion type: application urls: - kminion-0.9.0.tgz version: 0.9.0 - apiVersion: v2 appVersion: v2.2.0 created: "2023-07-03T16:38:22.574906+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: be8f0047b345d3954fc7c7e7f8953a848c909ef253107d6e77ed747843ddd167 name: kminion type: application urls: - kminion-0.8.3.tgz version: 0.8.3 - apiVersion: v2 appVersion: v2.1.0 created: "2023-07-03T16:38:22.573746+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: 888bc665cddc6b6b99af1ce6dd1dea0b107a2e928dff6bfe1c077bc741e20ef7 name: kminion type: application urls: - kminion-0.8.2.tgz version: 0.8.2 - apiVersion: v2 appVersion: v2.1.0 created: "2023-07-03T16:38:22.573271+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: e59c5d5574f162708bf1434c266acbfd9040a89aa7a4abd4a0db70885248e38d name: kminion type: application urls: - kminion-0.8.1.tgz version: 0.8.1 - apiVersion: v2 appVersion: v2.1.0 created: "2023-07-03T16:38:22.572697+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: f54d8236f8cf03c863b53e077e1647164ffe2a7c34e1cf77101fa3312c589706 name: kminion type: application urls: - kminion-0.8.0.tgz version: 0.8.0 - apiVersion: v2 appVersion: v2.1.0 created: "2023-07-03T16:38:22.572269+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: 4cc64cd9f78bd55673b00612579157e493020fb76440abbef10fe5152aef9acc name: kminion type: application urls: - kminion-0.7.0.tgz version: 0.7.0 - apiVersion: v2 appVersion: v2.1.0 created: "2023-07-03T16:38:22.571852+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: 0955e04fe9ef4b516fb0d9ed439ae79778ccdffcf817f09099790cb7e183e4d4 name: kminion type: application urls: - kminion-0.6.0.tgz version: 0.6.0 - apiVersion: v2 appVersion: v2.0.0 created: "2023-07-03T16:38:22.571391+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: d3eb64d05535e136802538662eef7e9fdfdb3f0b93b6a42dfdcc93ee7deeadbd name: kminion type: application urls: - kminion-0.5.0.tgz version: 0.5.0 - apiVersion: v2 appVersion: v2.0.0 created: "2023-07-03T16:38:22.570618+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: 6b4209352d1dffd7873791ee1573dc325eb08d67656b01b430729f45dea4c09a name: kminion type: application urls: - kminion-0.4.0.tgz version: 0.4.0 - apiVersion: v2 appVersion: v2.0.0 created: "2023-07-03T16:38:22.570281+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: c51e3b45791e9fd51f33036916b0d36f7ac695e2fa916a9e99882ea83914ed97 name: kminion type: application urls: - kminion-0.3.1.tgz version: 0.3.1 - apiVersion: v2 appVersion: v2.0.0 created: "2023-07-03T16:38:22.569892+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: a2be2dd8a02dc5222ec7386195a0e25b2682a39bbdcf52b60793c171acac7653 name: kminion type: application urls: - kminion-0.3.0.tgz version: 0.3.0 - apiVersion: v2 appVersion: v2.0.0 created: "2023-07-03T16:38:22.569445+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: 321b6d5ff95ce310d2a3257b3d55f9ced51de99af6519d6d91723d7bdb6456fa name: kminion type: application urls: - kminion-0.2.2.tgz version: 0.2.2 - apiVersion: v2 appVersion: v2.0.0 created: "2023-07-03T16:38:22.569089+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: ed57df27158521a1eb33d215731fcc3248c71b3f36a4a029eb2d3a7b617ca519 name: kminion type: application urls: - kminion-0.2.1.tgz version: 0.2.1 - apiVersion: v2 appVersion: v2.0.0 created: "2023-07-03T16:38:22.568694+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: 025661ee7cc574ad8dde7a68093a3b614fc92e26dd5dd398fc89d0b5308010e1 name: kminion type: application urls: - kminion-0.2.0.tgz version: 0.2.0 - apiVersion: v2 appVersion: v2.0.0 created: "2023-07-03T16:38:22.566269+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: e277e976d864b4bd2e505038dd865a9300486ae8c4323d3f0be40b84df75732b name: kminion type: application urls: - kminion-0.1.3.tgz version: 0.1.3 - apiVersion: v2 appVersion: v2.0.0 created: "2023-07-03T16:38:22.565773+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: 562937d3613624c55984e51adbc6765e7898d1cf8cc2d7d241b6d671bbc12303 name: kminion type: application urls: - kminion-0.1.2.tgz version: 0.1.2 - apiVersion: v2 appVersion: v2.0.0 created: "2023-07-03T16:38:22.562776+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: 25e83d7c7cc92a63268d76b13ecc13077758b48be093490f281498a4f55ad3ca name: kminion type: application urls: - kminion-0.1.1.tgz version: 0.1.1 - apiVersion: v2 appVersion: v2.0.0 created: "2023-07-03T16:38:22.562046+01:00" description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka digest: 7c10e9d9957e9752bc6f4b4a1fffb742d88cd57be06bf4f26ff7b5031645ccbd name: kminion type: application urls: - kminion-0.1.0.tgz version: 0.1.0 generated: "2023-07-03T16:38:22.560328+01:00" ================================================ FILE: charts/kminion/.helmignore ================================================ # Patterns to ignore when building packages. # This supports shell glob matching, relative path matching, and # negation (prefixed with !). Only one pattern per line. .DS_Store # Common VCS dirs .git/ .gitignore .bzr/ .bzrignore .hg/ .hgignore .svn/ # Common backup files *.swp *.bak *.tmp *.orig *~ # Various IDEs .project .idea/ *.tmproj .vscode/ ================================================ FILE: charts/kminion/Chart.yaml ================================================ apiVersion: v2 name: kminion description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka # A chart can be either an 'application' or a 'library' chart. # # Application charts are a collection of templates that can be packaged into versioned archives # to be deployed. # # Library charts provide useful utilities or functions for the chart developer. They're included as # a dependency of application charts to inject those utilities and functions into the rendering # pipeline. Library charts do not define any templates and therefore cannot be deployed. type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) version: 0.12.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. appVersion: "v2.2.5" ================================================ FILE: charts/kminion/templates/NOTES.txt ================================================ 1. Get the application URL by running these commands: {{- if .Values.ingress.enabled }} {{- range .Values.ingress.hosts }} http://{{ . }} {{- end }} {{- else if contains "NodePort" .Values.service.type }} export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "kminion.fullname" . }}) export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") echo http://$NODE_IP:$NODE_PORT {{- else if contains "LoadBalancer" .Values.service.type }} NOTE: It may take a few minutes for the LoadBalancer IP to be available. You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "kminion.fullname" . }}' export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "kminion.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") echo http://$SERVICE_IP:{{ .Values.service.port }} {{- else if contains "ClusterIP" .Values.service.type }} export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "kminion.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") echo "Visit http://127.0.0.1:8080 to use your application" kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT {{- end }} ================================================ FILE: charts/kminion/templates/_helpers.tpl ================================================ {{/* Expand the name of the chart. */}} {{- define "kminion.name" -}} {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} {{- end }} {{/* Create a default fully qualified app name. We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). If release name contains chart name it will be used as a full name. */}} {{- define "kminion.fullname" -}} {{- if .Values.fullnameOverride }} {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} {{- else }} {{- $name := default .Chart.Name .Values.nameOverride }} {{- if contains $name .Release.Name }} {{- .Release.Name | trunc 63 | trimSuffix "-" }} {{- else }} {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} {{- end }} {{- end }} {{- end }} {{/* Create chart name and version as used by the chart label. */}} {{- define "kminion.chart" -}} {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} {{- end }} {{/* Common labels */}} {{- define "kminion.labels" -}} helm.sh/chart: {{ include "kminion.chart" . }} {{ include "kminion.selectorLabels" . }} {{- if .Chart.AppVersion }} app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} {{- end }} app.kubernetes.io/managed-by: {{ .Release.Service }} {{- if .Values.customLabels}} {{ toYaml .Values.customLabels }} {{- end}} {{- end }} {{/* Selector labels */}} {{- define "kminion.selectorLabels" -}} app.kubernetes.io/name: {{ include "kminion.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} {{- end }} {{/* Create the name of the service account to use */}} {{- define "kminion.serviceAccountName" -}} {{- if .Values.serviceAccount.create }} {{- default (include "kminion.fullname" .) .Values.serviceAccount.name }} {{- else }} {{- default "default" .Values.serviceAccount.name }} {{- end }} {{- end }} {{/* Return the appropriate apiVersion for ingress. */}} {{- define "kminion.ingress.apiVersion" -}} {{- if and ($.Capabilities.APIVersions.Has "networking.k8s.io/v1") (semverCompare ">= 1.19-0" .Capabilities.KubeVersion.Version) }} {{- print "networking.k8s.io/v1" }} {{- else if $.Capabilities.APIVersions.Has "networking.k8s.io/v1beta1" }} {{- print "networking.k8s.io/v1beta1" }} {{- else }} {{- print "extensions/v1beta1" }} {{- end }} {{- end }} {{/* Return if ingress is stable. */}} {{- define "kminion.ingress.isStable" -}} {{- eq (include "kminion.ingress.apiVersion" .) "networking.k8s.io/v1" }} {{- end }} {{/* Return if ingress supports ingressClassName. */}} {{- define "kminion.ingress.supportsIngressClassName" -}} {{- or (eq (include "kminion.ingress.isStable" .) "true") (and (eq (include "kminion.ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18-0" .Capabilities.KubeVersion.Version)) }} {{- end }} {{/* Return if ingress supports pathType. */}} {{- define "kminion.ingress.supportsPathType" -}} {{- or (eq (include "kminion.ingress.isStable" .) "true") (and (eq (include "kminion.ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18-0" .Capabilities.KubeVersion.Version)) }} {{- end }} {{/* Return the appropriate apiVersion for podDisruptionBudget. */}} {{- define "kminion.podDisruptionBudget.apiVersion" -}} {{- if $.Capabilities.APIVersions.Has "policy/v1/PodDisruptionBudget" }} {{- print "policy/v1" }} {{- else }} {{- print "policy/v1beta1" }} {{- end }} {{- end }} ================================================ FILE: charts/kminion/templates/configmap.yaml ================================================ apiVersion: v1 kind: ConfigMap metadata: name: {{include "kminion.fullname" .}} namespace: {{ .Release.Namespace | quote }} labels: {{- include "kminion.labels" . | nindent 4}} data: config.yaml: | {{- toYaml .Values.kminion.config | nindent 4}} ================================================ FILE: charts/kminion/templates/daemonset.yaml ================================================ {{- if .Values.daemonset.enabled }} apiVersion: apps/v1 kind: DaemonSet metadata: name: {{include "kminion.fullname" .}} namespace: {{ .Release.Namespace | quote }} labels: {{- include "kminion.labels" . | nindent 4}} spec: updateStrategy: type: OnDelete selector: matchLabels: {{- include "kminion.selectorLabels" . | nindent 6}} template: metadata: {{- with .Values.podAnnotations}} annotations: {{- toYaml . | nindent 8}} {{- end}} labels: {{- include "kminion.selectorLabels" . | nindent 8}} {{- if .Values.customLabels}} {{toYaml .Values.customLabels | nindent 8}} {{- end}} spec: {{- with .Values.imagePullSecrets}} imagePullSecrets: {{- toYaml . | nindent 8}} {{- end}} securityContext: {{- toYaml .Values.podSecurityContext | nindent 8}} serviceAccountName: {{ .Values.serviceAccount.name }} volumes: - name: config configMap: name: {{include "kminion.fullname" .}} {{- range .Values.deployment.volumes.secrets}} - name: {{.secretName}} secret: secretName: {{.secretName}} {{- end}} containers: - name: {{.Chart.Name}} securityContext: {{- toYaml .Values.securityContext | nindent 12}} image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{.Values.image.pullPolicy}} ports: - name: metrics containerPort: {{.Values.service.port}} protocol: TCP env: - name: POD_NAME valueFrom: fieldRef: fieldPath: metadata.name - name: POD_NAMESPACE valueFrom: fieldRef: fieldPath: metadata.namespace - name: CONFIG_FILEPATH value: /etc/kminion/config.yaml {{- range .Values.deployment.env.values}} - name: {{.name}} value: {{.value | quote}} {{- end}} {{- range .Values.deployment.env.secretKeyRefs}} - name: {{.name}} valueFrom: secretKeyRef: name: {{.secretName}} key: {{.secretKey}} {{- end}} {{- range .Values.deployment.env.configMapKeyRefs}} - name: {{.name}} valueFrom: configMapKeyRef: name: {{.configMapName}} key: {{.configMapKey}} {{- end}} volumeMounts: - name: config mountPath: /etc/kminion {{- range .Values.deployment.volumes.secrets}} - name: {{.secretName}} mountPath: {{.mountPath}} {{- end}} resources: {{- toYaml .Values.resources | nindent 12}} livenessProbe: failureThreshold: 3 httpGet: path: /ready port: metrics scheme: HTTP initialDelaySeconds: 10 periodSeconds: 10 successThreshold: 1 timeoutSeconds: 1 readinessProbe: failureThreshold: 3 httpGet: path: /ready port: metrics scheme: HTTP periodSeconds: 10 successThreshold: 1 timeoutSeconds: 1 {{- with .Values.affinity}} affinity: {{- toYaml . | nindent 8}} {{- end}} {{- with .Values.tolerations}} tolerations: {{- toYaml . | nindent 8}} {{- end}} {{- end }} ================================================ FILE: charts/kminion/templates/deployment.yaml ================================================ apiVersion: apps/v1 kind: Deployment metadata: name: {{include "kminion.fullname" .}} namespace: {{ .Release.Namespace | quote }} labels: {{- include "kminion.labels" . | nindent 4}} {{- with .Values.deployment.labels}} {{- toYaml . | nindent 4}} {{- end}} {{- with .Values.deployment.annotations}} annotations: {{- toYaml . | nindent 4}} {{- end}} spec: {{- if not .Values.autoscaling.enabled}} replicas: {{.Values.replicaCount}} {{- end}} selector: matchLabels: {{- include "kminion.selectorLabels" . | nindent 6}} template: metadata: {{- with .Values.podAnnotations}} annotations: {{- toYaml . | nindent 8}} {{- end}} labels: {{- include "kminion.selectorLabels" . | nindent 8}} {{- if .Values.customLabels}} {{toYaml .Values.customLabels | nindent 8}} {{- end}} spec: {{- with .Values.imagePullSecrets}} imagePullSecrets: {{- toYaml . | nindent 8}} {{- end}} serviceAccountName: {{include "kminion.serviceAccountName" .}} securityContext: {{- toYaml .Values.podSecurityContext | nindent 8}} volumes: - name: config configMap: name: {{include "kminion.fullname" .}} {{- range .Values.deployment.volumes.secrets}} - name: {{.secretName}} secret: secretName: {{.secretName}} {{- end}} {{- with .Values.deployment.volumes.extra }} {{- toYaml . | nindent 8 }} {{- end }} initContainers: {{- with .Values.deployment.initContainers }} {{- toYaml . | nindent 8 }} {{- end }} containers: - name: {{.Chart.Name}} securityContext: {{- toYaml .Values.securityContext | nindent 12}} image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{.Values.image.pullPolicy}} ports: - name: metrics containerPort: {{.Values.service.port}} protocol: TCP env: - name: CONFIG_FILEPATH value: /etc/kminion/config.yaml {{- range .Values.deployment.env.values}} - name: {{.name}} value: {{.value | quote}} {{- end}} {{- range .Values.deployment.env.secretKeyRefs}} - name: {{.name}} valueFrom: secretKeyRef: name: {{.secretName}} key: {{.secretKey}} {{- end}} {{- range .Values.deployment.env.configMapKeyRefs}} - name: {{.name}} valueFrom: configMapKeyRef: name: {{.configMapName}} key: {{.configMapKey}} {{- end}} volumeMounts: - name: config mountPath: /etc/kminion {{- range .Values.deployment.volumes.secrets}} - name: {{.secretName}} mountPath: {{.mountPath}} {{- end}} resources: {{- toYaml .Values.resources | nindent 12}} {{- if .Values.deployment.readinessProbe.enabled }} readinessProbe: httpGet: path: /ready port: {{.Values.service.port}} initialDelaySeconds: 10 {{- end }} {{- with .Values.deployment.extraContainers }} {{- toYaml . | nindent 8 }} {{- end }} {{- with .Values.nodeSelector}} nodeSelector: {{- toYaml . | nindent 8}} {{- end}} {{- with .Values.affinity}} affinity: {{- toYaml . | nindent 8}} {{- end}} {{- with .Values.tolerations}} tolerations: {{- toYaml . | nindent 8}} {{- end}} ================================================ FILE: charts/kminion/templates/hpa.yaml ================================================ {{- if .Values.autoscaling.enabled }} apiVersion: {{ ternary "autoscaling/v2" "autoscaling/v2beta1" (.Capabilities.APIVersions.Has "autoscaling/v2") }} kind: HorizontalPodAutoscaler metadata: name: {{ include "kminion.fullname" . }} namespace: {{ .Release.Namespace | quote }} labels: {{- include "kminion.labels" . | nindent 4 }} spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment name: {{ include "kminion.fullname" . }} minReplicas: {{ .Values.autoscaling.minReplicas }} maxReplicas: {{ .Values.autoscaling.maxReplicas }} metrics: {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} - type: Resource resource: name: cpu {{- if .Capabilities.APIVersions.Has "autoscaling/v2" }} target: type: Utilization averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} {{ else }} targetAverageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} {{- end }} {{- end }} {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} - type: Resource resource: name: memory {{- if .Capabilities.APIVersions.Has "autoscaling/v2" }} target: type: Utilization averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} {{ else }} targetAverageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} {{- end }} {{- end }} {{- end }} ================================================ FILE: charts/kminion/templates/ingress.yaml ================================================ {{- if .Values.ingress.enabled -}} {{- $fullName := include "kminion.fullname" . -}} {{- $svcPort := .Values.service.port -}} {{- $ingressApiIsStable := eq (include "kminion.ingress.isStable" .) "true" -}} {{- $ingressSupportsIngressClassName := eq (include "kminion.ingress.supportsIngressClassName" .) "true" -}} {{- $ingressSupportsPathType := eq (include "kminion.ingress.supportsPathType" .) "true" -}} {{- $fullName := include "kminion.fullname" . -}} {{- $servicePort := .Values.service.port -}} {{- $ingressPath := .Values.ingress.path -}} {{- $ingressPathType := .Values.ingress.pathType -}} {{- $extraPaths := .Values.ingress.extraPaths -}} apiVersion: {{ include "kminion.ingress.apiVersion" . }} kind: Ingress metadata: name: {{ $fullName }} namespace: {{ .Release.Namespace | quote }} labels: {{- include "kminion.labels" . | nindent 4 }} {{- with .Values.ingress.annotations }} annotations: {{- toYaml . | nindent 4 }} {{- end }} spec: {{- if and $ingressSupportsIngressClassName .Values.ingress.ingressClassName }} ingressClassName: {{ .Values.ingress.ingressClassName }} {{- end -}} {{- with .Values.ingress.tls }} tls: {{- tpl (toYaml .) $ | nindent 4 }} {{- end }} rules: {{- if .Values.ingress.hosts }} {{- range .Values.ingress.hosts }} - host: {{ tpl . $ }} http: paths: {{- with $extraPaths }} {{- toYaml . | nindent 10 }} {{- end }} - path: {{ $ingressPath }} {{- if $ingressSupportsPathType }} pathType: {{ $ingressPathType }} {{- end }} backend: {{- if $ingressApiIsStable }} service: name: {{ $fullName }} port: number: {{ $servicePort }} {{- else }} serviceName: {{ $fullName }} servicePort: {{ $servicePort }} {{- end }} {{- end }} {{- else }} - http: paths: - backend: {{- if $ingressApiIsStable }} service: name: {{ $fullName }} port: number: {{ $servicePort }} {{- else }} serviceName: {{ $fullName }} servicePort: {{ $servicePort }} {{- end }} {{- with $ingressPath }} path: {{ . }} {{- end }} {{- if $ingressSupportsPathType }} pathType: {{ $ingressPathType }} {{- end }} {{- end -}} {{- end }} ================================================ FILE: charts/kminion/templates/poddisruptionbudget.yaml ================================================ {{- if .Values.podDisruptionBudget }} apiVersion: {{ include "kminion.podDisruptionBudget.apiVersion" . }} kind: PodDisruptionBudget metadata: name: {{ template "kminion.fullname" . }} namespace: {{ .Release.Namespace | quote }} labels: {{- include "kminion.labels" . | nindent 4}} spec: {{- if .Values.podDisruptionBudget.minAvailable }} minAvailable: {{ .Values.podDisruptionBudget.minAvailable }} {{- end }} {{- if .Values.podDisruptionBudget.maxUnavailable }} maxUnavailable: {{ .Values.podDisruptionBudget.maxUnavailable }} {{- end }} selector: matchLabels: {{- include "kminion.selectorLabels" . | nindent 6}} {{- end }} ================================================ FILE: charts/kminion/templates/service.yaml ================================================ apiVersion: v1 kind: Service metadata: name: {{ include "kminion.fullname" . }} namespace: {{ .Release.Namespace | quote }} labels: {{- include "kminion.labels" . | nindent 4 }} {{- if .Values.service.annotations }} annotations: {{- toYaml .Values.service.annotations | nindent 4 }} {{- end }} spec: type: {{ .Values.service.type }} ports: - port: {{ .Values.service.port }} targetPort: metrics protocol: TCP name: metrics {{- if .Values.service.extraPorts }} {{- toYaml .Values.service.extraPorts | nindent 4 }} {{- end }} selector: {{- include "kminion.selectorLabels" . | nindent 4 }} ================================================ FILE: charts/kminion/templates/serviceaccount.yaml ================================================ {{- if .Values.serviceAccount.create -}} apiVersion: v1 kind: ServiceAccount metadata: name: {{ include "kminion.serviceAccountName" . }} namespace: {{ .Release.Namespace | quote }} labels: {{- include "kminion.labels" . | nindent 4 }} {{- with .Values.serviceAccount.annotations }} annotations: {{- toYaml . | nindent 4 }} {{- end }} {{- end }} ================================================ FILE: charts/kminion/templates/servicemonitor.yaml ================================================ {{- if .Values.serviceMonitor.create }} apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: {{include "kminion.fullname" .}} namespace: {{ .Release.Namespace | quote }} labels: {{- include "kminion.labels" . | nindent 4}} {{- if .Values.serviceMonitor.additionalLabels}} {{toYaml .Values.serviceMonitor.additionalLabels | nindent 4}} {{- end}} spec: selector: matchLabels: {{- include "kminion.labels" . | nindent 6}} endpoints: - port: metrics path: /metrics honorLabels: {{ .Values.serviceMonitor.honorLabels }} scrapeTimeout: {{ .Values.serviceMonitor.scrapeTimeout }} interval: {{ .Values.serviceMonitor.interval }} {{- if .Values.serviceMonitor.relabelings }} relabelings: {{ toYaml .Values.serviceMonitor.relabelings | nindent 6 }} {{- end }} {{- if .Values.serviceMonitor.targetLabels}} targetLabels: {{- toYaml .Values.serviceMonitor.targetLabels | nindent 4}} {{- end}} {{- if .Values.customLabels }} podTargetLabels: {{- (keys .Values.customLabels | sortAlpha) | toYaml | nindent 4 }} {{- end}} {{- end }} ================================================ FILE: charts/kminion/values.yaml ================================================ # Default values for kminion. # This is a YAML-formatted file. # Declare variables to be passed into your templates. replicaCount: 1 image: repository: redpandadata/kminion pullPolicy: IfNotPresent # Overrides the image tag whose default is the chart appVersion. tag: "" imagePullSecrets: [] nameOverride: "" fullnameOverride: "" serviceAccount: # Specifies whether a service account should be created create: true # Annotations to add to the service account annotations: {} # The name of the service account to use. # If not set and create is true, a name is generated using the fullname template name: "" podAnnotations: {} # prometheus.io/scrape: "true" # prometheus.io/port: "8080" # prometheus.io/path: "/metrics" podSecurityContext: runAsUser: 99 fsGroup: 99 ## See `kubectl explain poddisruptionbudget.spec` for more ## ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/ podDisruptionBudget: maxUnavailable: 1 # minAvailable: 1 securityContext: allowPrivilegeEscalation: false # capabilities: # drop: # - ALL # readOnlyRootFilesystem: true # runAsNonRoot: true # runAsUser: 1000 service: type: ClusterIP port: 8080 # This port is also used as exposed container port annotations: {} # # Annotations to add to the service extraPorts: [] # when []extraContainers expose additional metrics, make # discoverable for servicemontors # - port: 8443 # targetPort: 8443 # protocol: TCP # name: expose-x509-for-ttl-checks ingress: enabled: false # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress # ingressClassName: nginx # Values can be templated annotations: {} # kubernetes.io/ingress.class: nginx # kubernetes.io/tls-acme: "true" labels: {} path: / # pathType is only for k8s >= 1.1= pathType: Prefix hosts: - chart-example.local ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. extraPaths: [] # - path: /* # backend: # serviceName: ssl-redirect # servicePort: use-annotation ## Or for k8s > 1.19 # - path: /* # pathType: Prefix # backend: # service: # name: ssl-redirect # port: # name: use-annotation tls: [] # - secretName: chart-example-tls # hosts: # - chart-example.local # ingressClassName: resources: {} # We usually recommend not to specify default resources and to leave this as a conscious # choice for the user. This also increases chances charts run on environments with little # resources, such as Minikube. If you do want to specify resources, uncomment the following # lines, adjust them as necessary, and remove the curly braces after 'resources:'. # limits: # cpu: 100m # memory: 128Mi # requests: # cpu: 100m # memory: 128Mi autoscaling: enabled: false minReplicas: 1 maxReplicas: 100 targetCPUUtilizationPercentage: 80 # targetMemoryUtilizationPercentage: 80 nodeSelector: {} tolerations: [] affinity: {} customLabels: {} serviceMonitor: create: false additionalLabels: {} honorLabels: false scrapeTimeout: 10s interval: 15s relabelings: [] # - sourceLabels: [__meta_kubernetes_pod_label_my_label] # separator: ; # regex: (.*) # targetLabel: my_label # replacement: $1 # action: replace # For DaemonSet mode you may set daemonset to "true" and replicaCount to 0. daemonset: enabled: false deployment: readinessProbe: enabled: true labels: {} # Annotations to add to the Deployment resource annotations: {} volumes: # Mount files from Kubernetes secrets into the container secrets: [] # - secretName: vault-tls # mountPath: /vault/tls extra: [] # - name: example # emptyDir: {} # If you want to provide specifc config settings like sensitive Kafka credentials via environment variables you can # do so by making them available here. See the kminion reference config to figure out the expected variable names. env: # Configure environment variables which you want to make available values: [] # - name: KAFKA_SASL_MECHANISM # value: PLAIN # Configure environment variables which you want to make available from a config map configMapKeyRefs: [] # - name: KAFKA_SASL_USERNAME # configMapName: kafka-user-team-xy # key: username # Configure environment variables which you want to make available from a secret secretKeyRefs: [] # - name: KAFKA_SASL_PASSWORD # secretName: kafka-credentials-team-xy # secretKey: password # Add additional containers, e. g. for oauth2-proxy extraContainers: {} # Add additional init containers, e. g. for oauth2-proxy initContainers: {} kminion: # KMinion can be configured using environment variables and/or a YAML config. The yaml contents under config will # end up in a YAML file which will be mounted into the kminion container. # See reference config: https://github.com/cloudhut/kminion/blob/master/docs/reference-config.yaml config: {} # kafka: # brokers: [ ] # clientId: "kminion" # rackId: "" # tls: # enabled: false # caFilepath: "" # certFilepath: "" # keyFilepath: "" # passphrase: "" # insecureSkipTlsVerify: false # sasl: # # Whether or not SASL authentication will be used for authentication # enabled: false # # Username to use for PLAIN or SCRAM mechanism # username: "" # # Password to use for PLAIN or SCRAM mechanism # password: "" # # Mechanism to use for SASL Authentication. Valid values are PLAIN, SCRAM-SHA-256, SCRAM-SHA-512, GSSAPI # mechanism: "PLAIN" # # GSSAPI / Kerberos config properties # gssapi: # authType: "" # keyTabPath: "" # kerberosConfigPath: "" # serviceName: "" # username: "" # password: "" # realm: "" # # Whether to retry the initial test connection to Kafka. False will exit with code 1 on error, # # while true will retry until success. # retryInitConnection: false # # minion: # consumerGroups: # # Enabled specifies whether consumer groups shall be scraped and exported or not. # enabled: true # # Mode specifies whether we export consumer group offsets using the Admin API or by consuming the internal # # __consumer_offsets topic. Both modes have their advantages and disadvantages. # scrapeMode: adminApi # Valid values: adminApi, offsetsTopic # # Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and # # you aren't interested in per partition lags you could choose "topic" where all partition lags will be summed # # and only topic lags will be exported. # granularity: partition # # AllowedGroups are regex strings of group ids that shall be exported # # You can specify allowed groups by providing literals like "my-consumergroup-name" or by providing regex expressions # # like "/internal-.*/". # allowedGroups: [ ] # # IgnoredGroups are regex strings of group ids that shall be ignored/skipped when exporting metrics. Ignored groups # # take precedence over allowed groups. # ignoredGroups: [ ] # topics: # # Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and # # you aren't interested in per partition metrics you could choose "topic". # granularity: partition # # AllowedTopics are regex strings of topic names whose topic metrics that shall be exported. # # You can specify allowed topics by providing literals like "my-topic-name" or by providing regex expressions # # like "/internal-.*/". # allowedTopics: [ ] # # # IgnoredTopics are regex strings of topic names that shall be ignored/skipped when exporting metrics. Ignored topics # # take precedence over allowed topics. # ignoredTopics: [ ] # # infoMetric is a configuration object for the kminion_kafka_topic_info metric # infoMetric: # # ConfigKeys are set of strings of Topic configs that you want to have exported as part of the metric # configKeys: ["cleanup.policy"] # logDirs: # # Enabled specifies whether log dirs shall be scraped and exported or not. This should be disabled for clusters prior # # to version 1.0.0 as describing log dirs was not supported back then. # enabled: true # # exporter: # # Namespace is the prefix for all exported Prometheus metrics # namespace: "kminion" # # Host that shall be used to bind the HTTP server on # host: "" # # Port that shall be used to bind the HTTP server on # port: 8080 # # logger: # # Level is a logging priority. Higher levels are more important. Valid values are: debug, info, warn, error, fatal, panic # level: info ================================================ FILE: config.go ================================================ package main import ( "fmt" "os" "strings" "github.com/cloudhut/kminion/v2/kafka" "github.com/cloudhut/kminion/v2/logging" "github.com/cloudhut/kminion/v2/minion" "github.com/cloudhut/kminion/v2/prometheus" "github.com/knadh/koanf" "github.com/knadh/koanf/parsers/yaml" "github.com/knadh/koanf/providers/env" "github.com/knadh/koanf/providers/file" "github.com/mitchellh/mapstructure" "go.uber.org/zap" ) type Config struct { Kafka kafka.Config `koanf:"kafka"` Minion minion.Config `koanf:"minion"` Exporter prometheus.Config `koanf:"exporter"` Logger logging.Config `koanf:"logger"` } func (c *Config) SetDefaults() { c.Kafka.SetDefaults() c.Minion.SetDefaults() c.Exporter.SetDefaults() c.Logger.SetDefaults() } func (c *Config) Validate() error { err := c.Kafka.Validate() if err != nil { return fmt.Errorf("failed to validate kafka config: %w", err) } err = c.Minion.Validate() if err != nil { return fmt.Errorf("failed to validate minion config: %w", err) } err = c.Logger.Validate() if err != nil { return fmt.Errorf("failed to validate logger config: %w", err) } return nil } func newConfig(logger *zap.Logger) (Config, error) { k := koanf.New(".") var cfg Config cfg.SetDefaults() // 1. Check if a config filepath is set via flags. If there is one we'll try to load the file using a YAML Parser envKey := "CONFIG_FILEPATH" configFilepath := os.Getenv(envKey) if configFilepath == "" { logger.Info("the env variable '" + envKey + "' is not set, therefore no YAML config will be loaded") } else { err := k.Load(file.Provider(configFilepath), yaml.Parser()) if err != nil { return Config{}, fmt.Errorf("failed to parse YAML config: %w", err) } } // We could unmarshal the loaded koanf input after loading both providers, however we want to unmarshal the YAML // config with `ErrorUnused` set to true, but unmarshal environment variables with `ErrorUnused` set to false (default). // Rationale: Orchestrators like Kubernetes inject unrelated environment variables, which we still want to allow. err := k.UnmarshalWithConf("", &cfg, koanf.UnmarshalConf{ Tag: "", FlatPaths: false, DecoderConfig: &mapstructure.DecoderConfig{ DecodeHook: mapstructure.ComposeDecodeHookFunc( mapstructure.StringToTimeDurationHookFunc()), Metadata: nil, Result: &cfg, WeaklyTypedInput: true, ErrorUnused: true, }, }) if err != nil { return Config{}, err } err = k.Load(env.ProviderWithValue("", ".", func(s string, v string) (string, interface{}) { // key := strings.Replace(strings.ToLower(s), "_", ".", -1) key := strings.Replace(strings.ToLower(s), "_", ".", -1) // Check to exist if we have a configuration option already and see if it's a slice // If there is a comma in the value, split the value into a slice by the comma. if strings.Contains(v, ",") { return key, strings.Split(v, ",") } // Otherwise return the new key with the unaltered value return key, v }), nil) if err != nil { return Config{}, err } err = k.Unmarshal("", &cfg) if err != nil { return Config{}, err } err = cfg.Validate() if err != nil { return Config{}, fmt.Errorf("failed to validate config: %w", err) } return cfg, nil } ================================================ FILE: docker-compose.yml ================================================ --- version: '2.1' services: zookeeper: image: confluentinc/cp-zookeeper:latest ports: - 2181:2181 environment: ZOOKEEPER_CLIENT_PORT: 2181 ZOOKEEPER_TICK_TIME: 2000 container_name: zookeeper hostname: zookeeper kafka: image: confluentinc/cp-kafka:latest hostname: kafka container_name: kafka depends_on: - zookeeper ports: - 9092:9092 environment: KAFKA_BROKER_ID: 1 KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092 KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 kafka-minion: build: context: . dockerfile: ./Dockerfile hostname: kafka-minion container_name: kafka-minion depends_on: - zookeeper - kafka ports: - 8080:8080 environment: KAFKA_BROKERS: kafka:29092 restart: unless-stopped ================================================ FILE: docs/end-to-end.md ================================================ # End-To-End Monitoring This page describes the end-to-end monitoring feature in KMinion, how it works, and what metrics it provides. ## Motivation > What is the issue? Why did we build this feature? We can monitor metrics like CPU usage, free disk space, or even consumer group lag. However, these metrics don't give us a good idea of the performance characteristics an actual, real-world, client experiences when connected to the cluster. With the "classic" metrics lots of questions go unanswered: - Can a client produce messages to the cluster? - Can clients produce & consume messages as well as commit group offsets with an acceptable latency? - Is the cluster in a healthy state from a client's perspective? ## Approach & Implementation > How do we solve those issues? How does the feature work? The most reliably way to get real-world performance and availability metrics is to actually run a producer/consumer ourselves. This is exactly what the end-to-end monitoring feature does! ## High Level Overview In order to determine if the cluster is fully operational, and it's performance is within acceptable limits, KMinion continuously produces and consumes messages to/from the cluster. That way we can measure things like ack-latency, commit-latency, and roundtrip-time. KMinion creates and manages its own topic for the end-to-end test messages. The name of the topic can be configured. **The first step** is to create a message and send it to the cluster. - Every produced message is added to an internal tracker, so we can recognize messages being "lost". A message is considered lost if it doesn't arrive back at the consumer within the configured time span. **The second step** is to continuously consume the topic. - As each message arrives, we calculate its roundtrip time (time from the point the message was created, until KMinion received it again) - Consumer group offsets are committed periodically, while also recording the time each commit takes. ### Topic Management The topic KMinion uses, is created and managed completely automatically (the topic name can be configured though). KMinion continuously checks the topic and fixes issues/imbalances automatically: - Add partitions to the topic, so it has at least as many partitions as there are brokers. - Will reassign partitions to ensure every broker leads at least one partition, and that all partitions' replicas are distributed evenly across the brokers. KMinion tries to assign partitionIDs to brokers that have the same broker id. ### Consumer Group Management On startup each KMinion instance generates a unique identifier (UUID) that is used to create its own consumer group. It incorporates the shared prefix from the config. That is necessary because: - Offsets must not be shared among multiple instances. - Each instance must always consume **all** partitions of the topic. The instances' UUID is also embedded in every message, so each instance can easily filter out messages it didn't produce. That's why it is perfectly fine to run multiple KMinion instances against the same cluster, using the same topic. KMinion also monitors and deletes consumer groups that use it's configured prefix. That way, when an instance exits/restarts, previous consumer groups will be cleaned up quickly (check happens every 20s). ## Available Metrics The end-to-end monitoring feature exports the following metrics. ### Counters | Name | Description | | --- | --- | | `kminion_end_to_end_messages_produced_total ` | Messages KMinion *tried* to send | | `kminion_end_to_end_messages_received_total ` | Number of messages received (only counts those that match, i.e. that this instance actually produced itself) | | `kminion_end_to_end_offset_commits_total` | Number of successful offset commits | | `kminion_end_to_end_messages_lost_total` Number of messages that have been produced successfully but not received within the configured SLA duration | | `kminion_end_to_end_messages_produced_failed_total` Number of messages failed to produce to Kafka because of a timeout or failure | | `kminion_end_to_end_offset_commits_total` Counts how many times kminions end-to-end test has committed offsets | ### Histograms | Name | Description | | --- | --- | | `kminion_end_to_end_produce_latency_seconds ` | Duration until the cluster acknowledged a message. | | `kminion_end_to_end_offset_commit_latency_seconds` Time kafka took to respond to kminion's offset commit | | `kminion_end_to_end_roundtrip_latency_seconds ` | Duration from creation of a message, until it was received/consumed again. | ### Gauges | Name | Description | | --- | --- | | `kminion_end_to_end_messages_produced_in_flight` Number of messages that kminion's end-to-end test produced but has not received an answer for yet | ## Config Properties All config properties related to this feature are located in `minion.endToEnd`. ```yaml endToEnd: enabled: true probeInterval: 800ms # how often to send end-to-end test messages topicManagement: # You can disable topic management, without disabling the testing feature. # Only makes sense if you have multiple kminion instances, and for some reason only want one of them to create/configure the topic. # It is strongly recommended to leave this enabled. enabled: true # Name of the topic kminion uses to send its test messages # You do *not* need to change this if you are running multiple kminion instances on the same cluster. # Different instances are perfectly fine with sharing the same topic! name: kminion-end-to-end # How often kminion checks its topic to validate configuration, partition count, and partition assignments reconciliationInterval: 10m # Useful for monitoring the performance of acks (if >1 this is best combined with 'producer.requiredAcks' set to 'all') replicationFactor: 1 # Rarely makes sense to change this, but maybe if you want some sort of cheap load test? partitionsPerBroker: 1 producer: # This defines the maximum time to wait for an ack response after producing a message, # and the upper bound for histogram buckets in "produce_latency_seconds" ackSla: 5s # Can be to "all" (default) so kafka only reports an end-to-end test message as acknowledged if # the message was written to all in-sync replicas of the partition. # Or can be set to "leader" to only require to have written the message to its log. requiredAcks: all consumer: # Prefix kminion uses when creating its consumer groups. Current kminion instance id will be appended automatically groupIdPrefix: kminion-end-to-end # Whether KMinion should try to delete empty consumer groups with the same prefix. This can be used if you want # KMinion to cleanup it's old consumer groups. It should only be used if you use a unique prefix for KMinion. deleteStaleConsumerGroups: false # Defines the time limit beyond which a message is considered "lost" (failed the roundtrip), # also used as the upper bound for histogram buckets in "roundtrip_latency" roundtripSla: 20s # Maximum time an offset commit is allowed to take before considering it failed, # also used as the upper bound for histogram buckets in "commit_latency_seconds" commitSla: 10s ``` ================================================ FILE: docs/metrics.md ================================================ # Exported Metrics This document lists all exported metrics in an exemplary way. ## Exporter Metrics ``` # HELP kminion_exporter_up Build info about this Prometheus Exporter. Gauge value is 0 if one or more scrapes have failed. # TYPE kminion_exporter_up gauge kminion_exporter_up{version="sha-0ab0dcdf862f7a34b06998cd2d980148e048151a"} 1 # HELP kminion_exporter_offset_consumer_records_consumed_total The number of offset records that have been consumed by the internal offset consumer # TYPE kminion_exporter_offset_consumer_records_consumed_total counter kminion_exporter_offset_consumer_records_consumed_total 5.058244883e+09 ``` ## Kafka Metrics ### General / Cluster Metrics ``` # HELP kminion_kafka_broker_info Kafka broker information # TYPE kminion_kafka_broker_info gauge kminion_kafka_broker_info{address="broker-9.analytics-prod.kafka.cloudhut.dev",broker_id="9",is_controller="false",port="9092",rack_id="europe-west1-b"} 1 # HELP kminion_kafka_cluster_info Kafka cluster information # TYPE kminion_kafka_cluster_info gauge kminion_kafka_cluster_info{broker_count="12",cluster_id="UYZJg8bhT_6SxhsdaQZEQ",cluster_version="v2.6",controller_id="6"} 1 ``` ### Log Dir Metrics ``` # HELP kminion_kafka_broker_log_dir_size_total_bytes The summed size in bytes of all log dirs for a given broker # TYPE kminion_kafka_broker_log_dir_size_total_bytes gauge kminion_kafka_broker_log_dir_size_total_bytes{address="broker-9.analytics-prod.kafka.cloudhut.dev",broker_id="9",port="9092",rack_id="europe-west1-b"} 8.32654935115e+11 # HELP kminion_kafka_topic_log_dir_size_total_bytes The summed size in bytes of partitions for a given topic. This includes the used space for replica partitions. # TYPE kminion_kafka_topic_log_dir_size_total_bytes gauge kminion_kafka_topic_log_dir_size_total_bytes{topic_name="__consumer_offsets"} 9.026554258e+09 ``` ### Topic & Partition Metrics ``` # HELP kminion_kafka_topic_info Info labels for a given topic # TYPE kminion_kafka_topic_info gauge kminion_kafka_topic_info{cleanup_policy="compact",partition_count="1",replication_factor="1",topic_name="_confluent-ksql-default__command_topic"} 1 # HELP kminion_kafka_topic_partition_low_water_mark Partition Low Water Mark # TYPE kminion_kafka_topic_partition_low_water_mark gauge kminion_kafka_topic_partition_low_water_mark{partition_id="0",topic_name="__consumer_offsets"} 0 # HELP kminion_kafka_topic_low_water_mark_sum Sum of all the topic's partition low water marks # TYPE kminion_kafka_topic_low_water_mark_sum gauge kminion_kafka_topic_low_water_mark_sum{topic_name="__consumer_offsets"} 0 # HELP kminion_kafka_topic_partition_high_water_mark Partition High Water Mark # TYPE kminion_kafka_topic_partition_high_water_mark gauge kminion_kafka_topic_partition_high_water_mark{partition_id="0",topic_name="__consumer_offsets"} 2.04952001e+08 # HELP kminion_kafka_topic_high_water_mark_sum Sum of all the topic's partition high water marks # TYPE kminion_kafka_topic_high_water_mark_sum gauge kminion_kafka_topic_high_water_mark_sum{topic_name="__consumer_offsets"} 1.512023846873e+12 ``` ### Consumer Group Metrics ``` # HELP kminion_kafka_consumer_group_info Consumer Group info metrics. It will report 1 if the group is in the stable state, otherwise 0. # TYPE kminion_kafka_consumer_group_info gauge kminion_kafka_consumer_group_info{coordinator_id="0",group_id="bigquery-sink",protocol="range",protocol_type="consumer",state="Stable"} 1 # HELP kminion_kafka_consumer_group_members Consumer Group member count metrics. It will report the number of members in the consumer group # TYPE kminion_kafka_consumer_group_members gauge kminion_kafka_consumer_group_members{group_id="bigquery-sink"} 2 # HELP kminion_kafka_consumer_group_empty_members Consumer Group Empty Members. It will report the number of members in the consumer group with no partition assigned # TYPE kminion_kafka_consumer_group_empty_members gauge kminion_kafka_consumer_group_empty_members{group_id="bigquery-sink"} 1 # HELP kminion_kafka_consumer_group_topic_members Consumer Group topic member count metrics. It will report the number of members in the consumer group assigned on a given topic # TYPE kminion_kafka_consumer_group_topic_members gauge kminion_kafka_consumer_group_topic_members{group_id="bigquery-sink",topic_name="shop-activity"} 4 # HELP kminion_kafka_consumer_group_topic_assigned_partitions Consumer Group topic partitions count metrics. It will report the number of partitions assigned in the consumer group for a given topic # TYPE kminion_kafka_consumer_group_topic_assigned_partitions gauge kminion_kafka_consumer_group_topic_assigned_partitions{group_id="bigquery-sink",topic_name="shop-activity"} 32 # HELP kminion_kafka_consumer_group_topic_offset_sum The sum of all committed group offsets across all partitions in a topic # TYPE kminion_kafka_consumer_group_topic_offset_sum gauge kminion_kafka_consumer_group_topic_offset_sum{group_id="bigquery-sink",topic_name="shop-activity"} 4.259513e+06 # HELP kminion_kafka_consumer_group_topic_partition_lag The number of messages a consumer group is lagging behind the latest offset of a partition # TYPE kminion_kafka_consumer_group_topic_partition_lag gauge kminion_kafka_consumer_group_topic_partition_lag{group_id="bigquery-sink",partition_id="10",topic_name="shop-activity"} 147481 # HELP kminion_kafka_consumer_group_topic_lag The number of messages a consumer group is lagging behind across all partitions in a topic # TYPE kminion_kafka_consumer_group_topic_lag gauge kminion_kafka_consumer_group_topic_lag{group_id="bigquery-sink",topic_name="shop-activity"} 147481 # HELP kminion_kafka_consumer_group_offset_commits_total The number of offsets committed by a group # TYPE kminion_kafka_consumer_group_offset_commits_total counter kminion_kafka_consumer_group_offset_commits_total{group_id="bigquery-sink"} 1098 ``` ### End-to-End Metrics ``` # HELP kminion_end_to_end_messages_produced_total Number of messages that kminion's end-to-end test has tried to send to kafka # TYPE kminion_end_to_end_messages_produced_total counter kminion_end_to_end_messages_produced_total 384 # HELP kminion_end_to_end_offset_commits_total Counts how many times kminions end-to-end test has committed messages # TYPE kminion_end_to_end_offset_commits_total counter kminion_end_to_end_offset_commits_total 18 # HELP kminion_end_to_end_messages_received_total Number of *matching* messages kminion received. Every roundtrip message has a minionID (randomly generated on startup) and a timestamp. Kminion only considers a message a match if it it arrives within the configured roundtrip SLA (and it matches the minionID) # TYPE kminion_end_to_end_messages_received_total counter kminion_end_to_end_messages_received_total 383 # HELP kminion_end_to_end_produce_latency_seconds Time until we received an ack for a produced message # TYPE kminion_end_to_end_produce_latency_seconds histogram kminion_end_to_end_produce_latency_seconds_bucket{partitionId="0",le="0.005"} 0 # HELP kminion_end_to_end_offset_commit_latency_seconds Time kafka took to respond to kminion's offset commit # TYPE kminion_end_to_end_offset_commit_latency_seconds histogram kminion_end_to_end_offset_commit_latency_seconds_bucket{groupCoordinatorBrokerId="0",le="0.005"} 0 # HELP kminion_end_to_end_roundtrip_latency_seconds Time it took between sending (producing) and receiving (consuming) a message # TYPE kminion_end_to_end_roundtrip_latency_seconds histogram kminion_end_to_end_roundtrip_latency_seconds_bucket{partitionId="0",le="0.005"} 0 # HELP kminion_end_to_end_messages_lost_total Number of messages that have been produced successfully but not received within the configured SLA duration # TYPE kminion_end_to_end_messages_lost_total counter kminion_end_to_end_messages_lost_total{partition_id="0"} 0 # HELP kminion_end_to_end_messages_produced_failed_total Number of messages failed to produce to Kafka because of a timeout or failure # TYPE kminion_end_to_end_messages_produced_failed_total counter kminion_end_to_end_messages_produced_failed_total{partition_id="0"} 0 # HELP kminion_end_to_end_messages_produced_in_flight Number of messages that kminion's end-to-end test produced but has not received an answer for yet # TYPE kminion_end_to_end_messages_produced_in_flight gauge kminion_end_to_end_messages_produced_in_flight{partition_id="0"} 0 ``` ================================================ FILE: docs/reference-config.yaml ================================================ ##################################################################################### # This file documents all the available config options and it's default values. # # All config options can be configured via environment variables as well. # If you specify both the env variable and yaml option for the same configuration # the environment variable will take precedence. If you want to use a YAML config # file, specify the path to the config file by setting the env variable # CONFIG_FILEPATH. # # The env variable name is auto generated by upper casing everything and adding # an underscore for each indentation/level. Some examples: # kafka.rackId => KAFKA_RACKID # kafka.tls.caFilepath => KAFKA_TLS_CAFILEPATH # minion.consumerGroups.allowedGroups => MINION_CONSUMERGROUPS_ALLOWEDGROUPS # # Env variables that expect array values can be provided by separting them using # a comma: KAFKA_BROKERS = "broker1:9092,broker2:9092,broker3:9092" ##################################################################################### logger: # Valid values are: debug, info, warn, error, fatal, panic level: info kafka: brokers: [ ] clientId: "kminion" rackId: "" tls: enabled: false caFilepath: "" certFilepath: "" keyFilepath: "" # base64 encoded tls CA, cannot be set if 'caFilepath' is set ca: "" # base64 encoded tls cert, cannot be set if 'certFilepath' is set cert: "" # base64 encoded tls key, cannot be set if 'keyFilepath' is set key: "" passphrase: "" insecureSkipTlsVerify: false sasl: # Whether or not SASL authentication will be used for authentication enabled: false # Username to use for PLAIN or SCRAM mechanism username: "" # Password to use for PLAIN or SCRAM mechanism password: "" # Mechanism to use for SASL Authentication. Valid values are PLAIN, SCRAM-SHA-256, SCRAM-SHA-512, GSSAPI, OAUTHBEARER mechanism: "PLAIN" # GSSAPI / Kerberos config properties gssapi: # Required. One of USER_AUTH or KEYTAB_AUTH authType: "" keyTabPath: "" kerberosConfigPath: "" serviceName: "" username: "" password: "" realm: "" enableFast: true # OAUTHBEARER config properties oauth: tokenEndpoint: "" clientId: "" clientSecret: "" scope: "" minion: consumerGroups: # Enabled specifies whether consumer groups shall be scraped and exported or not. enabled: true # Mode specifies whether we export consumer group offsets using the Admin API or by consuming the internal # __consumer_offsets topic. Both modes have their advantages and disadvantages. # * adminApi: # - Useful for managed kafka clusters that do not provide access to the offsets topic. # * offsetsTopic # - Enables kminion_kafka_consumer_group_offset_commits_total metrics. # - Processing the offsetsTopic requires slightly more memory and cpu than using the adminApi. The amount depends on the # size and throughput of the offsets topic. scrapeMode: adminApi # Valid values: adminApi, offsetsTopic # Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and # you aren't interested in per partition lags you could choose "topic" where all partition lags will be summed # and only topic lags will be exported. granularity: partition # AllowedGroups are regex strings of group ids that shall be exported # You can specify allowed groups by providing literals like "my-consumergroup-name" or by providing regex expressions # like "/internal-.*/". allowedGroups: [ ".*" ] # IgnoredGroups are regex strings of group ids that shall be ignored/skipped when exporting metrics. Ignored groups # take precedence over allowed groups. ignoredGroups: [ ] topics: # Enabled can be set to false in order to disable collecting any topic metrics. enabled: true # Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and # you aren't interested in per partition metrics you could choose "topic". granularity: partition # AllowedTopics are regex strings of topic names whose topic metrics that shall be exported. # You can specify allowed topics by providing literals like "my-topic-name" or by providing regex expressions # like "/internal-.*/". allowedTopics: [ ".*" ] # IgnoredTopics are regex strings of topic names that shall be ignored/skipped when exporting metrics. Ignored topics # take precedence over allowed topics. ignoredTopics: [ ] # infoMetric is a configuration object for the kminion_kafka_topic_info metric infoMetric: # ConfigKeys are set of strings of Topic configs that you want to have exported as part of the metric configKeys: [ "cleanup.policy" ] logDirs: # Enabled specifies whether log dirs shall be scraped and exported or not. This should be disabled for clusters prior # to version 1.0.0 as describing log dirs was not supported back then. enabled: true # EndToEnd Metrics # When enabled, kminion creates a topic which it produces to and consumes from, to measure various advanced metrics. See docs for more info endToEnd: enabled: false # How often to send end-to-end test messages probeInterval: 100ms topicManagement: # Controls whether kminion should create, validate, and alter the end-to-end topic. # # When enabled (true): # - kminion creates the topic if it doesn't exist # - Validates partition count and replica assignments on startup and periodically (based on reconciliationInterval) # - Alters the topic (reassigns partitions, adds partitions) to ensure one partition leader per broker # - Fails on startup if topic alteration operations fail # # When disabled (false): # - kminion will NOT create the topic if it doesn't exist (startup will fail) # - Will NOT validate or alter an existing topic (accepts current partition layout as-is) # - Logs warnings if the topic configuration is suboptimal # - Continues end-to-end testing with whatever partition layout exists # # Use disabled mode on managed Kafka platforms that restrict partition reassignment operations. # In this mode, you must manually create the topic with appropriate partition distribution # before starting kminion. enabled: true # Name of the topic kminion uses to send its test messages # You do *not* need to change this if you are running multiple kminion instances on the same cluster. # Different instances are perfectly fine with sharing the same topic! name: kminion-end-to-end # How often kminion checks its topic to validate configuration, partition count, and partition assignments reconciliationInterval: 10m # Depending on the desired monitoring (e.g. you want to alert on broker failure vs. cluster that is not writable) # you may choose replication factor 1 or 3 most commonly. replicationFactor: 1 # Rarely makes sense to change this, but maybe if you want some sort of cheap load test? # By default (1) every broker gets one partition partitionsPerBroker: 1 # Whether kminion should issue AlterPartitionAssignments requests to rebalance partition # leaders across brokers. Disable this when running against Kafka-compatible clusters that # reject AlterPartitionAssignments (e.g. Redpanda with autobalancer enabled, Confluent Cloud). # Topic creation and partition count management remain active regardless of this setting. rebalancePartitions: true producer: # This defines: # - Maximum time to wait for an ack response after producing a message # - Upper bound for histogram buckets in "produce_latency_seconds" ackSla: 5s # Can be to "all" (default) so kafka only reports an end-to-end test message as acknowledged if # the message was written to all in-sync replicas of the partition. # Or can be set to "leader" to only require to have written the message to its log. requiredAcks: all consumer: # Prefix kminion uses when creating its consumer groups. Current kminion instance id will be appended automatically groupIdPrefix: kminion-end-to-end # Whether KMinion should try to delete empty consumer groups with the same prefix. This can be used if you want # KMinion to cleanup it's old consumer groups. It should only be used if you use a unique prefix for KMinion. deleteStaleConsumerGroups: false # This defines: # - Upper bound for histogram buckets in "roundtrip_latency" # - Time limit beyond which a message is considered "lost" (failed the roundtrip) roundtripSla: 20s # - Upper bound for histogram buckets in "commit_latency_seconds" # - Maximum time an offset commit is allowed to take before considering it failed commitSla: 10s exporter: # Namespace is the prefix for all exported Prometheus metrics namespace: "kminion" # Host that shall be used to bind the HTTP server on host: "" # Port that shall be used to bind the HTTP server on port: 8080 ================================================ FILE: e2e/client_hooks.go ================================================ package e2e import ( "net" "sync/atomic" "time" "github.com/twmb/franz-go/pkg/kgo" "github.com/twmb/franz-go/pkg/kmsg" "go.uber.org/zap" ) // in e2e we only use client hooks for logging connect/disconnect messages type clientHooks struct { logger *zap.Logger lastCoordinatorUpdate time.Time currentCoordinator *atomic.Value // kgo.BrokerMetadata } func newEndToEndClientHooks(logger *zap.Logger) *clientHooks { return &clientHooks{ logger: logger.Named("e2e_hooks"), currentCoordinator: &atomic.Value{}, } } func (c *clientHooks) OnBrokerConnect(meta kgo.BrokerMetadata, dialDur time.Duration, _ net.Conn, err error) { if err != nil { c.logger.Error("kafka connection failed", zap.String("broker_host", meta.Host), zap.Int32("broker_id", meta.NodeID), zap.Error(err)) return } c.logger.Debug("kafka connection succeeded", zap.String("host", meta.Host), zap.Int32("broker_id", meta.NodeID), zap.Int64("dial_duration_ms", dialDur.Milliseconds())) } func (c *clientHooks) OnDisconnect(meta kgo.BrokerMetadata, _ net.Conn) { c.logger.Warn("kafka broker disconnected", zap.Int32("broker_id", meta.NodeID), zap.String("host", meta.Host)) } // OnBrokerWrite is passed the broker metadata, the key for the request that // was written, the number of bytes written, how long the request // waited before being written, how long it took to write the request, // and any error. // // The bytes written does not count any tls overhead. // OnWrite is called after a write to a broker. // // OnWrite(meta BrokerMetadata, key int16, bytesWritten int, writeWait, timeToWrite time.Duration, err error) func (c *clientHooks) OnBrokerWrite(meta kgo.BrokerMetadata, key int16, bytesWritten int, writeWait, timeToWrite time.Duration, err error) { keyName := kmsg.NameForKey(key) if keyName != "OffsetCommit" { return } // c.logger.Info("hooks onWrite", // zap.Duration("timeToWrite", timeToWrite), // zap.NamedError("err", err)) } // OnBrokerRead is passed the broker metadata, the key for the response that // was read, the number of bytes read, how long the Client waited // before reading the response, how long it took to read the response, // and any error. // // The bytes written does not count any tls overhead. // OnRead is called after a read from a broker. // OnRead(meta BrokerMetadata, key int16, bytesRead int, readWait, timeToRead time.Duration, err error) func (c *clientHooks) OnBrokerRead(meta kgo.BrokerMetadata, key int16, bytesRead int, readWait, timeToRead time.Duration, err error) { consumerGroupMsgKeys := []int16{ (&kmsg.OffsetCommitResponse{}).Key(), (&kmsg.JoinGroupResponse{}).Key(), (&kmsg.HeartbeatResponse{}).Key(), (&kmsg.SyncGroupResponse{}).Key(), } isMessageFromGroupCoordinator := isInArray(key, consumerGroupMsgKeys) if !isMessageFromGroupCoordinator { return } if err == nil { c.currentCoordinator.Store(meta) c.lastCoordinatorUpdate = time.Now() } } ================================================ FILE: e2e/config.go ================================================ package e2e import ( "fmt" "time" ) type Config struct { Enabled bool `koanf:"enabled"` TopicManagement EndToEndTopicConfig `koanf:"topicManagement"` ProbeInterval time.Duration `koanf:"probeInterval"` Producer EndToEndProducerConfig `koanf:"producer"` Consumer EndToEndConsumerConfig `koanf:"consumer"` } func (c *Config) SetDefaults() { c.Enabled = false c.ProbeInterval = 100 * time.Millisecond c.TopicManagement.SetDefaults() c.Producer.SetDefaults() c.Consumer.SetDefaults() } func (c *Config) Validate() error { if !c.Enabled { return nil } // If the timeduration is 0s or 0ms or its variation of zero, it will be parsed as 0 if c.ProbeInterval == 0 { return fmt.Errorf("failed to validate probeInterval config, the duration can't be zero") } err := c.TopicManagement.Validate() if err != nil { return fmt.Errorf("failed to validate topicManagement config: %w", err) } _, err = time.ParseDuration(c.ProbeInterval.String()) if err != nil { return fmt.Errorf("failed to parse '%s' to time.Duration: %v", c.ProbeInterval.String(), err) } err = c.Producer.Validate() if err != nil { return fmt.Errorf("failed to validate producer config: %w", err) } err = c.Consumer.Validate() if err != nil { return fmt.Errorf("failed to validate consumer config: %w", err) } return nil } ================================================ FILE: e2e/config_consumer.go ================================================ package e2e import ( "fmt" "time" ) type EndToEndConsumerConfig struct { GroupIdPrefix string `koanf:"groupIdPrefix"` DeleteStaleConsumerGroups bool `koanf:"deleteStaleConsumerGroups"` // RoundtripSLA is the time duration from the moment where we try to produce until the moment where we consumed // the message. Therefore this should always be higher than the produceTimeout / SLA. RoundtripSla time.Duration `koanf:"roundtripSla"` CommitSla time.Duration `koanf:"commitSla"` } func (c *EndToEndConsumerConfig) SetDefaults() { c.GroupIdPrefix = "kminion-end-to-end" c.DeleteStaleConsumerGroups = false c.RoundtripSla = 20 * time.Second c.CommitSla = 5 * time.Second } func (c *EndToEndConsumerConfig) Validate() error { if len(c.GroupIdPrefix) < 3 { return fmt.Errorf("kminion prefix should be at least 3 characters long") } if c.RoundtripSla <= 0 { return fmt.Errorf("consumer.roundtripSla must be greater than zero") } if c.CommitSla <= 0 { return fmt.Errorf("consumer.commitSla must be greater than zero") } return nil } ================================================ FILE: e2e/config_producer.go ================================================ package e2e import ( "fmt" "time" ) type EndToEndProducerConfig struct { AckSla time.Duration `koanf:"ackSla"` RequiredAcks string `koanf:"requiredAcks"` } func (c *EndToEndProducerConfig) SetDefaults() { c.AckSla = 5 * time.Second c.RequiredAcks = "all" } func (c *EndToEndProducerConfig) Validate() error { if c.RequiredAcks != "all" && c.RequiredAcks != "leader" { return fmt.Errorf("producer.requiredAcks must be 'all' or 'leader") } if c.AckSla <= 0 { return fmt.Errorf("producer.ackSla must be greater than zero") } return nil } ================================================ FILE: e2e/config_topic.go ================================================ package e2e import ( "fmt" "time" ) type EndToEndTopicConfig struct { Enabled bool `koanf:"enabled"` Name string `koanf:"name"` ReplicationFactor int `koanf:"replicationFactor"` PartitionsPerBroker int `koanf:"partitionsPerBroker"` ReconciliationInterval time.Duration `koanf:"reconciliationInterval"` // RebalancePartitions controls whether kminion will issue AlterPartitionAssignments // requests to rebalance partition leaders across brokers. Set to false when running // against Kafka-compatible clusters (e.g. Redpanda with autobalancer enabled, Confluent // Cloud) that reject AlterPartitionAssignments. Topic creation and partition count // management remain active regardless of this setting. RebalancePartitions bool `koanf:"rebalancePartitions"` } func (c *EndToEndTopicConfig) SetDefaults() { c.Enabled = true c.Name = "kminion-end-to-end" c.ReplicationFactor = 1 c.PartitionsPerBroker = 1 c.ReconciliationInterval = 10 * time.Minute c.RebalancePartitions = true } func (c *EndToEndTopicConfig) Validate() error { if c.ReplicationFactor < 1 { return fmt.Errorf("failed to parse replicationFactor, it should be more than 1, retrieved value %v", c.ReplicationFactor) } if c.PartitionsPerBroker < 1 { return fmt.Errorf("failed to parse partitionsPerBroker, it should be more than 1, retrieved value %v", c.PartitionsPerBroker) } // If the timeduration is 0s or 0ms or its variation of zero, it will be parsed as 0 if c.ReconciliationInterval == 0 { return fmt.Errorf("failed to validate topic.ReconciliationInterval config, the duration can't be zero") } return nil } ================================================ FILE: e2e/config_topic_test.go ================================================ package e2e import ( "testing" "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/zap" ) func TestEndToEndTopicConfig_SetDefaults(t *testing.T) { var cfg EndToEndTopicConfig cfg.SetDefaults() assert.True(t, cfg.Enabled) assert.Equal(t, "kminion-end-to-end", cfg.Name) assert.Equal(t, 1, cfg.ReplicationFactor) assert.Equal(t, 1, cfg.PartitionsPerBroker) assert.Equal(t, 10*time.Minute, cfg.ReconciliationInterval) assert.True(t, cfg.RebalancePartitions, "RebalancePartitions should default to true for backward compatibility") } func TestEndToEndTopicConfig_Validate(t *testing.T) { tests := []struct { name string cfg EndToEndTopicConfig wantErr bool }{ { name: "valid config with rebalance enabled", cfg: EndToEndTopicConfig{ Enabled: true, Name: "test-topic", ReplicationFactor: 3, PartitionsPerBroker: 1, ReconciliationInterval: 10 * time.Minute, RebalancePartitions: true, }, wantErr: false, }, { name: "valid config with rebalance disabled", cfg: EndToEndTopicConfig{ Enabled: true, Name: "test-topic", ReplicationFactor: 3, PartitionsPerBroker: 1, ReconciliationInterval: 10 * time.Minute, RebalancePartitions: false, }, wantErr: false, }, { name: "invalid replication factor", cfg: EndToEndTopicConfig{ ReplicationFactor: 0, PartitionsPerBroker: 1, ReconciliationInterval: 10 * time.Minute, RebalancePartitions: false, }, wantErr: true, }, { name: "invalid partitions per broker", cfg: EndToEndTopicConfig{ ReplicationFactor: 1, PartitionsPerBroker: 0, ReconciliationInterval: 10 * time.Minute, RebalancePartitions: false, }, wantErr: true, }, { name: "zero reconciliation interval", cfg: EndToEndTopicConfig{ ReplicationFactor: 1, PartitionsPerBroker: 1, ReconciliationInterval: 0, RebalancePartitions: false, }, wantErr: true, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { err := tt.cfg.Validate() if tt.wantErr { require.Error(t, err) } else { require.NoError(t, err) } }) } } // TestPartitionPlanner_RebalancePartitionsDisabled verifies that when // RebalancePartitions is false the planner still produces a valid plan // (the caller in topic.go is responsible for not executing reassignments), // and that partition creation assignments are still generated normally. func TestPartitionPlanner_RebalancePartitionsDisabled(t *testing.T) { // Three brokers, topic already exists with suboptimal leader distribution: // all partitions led by broker 0. meta := buildMeta("e2e", map[int32]string{0: "", 1: "", 2: ""}, [][]int32{ {0, 1, 2}, {0, 2, 1}, {0, 1, 2}, }, ) cfg := EndToEndTopicConfig{ Enabled: true, Name: "e2e", ReplicationFactor: 3, PartitionsPerBroker: 1, ReconciliationInterval: 10 * time.Minute, RebalancePartitions: false, } planner := NewPartitionPlanner(cfg, zap.NewNop()) plan, err := planner.Plan(meta) require.NoError(t, err) // The planner should still detect that reassignments are needed — // it's the caller's responsibility to skip executing them. assert.NotEmpty(t, plan.Reassignments, "planner should detect reassignments are needed") for _, ra := range plan.Reassignments { assertNoDuplicates(t, ra.Replicas) assert.Len(t, ra.Replicas, cfg.ReplicationFactor) } // No new partitions should be created (3 brokers × 1 per broker = 3 already exist). assert.Empty(t, plan.CreateAssignments) assert.Equal(t, 3, plan.FinalPartitionCount) } // TestPartitionPlanner_RebalancePartitionsDisabled_Creates verifies that when // RebalancePartitions is false and new partitions need to be created, Phase 3 // uses actual current leaders (not predicted leaders from staged reassignments) // to pick the preferred leader for new partitions. func TestPartitionPlanner_RebalancePartitionsDisabled_Creates(t *testing.T) { // 4 brokers, 3 partitions all led by broker 0. // PartitionsPerBroker=1 means desired = 4, so Phase 3 must create 1. meta := buildMeta("e2e", map[int32]string{0: "", 1: "", 2: "", 3: ""}, [][]int32{ {0, 1, 2}, {0, 2, 3}, {0, 1, 3}, }, ) cfg := EndToEndTopicConfig{ Enabled: true, Name: "e2e", ReplicationFactor: 3, PartitionsPerBroker: 1, ReconciliationInterval: 10 * time.Minute, RebalancePartitions: false, } planner := NewPartitionPlanner(cfg, zap.NewNop()) plan, err := planner.Plan(meta) require.NoError(t, err) // Phase 3 should create exactly 1 partition (4 desired - 3 existing). require.Len(t, plan.CreateAssignments, 1) // The new partition's preferred leader should NOT be broker 0, // because actual state shows broker 0 already leads 3 partitions. // With rebalancePartitions=false, Phase 3 counts from actual leaders, // so it should pick one of the under-represented brokers (1, 2, or 3). newLeader := plan.CreateAssignments[0].Replicas[0] assert.NotEqual(t, int32(0), newLeader, "new partition should not be led by broker 0 (already leads 3 partitions in actual state)") } ================================================ FILE: e2e/consumer.go ================================================ package e2e import ( "context" "encoding/json" "strconv" "time" "github.com/twmb/franz-go/pkg/kgo" "github.com/twmb/franz-go/pkg/kmsg" "go.uber.org/zap" ) func (s *Service) startConsumeMessages(ctx context.Context, initializedCh chan<- bool) { client := s.client s.logger.Info("starting to consume end-to-end topic", zap.String("topic_name", s.config.TopicManagement.Name), zap.String("group_id", s.groupId)) isInitialized := false for { fetches := client.PollFetches(ctx) if !isInitialized { isInitialized = true initializedCh <- true close(initializedCh) } // Log all errors and continue afterwards as we might get errors and still have some fetch results errors := fetches.Errors() for _, err := range errors { s.logger.Error("kafka fetch error", zap.String("topic", err.Topic), zap.Int32("partition", err.Partition), zap.Error(err.Err)) } fetches.EachRecord(s.processMessage) } } func (s *Service) commitOffsets(ctx context.Context) { client := s.client uncommittedOffset := client.UncommittedOffsets() if uncommittedOffset == nil { return } startCommitTimestamp := time.Now() childCtx, cancel := context.WithTimeout(ctx, s.config.Consumer.CommitSla) client.CommitOffsets(childCtx, uncommittedOffset, func(_ *kgo.Client, req *kmsg.OffsetCommitRequest, r *kmsg.OffsetCommitResponse, err error) { cancel() coordinator := s.clientHooks.currentCoordinator.Load().(kgo.BrokerMetadata) coordinatorID := strconv.Itoa(int(coordinator.NodeID)) latency := time.Since(startCommitTimestamp) s.offsetCommitLatency.WithLabelValues(coordinatorID).Observe(latency.Seconds()) s.offsetCommitsTotal.WithLabelValues(coordinatorID).Inc() // We do this to ensure that a series with that coordinator id is initialized s.offsetCommitsTotal.WithLabelValues(coordinatorID).Add(0) // If we have at least one error in our commit response we want to report it as an error with an appropriate // reason as label. if errCode := s.logCommitErrors(r, err); errCode != "" { s.offsetCommitsFailedTotal.WithLabelValues(coordinatorID, errCode).Inc() return } }) } // processMessage: // - deserializes the message // - checks if it is from us, or from another kminion process running somewhere else // - hands it off to the service, which then reports metrics on it func (s *Service) processMessage(record *kgo.Record) { if record.Value == nil { // Init messages have nil values - we want to skip these. They are only used to make sure a consumer is ready. return } var msg EndToEndMessage if jerr := json.Unmarshal(record.Value, &msg); jerr != nil { s.logger.Error("failed to unmarshal message value", zap.Error(jerr)) return // maybe older version } if msg.MinionID != s.minionID { return // not from us } // restore partition, which is not serialized msg.partition = int(record.Partition) s.messageTracker.onMessageArrived(&msg) } ================================================ FILE: e2e/endtoend_message.go ================================================ package e2e import "time" const ( _ = iota EndToEndMessageStateCreated EndToEndMessageStateProducedSuccessfully ) type EndToEndMessage struct { MinionID string `json:"minionID"` // unique for each running kminion instance MessageID string `json:"messageID"` // unique for each message Timestamp int64 `json:"createdUtcNs"` // when the message was created, unix nanoseconds // The following properties are only used within the message tracker partition int state int produceLatency float64 } func (m *EndToEndMessage) creationTime() time.Time { return time.Unix(0, m.Timestamp) } ================================================ FILE: e2e/group_tracker.go ================================================ package e2e import ( "context" "strings" "time" "github.com/twmb/franz-go/pkg/kerr" "github.com/twmb/franz-go/pkg/kgo" "github.com/twmb/franz-go/pkg/kmsg" "go.uber.org/zap" ) const ( oldGroupCheckInterval = 5 * time.Second // how often to check for old kminion groups oldGroupMaxAge = 20 * time.Second // maximum age after which an old group should be deleted ) // groupTracker keeps checking for empty consumerGroups matching the kminion prefix. // When a group was seen empty for some time, we delete it. // Why? // Whenever a kminion instance starts up it creates a consumer-group for itself in order to not "collide" with other kminion instances. // When an instance restarts (for whatever reason), it creates a new group again, so we'd end up with a lot of unused groups. type groupTracker struct { cfg Config logger *zap.Logger client *kgo.Client // kafka client groupId string // our own groupId potentiallyEmptyGroups map[string]time.Time // groupName -> utc timestamp when the group was first seen } func newGroupTracker(cfg Config, logger *zap.Logger, client *kgo.Client, groupID string) *groupTracker { return &groupTracker{ cfg: cfg, logger: logger.Named("group_tracker"), client: client, groupId: groupID, potentiallyEmptyGroups: make(map[string]time.Time), } } func (g *groupTracker) start(ctx context.Context) { g.logger.Debug("starting group tracker") deleteOldGroupsTicker := time.NewTicker(oldGroupCheckInterval) for { select { case <-ctx.Done(): g.logger.Debug("stopping group tracker, context was cancelled") return case <-deleteOldGroupsTicker.C: childCtx, cancel := context.WithTimeout(ctx, 10*time.Second) err := g.checkAndDeleteOldConsumerGroups(childCtx) if err != nil { g.logger.Error("failed to check for old consumer groups: %w", zap.Error(err)) } cancel() } } } func (g *groupTracker) checkAndDeleteOldConsumerGroups(ctx context.Context) error { groupsRq := kmsg.NewListGroupsRequest() groupsRq.StatesFilter = []string{"Empty"} g.logger.Debug("checking for stale kminion consumer groups") shardedResponse := g.client.RequestSharded(ctx, &groupsRq) // find groups that start with the kminion prefix matchingGroups := make([]string, 0) for _, shard := range shardedResponse { if shard.Err != nil { g.logger.Error("error in response to ListGroupsRequest", zap.Int32("broker_id", shard.Meta.NodeID), zap.Error(shard.Err)) continue } r, ok := shard.Resp.(*kmsg.ListGroupsResponse) if !ok { g.logger.Error("cannot cast responseShard.Resp to kmsg.ListGroupsResponse") continue } for _, group := range r.Groups { name := group.Group if name == g.groupId { continue // skip our own consumer group } if strings.HasPrefix(name, g.cfg.Consumer.GroupIdPrefix) { matchingGroups = append(matchingGroups, name) } } } // save new (previously unseen) groups to tracker g.logger.Debug("checked for stale consumer groups", zap.Int("found_groups", len(matchingGroups)), zap.Strings("groups", matchingGroups)) for _, name := range matchingGroups { _, exists := g.potentiallyEmptyGroups[name] if !exists { // add it with the current timestamp g.potentiallyEmptyGroups[name] = time.Now() g.logger.Debug("found new empty kminion group, adding it to the tracker", zap.String("group", name)) } } // go through saved groups: // - don't track the ones we don't see anymore (bc they got deleted or are not empty anymore) // - mark the ones that are too old (have been observed as empty for too long) groupsToDelete := make([]string, 0) for name, firstSeen := range g.potentiallyEmptyGroups { exists, _ := containsStr(matchingGroups, name) if exists { // still there, check age and maybe delete it age := time.Since(firstSeen) if age > oldGroupMaxAge { // group was unused for too long, delete it groupsToDelete = append(groupsToDelete, name) delete(g.potentiallyEmptyGroups, name) } } else { // does not exist anymore, it must have been deleted, or is in use now (no longer empty) // don't track it anymore delete(g.potentiallyEmptyGroups, name) } } // actually delete the groups we've decided to delete if len(groupsToDelete) == 0 { return nil } deleteRq := kmsg.NewDeleteGroupsRequest() deleteRq.Groups = groupsToDelete deleteResp := g.client.RequestSharded(ctx, &deleteRq) // done, now just errors // if we get a not authorized error we'll disable deleting groups foundNotAuthorizedError := false deletedGroups := make([]string, 0) for _, shard := range deleteResp { if shard.Err != nil { g.logger.Error("sharded consumer group delete request failed", zap.Error(shard.Err)) continue } resp, ok := shard.Resp.(*kmsg.DeleteGroupsResponse) if !ok { g.logger.Error("failed to cast shard response to DeleteGroupsResponse while handling an error for deleting groups", zap.String("shard_host", shard.Meta.Host), zap.Int32("broker_id", shard.Meta.NodeID), zap.Error(shard.Err)) continue } for _, groupResp := range resp.Groups { err := kerr.ErrorForCode(groupResp.ErrorCode) if err != nil { g.logger.Error("failed to delete consumer group", zap.String("shard", shard.Meta.Host), zap.Int32("broker_id", shard.Meta.NodeID), zap.String("group", groupResp.Group), zap.Error(err)) if groupResp.ErrorCode == kerr.GroupAuthorizationFailed.Code { foundNotAuthorizedError = true } } else { deletedGroups = append(deletedGroups, groupResp.Group) } } } g.logger.Info("deleted old consumer groups", zap.Strings("deleted_groups", deletedGroups)) if foundNotAuthorizedError { g.logger.Info("disabling trying to delete old kminion consumer-groups since one of the last delete results had an 'GroupAuthorizationFailed' error") } return nil } ================================================ FILE: e2e/message_tracker.go ================================================ package e2e import ( "fmt" "strconv" "time" "github.com/jellydator/ttlcache/v2" "go.uber.org/zap" ) // messageTracker keeps track of the messages' lifetime // // When we successfully send a mesasge, it will be added to this tracker. // Later, when we receive the message back in the consumer, the message is marked as completed and removed from the tracker. // If the message does not arrive within the configured `consumer.roundtripSla`, it is counted as lost. Messages that // failed to be produced will not be // considered as lost message. // // We use a dedicated counter to track messages that couldn't be produced to Kafka. type messageTracker struct { svc *Service logger *zap.Logger cache *ttlcache.Cache } func newMessageTracker(svc *Service) *messageTracker { defaultExpirationDuration := svc.config.Consumer.RoundtripSla cache := ttlcache.NewCache() cache.SetTTL(defaultExpirationDuration) t := &messageTracker{ svc: svc, logger: svc.logger.Named("message_tracker"), cache: cache, } t.cache.SetExpirationReasonCallback(func(key string, reason ttlcache.EvictionReason, value interface{}) { t.onMessageExpired(key, reason, value.(*EndToEndMessage)) }) return t } func (t *messageTracker) addToTracker(msg *EndToEndMessage) { t.cache.Set(msg.MessageID, msg) } // updateItemIfExists only updates a message if it still exists in the cache. The remaining time to live will not // be refreshed. // If it doesn't exist an ttlcache.ErrNotFound error will be returned. func (t *messageTracker) updateItemIfExists(msg *EndToEndMessage) error { _, ttl, err := t.cache.GetWithTTL(msg.MessageID) if err != nil { if err == ttlcache.ErrNotFound { return err } panic(err) } // Because the returned TTL is set to the original TTL duration (and not the remaining TTL) we have to calculate // the remaining TTL now as we want to updat the existing cache item without changing the remaining time to live. expiryTimestamp := msg.creationTime().Add(ttl) remainingTTL := expiryTimestamp.Sub(time.Now()) if remainingTTL < 0 { // This entry should have been deleted already. Race condition. return ttlcache.ErrNotFound } err = t.cache.SetWithTTL(msg.MessageID, msg, remainingTTL) if err != nil { panic(err) } return nil } // removeFromTracker removes an entry from the cache. If the key does not exist it will return an ttlcache.ErrNotFound error. func (t *messageTracker) removeFromTracker(messageID string) error { return t.cache.Remove(messageID) } func (t *messageTracker) onMessageArrived(arrivedMessage *EndToEndMessage) { cm, err := t.cache.Get(arrivedMessage.MessageID) if err != nil { if err == ttlcache.ErrNotFound { // message expired and was removed from the cache // it arrived too late, nothing to do here... return } else { panic(fmt.Errorf("failed to get message from cache: %w", err)) } } msg := cm.(*EndToEndMessage) expireTime := msg.creationTime().Add(t.svc.config.Consumer.RoundtripSla) isExpired := time.Now().Before(expireTime) latency := time.Now().Sub(msg.creationTime()) if !isExpired { // Message arrived late, but was still in cache. We don't increment the lost counter here because eventually // it will be evicted from the cache. This case should only pop up if the sla time is exceeded, but if the // item has not been evicted from the cache yet. t.logger.Info("message arrived late, will be marked as a lost message", zap.Int64("delay_ms", latency.Milliseconds()), zap.String("id", msg.MessageID)) return } // message arrived early enough pID := strconv.Itoa(msg.partition) t.svc.messagesReceived.WithLabelValues(pID).Inc() t.svc.roundtripLatency.WithLabelValues(pID).Observe(latency.Seconds()) // Remove message from cache, so that we don't track it any longer and won't mark it as lost when the entry expires. t.cache.Remove(msg.MessageID) } func (t *messageTracker) onMessageExpired(_ string, reason ttlcache.EvictionReason, value interface{}) { if reason == ttlcache.Removed { // We are not interested in messages that have been removed by us! return } msg := value.(*EndToEndMessage) created := msg.creationTime() age := time.Since(created) t.svc.lostMessages.WithLabelValues(strconv.Itoa(msg.partition)).Inc() t.logger.Debug("message expired/lost", zap.Int64("age_ms", age.Milliseconds()), zap.Int("partition", msg.partition), zap.String("message_id", msg.MessageID), zap.Bool("successfully_produced", msg.state == EndToEndMessageStateProducedSuccessfully), zap.Float64("produce_latency_seconds", msg.produceLatency), ) } ================================================ FILE: e2e/partition_planner.go ================================================ package e2e import ( "fmt" "sort" "github.com/twmb/franz-go/pkg/kmsg" "go.uber.org/zap" ) // PartitionPlanner is a thin orchestrator around the three planning phases. It wires in // configuration, logging, and the replica-selection strategy. The planning phases are: // 1. Fix replication factor and rack diversity on existing partitions. // 2. Ensure every broker is the preferred leader of at least 1 partition. // 3. Ensure total partition count meets the configured lower bound. // // The probe topic is created to continuously test end-to-end availability by // producing and consuming records per partition. The planner enforces: // // - Correct replication factor (RF == configured RF) and no duplicate brokers // within a single partition's replica set. // - Rack awareness: maximize the number of unique racks per partition (bounded // by min(RF, #racks)). This reduces the blast radius of a rack failure. // - Sufficient partition count: >= max(current, #brokers, partitionsPerBroker*#brokers). // - Leader coverage: every broker must be the preferred leader (replicas[0]) of // at least one partition, so a per-broker failure is observable. // - Minimal movement: prefer to fix RF and rack issues first, then rotate/swap // leaders to fill gaps, and only create partitions when necessary. type PartitionPlanner struct { cfg EndToEndTopicConfig logger *zap.Logger sel ReplicaSelector } // NewPartitionPlanner constructs a Planner with the given config & logger. The replica // selector is chosen in Plan() once we have ClusterState ready. func NewPartitionPlanner(cfg EndToEndTopicConfig, logger *zap.Logger) *PartitionPlanner { return &PartitionPlanner{cfg: cfg, logger: logger} } // Plan produces an in-memory plan (reassignments + creations) for the probe // topic based on current cluster metadata. See the package header for the // invariants we enforce. func (p *PartitionPlanner) Plan(meta *kmsg.MetadataResponse) (*Plan, error) { if meta == nil || len(meta.Topics) == 0 { return nil, fmt.Errorf("metadata response has no topics") } if len(meta.Brokers) == 0 { return nil, fmt.Errorf("metadata response has no brokers") } if p.cfg.ReplicationFactor > len(meta.Brokers) { return nil, fmt.Errorf("replication factor %d exceeds available brokers %d", p.cfg.ReplicationFactor, len(meta.Brokers)) } // Build state required for the planning state := BuildState(meta) desired := ComputeDesired(state, p.cfg) tracker := NewLoadTracker(state) selector := NewRackAwareSelector(state, tracker) p.sel = selector b := NewPlanBuilder(state, desired, tracker, p.cfg.RebalancePartitions) // Phase 1: normalize RF and racks (low movement first) // Grow/trim replicas to configured RF and re-pick to maximize unique racks // per partition. We avoid moving the leader when possible. fixReplicationAndRack(b, selector, p.cfg.ReplicationFactor) // Phase 2: ensure each broker is preferred leader for >= 1 partition // This guarantees probe coverage: if a broker dies, some partition leader is // unavailable and the probe trips. ensureLeaderCoverage(b, selector) // Phase 3: ensure desired partition count // Enforce >= max(current, #brokers, partitionsPerBroker * #brokers). ensurePartitionCount(b, selector) // Build final plan and log the changes plan := b.Build() // Log with appropriate level based on whether changes are needed totalChanges := len(plan.Reassignments) + len(plan.CreateAssignments) if totalChanges == 0 { p.logger.Info("e2e probe topic partition leadership and replica distribution check completed - optimal", zap.String("topic", state.TopicName), zap.Int("brokers", len(state.Brokers)), zap.Int("partitions", len(state.Partitions)), zap.String("status", "optimal"), ) } else { p.logger.Info("plan to change partition leadership and replica placements on e2e topic has been prepared", zap.String("topic", state.TopicName), zap.Int("brokers", len(state.Brokers)), zap.Int("cur_partitions", len(state.Partitions)), zap.Int("final_partitions", plan.FinalPartitionCount), zap.Int("reassignments", len(plan.Reassignments)), zap.Int("creates", len(plan.CreateAssignments)), zap.Int("total_changes", totalChanges), ) } return plan, nil } // ----------------------------------------------------------------------------- // Planning phases // ----------------------------------------------------------------------------- // PlanBuilder holds a predictive view of partition -> replicas after applying // staged operations. We never mutate ClusterState.Partitions; instead, we write // new assignments into PlanBuilder.view and record high-level operations to // produce Kafka requests at the end (see Plan.ToRequests). // // This keeps the planning phases simple and side-effect free. type PlanBuilder struct { state ClusterState desired Desired tracker *LoadTracker // rebalancePartitions indicates whether reassignments will actually be executed. // When false, Phase 3 uses actual current leaders instead of predicted leaders // from the view, since reassignments won't be applied. rebalancePartitions bool // view is our predictive map: partitionID -> replicas (preferred leader at idx 0) view map[int32][]int32 reassignments []Reassignment // staged reassignments for existing partitions creations []CreateAssignment // staged creations of new partitions } // Reassignment captures a single partition’s new replica list. // // The order of Replicas matters: index 0 will become the preferred leader after // reassignment completes on the broker side. type Reassignment struct { Partition int32 Replicas []int32 } // CreateAssignment captures the replica list for a *new* partition that will be // appended to the topic during CreatePartitions. // // The order of Replicas matters: index 0 is the preferred leader for the new // partition. type CreateAssignment struct { Replicas []int32 } // Plan is the final, immutable result of planning. It can be turned into Kafka // requests via ToRequests. // // FinalPartitionCount is the topic’s partition count after applying creations. // (Reassignments do not change the count.) type Plan struct { Reassignments []Reassignment CreateAssignments []CreateAssignment FinalPartitionCount int } // NewPlanBuilder initializes a predictive view by cloning the current // partition->replicas map. We avoid accidental mutation by copying slices. func NewPlanBuilder(state ClusterState, desired Desired, tracker *LoadTracker, rebalancePartitions bool) *PlanBuilder { view := make(map[int32][]int32, len(state.Partitions)) for pid, p := range state.Partitions { view[pid] = append([]int32(nil), p.Replicas...) } return &PlanBuilder{state: state, desired: desired, tracker: tracker, rebalancePartitions: rebalancePartitions, view: view} } // Build freezes the current staged operations into a Plan. We compute the final // partition count as current + number of creates. func (b *PlanBuilder) Build() *Plan { return &Plan{ Reassignments: b.reassignments, CreateAssignments: b.creations, FinalPartitionCount: len(b.state.Partitions) + len(b.creations), } } // CommitReassignment records a reassignment and updates the predictive view. func (b *PlanBuilder) CommitReassignment(pid int32, reps []int32) { b.reassignments = append(b.reassignments, Reassignment{Partition: pid, Replicas: reps}) b.view[pid] = reps } // CommitCreate records a new-partition assignment. The final partition count is // computed when building the Plan. func (b *PlanBuilder) CommitCreate(reps []int32) { b.creations = append(b.creations, CreateAssignment{Replicas: reps}) } // fixReplicationAndRack enforces configured RF on each existing partition // (growing or shrinking as needed) and re-picks replicas when rack diversity can // be improved. We try to keep the current leader by always retaining replicas[0] // when shrinking. func fixReplicationAndRack(b *PlanBuilder, sel ReplicaSelector, rf int) { // Sort partition IDs for deterministic iteration pids := make([]int32, 0, len(b.view)) for pid := range b.view { pids = append(pids, pid) } sort.Slice(pids, func(i, j int) bool { return pids[i] < pids[j] }) for _, pid := range pids { replicas := b.view[pid] desiredRF := rf newReplicas := replicas switch { // Grow: re-pick the full set based on the current actual leader. case len(replicas) < desiredRF: // Use the actual current leader from the partition metadata currentPartition, exists := b.state.Partitions[pid] preferredLeader := replicas[0] // fallback to preferred leader if exists && currentPartition.Leader != -1 { preferredLeader = currentPartition.Leader } newReplicas = sel.ChooseReplicas(preferredLeader, desiredRF) // Shrink: keep leader; then pick remaining replicas preferring // new racks, then lower load, then lower broker ID. case len(replicas) > desiredRF: newReplicas = shrinkPreservingLeader(b, pid, replicas, desiredRF) // Same RF: if rack diversity can be improved, re-pick. default: if violatesRackDiversity(replicas, b.state.NumRacks, b.state.Brokers) { // Use the actual current leader from the partition metadata currentPartition, exists := b.state.Partitions[pid] preferredLeader := replicas[0] // fallback to preferred leader if exists && currentPartition.Leader != -1 { preferredLeader = currentPartition.Leader } newReplicas = sel.ChooseReplicas(preferredLeader, desiredRF) } } if !equalInt32s(newReplicas, replicas) { b.CommitReassignment(pid, newReplicas) } } } // shrinkPreservingLeader returns a replica set of size rf that keeps the // current leader and greedily prefers candidates that add a new rack; // among equals, chooses lower load, then lower broker ID. func shrinkPreservingLeader(b *PlanBuilder, pid int32, replicas []int32, rf int) []int32 { // Use the actual current leader from the partition metadata, not replicas[0] currentPartition, exists := b.state.Partitions[pid] leader := replicas[0] // fallback to preferred leader if no current leader found if exists && currentPartition.Leader != -1 { leader = currentPartition.Leader } keep := []int32{leader} seen := map[string]struct{}{ b.state.Brokers[leader].Rack: {}, } type cand struct { id int32 rack string load int } // Build the candidate pool from non-leader replicas. pool := make([]cand, 0, len(replicas)-1) for _, id := range replicas[1:] { pool = append(pool, cand{ id: id, rack: b.state.Brokers[id].Rack, load: b.tracker.Load(id).Replicas, }) } // Greedy selection with dynamic "seen racks". for len(keep) < rf && len(pool) > 0 { best := 0 for i := 1; i < len(pool); i++ { a, b2 := pool[i], pool[best] _, aSeen := seen[a.rack] _, bSeen := seen[b2.rack] switch { // Prefer a candidate that adds a new rack. case aSeen != bSeen: if !aSeen && bSeen { best = i } // Then prefer lower load. case a.load != b2.load: if a.load < b2.load { best = i } // Then prefer lower broker ID (stable tie-breaker). case a.id < b2.id: best = i } } chosen := pool[best] keep = append(keep, chosen.id) seen[chosen.rack] = struct{}{} // Remove chosen from pool (swap-delete). pool[best] = pool[len(pool)-1] pool = pool[:len(pool)-1] } return keep } // ensureLeaderCoverage guarantees that each broker becomes preferred leader for // at least one partition. We try the cheapest options first: // 1. If the broker already hosts a replica of some partition where it is not // leader, rotate it to index 0 (no RF change, minimal movement). // 2. Otherwise, replace a non-unique-rack replica in some donor partition and // rotate the target broker to index 0. // 3. If neither is possible, create a new partition led by the target broker. func ensureLeaderCoverage(b *PlanBuilder, sel ReplicaSelector) { // Guard: if actual leaders already cover all brokers, skip preferred leader rebalancing actualLeaders := make(map[int32][]int32, len(b.state.BrokerIDs)) for _, id := range b.state.BrokerIDs { actualLeaders[id] = nil } for pid, part := range b.state.Partitions { if part.Leader != -1 { actualLeaders[part.Leader] = append(actualLeaders[part.Leader], pid) } } if len(brokersMissingLeadership(b.state.BrokerIDs, actualLeaders)) == 0 { return // Actual coverage is perfect - no need to rebalance preferred leaders } // Build "leadersByBroker": broker -> list of partition IDs it currently leads (preferred). leadersByBroker := indexLeaders(b.state.BrokerIDs, b.view) // Brokers that currently lead zero partitions (preferred). // However, if a broker already has actual leadership (even if not preferred), // we can skip it to minimize unnecessary reassignments. missing := []int32{} for _, broker := range brokersMissingLeadership(b.state.BrokerIDs, leadersByBroker) { // Skip if this broker already has actual leadership if len(actualLeaders[broker]) > 0 { continue } missing = append(missing, broker) } if len(missing) == 0 { return } // Local helpers that both perform the action and update leadersByBroker. rotateIfReplica := func(target int32, donors []int32) bool { for _, donor := range donors { // Collect candidate partitions where target is already a replica pids := append([]int32(nil), leadersByBroker[donor]...) // Sort with preference: partitions where the donor is the ACTUAL leader first. // This ensures we're actually freeing up leadership from the donor, rather than // rotating a partition where the donor is only the preferred leader. // Then by partition ID for determinism. sort.Slice(pids, func(i, j int) bool { pi, pj := pids[i], pids[j] // Prefer partitions where the donor is the actual leader iDonorIsActual := b.state.Partitions[pi].Leader == donor jDonorIsActual := b.state.Partitions[pj].Leader == donor if iDonorIsActual != jDonorIsActual { return iDonorIsActual } // Then by partition ID for stability return pi < pj }) for _, pid := range pids { reps := b.view[pid] if !contains(reps, target) { continue } newReps := putFirst(reps, target) // make target the leader b.CommitReassignment(pid, newReps) // Update bookkeeping: pid moves from donor to target. leadersByBroker[donor] = remove(leadersByBroker[donor], pid) leadersByBroker[target] = append(leadersByBroker[target], pid) return true } } return false } replaceDuplicateAndRotate := func(target int32, donors []int32) bool { for _, donor := range donors { // Sort partition IDs for deterministic iteration pids := append([]int32(nil), leadersByBroker[donor]...) sort.Slice(pids, func(i, j int) bool { return pids[i] < pids[j] }) for _, pid := range pids { reps := b.view[pid] if contains(reps, target) { continue // covered by rotate path above } idx := victimIndex(reps, b.state.Brokers) if idx < 0 { continue // no safe replica to swap (unique racks already) } newReps := append([]int32{}, reps...) newReps[idx] = target newReps = putFirst(newReps, target) b.CommitReassignment(pid, newReps) leadersByBroker[donor] = remove(leadersByBroker[donor], pid) leadersByBroker[target] = append(leadersByBroker[target], pid) return true } } return false } for _, target := range missing { // Donors: brokers leading more partitions than their soft target, // sorted by largest surplus first (tie-breaker: smaller broker ID). donors := donorBrokers(b.state.BrokerIDs, leadersByBroker, b.desired.TargetLeaders) // 1) Cheapest: rotate target to lead where it already is a replica. if rotateIfReplica(target, donors) { continue } // 2) Next-cheapest: replace a duplicate-rack replica, then rotate. if replaceDuplicateAndRotate(target, donors) { continue } // 3) Last resort: create a new partition led by target. reps := sel.ChooseReplicas(target, b.desired.RF) b.CommitCreate(reps) // Track a synthetic partition ID so counts stay consistent within this loop. newPID := int32(len(b.state.Partitions) + len(b.creations) - 1) leadersByBroker[target] = append(leadersByBroker[target], newPID) } } // ensurePartitionCount adds partitions until we reach DesiredPartitions. // // Leader selection strategy per new partition: // 1. Prefer a broker still below its soft target (TargetLeaders). Among those, // pick the one closest to its target (smallest positive gap). // 2. If all brokers are at/above target, pick the broker that currently leads // the fewest partitions (stable tie-breaker via leastLoadedLeader). func ensurePartitionCount(b *PlanBuilder, sel ReplicaSelector) { desiredTotal := b.desired.DesiredPartitions total := len(b.state.Partitions) + len(b.creations) if total >= desiredTotal || len(b.state.BrokerIDs) == 0 { return } // Count current leaders per broker. leaderCount := make(map[int32]int, len(b.state.BrokerIDs)) if b.rebalancePartitions { // Use predictive view (reassignments will be applied) for _, reps := range b.view { if len(reps) > 0 { leaderCount[reps[0]]++ } } } else { // Use actual current leaders (reassignments won't be applied) for _, p := range b.state.Partitions { if p.Leader != -1 { leaderCount[p.Leader]++ } } } // Always include leaders from staged creates (Phase 2 fallback creates are always executed) for _, ca := range b.creations { if len(ca.Replicas) > 0 { leaderCount[ca.Replicas[0]]++ } } // Make sure every broker has an entry in leaderCount. for _, id := range b.state.BrokerIDs { if _, ok := leaderCount[id]; !ok { leaderCount[id] = 0 } } for total < desiredTotal { // Pick a preferred leader for the new partition. preferred := pickLeader(b.state.BrokerIDs, leaderCount, b.desired.TargetLeaders) // Materialize replicas and commit the create. reps := sel.ChooseReplicas(preferred, b.desired.RF) b.CommitCreate(reps) leaderCount[preferred]++ total++ } } // pickLeader returns the broker to prefer as the new partition's leader. // // If any brokers are still below their soft target, it returns the one with the // smallest positive remaining gap (# of missing leaderships for partitions). // Otherwise, it returns the least-loaded leader. func pickLeader(brokerIDs []int32, leaderCount map[int32]int, target map[int32]int) int32 { var ( chosen int32 bestGap int // smallest positive gap seen so far found bool ) for _, id := range brokerIDs { gap := target[id] - leaderCount[id] // how many leaders this broker still “should” get if gap <= 0 { continue } if !found || gap < bestGap { chosen = id bestGap = gap found = true } } if found { return chosen } // Everyone at/above target: fall back to least leaders. return leastLoadedLeader(leaderCount, brokerIDs) } // ----------------------------------------------------------------------------- // State & Desired // ----------------------------------------------------------------------------- // Broker is an immutable snapshot of a broker’s ID and rack label at plan time. // An empty rack value means the broker did not advertise one. // // We never mutate this during planning; it mirrors the Metadata response. type Broker struct { ID int32 Rack string // empty string if unknown } // Partition is an immutable snapshot of a partition's current replica list. // // The order of Replicas matters: index 0 is the *preferred leader* for this // partition, i.e., where the controller will attempt to place leadership after // changes. The Leader field contains the *actual current leader* as reported // by Kafka metadata, which may differ from the preferred leader (replicas[0]). // // We do not mutate these structs; instead we work with a predictive "view" // inside PlanBuilder (below). type Partition struct { ID int32 Leader int32 // actual current leader from metadata Replicas []int32 // order matters; index 0 is preferred leader } // ClusterState is a convenience wrapper around the metadata we actually need in // this planner. It is deliberately small to keep the code readable. // // - TopicName: the probe topic name (for logging and request building) // - Brokers: map of broker id -> Broker // - BrokerIDs: sorted slice of broker IDs for deterministic iteration // - ByRack: rack -> broker IDs, to reason about rack diversity // - NumRacks: number of distinct racks (empty rack counts as one) // - Partitions: map of partition id -> Partition snapshot // // None of the members are mutated after construction. type ClusterState struct { TopicName string Brokers map[int32]Broker BrokerIDs []int32 // sorted ByRack map[string][]int32 NumRacks int Partitions map[int32]Partition } // BuildState converts Metadata response to ClusterState and copies slices so // planning cannot accidentally mutate the input. func BuildState(meta *kmsg.MetadataResponse) ClusterState { brokers := make(map[int32]Broker, len(meta.Brokers)) byRack := make(map[string][]int32) var ids []int32 for _, b := range meta.Brokers { rack := "" if b.Rack != nil { rack = *b.Rack } brokers[b.NodeID] = Broker{ID: b.NodeID, Rack: rack} byRack[rack] = append(byRack[rack], b.NodeID) ids = append(ids, b.NodeID) } sort.Slice(ids, func(i, j int) bool { return ids[i] < ids[j] }) parts := make(map[int32]Partition) for _, p := range meta.Topics[0].Partitions { rep := append([]int32(nil), p.Replicas...) parts[p.Partition] = Partition{ID: p.Partition, Leader: p.Leader, Replicas: rep} } return ClusterState{ TopicName: pointerStrToStr(meta.Topics[0].Topic), Brokers: brokers, BrokerIDs: ids, ByRack: byRack, NumRacks: len(byRack), Partitions: parts, } } type Desired struct { RF int DesiredPartitions int TargetLeaders map[int32]int // per broker (even split) } // ComputeDesired derives the minimal partition count and a soft target for // preferred leader distribution. We require at least one partition per broker so // everyone can lead, and we honor PartitionsPerBroker as an additional lower // bound. func ComputeDesired(state ClusterState, cfg EndToEndTopicConfig) Desired { perBroker := cfg.PartitionsPerBroker if perBroker < 1 { perBroker = 1 } cur := len(state.Partitions) desiredPartitions := max(cur, perBroker*len(state.BrokerIDs)) target := evenSplit(desiredPartitions, state.BrokerIDs) return Desired{RF: cfg.ReplicationFactor, DesiredPartitions: desiredPartitions, TargetLeaders: target} } // ----------------------------------------------------------------------------- // Replica selection (rack-aware) & load tracking // ----------------------------------------------------------------------------- // Load captures, per broker, how many times it appears as a replica and as a // preferred leader across the predictive view of the topic. We use this to // bias selection towards less-loaded brokers to avoid hot spots. // // Note: this is a transient view local to a single planning run. Nothing here // is persisted and no attempt is made to perfectly balance replicas across the // cluster—only to avoid obviously uneven choices. type Load struct { Replicas int Leaders int } // LoadTracker is a tiny helper holding a per-broker Load map with convenience // methods for reading and incrementing counts while the plan is being // constructed. // // Why not compute loads on the fly? We do that initially (from current // assignments) and then update incrementally while choosing replicas for new or // changed partitions to keep subsequent choices informed by earlier ones. // Keeping it explicit in a struct makes the intent obvious and testing easier. type LoadTracker struct { l map[int32]Load } // NewLoadTracker builds initial loads from the current assignments in the // metadata (before any staged changes). The caller updates loads as it makes // predictive choices so the next decision can see the latest picture. func NewLoadTracker(state ClusterState) *LoadTracker { l := make(map[int32]Load, len(state.BrokerIDs)) for _, id := range state.BrokerIDs { l[id] = Load{} } for _, p := range state.Partitions { for _, r := range p.Replicas { ld := l[r] ld.Replicas++ l[r] = ld } // Count the actual current leader separately if p.Leader != -1 { // -1 indicates no leader (error state) ld := l[p.Leader] ld.Leaders++ l[p.Leader] = ld } } return &LoadTracker{l: l} } // AddReplica increments replica and (optionally) leader counts for a broker in // the predictive view. Call this after the planner decides to place a replica // (e.g., in ChooseReplicas). func (t *LoadTracker) AddReplica(id int32, leader bool) { ld := t.l[id] ld.Replicas++ if leader { ld.Leaders++ } t.l[id] = ld } // Load returns the current transient load counters for a broker. func (t *LoadTracker) Load(id int32) Load { return t.l[id] } // ReplicaSelector abstracts the heuristic used to pick a concrete replica set // for a given (preferred) leader and RF. Keeping this as an interface makes it // trivial to swap strategy in tests if you’d like to assert specific behaviors. // The production strategy we use is RackAwareSelector. type ReplicaSelector interface { ChooseReplicas(preferredLeader int32, rf int) []int32 } // RackAwareSelector is a simple greedy strategy that tries to: // 1. Always include the requested preferred leader at index 0. // 2. Maximize rack diversity by preferring brokers on new racks first. // 3. Among candidates on equally novel racks, prefer lower replica load. // 4. Use broker ID as a final tiebreaker for determinism. // // After selecting, we update the transient LoadTracker so subsequent decisions // are informed by this choice. // // This is intentionally not perfect or global-optimal—just a pragmatic heuristic // that produces good, stable results for the probe topic. type RackAwareSelector struct { state ClusterState loads *LoadTracker } // NewRackAwareSelector constructs the default selection strategy. func NewRackAwareSelector(state ClusterState, loads *LoadTracker) *RackAwareSelector { return &RackAwareSelector{state: state, loads: loads} } // ChooseReplicas returns an ordered replica list of length rf where index 0 is // the preferred leader. Candidates are scored by new rack first, then lower // load, then lower broker ID. func (s *RackAwareSelector) ChooseReplicas(preferredLeader int32, rf int) []int32 { rf = min(rf, len(s.state.BrokerIDs)) res := make([]int32, 0, rf) res = append(res, preferredLeader) seen := map[int32]struct{}{preferredLeader: {}} usedRack := map[string]struct{}{s.state.Brokers[preferredLeader].Rack: {}} type cand struct { id int32 rack string load int } build := func() []cand { out := make([]cand, 0, len(s.state.BrokerIDs)) for _, id := range s.state.BrokerIDs { if _, ok := seen[id]; ok { continue } out = append(out, cand{id: id, rack: s.state.Brokers[id].Rack, load: s.loads.Load(id).Replicas}) } sort.Slice(out, func(i, j int) bool { _, iu := usedRack[out[i].rack] _, ju := usedRack[out[j].rack] if iu != ju { return !iu && ju } if out[i].load != out[j].load { return out[i].load < out[j].load } return out[i].id < out[j].id }) return out } for len(res) < rf { cands := build() if len(cands) == 0 { break } c := cands[0] res = append(res, c.id) seen[c.id] = struct{}{} usedRack[c.rack] = struct{}{} } // update transient loads for i, id := range res { s.loads.AddReplica(id, i == 0) } return res } // ----------------------------------------------------------------------------- // Plan -> Kafka requests // ----------------------------------------------------------------------------- // ToRequests converts a Plan to Kafka admin requests. Either result may be nil // if the plan contains no operations of that type. // // rebalancePartitions controls whether explicit replica assignments are included // in the CreatePartitions request. Set it to false for Redpanda Cloud, which // disallows explicit partition assignments via the Kafka API and returns // INVALID_REQUEST when they are present. func (p *Plan) ToRequests(topic string, rebalancePartitions bool) (*kmsg.AlterPartitionAssignmentsRequest, *kmsg.CreatePartitionsRequest) { var alter *kmsg.AlterPartitionAssignmentsRequest var create *kmsg.CreatePartitionsRequest if len(p.Reassignments) > 0 { r := kmsg.NewAlterPartitionAssignmentsRequest() t := kmsg.NewAlterPartitionAssignmentsRequestTopic() t.Topic = topic for _, ra := range p.Reassignments { pr := kmsg.NewAlterPartitionAssignmentsRequestTopicPartition() pr.Partition = ra.Partition pr.Replicas = append([]int32(nil), ra.Replicas...) t.Partitions = append(t.Partitions, pr) } r.Topics = []kmsg.AlterPartitionAssignmentsRequestTopic{t} alter = &r } if len(p.CreateAssignments) > 0 { r := kmsg.NewCreatePartitionsRequest() t := kmsg.NewCreatePartitionsRequestTopic() t.Topic = topic t.Count = int32(p.FinalPartitionCount) // Redpanda Cloud disallows explicit partition assignments via the Kafka API // (returning INVALID_REQUEST), the same restriction that applies to // AlterPartitionAssignments. Omit the Assignment list and let the broker // auto-place new partitions when rebalancing is disabled. if rebalancePartitions { for _, ca := range p.CreateAssignments { ta := kmsg.NewCreatePartitionsRequestTopicAssignment() ta.Replicas = append([]int32(nil), ca.Replicas...) t.Assignment = append(t.Assignment, ta) } } r.Topics = []kmsg.CreatePartitionsRequestTopic{t} create = &r } return alter, create } // ----------------------------------------------------------------------------- // Utils // ----------------------------------------------------------------------------- // indexLeaders builds broker -> list of partition IDs it currently leads (from view). func indexLeaders(brokerIDs []int32, view map[int32][]int32) map[int32][]int32 { m := make(map[int32][]int32, len(brokerIDs)) for _, id := range brokerIDs { m[id] = nil } for pid, reps := range view { if len(reps) > 0 { m[reps[0]] = append(m[reps[0]], pid) } } return m } // brokersMissingLeadership returns brokers that lead zero partitions. func brokersMissingLeadership(brokerIDs []int32, leadersByBroker map[int32][]int32) []int32 { var out []int32 for _, id := range brokerIDs { if len(leadersByBroker[id]) == 0 { out = append(out, id) } } return out } // donorBrokers returns brokers that currently lead more than their soft target, // sorted by largest surplus first; ties broken by broker ID ascending. func donorBrokers(brokerIDs []int32, leadersByBroker map[int32][]int32, target map[int32]int) []int32 { var donors []int32 for _, id := range brokerIDs { if len(leadersByBroker[id]) > target[id] { donors = append(donors, id) } } sort.Slice(donors, func(i, j int) bool { surplusI := len(leadersByBroker[donors[i]]) - target[donors[i]] surplusJ := len(leadersByBroker[donors[j]]) - target[donors[j]] if surplusI != surplusJ { return surplusI > surplusJ } return donors[i] < donors[j] }) return donors } // violatesRackDiversity returns true if a partition’s replicas do not use as // many unique racks as they could (bounded by min(len(reps), numRacks)). func violatesRackDiversity(reps []int32, numRacks int, brokers map[int32]Broker) bool { if len(reps) <= 1 { return false } seen := map[string]struct{}{} for _, r := range reps { seen[brokers[r].Rack] = struct{}{} } maxUnique := min(len(reps), numRacks) return len(seen) < maxUnique } // victimIndex returns the index of a replica that sits on a rack appearing more // than once within the replica set (i.e., a duplicate-rack candidate). This is // used when we need to swap in a new broker to keep/restore diversity. If none // exists, we fall back to the last replica (a stable, simple choice that avoids // touching the leader at index 0). func victimIndex(reps []int32, brokers map[int32]Broker) int { rc := map[string]int{} for _, r := range reps { rc[brokers[r].Rack]++ } for i, r := range reps { if rc[brokers[r].Rack] > 1 { return i } } return len(reps) - 1 // fallback: last (never the leader) } // remove removes v from a slice without preserving order. Used for maintaining // the leaders map in ensureLeaderCoverage. func remove(xs []int32, v int32) []int32 { out := xs[:0] for _, x := range xs { if x != v { out = append(out, x) } } return out } // putFirst moves id to index 0 while preserving the relative order of the // remaining elements. This models changing the preferred leader. func putFirst(reps []int32, id int32) []int32 { out := make([]int32, 0, len(reps)) out = append(out, id) for _, r := range reps { if r != id { out = append(out, r) } } return out } // contains reports whether v is present in xs. func contains(xs []int32, v int32) bool { for _, x := range xs { if x == v { return true } } return false } // evenSplit returns a soft target leader count per broker such that totals sum // to n. The remainder (+1) is assigned to the lowest broker IDs for stability. func evenSplit(n int, ids []int32) map[int32]int { m := make(map[int32]int, len(ids)) if len(ids) == 0 { return m } base := n / len(ids) rem := n % len(ids) for i, id := range ids { m[id] = base if i < rem { m[id]++ } } return m } // leastLoadedLeader returns the broker with the smallest number of preferred // leader assignments (ties broken by smaller broker ID). func leastLoadedLeader(leaders map[int32]int, ids []int32) int32 { best, bestCnt := ids[0], 1<<30 for _, id := range ids { if leaders[id] < bestCnt || (leaders[id] == bestCnt && id < best) { bestCnt, best = leaders[id], id } } return best } func equalInt32s(a, b []int32) bool { if len(a) != len(b) { return false } for i := range a { if a[i] != b[i] { return false } } return true } ================================================ FILE: e2e/partition_planner_test.go ================================================ package e2e import ( "sort" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/twmb/franz-go/pkg/kmsg" "go.uber.org/zap" ) // buildMeta constructs a MetadataResponse for tests. // brokers: brokerID -> rack label ("" means no rack/unknown). // partitions: list of replica lists where index 0 is the preferred leader. func buildMeta(topic string, brokers map[int32]string, partitions [][]int32) *kmsg.MetadataResponse { // Brokers bs := make([]kmsg.MetadataResponseBroker, 0, len(brokers)) ids := make([]int32, 0, len(brokers)) for id := range brokers { ids = append(ids, id) } sort.Slice(ids, func(i, j int) bool { return ids[i] < ids[j] }) for _, id := range ids { rack := brokers[id] // copy for address stability bs = append(bs, kmsg.MetadataResponseBroker{ NodeID: id, Rack: &rack, // empty string is allowed and treated as one "rack" bucket by planner }) } // Partitions ps := make([]kmsg.MetadataResponseTopicPartition, 0, len(partitions)) for i, reps := range partitions { cp := append([]int32(nil), reps...) leader := int32(-1) if len(reps) > 0 { leader = reps[0] // In tests, assume preferred leader is actual leader } ps = append(ps, kmsg.MetadataResponseTopicPartition{ Partition: int32(i), Leader: leader, Replicas: cp, }) } return &kmsg.MetadataResponse{ Brokers: bs, Topics: []kmsg.MetadataResponseTopic{ { Topic: kmsg.StringPtr(topic), Partitions: ps, }, }, } } // applyPlan returns the final predictive assignments after applying the plan // to the given metadata snapshot (without mutating meta). func applyPlan(meta *kmsg.MetadataResponse, plan *Plan) map[int32][]int32 { final := map[int32][]int32{} for _, p := range meta.Topics[0].Partitions { final[p.Partition] = append([]int32(nil), p.Replicas...) } for _, ra := range plan.Reassignments { final[ra.Partition] = append([]int32(nil), ra.Replicas...) } nextID := int32(len(meta.Topics[0].Partitions)) for _, ca := range plan.CreateAssignments { final[nextID] = append([]int32(nil), ca.Replicas...) nextID++ } return final } func countLeaders(assigns map[int32][]int32) map[int32]int { m := map[int32]int{} for _, reps := range assigns { if len(reps) > 0 { m[reps[0]]++ } } return m } func assertNoDuplicates(t *testing.T, reps []int32) { t.Helper() seen := map[int32]struct{}{} for _, r := range reps { _, dup := seen[r] assert.Falsef(t, dup, "duplicate broker in replica set: %v", reps) seen[r] = struct{}{} } } func max(a, b int) int { if a > b { return a } return b } // --- tests ----------------------------------------------------------------- func TestPartitionPlanner_Plan(t *testing.T) { type tc struct { name string brokers map[int32]string // brokerID -> rack ("" for no rack) partitions [][]int32 // ordered replicas (index 0 = preferred leader) cfg EndToEndTopicConfig // uses ReplicationFactor & PartitionsPerBroker check func(t *testing.T, meta *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) } tests := []tc{ { name: "single broker creates one partition", brokers: map[int32]string{1: ""}, partitions: nil, // empty topic cfg: EndToEndTopicConfig{ ReplicationFactor: 1, PartitionsPerBroker: 1, }, check: func(t *testing.T, _ *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) { require.NotNil(t, plan) assert.Equal(t, 1, len(plan.CreateAssignments), "should create exactly one partition") assert.Equal(t, 1, plan.FinalPartitionCount) for pid, reps := range final { assert.Lenf(t, reps, 1, "pid %d must have RF=1", pid) } leaders := countLeaders(final) assert.Equal(t, 1, leaders[1], "broker 1 should lead one partition") }, }, { name: "three brokers, no racks, RF grows to 3; ensure coverage and count", brokers: map[int32]string{ 1: "", 2: "", 3: "", }, // start with 2 partitions at RF=2 both led by broker with id 1 (skewed) partitions: [][]int32{ {1, 2}, {1, 2}, }, cfg: EndToEndTopicConfig{ ReplicationFactor: 3, PartitionsPerBroker: 1, }, check: func(t *testing.T, _ *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) { assert.Equal(t, 3, plan.FinalPartitionCount, "desired should be max(cur=2, ppb*brokers=3)=3") for pid, reps := range final { assert.Lenf(t, reps, 3, "pid %d must have RF=3", pid) assertNoDuplicates(t, reps) } leaders := countLeaders(final) assert.GreaterOrEqual(t, leaders[1], 1) assert.GreaterOrEqual(t, leaders[2], 1) assert.GreaterOrEqual(t, leaders[3], 1) }, }, { name: "rack diversity improves on same-RF partition (2 racks, RF=2)", brokers: map[int32]string{ 1: "a", 2: "a", 3: "b", }, // single partition with both replicas on rack "a" -> re-pick to include "b" partitions: [][]int32{ {1, 2}, }, cfg: EndToEndTopicConfig{ ReplicationFactor: 2, PartitionsPerBroker: 1, // desired total becomes 3, but p0 should be improved first }, check: func(t *testing.T, meta *kmsg.MetadataResponse, _ *Plan, final map[int32][]int32) { reps := final[0] require.Len(t, reps, 2) assertNoDuplicates(t, reps) state := BuildState(meta) assert.False(t, violatesRackDiversity(reps, state.NumRacks, state.Brokers), "p0 should span both racks a/b") }, }, { name: "shrink RF preserves current leader", brokers: map[int32]string{ 1: "", 2: "", 3: "", }, // RF=3 currently, leader is 2; configured RF=2 -> leader must remain 2 partitions: [][]int32{ {2, 1, 3}, }, cfg: EndToEndTopicConfig{ ReplicationFactor: 2, PartitionsPerBroker: 1, }, check: func(t *testing.T, _ *kmsg.MetadataResponse, _ *Plan, final map[int32][]int32) { reps := final[0] require.Len(t, reps, 2) assert.Equal(t, int32(2), reps[0], "leader 2 should be preserved after shrink") assertNoDuplicates(t, reps) }, }, { name: "rotate-if-replica covers missing brokers without extra swaps", brokers: map[int32]string{ 1: "", 2: "", 3: "", }, // 2 partitions, both led by 1; 2 and 3 are replicas only in separate partitions. partitions: [][]int32{ {1, 2}, {1, 3}, }, cfg: EndToEndTopicConfig{ ReplicationFactor: 2, PartitionsPerBroker: 1, // desired total 3 -> one create expected }, check: func(t *testing.T, _ *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) { assert.Equal(t, 1, len(plan.CreateAssignments), "one create due to partition target") leaders := countLeaders(final) assert.GreaterOrEqual(t, leaders[1], 1) assert.GreaterOrEqual(t, leaders[2], 1) assert.GreaterOrEqual(t, leaders[3], 1) for pid, reps := range final { assert.Lenf(t, reps, 2, "pid %d must have RF=2", pid) assertNoDuplicates(t, reps) } }, }, { name: "replace-duplicate-and-rotate when target broker is not a replica anywhere", brokers: map[int32]string{ 1: "a", 2: "a", 3: "b", }, // 2 partitions both {1,2}; broker 3 is nowhere; donors exist (1 leads 2), // and duplicates exist (rack "a" twice) so we can safely swap in 3 and rotate. partitions: [][]int32{ {1, 2}, {1, 2}, }, cfg: EndToEndTopicConfig{ ReplicationFactor: 2, PartitionsPerBroker: 1, // desired total 3; coverage for 3 should be via swap+rotate }, check: func(t *testing.T, _ *kmsg.MetadataResponse, _ *Plan, final map[int32][]int32) { leaders := countLeaders(final) assert.GreaterOrEqual(t, leaders[3], 1, "broker 3 should lead at least one partition") for pid, reps := range final { assert.Lenf(t, reps, 2, "pid %d RF=2", pid) assertNoDuplicates(t, reps) } }, }, { name: "even split leaders when scaling partitions (ppb=2)", brokers: map[int32]string{ 1: "", 2: "", 3: "", }, // Start with 3 partitions, each broker already leads one partitions: [][]int32{ {1, 2}, {2, 1}, {3, 1}, }, cfg: EndToEndTopicConfig{ ReplicationFactor: 2, PartitionsPerBroker: 2, // desired = 2 * 3 = 6 }, check: func(t *testing.T, _ *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) { assert.Equal(t, 6, plan.FinalPartitionCount) leaders := countLeaders(final) assert.Equal(t, 2, leaders[1]) assert.Equal(t, 2, leaders[2]) assert.Equal(t, 2, leaders[3]) }, }, { name: "one defined rack for all brokers -> no diversity churn; no ops needed", brokers: map[int32]string{ 1: "a", 2: "a", 3: "a", }, // Already meets RF and coverage and desired count == 3 partitions: [][]int32{ {1, 2}, {2, 3}, {3, 1}, }, cfg: EndToEndTopicConfig{ ReplicationFactor: 2, PartitionsPerBroker: 1, }, check: func(t *testing.T, meta *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) { assert.Equal(t, 0, len(plan.Reassignments), "no reassignments expected") assert.Equal(t, 0, len(plan.CreateAssignments), "no creates expected") state := BuildState(meta) // Diversity can't be improved (NumRacks=1) for pid, reps := range final { assert.Falsef(t, violatesRackDiversity(reps, state.NumRacks, state.Brokers), "pid %d should not violate with NumRacks=1", pid) } }, }, { name: "broker present but leading nothing (restarting?) gets coverage (via swap or create)", brokers: map[int32]string{ 1: "a", 2: "b", 3: "c", }, // Two partitions led by 1 and 2; broker 3 has no replicas/leadership. partitions: [][]int32{ {1, 2}, {2, 1}, }, cfg: EndToEndTopicConfig{ ReplicationFactor: 2, PartitionsPerBroker: 1, // desired = max(2, 3)=3 -> at least one create or a swap+rotate + create for count }, check: func(t *testing.T, _ *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) { assert.Equal(t, 3, plan.FinalPartitionCount) leaders := countLeaders(final) assert.GreaterOrEqual(t, leaders[1], 1) assert.GreaterOrEqual(t, leaders[2], 1) assert.GreaterOrEqual(t, leaders[3], 1) for pid, reps := range final { assert.Lenf(t, reps, 2, "pid %d RF=2", pid) assertNoDuplicates(t, reps) } }, }, { name: "phase 3 accounts for leaders created in phase 2 - no over-assignment", brokers: map[int32]string{ 1: "a", 2: "b", 3: "c", 4: "d", }, // Start with 2 partitions, both led by broker 1 // Brokers 2,3,4 have no leadership -> phase 2 will create partitions for them // Then phase 3 should NOT over-assign broker 1 when creating additional partitions partitions: [][]int32{ {1, 2}, // broker 1 leads {1, 3}, // broker 1 leads (over-represented) }, cfg: EndToEndTopicConfig{ ReplicationFactor: 2, PartitionsPerBroker: 2, // 4*2=8 total desired, currently have 2, so need 6 more }, check: func(t *testing.T, _ *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) { // Should create 6 new partitions (3 for coverage in phase 2, 3 more for count in phase 3) assert.Equal(t, 8, plan.FinalPartitionCount, "should reach desired partition count of 8") assert.Equal(t, 6, len(plan.CreateAssignments), "should create 6 new partitions") leaders := countLeaders(final) // Each broker should lead exactly 2 partitions (8 total / 4 brokers = 2 each) // This test will fail if phase 3 doesn't account for leaders created in phase 2 // because it will see broker 1 as leading 2 partitions and think it needs 0 more, // while it actually leads 2 from existing + potentially more from phase 2 for brokerID := int32(1); brokerID <= 4; brokerID++ { assert.Equal(t, 2, leaders[brokerID], "broker %d should lead exactly 2 partitions (even distribution), but leads %d", brokerID, leaders[brokerID]) } }, }, { name: "uses actual leader not preferred leader when they differ", brokers: map[int32]string{ 1: "a", 2: "b", 3: "c", }, // This case simulates when the actual leader differs from preferred leader (replicas[0]) // We'll manually construct metadata where leader != replicas[0] partitions: [][]int32{ {1, 2, 3}, // preferred leader is 1, but we'll set actual leader to 2 in buildMetaWithLeader {2, 3, 1}, // preferred leader is 2, but we'll set actual leader to 3 in buildMetaWithLeader }, cfg: EndToEndTopicConfig{ ReplicationFactor: 3, PartitionsPerBroker: 1, // 3*1=3 total desired, have 2, need 1 more RebalancePartitions: true, }, check: func(t *testing.T, meta *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) { // With our manually set leaders (2, 3), broker 1 has no leadership // The planner should recognize this and either rotate leadership to broker 1 // or create a new partition led by broker 1 leaders := countLeaders(final) assert.GreaterOrEqual(t, leaders[1], 1, "broker 1 should lead at least one partition") assert.GreaterOrEqual(t, leaders[2], 1, "broker 2 should lead at least one partition") assert.GreaterOrEqual(t, leaders[3], 1, "broker 3 should lead at least one partition") }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { var meta *kmsg.MetadataResponse if tt.name == "uses actual leader not preferred leader when they differ" { // Special case: manually set leaders to differ from preferred leaders meta = buildMeta("probe", tt.brokers, tt.partitions) // Override the leaders: partition 0 actual leader = 2, partition 1 actual leader = 3 meta.Topics[0].Partitions[0].Leader = 2 // preferred is 1, actual is 2 meta.Topics[0].Partitions[1].Leader = 3 // preferred is 2, actual is 3 } else { meta = buildMeta("probe", tt.brokers, tt.partitions) } planner := NewPartitionPlanner(tt.cfg, zap.NewNop()) plan, err := planner.Plan(meta) require.NoError(t, err, "Plan() should not error") require.NotNil(t, plan, "Plan() returned nil plan") // Sanity: final >= max(current, ppb*brokers) expectedMin := max(len(meta.Topics[0].Partitions), tt.cfg.PartitionsPerBroker*len(tt.brokers)) assert.GreaterOrEqual(t, plan.FinalPartitionCount, expectedMin, "final partition count must meet lower bound") // Apply and enforce universal invariants. final := applyPlan(meta, plan) for pid, reps := range final { assert.Lenf(t, reps, tt.cfg.ReplicationFactor, "pid %d RF mismatch", pid) assertNoDuplicates(t, reps) } // Scenario-specific checks. tt.check(t, meta, plan, final) }) } } func TestPartitionPlanner_Plan_Deterministic(t *testing.T) { // Test that the same input produces identical plans across multiple runs brokers := map[int32]string{ 10: "rack1", 20: "rack2", 30: "rack3", 40: "rack1", 50: "rack2", 60: "rack3", } // Many partitions with suboptimal RF to force fixReplicationAndRack to iterate over map partitions := [][]int32{ {10}, {20}, {30}, {40}, {50}, {60}, // RF=1, needs growth to 3 {10, 20}, {20, 30}, {30, 40}, {40, 50}, // RF=2, needs growth to 3 {50, 60, 10}, {60, 10, 20}, {10, 30, 40}, // RF=3, may need rack fixes } cfg := EndToEndTopicConfig{ ReplicationFactor: 3, PartitionsPerBroker: 3, // 6*3=18 total desired, have 13, need 5 more } meta := buildMeta("probe", brokers, partitions) var plans []*Plan // Run the same plan many times to increase chance of hitting different map iteration orders for i := 0; i < 10; i++ { planner := NewPartitionPlanner(cfg, zap.NewNop()) plan, err := planner.Plan(meta) require.NoError(t, err, "Plan() should not error on run %d", i) require.NotNil(t, plan, "Plan() returned nil plan on run %d", i) plans = append(plans, plan) } // All plans should be identical firstPlan := plans[0] for i := 1; i < len(plans); i++ { assert.Equal(t, len(firstPlan.Reassignments), len(plans[i].Reassignments), "run %d: reassignment count should be identical", i) assert.Equal(t, len(firstPlan.CreateAssignments), len(plans[i].CreateAssignments), "run %d: create count should be identical", i) assert.Equal(t, firstPlan.FinalPartitionCount, plans[i].FinalPartitionCount, "run %d: final partition count should be identical", i) // Build maps for comparison reassign1 := make(map[int32][]int32) for _, r := range firstPlan.Reassignments { reassign1[r.Partition] = r.Replicas } reassign2 := make(map[int32][]int32) for _, r := range plans[i].Reassignments { reassign2[r.Partition] = r.Replicas } assert.Equal(t, reassign1, reassign2, "run %d: reassignments should be identical", i) // Create assignments order matters for determinism assert.Equal(t, firstPlan.CreateAssignments, plans[i].CreateAssignments, "run %d: create assignments should be identical", i) } } // TestActualLeaderCoverageSkipsPreferredRebalancing tests the fix for the bug where // ensureLeaderCoverage would trigger unnecessary reassignments when actual leader // coverage was perfect but preferred leader coverage was unbalanced. func TestActualLeaderCoverageSkipsPreferredRebalancing(t *testing.T) { // Simulate the exact scenario from the bug report: // - All brokers in same rack (no rack diversity benefit possible) // - Actual leaders perfectly distributed: broker 0→p1, broker 1→p0, broker 2→p2 // - But preferred leaders (replicas[0]) unbalanced: broker 1→p0&p2, broker 0→p1, broker 2→none brokers := map[int32]string{ 0: "europe-west1-b", 1: "europe-west1-b", 2: "europe-west1-b", } // Build metadata with specific replica assignments matching the bug report meta := buildMeta("probe", brokers, [][]int32{ {1, 2, 0}, // partition 0: preferred leader = 1 {0, 1, 2}, // partition 1: preferred leader = 0 {1, 2, 0}, // partition 2: preferred leader = 1 }) // Override actual leaders to match the bug report scenario meta.Topics[0].Partitions[0].Leader = 1 // p0: preferred=1, actual=1 (same) meta.Topics[0].Partitions[1].Leader = 0 // p1: preferred=0, actual=0 (same) meta.Topics[0].Partitions[2].Leader = 2 // p2: preferred=1, actual=2 (DIFFERENT!) cfg := EndToEndTopicConfig{ ReplicationFactor: 3, PartitionsPerBroker: 1, // 3*1=3 total desired, have 3, perfect } planner := NewPartitionPlanner(cfg, zap.NewNop()) plan, err := planner.Plan(meta) require.NoError(t, err, "Plan() should not error") require.NotNil(t, plan, "Plan() returned nil plan") // This is the key assertion: should have ZERO reassignments because: // 1. All brokers in same rack → no rack diversity violations possible // 2. Actual leader coverage already perfect (each broker leads exactly 1 partition) // 3. RF and partition count already correct assert.Equal(t, 0, len(plan.Reassignments), "should have no reassignments when actual coverage is perfect") assert.Equal(t, 0, len(plan.CreateAssignments), "should have no creates when partition count is perfect") assert.Equal(t, 3, plan.FinalPartitionCount, "should maintain existing partition count") // Verify the fix: no changes to replica assignments final := applyPlan(meta, plan) assert.Equal(t, []int32{1, 2, 0}, final[0], "partition 0 replicas should be unchanged") assert.Equal(t, []int32{0, 1, 2}, final[1], "partition 1 replicas should be unchanged") assert.Equal(t, []int32{1, 2, 0}, final[2], "partition 2 replicas should be unchanged") // Verify all partitions still have correct RF and no duplicates for pid, reps := range final { assert.Lenf(t, reps, 3, "pid %d must have RF=3", pid) assertNoDuplicates(t, reps) } } // TestPlan_ToRequests_RebalancePartitions verifies that ToRequests omits explicit // replica assignments from the CreatePartitions request when RebalancePartitions // is false, but still sets the correct total Count. func TestPlan_ToRequests_RebalancePartitions(t *testing.T) { brokers := map[int32]string{1: "a", 2: "b", 3: "c", 4: "d", 5: "e", 6: "f"} // Topic currently has 3 partitions; 6 brokers → planner will want 6 total. partitions := [][]int32{ {1, 2, 3}, {2, 3, 4}, {3, 4, 5}, } meta := buildMeta("probe", brokers, partitions) t.Run("rebalancePartitions=true includes explicit assignments", func(t *testing.T) { cfg := EndToEndTopicConfig{ ReplicationFactor: 3, PartitionsPerBroker: 1, RebalancePartitions: true, } plan, err := NewPartitionPlanner(cfg, zap.NewNop()).Plan(meta) require.NoError(t, err) _, createReq := plan.ToRequests("probe", true) require.NotNil(t, createReq, "should have a CreatePartitions request") topic := createReq.Topics[0] assert.Equal(t, int32(plan.FinalPartitionCount), topic.Count) assert.NotEmpty(t, topic.Assignment, "assignments must be present when rebalancePartitions=true") assert.Equal(t, len(plan.CreateAssignments), len(topic.Assignment), "one assignment entry per new partition") }) t.Run("rebalancePartitions=false omits assignments", func(t *testing.T) { cfg := EndToEndTopicConfig{ ReplicationFactor: 3, PartitionsPerBroker: 1, RebalancePartitions: false, } plan, err := NewPartitionPlanner(cfg, zap.NewNop()).Plan(meta) require.NoError(t, err) // Planner should still compute create assignments (used for logging / count), // but ToRequests must NOT include them in the wire request. require.NotEmpty(t, plan.CreateAssignments, "planner should still compute assignments for count tracking") _, createReq := plan.ToRequests("probe", false) require.NotNil(t, createReq, "should still produce a CreatePartitions request") topic := createReq.Topics[0] assert.Equal(t, int32(plan.FinalPartitionCount), topic.Count, "Count must reflect the desired total even without explicit assignments") assert.Empty(t, topic.Assignment, "assignments must be absent when rebalancePartitions=false") }) t.Run("no creates needed produces nil create request regardless of flag", func(t *testing.T) { // Already-optimal topic: 3 brokers, 3 partitions, each broker leads one. optMeta := buildMeta("probe", map[int32]string{1: "", 2: "", 3: ""}, [][]int32{{1, 2, 3}, {2, 3, 1}, {3, 1, 2}}, ) for _, rebalance := range []bool{true, false} { cfg := EndToEndTopicConfig{ ReplicationFactor: 3, PartitionsPerBroker: 1, RebalancePartitions: rebalance, } plan, err := NewPartitionPlanner(cfg, zap.NewNop()).Plan(optMeta) require.NoError(t, err) assert.Empty(t, plan.CreateAssignments) _, createReq := plan.ToRequests("probe", rebalance) assert.Nil(t, createReq, "no CreatePartitions request when nothing to create (rebalance=%v)", rebalance) } }) } func TestMinimalReassignmentsWhenActualLeadersDivergeFromPreferred(t *testing.T) { // Scenario: all partitions have same preferred leader (broker 0), but actual // leaders are distributed. Algorithm should recognize brokers with actual // leadership and only fix gaps, not realign everything to preferred. brokers := map[int32]string{ 0: "rack-a", 1: "rack-b", 2: "rack-c", } meta := buildMeta("_redpanda_e2e_probe", brokers, [][]int32{ {0, 1, 2}, // partition 0: preferred leader = 0 {0, 1, 2}, // partition 1: preferred leader = 0 {0, 1, 2}, // partition 2: preferred leader = 0 }) // Set actual leaders to show divergence from preferred meta.Topics[0].Partitions[0].Leader = 0 // p0: preferred=0, actual=0 (match) meta.Topics[0].Partitions[1].Leader = 1 // p1: preferred=0, actual=1 (DIVERGED) meta.Topics[0].Partitions[2].Leader = 0 // p2: preferred=0, actual=0 (match) cfg := EndToEndTopicConfig{ ReplicationFactor: 3, PartitionsPerBroker: 1, } planner := NewPartitionPlanner(cfg, zap.NewNop()) plan, err := planner.Plan(meta) require.NoError(t, err) require.NotNil(t, plan) // Key assertion: should have ONLY ONE reassignment // Broker 1 already has actual leadership (p1), even though preferred leader of // p1 is broker 0. Only broker 2 is missing from actual leadership, so we only // need to fix that one gap. assert.Equal(t, 1, len(plan.Reassignments), "should need only ONE reassignment since broker 1 already has actual leadership") assert.Equal(t, 0, len(plan.CreateAssignments), "should have no creates") // Verify the reassignment gives broker 2 preferred leadership require.Len(t, plan.Reassignments, 1) reassignment := plan.Reassignments[0] assert.Equal(t, int32(2), reassignment.Replicas[0], "reassignment should give broker 2 preferred leadership") // Verify all partitions still have correct RF and no duplicates final := applyPlan(meta, plan) for pid, reps := range final { assert.Lenf(t, reps, 3, "pid %d must have RF=3", pid) assertNoDuplicates(t, reps) } } ================================================ FILE: e2e/producer.go ================================================ package e2e import ( "context" "encoding/json" "strconv" "time" "github.com/google/uuid" "github.com/twmb/franz-go/pkg/kgo" "go.uber.org/zap" ) // produceMessagesToAllPartitions sends an EndToEndMessage to every partition on the given topic func (s *Service) produceMessagesToAllPartitions(ctx context.Context) { for i := 0; i < s.partitionCount; i++ { s.produceMessage(ctx, i) } } // produceMessage produces an end to end record to a single given partition. If it succeeds producing the record // it will add it to the message tracker. If producing fails a message will be logged and the respective metrics // will be incremented. func (s *Service) produceMessage(ctx context.Context, partition int) { topicName := s.config.TopicManagement.Name record, msg := createEndToEndRecord(s.minionID, topicName, partition) startTime := time.Now() // This childCtx will ensure that we will abort our efforts to produce (including retries) when we exceed // the SLA for producers. childCtx, cancel := context.WithTimeout(ctx, s.config.Producer.AckSla+2*time.Second) pID := strconv.Itoa(partition) s.messagesProducedInFlight.WithLabelValues(pID).Inc() s.messageTracker.addToTracker(msg) s.client.TryProduce(childCtx, record, func(r *kgo.Record, err error) { defer cancel() ackDuration := time.Since(startTime) s.messagesProducedInFlight.WithLabelValues(pID).Dec() s.messagesProducedTotal.WithLabelValues(pID).Inc() // We add 0 in order to ensure that the "failed" metric series for that partition id are initialized as well. s.messagesProducedFailed.WithLabelValues(pID).Add(0) s.lostMessages.WithLabelValues(pID).Add(0) if err != nil { s.messagesProducedFailed.WithLabelValues(pID).Inc() _ = s.messageTracker.removeFromTracker(msg.MessageID) s.logger.Info("failed to produce message to end-to-end topic", zap.String("topic_name", r.Topic), zap.Int32("partition", r.Partition), zap.Error(err)) return } else { // Update the message's state. If this message expires and is marked as successfully produced we will // report this as a lost message, which would indicate that the producer was told that the message got // produced successfully, but it got lost somewhere. // We need to use updateItemIfExists() because it's possible that the message has already been consumed // before we have received the message here (because we were awaiting the produce ack). msg.state = EndToEndMessageStateProducedSuccessfully msg.produceLatency = ackDuration.Seconds() // TODO: Enable again as soon as https://github.com/ReneKroon/ttlcache/issues/60 is fixed // Because we cannot update cache items in an atomic fashion we currently can't use this method // as this would cause a race condition which ends up in records being reported as lost/expired. // s.messageTracker.updateItemIfExists(msg) } s.produceLatency.WithLabelValues(pID).Observe(ackDuration.Seconds()) }) } func createEndToEndRecord(minionID string, topicName string, partition int) (*kgo.Record, *EndToEndMessage) { message := &EndToEndMessage{ MinionID: minionID, MessageID: uuid.NewString(), Timestamp: time.Now().UnixNano(), partition: partition, state: EndToEndMessageStateCreated, } mjson, err := json.Marshal(message) if err != nil { // Should never happen since the struct is so simple, // but if it does, something is completely broken anyway panic("cannot serialize EndToEndMessage") } record := &kgo.Record{ Topic: topicName, Value: mjson, Partition: int32(partition), // we set partition for producing so our customPartitioner can make use of it } return record, message } ================================================ FILE: e2e/service.go ================================================ package e2e import ( "context" "fmt" "strings" "time" "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" "github.com/twmb/franz-go/pkg/kgo" "go.uber.org/zap" "github.com/cloudhut/kminion/v2/kafka" ) type Service struct { // General config Config logger *zap.Logger kafkaSvc *kafka.Service // creates kafka client for us client *kgo.Client // Service minionID string // unique identifier, reported in metrics, in case multiple instances run at the same time groupId string // our own consumer group groupTracker *groupTracker // tracks consumer groups starting with the kminion prefix and deletes them if they are unused for some time messageTracker *messageTracker // tracks successfully produced messages, clientHooks *clientHooks // logs broker events, tracks the coordinator (i.e. which broker last responded to our offset commit) partitionCount int // number of partitions of our test topic, used to send messages to all partitions // Metrics messagesProducedInFlight *prometheus.GaugeVec messagesProducedTotal *prometheus.CounterVec messagesProducedFailed *prometheus.CounterVec messagesReceived *prometheus.CounterVec offsetCommitsTotal *prometheus.CounterVec offsetCommitsFailedTotal *prometheus.CounterVec lostMessages *prometheus.CounterVec produceLatency *prometheus.HistogramVec roundtripLatency *prometheus.HistogramVec offsetCommitLatency *prometheus.HistogramVec } // NewService creates a new instance of the e2e monitoring service (wow) func NewService(ctx context.Context, cfg Config, logger *zap.Logger, kafkaSvc *kafka.Service, promRegisterer prometheus.Registerer) (*Service, error) { minionID := uuid.NewString() groupID := fmt.Sprintf("%v-%v", cfg.Consumer.GroupIdPrefix, minionID) // Producer options kgoOpts := []kgo.Opt{ kgo.ProduceRequestTimeout(3 * time.Second), kgo.RecordRetries(3), // We use the manual partitioner so that the records' partition id will be used as target partition kgo.RecordPartitioner(kgo.ManualPartitioner()), } if cfg.Producer.RequiredAcks == "all" { kgoOpts = append(kgoOpts, kgo.RequiredAcks(kgo.AllISRAcks())) } else { kgoOpts = append(kgoOpts, kgo.RequiredAcks(kgo.LeaderAck())) kgoOpts = append(kgoOpts, kgo.DisableIdempotentWrite()) } // Consumer configs kgoOpts = append(kgoOpts, kgo.ConsumerGroup(groupID), kgo.ConsumeTopics(cfg.TopicManagement.Name), kgo.Balancers(kgo.CooperativeStickyBalancer()), kgo.DisableAutoCommit(), kgo.ConsumeResetOffset(kgo.NewOffset().AtEnd()), ) // Prepare hooks hooks := newEndToEndClientHooks(logger) kgoOpts = append(kgoOpts, kgo.WithHooks(hooks)) // Create kafka service and check if client can successfully connect to Kafka cluster logger.Info("connecting to Kafka seed brokers, trying to fetch cluster metadata", zap.String("seed_brokers", strings.Join(kafkaSvc.Brokers(), ","))) client, err := kafkaSvc.CreateAndTestClient(ctx, logger, kgoOpts) if err != nil { return nil, fmt.Errorf("failed to create kafka client for e2e: %w", err) } logger.Info("successfully connected to kafka cluster") svc := &Service{ config: cfg, logger: logger.Named("e2e"), kafkaSvc: kafkaSvc, client: client, minionID: minionID, groupId: groupID, clientHooks: hooks, } svc.groupTracker = newGroupTracker(cfg, logger, client, groupID) svc.messageTracker = newMessageTracker(svc) makeCounterVec := func(name string, labelNames []string, help string) *prometheus.CounterVec { cv := prometheus.NewCounterVec(prometheus.CounterOpts{ Subsystem: "end_to_end", Name: name, Help: help, }, labelNames) promRegisterer.MustRegister(cv) return cv } makeGaugeVec := func(name string, labelNames []string, help string) *prometheus.GaugeVec { gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Subsystem: "end_to_end", Name: name, Help: help, }, labelNames) promRegisterer.MustRegister(gv) return gv } makeHistogramVec := func(name string, maxLatency time.Duration, labelNames []string, help string) *prometheus.HistogramVec { hv := prometheus.NewHistogramVec(prometheus.HistogramOpts{ Subsystem: "end_to_end", Name: name, Help: help, Buckets: createHistogramBuckets(maxLatency), }, labelNames) promRegisterer.MustRegister(hv) return hv } // Low-level info // Users can construct alerts like "can't produce messages" themselves from those svc.messagesProducedInFlight = makeGaugeVec("messages_produced_in_flight", []string{"partition_id"}, "Number of messages that kminion's end-to-end test produced but has not received an answer for yet") svc.messagesProducedTotal = makeCounterVec("messages_produced_total", []string{"partition_id"}, "Number of all messages produced to Kafka. This counter will be incremented when we receive a response (failure/timeout or success) from Kafka") svc.messagesProducedFailed = makeCounterVec("messages_produced_failed_total", []string{"partition_id"}, "Number of messages failed to produce to Kafka because of a timeout or failure") svc.messagesReceived = makeCounterVec("messages_received_total", []string{"partition_id"}, "Number of *matching* messages kminion received. Every roundtrip message has a minionID (randomly generated on startup) and a timestamp. Kminion only considers a message a match if it it arrives within the configured roundtrip SLA (and it matches the minionID)") svc.offsetCommitsTotal = makeCounterVec("offset_commits_total", []string{"coordinator_id"}, "Counts how many times kminions end-to-end test has committed offsets") svc.offsetCommitsFailedTotal = makeCounterVec("offset_commits_failed_total", []string{"coordinator_id", "reason"}, "Number of offset commits that returned an error or timed out") svc.lostMessages = makeCounterVec("messages_lost_total", []string{"partition_id"}, "Number of messages that have been produced successfully but not received within the configured SLA duration") // Latency Histograms // More detailed info about how long stuff took // Since histograms also have an 'infinite' bucket, they can be used to detect small hickups "lost" messages svc.produceLatency = makeHistogramVec("produce_latency_seconds", cfg.Producer.AckSla, []string{"partition_id"}, "Time until we received an ack for a produced message") svc.roundtripLatency = makeHistogramVec("roundtrip_latency_seconds", cfg.Consumer.RoundtripSla, []string{"partition_id"}, "Time it took between sending (producing) and receiving (consuming) a message") svc.offsetCommitLatency = makeHistogramVec("offset_commit_latency_seconds", cfg.Consumer.CommitSla, []string{"coordinator_id"}, "Time kafka took to respond to kminion's offset commit") return svc, nil } // Start starts the service (wow) func (s *Service) Start(ctx context.Context) error { // Ensure topic exists and is configured correctly if err := s.validateManagementTopic(ctx); err != nil { return fmt.Errorf("could not validate end-to-end topic: %w", err) } // finally start everything else (producing, consuming, continuous validation, consumer group tracking) go s.startReconciliation(ctx) // Start consumer and wait until we've received a response for the first poll // which would indicate that the consumer is ready. Only if the consumer is // ready we want to start the e2e producer to ensure that we will not miss // messages because the consumer wasn't ready. However, if this initialization // does not succeed within 30s we have to assume, that something is wrong on the // consuming or producing side. KMinion is supposed to report these kind of // issues and therefore this should not block KMinion from starting. initCh := make(chan bool, 1) s.logger.Info("initializing consumer and waiting until it has received the first record batch") go s.startConsumeMessages(ctx, initCh) // Produce an init message until the consumer received at least one fetch initTicker := time.NewTicker(1 * time.Second) isInitialized := false // We send a first message immediately, but we'll keep sending more messages later // since the consumers start at the latest offset and may have missed this message. initCtx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() s.sendInitMessage(initCtx, s.client, s.config.TopicManagement.Name) for !isInitialized { select { case <-initTicker.C: s.sendInitMessage(initCtx, s.client, s.config.TopicManagement.Name) case <-initCh: isInitialized = true s.logger.Info("consumer has been successfully initialized") case <-initCtx.Done(): // At this point we just assume the consumers are running fine. // The entire cluster may be down or producing fails. s.logger.Warn("initializing the consumers timed out, proceeding with the startup") isInitialized = true case <-ctx.Done(): return nil } } go s.startOffsetCommits(ctx) go s.startProducer(ctx) // keep track of groups, delete old unused groups if s.config.Consumer.DeleteStaleConsumerGroups { go s.groupTracker.start(ctx) } return nil } func (s *Service) sendInitMessage(ctx context.Context, client *kgo.Client, topicName string) { // Try to produce one record into each partition. This is important because // one or more partitions may be offline, while others may still be writable. for i := 0; i < s.partitionCount; i++ { client.TryProduce(ctx, &kgo.Record{ Key: []byte("init-message"), Value: nil, Topic: topicName, Partition: int32(i), }, nil) } } func (s *Service) startReconciliation(ctx context.Context) { if !s.config.TopicManagement.Enabled { return } validateTopicTicker := time.NewTicker(s.config.TopicManagement.ReconciliationInterval) for { select { case <-ctx.Done(): return case <-validateTopicTicker.C: err := s.validateManagementTopic(ctx) if err != nil { s.logger.Error("failed to validate end-to-end topic", zap.Error(err)) } } } } func (s *Service) startProducer(ctx context.Context) { produceTicker := time.NewTicker(s.config.ProbeInterval) for { select { case <-ctx.Done(): return case <-produceTicker.C: s.produceMessagesToAllPartitions(ctx) } } } func (s *Service) startOffsetCommits(ctx context.Context) { commitTicker := time.NewTicker(5 * time.Second) for { select { case <-ctx.Done(): return case <-commitTicker.C: s.commitOffsets(ctx) } } } ================================================ FILE: e2e/topic.go ================================================ package e2e import ( "context" "errors" "fmt" "sort" "time" "github.com/twmb/franz-go/pkg/kerr" "github.com/twmb/franz-go/pkg/kmsg" "go.uber.org/zap" ) // Check our end-to-end test topic and adapt accordingly if something does not match our expectations. // - does it exist? // // - is it configured correctly? // - does it have enough partitions? // - is the replicationFactor correct? // // - are assignments good? // - is each broker leading at least one partition? // - are replicas distributed correctly? func (s *Service) validateManagementTopic(ctx context.Context) error { s.logger.Debug("validating end-to-end topic...") meta, err := s.getTopicMetadata(ctx) if err != nil { return fmt.Errorf("validateManagementTopic cannot get metadata of e2e topic: %w", err) } typedErr := kerr.TypedErrorForCode(meta.Topics[0].ErrorCode) topicExists := false switch { case typedErr == nil: topicExists = true case errors.Is(typedErr, kerr.UnknownTopicOrPartition): // UnknownTopicOrPartition (Error code 3) means that the topic does not exist. // When the topic doesn't exist, continue to create it further down in the code. topicExists = false default: // If the topic (possibly) exists, but there's an error, then this should result in a fail return fmt.Errorf("failed to get metadata for end-to-end topic: %w", err) } // Create topic if it doesn't exist if !topicExists { if !s.config.TopicManagement.Enabled { return fmt.Errorf("the configured end to end topic does not exist. The topic will not be created " + "because topic management is disabled") } if err = s.createManagementTopic(ctx, meta); err != nil { return err } // Topic was just created with optimal assignments from the partition planner. // We can skip the validation/planning phase since the topic already has the correct // partition count and optimal replica assignments. We only need to update our // internal partition count tracking for KMinion's e2e monitoring operations. return s.updatePartitionCount(ctx) } // If topic management is disabled, skip validation and alteration of the existing topic. // This allows kminion to work on managed Kafka platforms (e.g., Confluent Cloud) that // block partition reassignment operations. if !s.config.TopicManagement.Enabled { topicMeta := meta.Topics[0] brokerIDs := make([]int32, len(meta.Brokers)) for i, broker := range meta.Brokers { brokerIDs[i] = broker.NodeID } s.logger.Info("topic management is disabled, skipping validation and alteration of existing topic", zap.String("topic", s.config.TopicManagement.Name), zap.Int("current_partitions", len(topicMeta.Partitions)), zap.Int("replication_factor", len(topicMeta.Partitions[0].Replicas))) // Log warnings if the topic configuration differs from expectations expectedPartitions := s.config.TopicManagement.PartitionsPerBroker * len(brokerIDs) if len(topicMeta.Partitions) != expectedPartitions { s.logger.Warn("topic partition count differs from expected configuration", zap.Int("current_partitions", len(topicMeta.Partitions)), zap.Int("expected_partitions", expectedPartitions), zap.Int("brokers", len(brokerIDs)), zap.Int("partitions_per_broker_config", s.config.TopicManagement.PartitionsPerBroker), zap.String("reason", "topic management is disabled, will not alter")) } // Check if each broker is leading at least one partition leaderCounts := make(map[int32]int) for _, partition := range topicMeta.Partitions { leaderCounts[partition.Leader]++ } brokersWithoutLeader := []int32{} for _, brokerID := range brokerIDs { if leaderCounts[brokerID] == 0 { brokersWithoutLeader = append(brokersWithoutLeader, brokerID) } } if len(brokersWithoutLeader) > 0 { s.logger.Warn("some brokers are not leading any partitions on the e2e topic", zap.Int32s("brokers_without_leader", brokersWithoutLeader), zap.String("reason", "topic management is disabled, will not alter"), zap.String("impact", "end-to-end monitoring may not cover all brokers")) } return s.updatePartitionCount(ctx) } // Topic already exists - use partition planner to validate and potentially fix assignments planner := NewPartitionPlanner(s.config.TopicManagement, s.logger) plan, err := planner.Plan(meta) if err != nil { return fmt.Errorf("failed to create partition plan: %w", err) } // Convert the plan to Kafka requests topicName := pointerStrToStr(meta.Topics[0].Topic) alterReq, createReq := plan.ToRequests(topicName, s.config.TopicManagement.RebalancePartitions) if s.config.TopicManagement.RebalancePartitions { if len(plan.Reassignments) > 0 { s.logPlannedReassignments(meta, plan, topicName) } err = s.executeAlterPartitionAssignments(ctx, alterReq) if err != nil { return fmt.Errorf("failed to alter partition assignments: %w", err) } } else if len(plan.Reassignments) > 0 { s.logger.Info("skipping partition reassignment because rebalancePartitions is disabled", zap.String("topic", topicName), zap.Int("skipped_reassignments", len(plan.Reassignments)), ) } if len(plan.CreateAssignments) > 0 { s.logPlannedCreations(meta, plan, topicName, s.config.TopicManagement.RebalancePartitions) } err = s.executeCreatePartitions(ctx, createReq) if err != nil { return fmt.Errorf("failed to create partitions: %w", err) } return s.updatePartitionCount(ctx) } // updatePartitionCount retrieves metadata to inform kminion about the updated // partition count of its e2e topic. It must be updated after topic validation // because the validation process may lead to the creation of new partitions. // This can occur when new brokers are added to the cluster. func (s *Service) updatePartitionCount(ctx context.Context) error { retryTicker := time.NewTicker(1 * time.Second) defer retryTicker.Stop() for { select { case <-ctx.Done(): return ctx.Err() case <-retryTicker.C: meta, err := s.getTopicMetadata(ctx) if err != nil { return fmt.Errorf("could not get topic metadata while updating partition count: %w", err) } typedErr := kerr.TypedErrorForCode(meta.Topics[0].ErrorCode) if typedErr == nil { s.partitionCount = len(meta.Topics[0].Partitions) s.logger.Debug("updatePartitionCount: successfully updated partition count", zap.Int("partition_count", s.partitionCount)) return nil } if !errors.Is(typedErr, kerr.UnknownTopicOrPartition) { return fmt.Errorf("unexpected error while updating partition count: %w", typedErr) } s.logger.Warn("updatePartitionCount: received UNKNOWN_TOPIC_OR_PARTITION error, possibly due to timing issue. Retrying...") // The UNKNOWN_TOPIC_OR_PARTITION error occurs occasionally even though the topic is created // in the validateManagementTopic function. It appears to be a timing issue where the topic metadata // is not immediately available after creation. In practice, waiting for a short period and then retrying // the operation resolves the issue. } } } func (s *Service) executeCreatePartitions(ctx context.Context, req *kmsg.CreatePartitionsRequest) error { if req == nil { return nil } res, err := req.RequestWith(ctx, s.client) if err != nil { return err } for _, topic := range res.Topics { err := kerr.ErrorForCode(topic.ErrorCode) if err != nil { return fmt.Errorf("inner Kafka error: %w", err) } } return nil } func (s *Service) executeAlterPartitionAssignments(ctx context.Context, req *kmsg.AlterPartitionAssignmentsRequest) error { if req == nil { return nil } res, err := req.RequestWith(ctx, s.client) if err != nil { return err } typedErr := kerr.TypedErrorForCode(res.ErrorCode) if typedErr != nil { s.logger.Error("alter partition assignments: failed to alter partition assignments", zap.Any("request_topics", req.Topics)) return fmt.Errorf("inner Kafka error: %w", typedErr) } for _, topic := range res.Topics { for _, partition := range topic.Partitions { typedErr = kerr.TypedErrorForCode(partition.ErrorCode) if typedErr != nil { return fmt.Errorf("inner Kafka partition error on partition '%v': %w", partition.Partition, typedErr) } } } return nil } // logPlannedReassignments logs current partition state and planned reassignment details. func (s *Service) logPlannedReassignments(meta *kmsg.MetadataResponse, plan *Plan, topicName string) { topicMeta := meta.Topics[0] // Log current partition state s.logger.Info("current partition assignments for e2e topic", zap.String("topic", topicName), zap.Int("current_partitions", len(topicMeta.Partitions)), zap.Int("brokers_available", len(meta.Brokers)), ) // Log each current partition assignment (sorted by partition ID) sortedPartitions := make([]kmsg.MetadataResponseTopicPartition, len(topicMeta.Partitions)) copy(sortedPartitions, topicMeta.Partitions) sort.Slice(sortedPartitions, func(i, j int) bool { return sortedPartitions[i].Partition < sortedPartitions[j].Partition }) for _, partition := range sortedPartitions { s.logger.Info("current partition assignment", zap.String("topic", topicName), zap.Int32("partition", partition.Partition), zap.Int32s("replicas", partition.Replicas), zap.Int32("leader", partition.Leader), ) } s.logger.Info("planned partition reassignments", zap.String("topic", topicName), zap.Int("reassignment_count", len(plan.Reassignments)), ) // Sort reassignments by partition ID for consistent logging sortedReassignments := make([]Reassignment, len(plan.Reassignments)) copy(sortedReassignments, plan.Reassignments) sort.Slice(sortedReassignments, func(i, j int) bool { return sortedReassignments[i].Partition < sortedReassignments[j].Partition }) for _, reassignment := range sortedReassignments { // Find current assignment for this partition var currentReplicas []int32 var currentLeader int32 = -1 for _, partition := range topicMeta.Partitions { if partition.Partition == reassignment.Partition { currentReplicas = partition.Replicas currentLeader = partition.Leader break } } s.logger.Info("partition reassignment", zap.String("topic", topicName), zap.Int32("partition", reassignment.Partition), zap.Int32s("current_replicas", currentReplicas), zap.Int32s("new_replicas", reassignment.Replicas), zap.Int32("current_leader", currentLeader), zap.Int32("new_leader", reassignment.Replicas[0]), ) } } // logPlannedCreations logs planned partition creation details. func (s *Service) logPlannedCreations(meta *kmsg.MetadataResponse, plan *Plan, topicName string, rebalancePartitions bool) { topicMeta := meta.Topics[0] s.logger.Info("planned partition creations", zap.String("topic", topicName), zap.Int("creation_count", len(plan.CreateAssignments)), zap.Int("current_partitions", len(topicMeta.Partitions)), zap.Int("final_partitions", plan.FinalPartitionCount), ) if !rebalancePartitions { s.logger.Info("partition assignments will be auto-placed by broker (rebalancePartitions is disabled)", zap.String("topic", topicName), ) return } nextPartitionID := int32(len(topicMeta.Partitions)) for i, creation := range plan.CreateAssignments { s.logger.Info("new partition creation", zap.String("topic", topicName), zap.Int32("new_partition", nextPartitionID+int32(i)), zap.Int32s("replicas", creation.Replicas), zap.Int32("leader", creation.Replicas[0]), ) } } func (s *Service) createManagementTopic(ctx context.Context, allMeta *kmsg.MetadataResponse) error { topicCfg := s.config.TopicManagement brokerCount := len(allMeta.Brokers) totalPartitions := brokerCount * topicCfg.PartitionsPerBroker s.logger.Info("e2e topic does not exist, creating it...", zap.String("topic_name", topicCfg.Name), zap.Int("partitions_per_broker", topicCfg.PartitionsPerBroker), zap.Int("replication_factor", topicCfg.ReplicationFactor), zap.Int("broker_count", brokerCount), zap.Int("total_partitions", totalPartitions), ) // Use partition planner to determine optimal assignments for the new topic. // The metadata already contains broker info, and since the topic doesn't exist, // meta.Topics[0].Partitions will be empty, which is exactly what we want. planner := NewPartitionPlanner(topicCfg, s.logger) plan, err := planner.Plan(allMeta) if err != nil { return fmt.Errorf("failed to create partition plan for new topic: %w", err) } // Create topic with specific replica assignments from the planner topic := kmsg.NewCreateTopicsRequestTopic() topic.Topic = topicCfg.Name topic.NumPartitions = -1 // Must be -1 when using ReplicaAssignment topic.ReplicationFactor = -1 // Must be -1 when using ReplicaAssignment topic.Configs = createTopicConfig(topicCfg) // Convert planner's CreateAssignments to Kafka's ReplicaAssignment format for i, assignment := range plan.CreateAssignments { replica := kmsg.NewCreateTopicsRequestTopicReplicaAssignment() replica.Partition = int32(i) replica.Replicas = append([]int32(nil), assignment.Replicas...) topic.ReplicaAssignment = append(topic.ReplicaAssignment, replica) } req := kmsg.NewCreateTopicsRequest() req.Topics = []kmsg.CreateTopicsRequestTopic{topic} res, err := req.RequestWith(ctx, s.client) if err != nil { return fmt.Errorf("failed to create e2e topic: %w", err) } if len(res.Topics) > 0 { err := kerr.ErrorForCode(res.Topics[0].ErrorCode) if err != nil { return fmt.Errorf("failed to create e2e topic: %w", err) } } return nil } func (s *Service) getTopicMetadata(ctx context.Context) (*kmsg.MetadataResponse, error) { topicReq := kmsg.NewMetadataRequestTopic() topicName := s.config.TopicManagement.Name topicReq.Topic = &topicName req := kmsg.NewMetadataRequest() req.Topics = []kmsg.MetadataRequestTopic{topicReq} return req.RequestWith(ctx, s.client) } func (s *Service) getTopicsConfigs(ctx context.Context, configNames []string) (*kmsg.DescribeConfigsResponse, error) { req := kmsg.NewDescribeConfigsRequest() req.IncludeDocumentation = false req.IncludeSynonyms = false req.Resources = []kmsg.DescribeConfigsRequestResource{ { ResourceType: kmsg.ConfigResourceTypeTopic, ResourceName: s.config.TopicManagement.Name, ConfigNames: configNames, }, } return req.RequestWith(ctx, s.client) } func createTopicConfig(cfgTopic EndToEndTopicConfig) []kmsg.CreateTopicsRequestTopicConfig { topicConfig := func(name string, value interface{}) kmsg.CreateTopicsRequestTopicConfig { prop := kmsg.NewCreateTopicsRequestTopicConfig() prop.Name = name valStr := fmt.Sprintf("%v", value) prop.Value = &valStr return prop } minISR := 1 if cfgTopic.ReplicationFactor >= 3 { // Only with 3+ replicas does it make sense to require acks from 2 brokers // todo: think about if we should change how 'producer.requiredAcks' works. // we probably don't even need this configured on the topic directly... minISR = 2 } // Even though kminion's end-to-end feature actually does not require any // real persistence beyond a few minutes; it might be good to keep messages // around a bit for debugging. return []kmsg.CreateTopicsRequestTopicConfig{ topicConfig("cleanup.policy", "delete"), topicConfig("segment.ms", (time.Hour * 12).Milliseconds()), // new segment every 12h topicConfig("retention.ms", (time.Hour * 24).Milliseconds()), // discard segments older than 24h topicConfig("min.insync.replicas", minISR), } } ================================================ FILE: e2e/utils.go ================================================ package e2e import ( "context" "math" "time" "github.com/prometheus/client_golang/prometheus" "github.com/twmb/franz-go/pkg/kerr" "github.com/twmb/franz-go/pkg/kmsg" "go.uber.org/zap" ) // createHistogramBuckets creates the buckets for the histogram based on the number of desired buckets (10) and the // upper bucket size. func createHistogramBuckets(maxLatency time.Duration) []float64 { // Since this is an exponential bucket we need to take Log base2 or binary as the upper bound // Divide by 10 for the argument because the base is counted as 20ms and we want to normalize it as base 2 instead of 20 // +2 because it starts at 5ms or 0.005 sec, to account 5ms and 10ms before it goes to the base which in this case is 0.02 sec or 20ms // and another +1 to account for decimal points on int parsing latencyCount := math.Logb(float64(maxLatency.Milliseconds() / 10)) count := int(latencyCount) + 3 bucket := prometheus.ExponentialBuckets(0.005, 2, count) return bucket } func containsStr(ar []string, x string) (bool, int) { for i, item := range ar { if item == x { return true, i } } return false, -1 } // logCommitErrors logs all errors in commit response and returns a well formatted error code if there was one func (s *Service) logCommitErrors(r *kmsg.OffsetCommitResponse, err error) string { if err != nil { if err == context.DeadlineExceeded { s.logger.Warn("offset commit failed because SLA has been exceeded") return "OFFSET_COMMIT_SLA_EXCEEDED" } s.logger.Warn("offset commit failed", zap.Error(err)) return "RESPONSE_ERROR" } lastErrCode := "" for _, t := range r.Topics { for _, p := range t.Partitions { typedErr := kerr.TypedErrorForCode(p.ErrorCode) if typedErr == nil { continue } s.logger.Warn("error committing partition offset", zap.String("topic", t.Topic), zap.Int32("partition_id", p.Partition), zap.Error(typedErr), ) lastErrCode = typedErr.Message } } return lastErrCode } // brokerMetadataByBrokerID returns a map of all broker metadata keyed by their BrokerID func brokerMetadataByBrokerID(meta []kmsg.MetadataResponseBroker) map[int32]kmsg.MetadataResponseBroker { res := make(map[int32]kmsg.MetadataResponseBroker) for _, broker := range meta { res[broker.NodeID] = broker } return res } // brokerMetadataByRackID returns a map of all broker metadata keyed by their Rack identifier func brokerMetadataByRackID(meta []kmsg.MetadataResponseBroker) map[string][]kmsg.MetadataResponseBroker { res := make(map[string][]kmsg.MetadataResponseBroker) for _, broker := range meta { rackID := "" if broker.Rack != nil { rackID = *broker.Rack } res[rackID] = append(res[rackID], broker) } return res } func pointerStrToStr(str *string) string { if str == nil { return "" } return *str } func safeUnwrap(err error) string { if err == nil { return "" } return err.Error() } func isInArray(num int16, arr []int16) bool { for _, n := range arr { if num == n { return true } } return false } ================================================ FILE: go.mod ================================================ module github.com/cloudhut/kminion/v2 go 1.26 require ( github.com/google/uuid v1.6.0 github.com/jcmturner/gokrb5/v8 v8.4.4 github.com/jellydator/ttlcache/v2 v2.11.1 github.com/knadh/koanf v1.5.0 github.com/mitchellh/mapstructure v1.5.0 github.com/orcaman/concurrent-map v1.0.0 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.23.2 github.com/stretchr/testify v1.11.1 github.com/twmb/franz-go v1.20.6 github.com/twmb/franz-go/pkg/kadm v1.17.1 github.com/twmb/franz-go/pkg/kmsg v1.12.0 github.com/twmb/franz-go/pkg/sasl/kerberos v1.1.0 go.uber.org/atomic v1.11.0 go.uber.org/zap v1.27.1 golang.org/x/sync v0.19.0 ) require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/hashicorp/go-uuid v1.0.3 // indirect github.com/jcmturner/aescts/v2 v2.0.0 // indirect github.com/jcmturner/dnsutils/v2 v2.0.0 // indirect github.com/jcmturner/gofork v1.7.6 // indirect github.com/jcmturner/rpc/v2 v2.0.3 // indirect github.com/klauspost/compress v1.18.3 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pelletier/go-toml v1.9.1 // indirect github.com/pierrec/lz4/v4 v4.1.25 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.5 // indirect github.com/prometheus/procfs v0.19.2 // indirect go.uber.org/multierr v1.11.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect golang.org/x/crypto v0.47.0 // indirect golang.org/x/net v0.49.0 // indirect golang.org/x/sys v0.40.0 // indirect google.golang.org/protobuf v1.36.11 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) ================================================ FILE: go.sum ================================================ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o= github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY= github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= github.com/armon/go-radix v1.0.0/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= github.com/aws/aws-sdk-go-v2 v1.9.2/go.mod h1:cK/D0BBs0b/oWPIcX/Z/obahJK1TT7IPVjy53i/mX/4= github.com/aws/aws-sdk-go-v2/config v1.8.3/go.mod h1:4AEiLtAb8kLs7vgw2ZV3p2VZ1+hBavOc84hqxVNpCyw= github.com/aws/aws-sdk-go-v2/credentials v1.4.3/go.mod h1:FNNC6nQZQUuyhq5aE5c7ata8o9e4ECGmS4lAXC7o1mQ= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.6.0/go.mod h1:gqlclDEZp4aqJOancXK6TN24aKhT0W0Ae9MHk3wzTMM= github.com/aws/aws-sdk-go-v2/internal/ini v1.2.4/go.mod h1:ZcBrrI3zBKlhGFNYWvju0I3TR93I7YIgAfy82Fh4lcQ= github.com/aws/aws-sdk-go-v2/service/appconfig v1.4.2/go.mod h1:FZ3HkCe+b10uFZZkFdvf98LHW21k49W8o8J366lqVKY= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.3.2/go.mod h1:72HRZDLMtmVQiLG2tLfQcaWLCssELvGl+Zf2WVxMmR8= github.com/aws/aws-sdk-go-v2/service/sso v1.4.2/go.mod h1:NBvT9R1MEF+Ud6ApJKM0G+IkPchKS7p7c2YPKwHmBOk= github.com/aws/aws-sdk-go-v2/service/sts v1.7.2/go.mod h1:8EzeIqfWt2wWT4rJVu3f21TfrhJ8AEMzVybRNSb/b4g= github.com/aws/smithy-go v1.8.0/go.mod h1:SObp3lf9smib00L/v3U2eAKG8FyQ7iLrJnQiAmR5n+E= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/fatih/color v1.9.0/go.mod h1:eQcE1qtQxscV5RaZvpXrrb8Drkc3/DdQ+uUYCNjL+zU= github.com/fatih/structs v1.1.0/go.mod h1:9NiDSp5zOcgEDl+j00MP/WkGVPOlPRLejGD8Ga6PJ7M= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= github.com/go-kit/log v0.1.0/go.mod h1:zbhenjAZHb184qTLMA9ZjW7ThYL0H2mk7Q6pNt4vbaY= github.com/go-ldap/ldap v3.0.2+incompatible/go.mod h1:qfd9rJvER9Q0/D/Sqn1DfHRoBp40uXYvFoEVrNEPqRc= github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/go-test/deep v1.0.2-0.20181118220953-042da051cf31/go.mod h1:wGDj63lr65AM2AQyKZd/NYHGb0R+1RLqB8NKt3aSFNA= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.7/go.mod h1:n+brtR0CgQNWTVd5ZUFpTBC8YFBDLK/h/bpaJ8/DtOE= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/securecookie v1.1.1/go.mod h1:ra0sb63/xPlUeL+yeDciTfxMRAA+MP+HVt/4epWDjd4= github.com/gorilla/sessions v1.2.1/go.mod h1:dk2InVEVJ0sfLlnXv9EAgkf6ecYs/i80K/zI+bUmuGM= github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/hashicorp/consul/api v1.13.0/go.mod h1:ZlVrynguJKcYr54zGaDbaL3fOvKC9m72FhPvA8T35KQ= github.com/hashicorp/consul/sdk v0.8.0/go.mod h1:GBvyrGALthsZObzUGsfgHZQDXjg4lOjagTIwIR1vPms= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/go-cleanhttp v0.5.0/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80= github.com/hashicorp/go-cleanhttp v0.5.1/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80= github.com/hashicorp/go-hclog v0.0.0-20180709165350-ff2cf002a8dd/go.mod h1:9bjs9uLqI8l75knNv3lV1kA55veR+WUPSiKIWcQHudI= github.com/hashicorp/go-hclog v0.8.0/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ= github.com/hashicorp/go-hclog v0.12.0/go.mod h1:whpDNt7SSdeAju8AWKIWsul05p54N/39EeqMAyrmvFQ= github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60= github.com/hashicorp/go-msgpack v0.5.3/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM= github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk= github.com/hashicorp/go-multierror v1.1.0/go.mod h1:spPvp8C1qA32ftKqdAHm4hHTbPw+vmowP0z+KUhOZdA= github.com/hashicorp/go-plugin v1.0.1/go.mod h1:++UyYGoz3o5w9ZzAdZxtQKrWWP+iqPBn3cQptSMzBuY= github.com/hashicorp/go-retryablehttp v0.5.4/go.mod h1:9B5zBasrRhHXnJnui7y6sL7es7NDiJgTc6Er0maI1Xs= github.com/hashicorp/go-rootcerts v1.0.1/go.mod h1:pqUvnprVnM5bf7AOirdbb01K4ccR319Vf4pU3K5EGc8= github.com/hashicorp/go-rootcerts v1.0.2/go.mod h1:pqUvnprVnM5bf7AOirdbb01K4ccR319Vf4pU3K5EGc8= github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU= github.com/hashicorp/go-sockaddr v1.0.2/go.mod h1:rB4wwRAUzs07qva3c5SdrY/NEtAUjGlgmH/UkBUC97A= github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4= github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/go-uuid v1.0.1/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/go-uuid v1.0.2/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/Co8= github.com/hashicorp/go-uuid v1.0.3/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/go-version v1.1.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO+LraFDTW64= github.com/hashicorp/mdns v1.0.4/go.mod h1:mtBihi+LeNXGtG8L9dX59gAEa12BDtBQSp4v/YAJqrc= github.com/hashicorp/memberlist v0.3.0/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE= github.com/hashicorp/serf v0.9.6/go.mod h1:TXZNMjZQijwlDvp+r0b63xZ45H7JmCmgg4gpTwn9UV4= github.com/hashicorp/vault/api v1.0.4/go.mod h1:gDcqh3WGcR1cpF5AJz/B1UFheUEneMoIospckxBxk6Q= github.com/hashicorp/vault/sdk v0.1.13/go.mod h1:B+hVj7TpuQY1Y/GPbCpffmgd+tSEwvhkWnjtSYCaS2M= github.com/hashicorp/yamux v0.0.0-20180604194846-3520598351bb/go.mod h1:+NfK9FKeTrX5uv1uIXGdwYDTeHna2qgaIlx54MXqjAM= github.com/hashicorp/yamux v0.0.0-20181012175058-2f1d1f20f75d/go.mod h1:+NfK9FKeTrX5uv1uIXGdwYDTeHna2qgaIlx54MXqjAM= github.com/hjson/hjson-go/v4 v4.0.0 h1:wlm6IYYqHjOdXH1gHev4VoXCaW20HdQAGCxdOEEg2cs= github.com/hjson/hjson-go/v4 v4.0.0/go.mod h1:KaYt3bTw3zhBjYqnXkYywcYctk0A2nxeEFTse3rH13E= github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8= github.com/jcmturner/aescts/v2 v2.0.0/go.mod h1:AiaICIRyfYg35RUkr8yESTqvSy7csK90qZ5xfvvsoNs= github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo= github.com/jcmturner/dnsutils/v2 v2.0.0/go.mod h1:b0TnjGOvI/n42bZa+hmXL+kFJZsFT7G4t3HTlQ184QM= github.com/jcmturner/gofork v1.7.6 h1:QH0l3hzAU1tfT3rZCnW5zXl+orbkNMMRGJfdJjHVETg= github.com/jcmturner/gofork v1.7.6/go.mod h1:1622LH6i/EZqLloHfE7IeZ0uEJwMSUyQ/nDd82IeqRo= github.com/jcmturner/goidentity/v6 v6.0.1 h1:VKnZd2oEIMorCTsFBnJWbExfNN7yZr3EhJAxwOkZg6o= github.com/jcmturner/goidentity/v6 v6.0.1/go.mod h1:X1YW3bgtvwAXju7V3LCIMpY0Gbxyjn/mY9zx4tFonSg= github.com/jcmturner/gokrb5/v8 v8.4.3/go.mod h1:dqRwJGXznQrzw6cWmyo6kH+E7jksEQG/CyVWsJEsJO0= github.com/jcmturner/gokrb5/v8 v8.4.4 h1:x1Sv4HaTpepFkXbt2IkL29DXRf8sOfZXo8eRKh687T8= github.com/jcmturner/gokrb5/v8 v8.4.4/go.mod h1:1btQEpgT6k+unzCwX1KdWMEwPPkkgBtP+F6aCACiMrs= github.com/jcmturner/rpc/v2 v2.0.3 h1:7FXXj8Ti1IaVFpSAziCZWNzbNuZmnvw/i6CqLNdWfZY= github.com/jcmturner/rpc/v2 v2.0.3/go.mod h1:VUJYCIDm3PVOEHw8sgt091/20OJjskO/YJki3ELg/Hc= github.com/jellydator/ttlcache/v2 v2.11.1 h1:AZGME43Eh2Vv3giG6GeqeLeFXxwxn1/qHItqWZl6U64= github.com/jellydator/ttlcache/v2 v2.11.1/go.mod h1:RtE5Snf0/57e+2cLWFYWCCsLas2Hy3c5Z4n14XmSvTI= github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= github.com/joho/godotenv v1.3.0 h1:Zjp+RcGpHhGlrMbJzXTrZZPrWj+1vfm90La1wgB6Bhc= github.com/joho/godotenv v1.3.0/go.mod h1:7hK45KPybAkOC6peb+G5yklZfMxEjkZhHbwpqxOKXbg= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/json-iterator/go v1.1.11/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU= github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw= github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= github.com/knadh/koanf v1.5.0 h1:q2TSd/3Pyc/5yP9ldIrSdIz26MCcyNQzW0pEAugLPNs= github.com/knadh/koanf v1.5.0/go.mod h1:Hgyjp4y8v44hpZtPzs7JZfRAW5AhN7KfZcwv1RYggDs= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= github.com/mattn/go-colorable v0.1.6/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= github.com/mattn/go-isatty v0.0.3/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= github.com/mattn/go-isatty v0.0.10/go.mod h1:qgIWMr58cqv1PHHyhnkY9lrL7etaEgOFcMEpPG5Rm84= github.com/mattn/go-isatty v0.0.11/go.mod h1:PhnuNfih5lzO57/f3n+odYbM4JtupLOxQOAqxQCu2WE= github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/miekg/dns v1.1.26/go.mod h1:bPDLeHnStXmXAq1m/Ch/hvfNHr14JKNPMBo3VZKjuso= github.com/miekg/dns v1.1.41/go.mod h1:p6aan82bvRIyn+zDIv9xYNUpwa73JcSh9BKwknJysuI= github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc= github.com/mitchellh/cli v1.1.0/go.mod h1:xcISNoH86gajksDmfB23e/pu+B+GeFRMYmoHXxx3xhI= github.com/mitchellh/copystructure v1.0.0/go.mod h1:SNtv71yrdKgLRyLFxmLdkAbkKEFWgYaq1OVrnRcwhnw= github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw= github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HKCj9FbZEVFJRxO9s= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/go-testing-interface v0.0.0-20171004221916-a61a99592b77/go.mod h1:kRemZodwjscx+RGhAo8eIhFbs2+BFgRtFPeD/KE+zxI= github.com/mitchellh/go-testing-interface v1.0.0/go.mod h1:kRemZodwjscx+RGhAo8eIhFbs2+BFgRtFPeD/KE+zxI= github.com/mitchellh/go-wordwrap v1.0.0/go.mod h1:ZXFpozHsX6DPmq2I0TCekCxypsnAUbP2oI0UX1GXzOo= github.com/mitchellh/mapstructure v0.0.0-20160808181253-ca63d7c062ee/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/mitchellh/reflectwalk v1.0.0/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/npillmayer/nestext v0.1.3/go.mod h1:h2lrijH8jpicr25dFY+oAJLyzlya6jhnuG+zWp9L0Uk= github.com/oklog/run v1.0.0/go.mod h1:dlhp/R75TPv97u0XWUtDeV/lRKWPKSdTuV0TZvrmrQA= github.com/orcaman/concurrent-map v1.0.0 h1:I/2A2XPCb4IuQWcQhBhSwGfiuybl/J0ev9HDbW65HOY= github.com/orcaman/concurrent-map v1.0.0/go.mod h1:Lu3tH6HLW3feq74c2GC+jIMS/K2CFcDWnWD9XkenwhI= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pelletier/go-toml v1.7.0/go.mod h1:vwGMzjaWMwyfHwgIBhI2YUM4fB6nL6lVAvS1LBMMhTE= github.com/pelletier/go-toml v1.9.1 h1:a6qW1EVNZWH9WGI6CsYdD8WAylkoXBS5yv0XHlh17Tc= github.com/pelletier/go-toml v1.9.1/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c= github.com/pierrec/lz4 v2.0.5+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY= github.com/pierrec/lz4/v4 v4.1.15/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pierrec/lz4/v4 v4.1.25 h1:kocOqRffaIbU5djlIBr7Wh+cx82C0vtFb0fOurZHqD0= github.com/pierrec/lz4/v4 v4.1.25/go.mod h1:EoQMVJgeeEOMsCqCzqFm2O0cJvljX2nGZjcRIPL34O4= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI= github.com/posener/complete v1.2.3/go.mod h1:WZIdtGGp+qx0sLrYKtIRAruyNpv6hFCicSgv7Sy7s/s= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M= github.com/prometheus/client_golang v1.11.1/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo= github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc= github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws= github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw= github.com/rhnvrm/simples3 v0.6.1/go.mod h1:Y+3vYm2V7Y4VijFoJHHTrja6OgPrJ2cBti8dPGkC3sA= github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= github.com/ryanuber/columnize v2.1.0+incompatible/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= github.com/ryanuber/go-glob v1.0.0/go.mod h1:807d1WSdnB0XRJzKNil9Om6lcp/3a0v4qIHxIXzX/Yc= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/twmb/franz-go v1.7.0/go.mod h1:PMze0jNfNghhih2XHbkmTFykbMF5sJqmNJB31DOOzro= github.com/twmb/franz-go v1.20.6 h1:TpQTt4QcixJ1cHEmQGPOERvTzo99s8jAutmS7rbSD6w= github.com/twmb/franz-go v1.20.6/go.mod h1:u+FzH2sInp7b9HNVv2cZN8AxdXy6y/AQ1Bkptu4c0FM= github.com/twmb/franz-go/pkg/kadm v1.17.1 h1:Bt02Y/RLgnFO2NP2HVP1kd2TFtGRiJZx+fSArjZDtpw= github.com/twmb/franz-go/pkg/kadm v1.17.1/go.mod h1:s4duQmrDbloVW9QTMXhs6mViTepze7JLG43xwPcAeTg= github.com/twmb/franz-go/pkg/kmsg v1.2.0/go.mod h1:SxG/xJKhgPu25SamAq0rrucfp7lbzCpEXOC+vH/ELrY= github.com/twmb/franz-go/pkg/kmsg v1.12.0 h1:CbatD7ers1KzDNgJqPbKOq0Bz/WLBdsTH75wgzeVaPc= github.com/twmb/franz-go/pkg/kmsg v1.12.0/go.mod h1:+DPt4NC8RmI6hqb8G09+3giKObE6uD2Eya6CfqBpeJY= github.com/twmb/franz-go/pkg/sasl/kerberos v1.1.0 h1:alKdbddkPw3rDh+AwmUEwh6HNYgTvDSFIe/GWYRR9RM= github.com/twmb/franz-go/pkg/sasl/kerberos v1.1.0/go.mod h1:k8BoBjyUbFj34f0rRbn+Ky12sZFAPbmShrg0karAIMo= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.etcd.io/etcd/api/v3 v3.5.4/go.mod h1:5GB2vv4A4AOn3yk7MftYGHkUfGtDHnEraIjym4dYz5A= go.etcd.io/etcd/client/pkg/v3 v3.5.4/go.mod h1:IJHfcCEKxYu1Os13ZdwCwIUTUVGYTSAM3YSwc9/Ac1g= go.etcd.io/etcd/client/v3 v3.5.4/go.mod h1:ZaRkVgBZC+L+dLCjTcF1hRXpgZXQPOvnA/Ak/gq3kiY= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE= go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.17.0/go.mod h1:MXVU+bhUf/A7Xi2HNOnopQOrmycQ5Ih87HtOu4q5SSo= go.uber.org/zap v1.27.1 h1:08RqriUEv8+ArZRYSTXy1LeBScaMpVSTBhCeaZYfMYc= go.uber.org/zap v1.27.1/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190923035154-9ee001bba392/go.mod h1:/lpIB1dKB+9EgE3H3cr1v9wB50oz8l4C4h62xy7jSTY= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.0.0-20220817201139-bc19a97f63c8/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.6.0/go.mod h1:OFC/31mSvZgRz0V1QTNCzfAI1aIRzbiufJtkMIlEp58= golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8= golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20201208152925-83fdc39ff7b5/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/lint v0.0.0-20210508222113-6edffad5e616/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190923162816-aa69164e4478/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= golang.org/x/net v0.0.0-20210410081132-afb366fc7cd1/go.mod h1:9tjilg8BloeKEkVJvy7fQ90B1CfIiPueXVOjqfkSzI8= golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.0.0-20220725212005-46097bf591d3/go.mod h1:AaygXjzTFtRAg2ttMY5RMuhpJ3cNnI0XpyFJD1iQRSM= golang.org/x/net v0.0.0-20220812174116-3211cb980234/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190129075346-302c3dd5f1cc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190403152447-81d4e9dc473e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190922100055-0a153f010e69/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190924154521-2837fb4f24fe/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200124204421-9fbb57f87de9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210303074136-134d130e1a04/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20181227161524-e6919f6577db/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190907020128-2ca718005c18/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210112230658-8b4aab62c064/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.2/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190404172233-64821d5d2107/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20200513103714-09dca8ec2884/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= google.golang.org/genproto v0.0.0-20210602131652-f16073e35f0c/go.mod h1:UODoCrxHCcBojKKwX1terBiRUaqAsFqJiF615XL43r0= google.golang.org/grpc v1.14.0/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.22.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTpR3n0= google.golang.org/grpc v1.38.0/go.mod h1:NREThFqKR1f3iQ6oBuvc5LadQuXVGo9rkm5ZGrQdJfM= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/asn1-ber.v1 v1.0.0-20181015200546-f715ec2f112d/go.mod h1:cuepJuh7vyXfUyUwEgHQXw849cJrilpS5NeIjOWESAw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/square/go-jose.v2 v2.3.1/go.mod h1:M9dMgbHiYLoDGQrXy7OpJDJWiKiU//h+vD76mk0e1AI= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= sigs.k8s.io/yaml v1.2.0/go.mod h1:yfXDCHCao9+ENCvLSE62v9VSji2MKu5jeNfTrofGhJc= ================================================ FILE: kafka/client_config_helper.go ================================================ package kafka import ( "context" "crypto/tls" "crypto/x509" "encoding/pem" "fmt" "io/ioutil" "net" "time" "github.com/jcmturner/gokrb5/v8/client" "github.com/jcmturner/gokrb5/v8/keytab" "github.com/twmb/franz-go/pkg/kgo" "github.com/twmb/franz-go/pkg/sasl" "github.com/twmb/franz-go/pkg/sasl/kerberos" "github.com/twmb/franz-go/pkg/sasl/oauth" "github.com/twmb/franz-go/pkg/sasl/plain" "github.com/twmb/franz-go/pkg/sasl/scram" "go.uber.org/zap" krbconfig "github.com/jcmturner/gokrb5/v8/config" ) // NewKgoConfig creates a new Config for the Kafka Client as exposed by the franz-go library. // If TLS certificates can't be read an error will be returned. // logger is only used to print warnings about TLS. func NewKgoConfig(cfg Config, logger *zap.Logger) ([]kgo.Opt, error) { opts := []kgo.Opt{ kgo.SeedBrokers(cfg.Brokers...), kgo.ClientID(cfg.ClientID), kgo.FetchMaxBytes(5 * 1000 * 1000), // 5MB kgo.MaxConcurrentFetches(10), // Allow metadata to be refreshed more often than 5s (default) if needed. // That will mitigate issues with unknown partitions shortly after creating // them. kgo.MetadataMinAge(time.Second), } // Create Logger kgoLogger := KgoZapLogger{ logger: logger.Sugar(), } opts = append(opts, kgo.WithLogger(kgoLogger)) // Add Rack Awareness if configured if cfg.RackID != "" { opts = append(opts, kgo.Rack(cfg.RackID)) } // Configure SASL if cfg.SASL.Enabled { // SASL Plain if cfg.SASL.Mechanism == "PLAIN" { mechanism := plain.Auth{ User: cfg.SASL.Username, Pass: cfg.SASL.Password, }.AsMechanism() opts = append(opts, kgo.SASL(mechanism)) } // SASL SCRAM if cfg.SASL.Mechanism == "SCRAM-SHA-256" || cfg.SASL.Mechanism == "SCRAM-SHA-512" { var mechanism sasl.Mechanism scramAuth := scram.Auth{ User: cfg.SASL.Username, Pass: cfg.SASL.Password, } if cfg.SASL.Mechanism == "SCRAM-SHA-256" { mechanism = scramAuth.AsSha256Mechanism() } if cfg.SASL.Mechanism == "SCRAM-SHA-512" { mechanism = scramAuth.AsSha512Mechanism() } opts = append(opts, kgo.SASL(mechanism)) } // Kerberos if cfg.SASL.Mechanism == "GSSAPI" { var krbClient *client.Client kerbCfg, err := krbconfig.Load(cfg.SASL.GSSAPI.KerberosConfigPath) if err != nil { return nil, fmt.Errorf("failed to create kerberos config from specified config filepath: %w", err) } switch cfg.SASL.GSSAPI.AuthType { case "USER_AUTH:": krbClient = client.NewWithPassword( cfg.SASL.GSSAPI.Username, cfg.SASL.GSSAPI.Realm, cfg.SASL.GSSAPI.Password, kerbCfg, client.DisablePAFXFAST(!cfg.SASL.GSSAPI.EnableFast)) case "KEYTAB_AUTH": ktb, err := keytab.Load(cfg.SASL.GSSAPI.KeyTabPath) if err != nil { return nil, fmt.Errorf("failed to load keytab: %w", err) } krbClient = client.NewWithKeytab( cfg.SASL.GSSAPI.Username, cfg.SASL.GSSAPI.Realm, ktb, kerbCfg, client.DisablePAFXFAST(!cfg.SASL.GSSAPI.EnableFast)) } if krbClient == nil { return nil, fmt.Errorf("kafka.sasl.gssapi.authType must be one of USER_AUTH or KEYTAB_AUTH") } kerberosMechanism := kerberos.Auth{ Client: krbClient, Service: cfg.SASL.GSSAPI.ServiceName, PersistAfterAuth: true, }.AsMechanism() opts = append(opts, kgo.SASL(kerberosMechanism)) } // OAuthBearer if cfg.SASL.Mechanism == "OAUTHBEARER" { mechanism := oauth.Oauth(func(ctx context.Context) (oauth.Auth, error) { token, err := cfg.SASL.OAuthBearer.getToken(ctx) return oauth.Auth{ Zid: cfg.SASL.OAuthBearer.ClientID, Token: token, }, err }) opts = append(opts, kgo.SASL(mechanism)) } } // Configure TLS var caCertPool *x509.CertPool if cfg.TLS.Enabled { // Root CA if cfg.TLS.CaFilepath != "" || len(cfg.TLS.Ca) > 0 { ca := []byte(cfg.TLS.Ca) if cfg.TLS.CaFilepath != "" { caBytes, err := ioutil.ReadFile(cfg.TLS.CaFilepath) if err != nil { return nil, fmt.Errorf("failed to load ca cert: %w", err) } ca = caBytes } caCertPool = x509.NewCertPool() isSuccessful := caCertPool.AppendCertsFromPEM(ca) if !isSuccessful { logger.Warn("failed to append ca file to cert pool, is this a valid PEM format?") } } // If configured load TLS cert & key - Mutual TLS var certificates []tls.Certificate hasCertFile := cfg.TLS.CertFilepath != "" || len(cfg.TLS.Cert) > 0 hasKeyFile := cfg.TLS.KeyFilepath != "" || len(cfg.TLS.Key) > 0 if hasCertFile || hasKeyFile { cert := []byte(cfg.TLS.Cert) privateKey := []byte(cfg.TLS.Key) // 1. Read certificates if cfg.TLS.CertFilepath != "" { certBytes, err := ioutil.ReadFile(cfg.TLS.CertFilepath) if err != nil { return nil, fmt.Errorf("failed to TLS certificate: %w", err) } cert = certBytes } if cfg.TLS.KeyFilepath != "" { keyBytes, err := ioutil.ReadFile(cfg.TLS.KeyFilepath) if err != nil { return nil, fmt.Errorf("failed to read TLS key: %w", err) } privateKey = keyBytes } // 2. Check if private key needs to be decrypted. Decrypt it if passphrase is given, otherwise return error pemBlock, _ := pem.Decode(privateKey) if pemBlock == nil { return nil, fmt.Errorf("no valid private key found") } if x509.IsEncryptedPEMBlock(pemBlock) { decryptedKey, err := x509.DecryptPEMBlock(pemBlock, []byte(cfg.TLS.Passphrase)) if err != nil { return nil, fmt.Errorf("private key is encrypted, but could not decrypt it: %s", err) } // If private key was encrypted we can overwrite the original contents now with the decrypted version privateKey = pem.EncodeToMemory(&pem.Block{Type: pemBlock.Type, Bytes: decryptedKey}) } tlsCert, err := tls.X509KeyPair(cert, privateKey) if err != nil { return nil, fmt.Errorf("cannot parse pem: %s", err) } certificates = []tls.Certificate{tlsCert} } tlsDialer := &tls.Dialer{ NetDialer: &net.Dialer{Timeout: 10 * time.Second}, Config: &tls.Config{ InsecureSkipVerify: cfg.TLS.InsecureSkipTLSVerify, Certificates: certificates, RootCAs: caCertPool, }, } opts = append(opts, kgo.Dialer(tlsDialer.DialContext)) } return opts, nil } ================================================ FILE: kafka/client_logger.go ================================================ package kafka import ( "github.com/twmb/franz-go/pkg/kgo" "go.uber.org/zap" ) type KgoZapLogger struct { logger *zap.SugaredLogger } // Level Implements kgo.Logger interface. It returns the log level to log at. // We pin this to debug as the zap logger decides what to actually send to the output stream. func (k KgoZapLogger) Level() kgo.LogLevel { return kgo.LogLevelDebug } // Log implements kgo.Logger interface func (k KgoZapLogger) Log(level kgo.LogLevel, msg string, keyvals ...interface{}) { switch level { case kgo.LogLevelDebug: k.logger.Debugw(msg, keyvals...) case kgo.LogLevelInfo: k.logger.Infow(msg, keyvals...) case kgo.LogLevelWarn: k.logger.Warnw(msg, keyvals...) case kgo.LogLevelError: k.logger.Errorw(msg, keyvals...) } } ================================================ FILE: kafka/config.go ================================================ package kafka import "fmt" type Config struct { // General Brokers []string `koanf:"brokers"` ClientID string `koanf:"clientId"` RackID string `koanf:"rackId"` TLS TLSConfig `koanf:"tls"` SASL SASLConfig `koanf:"sasl"` RetryInitConnection bool `koanf:"retryInitConnection"` } func (c *Config) SetDefaults() { c.ClientID = "kminion" c.TLS.SetDefaults() c.SASL.SetDefaults() } func (c *Config) Validate() error { if len(c.Brokers) == 0 { return fmt.Errorf("no seed brokers specified, at least one must be configured") } err := c.TLS.Validate() if err != nil { return fmt.Errorf("failed to validate TLS config: %w", err) } err = c.SASL.Validate() if err != nil { return fmt.Errorf("failed to validate SASL config: %w", err) } return nil } ================================================ FILE: kafka/config_sasl.go ================================================ package kafka import "fmt" const ( SASLMechanismPlain = "PLAIN" SASLMechanismScramSHA256 = "SCRAM-SHA-256" SASLMechanismScramSHA512 = "SCRAM-SHA-512" SASLMechanismGSSAPI = "GSSAPI" SASLMechanismOAuthBearer = "OAUTHBEARER" ) // SASLConfig for Kafka Client type SASLConfig struct { Enabled bool `koanf:"enabled"` Username string `koanf:"username"` Password string `koanf:"password"` Mechanism string `koanf:"mechanism"` // SASL Mechanisms that require more configuration than username & password GSSAPI SASLGSSAPIConfig `koanf:"gssapi"` OAuthBearer OAuthBearerConfig `koanf:"oauth"` } // SetDefaults for SASL Config func (c *SASLConfig) SetDefaults() { c.Enabled = false c.Mechanism = SASLMechanismPlain c.GSSAPI.SetDefaults() } // Validate SASL config input func (c *SASLConfig) Validate() error { if !c.Enabled { return nil } switch c.Mechanism { case SASLMechanismPlain, SASLMechanismScramSHA256, SASLMechanismScramSHA512, SASLMechanismGSSAPI: // Valid and supported case SASLMechanismOAuthBearer: return c.OAuthBearer.Validate() default: return fmt.Errorf("given sasl mechanism '%v' is invalid", c.Mechanism) } return nil } ================================================ FILE: kafka/config_sasl_gssapi.go ================================================ package kafka // SASLGSSAPIConfig represents the Kafka Kerberos config type SASLGSSAPIConfig struct { AuthType string `koanf:"authType"` KeyTabPath string `koanf:"keyTabPath"` KerberosConfigPath string `koanf:"kerberosConfigPath"` ServiceName string `koanf:"serviceName"` Username string `koanf:"username"` Password string `koanf:"password"` Realm string `koanf:"realm"` // EnableFAST enables FAST, which is a pre-authentication framework for Kerberos. // It includes a mechanism for tunneling pre-authentication exchanges using armoured KDC messages. // FAST provides increased resistance to passive password guessing attacks. EnableFast bool `koanf:"enableFast"` } func (s *SASLGSSAPIConfig) SetDefaults() { s.EnableFast = true } ================================================ FILE: kafka/config_sasl_oauthbearer.go ================================================ package kafka import ( "context" "encoding/base64" "encoding/json" "fmt" "net/http" "net/url" "strings" ) type OAuthBearerConfig struct { TokenEndpoint string `koanf:"tokenEndpoint"` ClientID string `koanf:"clientId"` ClientSecret string `koanf:"clientSecret"` Scope string `koanf:"scope"` } func (c *OAuthBearerConfig) Validate() error { if c.TokenEndpoint == "" { return fmt.Errorf("OAuthBearer token endpoint is not specified") } if c.ClientID == "" || c.ClientSecret == "" { return fmt.Errorf("OAuthBearer client credentials are not specified") } return nil } // same as AcquireToken in Console https://github.com/redpanda-data/console/blob/master/backend/pkg/config/kafka_sasl_oauth.go#L56 func (c *OAuthBearerConfig) getToken(ctx context.Context) (string, error) { authHeaderValue := base64.StdEncoding.EncodeToString([]byte(c.ClientID + ":" + c.ClientSecret)) queryParams := url.Values{ "grant_type": []string{"client_credentials"}, "scope": []string{c.Scope}, } req, err := http.NewRequestWithContext(ctx, "POST", c.TokenEndpoint, strings.NewReader(queryParams.Encode())) if err != nil { return "", fmt.Errorf("failed to create HTTP request: %w", err) } req.URL.RawQuery = queryParams.Encode() req.Header.Set("Authorization", "Basic "+authHeaderValue) req.Header.Set("Content-Type", "application/x-www-form-urlencoded") client := &http.Client{} resp, err := client.Do(req) if err != nil { return "", fmt.Errorf("HTTP request failed: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return "", fmt.Errorf("token request failed with status code %d", resp.StatusCode) } var tokenResponse map[string]interface{} decoder := json.NewDecoder(resp.Body) if err := decoder.Decode(&tokenResponse); err != nil { return "", fmt.Errorf("failed to parse token response: %w", err) } accessToken, ok := tokenResponse["access_token"].(string) if !ok { return "", fmt.Errorf("access_token not found in token response") } return accessToken, nil } ================================================ FILE: kafka/config_tls.go ================================================ package kafka import "fmt" // TLSConfig to connect to Kafka via TLS type TLSConfig struct { Enabled bool `koanf:"enabled"` CaFilepath string `koanf:"caFilepath"` CertFilepath string `koanf:"certFilepath"` KeyFilepath string `koanf:"keyFilepath"` Ca string `koanf:"ca"` Cert string `koanf:"cert"` Key string `koanf:"key"` Passphrase string `koanf:"passphrase"` InsecureSkipTLSVerify bool `koanf:"insecureSkipTlsVerify"` } func (c *TLSConfig) SetDefaults() { c.Enabled = false } func (c *TLSConfig) Validate() error { if len(c.CaFilepath) > 0 && len(c.Ca) > 0 { return fmt.Errorf("config keys 'caFilepath' and 'ca' are both set. only one can be used at the same time") } if len(c.CertFilepath) > 0 && len(c.Cert) > 0 { return fmt.Errorf("config keys 'certFilepath' and 'cert' are both set. only one can be used at the same time") } if len(c.KeyFilepath) > 0 && len(c.Key) > 0 { return fmt.Errorf("config keys 'keyFilepath' and 'key' are both set. only one can be used at the same time") } return nil } ================================================ FILE: kafka/service.go ================================================ package kafka import ( "context" "fmt" "time" "github.com/twmb/franz-go/pkg/kerr" "github.com/twmb/franz-go/pkg/kgo" "github.com/twmb/franz-go/pkg/kmsg" "github.com/twmb/franz-go/pkg/kversion" "go.uber.org/zap" ) type Service struct { cfg Config logger *zap.Logger } func NewService(cfg Config, logger *zap.Logger) *Service { return &Service{ cfg: cfg, logger: logger.Named("kafka_service"), } } // CreateAndTestClient creates a client with the services default settings // logger: will be used to log connections, errors, warnings about tls config, ... func (s *Service) CreateAndTestClient(ctx context.Context, l *zap.Logger, opts []kgo.Opt) (*kgo.Client, error) { logger := l.Named("kgo_client") // Config with default options kgoOpts, err := NewKgoConfig(s.cfg, logger) if err != nil { return nil, fmt.Errorf("failed to create a valid kafka Client config: %w", err) } // Append user (the service calling this method) provided options kgoOpts = append(kgoOpts, opts...) // Create kafka client client, err := kgo.NewClient(kgoOpts...) if err != nil { return nil, fmt.Errorf("failed to create kafka Client: %w", err) } // Test connection for { err = s.testConnection(client, ctx) if err == nil { break } if !s.cfg.RetryInitConnection { return nil, fmt.Errorf("failed to test connectivity to Kafka cluster %w", err) } logger.Warn("failed to test connectivity to Kafka cluster, retrying in 5 seconds", zap.Error(err)) time.Sleep(time.Second * 5) } return client, nil } // Brokers returns list of brokers this service is connecting to func (s *Service) Brokers() []string { return s.cfg.Brokers } // testConnection tries to fetch Broker metadata and prints some information if connection succeeds. An error will be // returned if connecting fails. func (s *Service) testConnection(client *kgo.Client, ctx context.Context) error { connectCtx, cancel := context.WithTimeout(ctx, 15*time.Second) defer cancel() req := kmsg.MetadataRequest{ Topics: nil, } res, err := req.RequestWith(connectCtx, client) if err != nil { return fmt.Errorf("failed to request metadata: %w", err) } // Request versions in order to guess Kafka Cluster version versionsReq := kmsg.NewApiVersionsRequest() versionsRes, err := versionsReq.RequestWith(connectCtx, client) if err != nil { return fmt.Errorf("failed to request api versions: %w", err) } err = kerr.ErrorForCode(versionsRes.ErrorCode) if err != nil { return fmt.Errorf("failed to request api versions. Inner kafka error: %w", err) } versions := kversion.FromApiVersionsResponse(versionsRes) s.logger.Debug("successfully connected to kafka cluster", zap.Int("advertised_broker_count", len(res.Brokers)), zap.Int("topic_count", len(res.Topics)), zap.Int32("controller_id", res.ControllerID), zap.String("kafka_version", versions.VersionGuess())) return nil } ================================================ FILE: logging/config.go ================================================ package logging import ( "fmt" "go.uber.org/zap" ) type Config struct { Level string `koanf:"level"` } func (c *Config) SetDefaults() { c.Level = "info" } func (c *Config) Validate() error { level := zap.NewAtomicLevel() err := level.UnmarshalText([]byte(c.Level)) if err != nil { return fmt.Errorf("failed to parse logger level: %w", err) } return nil } ================================================ FILE: logging/logger.go ================================================ package logging import ( "os" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "go.uber.org/zap/zapcore" "go.uber.org/zap" ) // NewLogger creates a preconfigured global logger and configures the global zap logger func NewLogger(cfg Config, metricsNamespace string) *zap.Logger { encoderCfg := zap.NewProductionEncoderConfig() encoderCfg.EncodeTime = zapcore.ISO8601TimeEncoder // Parse log level text to zap.LogLevel. Error check isn't required because the input is already validated. level := zap.NewAtomicLevel() _ = level.UnmarshalText([]byte(cfg.Level)) core := zapcore.NewCore( zapcore.NewJSONEncoder(encoderCfg), zapcore.Lock(os.Stdout), level, ) core = zapcore.RegisterHooks(core, prometheusHook(metricsNamespace)) logger := zap.New(core) zap.ReplaceGlobals(logger) return logger } // prometheusHook is a hook for the zap library which exposes Prometheus counters for various log levels. func prometheusHook(metricsNamespace string) func(zapcore.Entry) error { messageCounterVec := promauto.NewCounterVec(prometheus.CounterOpts{ Namespace: metricsNamespace, Name: "log_messages_total", Help: "Total number of log messages by log level emitted by KMinion.", }, []string{"level"}) // Initialize counters for all supported log levels so that they expose 0 for each level on startup supportedLevels := []zapcore.Level{ zapcore.DebugLevel, zapcore.InfoLevel, zapcore.WarnLevel, zapcore.ErrorLevel, zapcore.FatalLevel, zapcore.PanicLevel, } for _, level := range supportedLevels { messageCounterVec.WithLabelValues(level.String()) } return func(entry zapcore.Entry) error { messageCounterVec.WithLabelValues(entry.Level.String()).Inc() return nil } } ================================================ FILE: main.go ================================================ package main import ( "context" "errors" "fmt" "net" "net/http" "os" "os/signal" "strconv" "github.com/cloudhut/kminion/v2/e2e" "github.com/cloudhut/kminion/v2/kafka" "github.com/cloudhut/kminion/v2/logging" "github.com/cloudhut/kminion/v2/minion" "github.com/cloudhut/kminion/v2/prometheus" promclient "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "go.uber.org/zap" ) var ( // ------------------------------------------------------------------------ // Below parameters are set at build time using ldflags. // ------------------------------------------------------------------------ // version is KMinion's SemVer version (for example: v1.0.0). version = "development" // builtAt is a string that represent a human-readable date when the binary was built. builtAt = "N/A" // commit is a string that represents the last git commit for this build. commit = "N/A" ) func main() { startupLogger, err := zap.NewProduction() if err != nil { panic(fmt.Errorf("failed to create startup logger: %w", err)) } cfg, err := newConfig(startupLogger) if err != nil { startupLogger.Fatal("failed to parse config", zap.Error(err)) } logger := logging.NewLogger(cfg.Logger, cfg.Exporter.Namespace).Named("main") if err != nil { startupLogger.Fatal("failed to create new logger", zap.Error(err)) } logger.Info("started kminion", zap.String("version", version), zap.String("built_at", builtAt)) // Setup context that stops when the application receives an interrupt signal ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt) defer stop() wrappedRegisterer := promclient.WrapRegistererWithPrefix(cfg.Exporter.Namespace+"_", promclient.DefaultRegisterer) // Create kafka service kafkaSvc := kafka.NewService(cfg.Kafka, logger) // Create minion service // Prometheus exporter only talks to the minion service which // issues all the requests to Kafka and wraps the interface accordingly. minionSvc, err := minion.NewService(cfg.Minion, logger, kafkaSvc, cfg.Exporter.Namespace, ctx) if err != nil { logger.Fatal("failed to setup minion service", zap.Error(err)) } err = minionSvc.Start(ctx) if err != nil { logger.Fatal("failed to start minion service", zap.Error(err)) } // Create end to end testing service if cfg.Minion.EndToEnd.Enabled { e2eService, err := e2e.NewService( ctx, cfg.Minion.EndToEnd, logger, kafkaSvc, wrappedRegisterer, ) if err != nil { logger.Fatal("failed to create end-to-end monitoring service: %w", zap.Error(err)) } if err = e2eService.Start(ctx); err != nil { logger.Fatal("failed to start end-to-end monitoring service", zap.Error(err)) } } // The Prometheus exporter that implements the Prometheus collector interface exporter, err := prometheus.NewExporter(cfg.Exporter, logger, minionSvc) if err != nil { logger.Fatal("failed to setup prometheus exporter", zap.Error(err)) } exporter.InitializeMetrics() promclient.MustRegister(exporter) http.Handle("/metrics", promhttp.InstrumentMetricHandler( promclient.DefaultRegisterer, promhttp.HandlerFor( promclient.DefaultGatherer, promhttp.HandlerOpts{}, ), ), ) http.Handle("/ready", minionSvc.HandleIsReady()) // Start HTTP server address := net.JoinHostPort(cfg.Exporter.Host, strconv.Itoa(cfg.Exporter.Port)) srv := &http.Server{Addr: address} go func() { <-ctx.Done() if err := srv.Shutdown(context.Background()); err != nil { logger.Error("error stopping HTTP server", zap.Error(err)) os.Exit(1) } }() logger.Info("listening on address", zap.String("listen_address", address)) if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { logger.Error("error starting HTTP server", zap.Error(err)) os.Exit(1) } logger.Info("kminion stopped") } ================================================ FILE: minion/client_hooks.go ================================================ package minion import ( "net" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "github.com/twmb/franz-go/pkg/kgo" "go.uber.org/zap" ) // clientHooks implements the various hook interfaces from the franz-go (kafka) library. We can use these hooks to // log additional information, collect Prometheus metrics and similar. type clientHooks struct { logger *zap.Logger requestSentCount prometheus.Counter bytesSent prometheus.Counter requestsReceivedCount prometheus.Counter bytesReceived prometheus.Counter } func newMinionClientHooks(logger *zap.Logger, metricsNamespace string) *clientHooks { requestSentCount := promauto.NewCounter(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: "kafka", Name: "requests_sent_total"}) bytesSent := promauto.NewCounter(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: "kafka", Name: "sent_bytes", }) requestsReceivedCount := promauto.NewCounter(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: "kafka", Name: "requests_received_total"}) bytesReceived := promauto.NewCounter(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: "kafka", Name: "received_bytes", }) return &clientHooks{ logger: logger, requestSentCount: requestSentCount, bytesSent: bytesSent, requestsReceivedCount: requestsReceivedCount, bytesReceived: bytesReceived, } } func (c clientHooks) OnBrokerConnect(meta kgo.BrokerMetadata, dialDur time.Duration, _ net.Conn, err error) { if err != nil { c.logger.Debug("kafka connection failed", zap.String("broker_host", meta.Host), zap.Error(err)) return } c.logger.Debug("kafka connection succeeded", zap.String("host", meta.Host), zap.Duration("dial_duration", dialDur)) } func (c clientHooks) OnBrokerDisconnect(meta kgo.BrokerMetadata, _ net.Conn) { c.logger.Debug("kafka broker disconnected", zap.String("host", meta.Host)) } // OnBrokerRead is passed the broker metadata, the key for the response that // was read, the number of bytes read, how long the Client waited // before reading the response, how long it took to read the response, // and any error. // // The bytes written does not count any tls overhead. // OnRead is called after a read from a broker. func (c clientHooks) OnBrokerRead(_ kgo.BrokerMetadata, _ int16, bytesRead int, _, _ time.Duration, _ error) { c.requestsReceivedCount.Inc() c.bytesReceived.Add(float64(bytesRead)) } // OnBrokerWrite is passed the broker metadata, the key for the request that // was written, the number of bytes written, how long the request // waited before being written, how long it took to write the request, // and any error. // // The bytes written does not count any tls overhead. // OnWrite is called after a write to a broker. func (c clientHooks) OnBrokerWrite(_ kgo.BrokerMetadata, _ int16, bytesWritten int, _, _ time.Duration, _ error) { c.requestSentCount.Inc() c.bytesSent.Add(float64(bytesWritten)) } ================================================ FILE: minion/config.go ================================================ package minion import ( "fmt" "github.com/cloudhut/kminion/v2/e2e" ) type Config struct { ConsumerGroups ConsumerGroupConfig `koanf:"consumerGroups"` Topics TopicConfig `koanf:"topics"` LogDirs LogDirsConfig `koanf:"logDirs"` EndToEnd e2e.Config `koanf:"endToEnd"` } func (c *Config) SetDefaults() { c.ConsumerGroups.SetDefaults() c.Topics.SetDefaults() c.LogDirs.SetDefaults() c.EndToEnd.SetDefaults() } func (c *Config) Validate() error { err := c.ConsumerGroups.Validate() if err != nil { return fmt.Errorf("failed to consumer group config: %w", err) } err = c.Topics.Validate() if err != nil { return fmt.Errorf("failed to validate topic config: %w", err) } err = c.LogDirs.Validate() if err != nil { return fmt.Errorf("failed to validate log dirs config: %w", err) } err = c.EndToEnd.Validate() if err != nil { return fmt.Errorf("failed to validate endToEnd config: %w", err) } return nil } ================================================ FILE: minion/config_consumer_group.go ================================================ package minion import ( "fmt" ) const ( ConsumerGroupScrapeModeOffsetsTopic string = "offsetsTopic" ConsumerGroupScrapeModeAdminAPI string = "adminApi" ConsumerGroupGranularityTopic string = "topic" ConsumerGroupGranularityPartition string = "partition" ) type ConsumerGroupConfig struct { // Enabled specifies whether consumer groups shall be scraped and exported or not. Enabled bool `koanf:"enabled"` // Mode specifies whether we export consumer group offsets using the Admin API or by consuming the internal // __consumer_offsets topic. ScrapeMode string `koanf:"scrapeMode"` // Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and // you aren't interested in per partition lags you could choose "topic" where all partition lags will be summed // and only topic lags will be exported. Granularity string `koanf:"granularity"` // AllowedGroups are regex strings of group ids that shall be exported AllowedGroupIDs []string `koanf:"allowedGroups"` // IgnoredGroups are regex strings of group ids that shall be ignored/skipped when exporting metrics. Ignored groups // take precedence over allowed groups. IgnoredGroupIDs []string `koanf:"ignoredGroups"` } func (c *ConsumerGroupConfig) SetDefaults() { c.Enabled = true c.ScrapeMode = ConsumerGroupScrapeModeAdminAPI c.Granularity = ConsumerGroupGranularityPartition c.AllowedGroupIDs = []string{"/.*/"} } func (c *ConsumerGroupConfig) Validate() error { switch c.ScrapeMode { case ConsumerGroupScrapeModeOffsetsTopic, ConsumerGroupScrapeModeAdminAPI: default: return fmt.Errorf("invalid scrape mode '%v' specified. Valid modes are '%v' or '%v'", c.ScrapeMode, ConsumerGroupScrapeModeOffsetsTopic, ConsumerGroupScrapeModeAdminAPI) } switch c.Granularity { case ConsumerGroupGranularityTopic, ConsumerGroupGranularityPartition: default: return fmt.Errorf("invalid consumer group granularity '%v' specified. Valid modes are '%v' or '%v'", c.Granularity, ConsumerGroupGranularityTopic, ConsumerGroupGranularityPartition) } // Check if all group strings are valid regex or literals for _, groupID := range c.AllowedGroupIDs { _, err := compileRegex(groupID) if err != nil { return fmt.Errorf("allowed group string '%v' is not valid regex", groupID) } } for _, groupID := range c.IgnoredGroupIDs { _, err := compileRegex(groupID) if err != nil { return fmt.Errorf("ignored group string '%v' is not valid regex", groupID) } } return nil } ================================================ FILE: minion/config_log_dirs.go ================================================ package minion type LogDirsConfig struct { // Enabled specifies whether log dirs shall be scraped and exported or not. This should be disabled for clusters prior // to version 1.0.0 as describing log dirs was not supported back then. Enabled bool `koanf:"enabled"` } // Validate if provided LogDirsConfig is valid. func (c *LogDirsConfig) Validate() error { return nil } // SetDefaults for topic config func (c *LogDirsConfig) SetDefaults() { c.Enabled = true } ================================================ FILE: minion/config_topic_config.go ================================================ package minion import ( "fmt" ) const ( TopicGranularityTopic string = "topic" TopicGranularityPartition string = "partition" ) type TopicConfig struct { // Enabled can be set to false in order to not collect any topic metrics at all. Enabled bool `koanf:"enabled"` // Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and // you aren't interested in per partition metrics you could choose "topic". Granularity string `koanf:"granularity"` // AllowedTopics are regex strings of topic names whose topic metrics that shall be exported. AllowedTopics []string `koanf:"allowedTopics"` // IgnoredTopics are regex strings of topic names that shall be ignored/skipped when exporting metrics. Ignored topics // take precedence over allowed topics. IgnoredTopics []string `koanf:"ignoredTopics"` // InfoMetric configures how the kafka_topic_info metric is populated InfoMetric InfoMetricConfig `koanf:"infoMetric"` } type InfoMetricConfig struct { // ConfigKeys configures optional topic configuration keys that should be exported // as prometheus metric labels. // By default only "cleanup.policy" is exported ConfigKeys []string `koanf:"configKeys"` } // Validate if provided TopicConfig is valid. func (c *TopicConfig) Validate() error { switch c.Granularity { case TopicGranularityPartition, TopicGranularityTopic: default: return fmt.Errorf("given granularity '%v' is invalid", c.Granularity) } // Check whether each provided string is valid regex for _, topic := range c.AllowedTopics { _, err := compileRegex(topic) if err != nil { return fmt.Errorf("allowed topic string '%v' is not valid regex", topic) } } for _, topic := range c.IgnoredTopics { _, err := compileRegex(topic) if err != nil { return fmt.Errorf("ignored topic string '%v' is not valid regex", topic) } } return nil } // SetDefaults for topic config func (c *TopicConfig) SetDefaults() { c.Enabled = true c.Granularity = TopicGranularityPartition c.AllowedTopics = []string{"/.*/"} c.InfoMetric = InfoMetricConfig{ConfigKeys: []string{"cleanup.policy"}} } ================================================ FILE: minion/consumer_group_offsets.go ================================================ package minion import ( "context" "fmt" "sync" "github.com/twmb/franz-go/pkg/kmsg" "go.uber.org/zap" "golang.org/x/sync/errgroup" ) // ListAllConsumerGroupOffsetsInternal returns a map from the in memory storage. The map value is the offset commit // value and is grouped by group id, topic, partition id as keys of the nested maps. func (s *Service) ListAllConsumerGroupOffsetsInternal() map[string]map[string]map[int32]OffsetCommit { return s.storage.getGroupOffsets() } // ListAllConsumerGroupOffsetsAdminAPI return all consumer group offsets using Kafka's Admin API. func (s *Service) ListAllConsumerGroupOffsetsAdminAPI(ctx context.Context) (map[string]*kmsg.OffsetFetchResponse, error) { groupsRes, err := s.listConsumerGroupsCached(ctx) if err != nil { return nil, fmt.Errorf("failed to list groupsRes: %w", err) } groupIDs := make([]string, len(groupsRes.Groups)) for i, group := range groupsRes.Groups { groupIDs[i] = group.Group } return s.listConsumerGroupOffsetsBulk(ctx, groupIDs) } // listConsumerGroupOffsetsBulk returns a map which has the Consumer group name as key func (s *Service) listConsumerGroupOffsetsBulk(ctx context.Context, groups []string) (map[string]*kmsg.OffsetFetchResponse, error) { eg, _ := errgroup.WithContext(ctx) mutex := sync.Mutex{} res := make(map[string]*kmsg.OffsetFetchResponse) f := func(group string) func() error { return func() error { offsets, err := s.listConsumerGroupOffsets(ctx, group) if err != nil { s.logger.Warn("failed to fetch consumer group offsets, inner kafka error", zap.String("consumer_group", group), zap.Error(err)) return nil } mutex.Lock() res[group] = offsets mutex.Unlock() return nil } } for _, group := range groups { eg.Go(f(group)) } if err := eg.Wait(); err != nil { return nil, err } return res, nil } // listConsumerGroupOffsets returns the committed group offsets for a single group func (s *Service) listConsumerGroupOffsets(ctx context.Context, group string) (*kmsg.OffsetFetchResponse, error) { req := kmsg.NewOffsetFetchRequest() req.Group = group req.Topics = nil res, err := req.RequestWith(ctx, s.client) if err != nil { return nil, fmt.Errorf("failed to request group offsets for group '%v': %w", group, err) } return res, nil } ================================================ FILE: minion/describe_consumer_groups.go ================================================ package minion import ( "context" "fmt" "time" "github.com/twmb/franz-go/pkg/kerr" "github.com/twmb/franz-go/pkg/kgo" "github.com/twmb/franz-go/pkg/kmsg" "go.uber.org/zap" ) type DescribeConsumerGroupsResponse struct { BrokerMetadata kgo.BrokerMetadata Groups *kmsg.DescribeGroupsResponse } func (s *Service) listConsumerGroupsCached(ctx context.Context) (*kmsg.ListGroupsResponse, error) { reqId := ctx.Value("requestId").(string) key := "list-consumer-groups-" + reqId if cachedRes, exists := s.getCachedItem(key); exists { return cachedRes.(*kmsg.ListGroupsResponse), nil } res, err, _ := s.requestGroup.Do(key, func() (interface{}, error) { res, err := s.listConsumerGroups(ctx) if err != nil { return nil, err } s.setCachedItem(key, res, 120*time.Second) return res, nil }) if err != nil { return nil, err } return res.(*kmsg.ListGroupsResponse), nil } func (s *Service) listConsumerGroups(ctx context.Context) (*kmsg.ListGroupsResponse, error) { listReq := kmsg.NewListGroupsRequest() res, err := listReq.RequestWith(ctx, s.client) if err != nil { return nil, fmt.Errorf("failed to list consumer groups: %w", err) } err = kerr.ErrorForCode(res.ErrorCode) if err != nil { return nil, fmt.Errorf("failed to list consumer groups. inner kafka error: %w", err) } return res, nil } func (s *Service) DescribeConsumerGroups(ctx context.Context) ([]DescribeConsumerGroupsResponse, error) { listRes, err := s.listConsumerGroupsCached(ctx) if err != nil { return nil, err } groupIDs := make([]string, len(listRes.Groups)) for i, group := range listRes.Groups { groupIDs[i] = group.Group } describeReq := kmsg.NewDescribeGroupsRequest() describeReq.Groups = groupIDs describeReq.IncludeAuthorizedOperations = false shardedResp := s.client.RequestSharded(ctx, &describeReq) describedGroups := make([]DescribeConsumerGroupsResponse, 0) for _, kresp := range shardedResp { if kresp.Err != nil { s.logger.Warn("broker failed to respond to the described groups request", zap.Int32("broker_id", kresp.Meta.NodeID), zap.Error(kresp.Err)) continue } res := kresp.Resp.(*kmsg.DescribeGroupsResponse) describedGroups = append(describedGroups, DescribeConsumerGroupsResponse{ BrokerMetadata: kresp.Meta, Groups: res, }) } return describedGroups, nil } ================================================ FILE: minion/describe_topic_config.go ================================================ package minion import ( "context" "fmt" "github.com/pkg/errors" "github.com/twmb/franz-go/pkg/kmsg" ) func (s *Service) GetTopicConfigs(ctx context.Context) (*kmsg.DescribeConfigsResponse, error) { metadata, err := s.GetMetadataCached(ctx) if err != nil { return nil, errors.Wrap(err, "failed to get metadata") } req := kmsg.NewDescribeConfigsRequest() for _, topic := range metadata.Topics { resourceReq := kmsg.NewDescribeConfigsRequestResource() resourceReq.ResourceType = kmsg.ConfigResourceTypeTopic resourceReq.ResourceName = *topic.Topic req.Resources = append(req.Resources, resourceReq) } res, err := req.RequestWith(ctx, s.client) if err != nil { return nil, fmt.Errorf("failed to request metadata: %w", err) } return res, nil } ================================================ FILE: minion/list_offsets.go ================================================ package minion import ( "context" "errors" "fmt" "time" "github.com/twmb/franz-go/pkg/kadm" "go.uber.org/zap" ) func (s *Service) ListEndOffsetsCached(ctx context.Context) (kadm.ListedOffsets, error) { return s.listOffsetsCached(ctx, "end") } func (s *Service) ListStartOffsetsCached(ctx context.Context) (kadm.ListedOffsets, error) { return s.listOffsetsCached(ctx, "start") } func (s *Service) listOffsetsCached(ctx context.Context, offsetType string) (kadm.ListedOffsets, error) { reqId := ctx.Value("requestId").(string) key := fmt.Sprintf("partition-%s-offsets-%s", offsetType, reqId) if cachedRes, exists := s.getCachedItem(key); exists { return cachedRes.(kadm.ListedOffsets), nil } var listFunc func(context.Context) (kadm.ListedOffsets, error) switch offsetType { case "end": listFunc = s.ListEndOffsets case "start": listFunc = s.ListStartOffsets default: return nil, fmt.Errorf("invalid offset type: %s", offsetType) } res, err, _ := s.requestGroup.Do(key, func() (interface{}, error) { offsets, err := listFunc(ctx) if err != nil { return nil, err } s.setCachedItem(key, offsets, 120*time.Second) return offsets, nil }) if err != nil { return nil, err } return res.(kadm.ListedOffsets), nil } // ListEndOffsets fetches the high water mark for all topic partitions. func (s *Service) ListEndOffsets(ctx context.Context) (kadm.ListedOffsets, error) { return s.listOffsetsInternal(ctx, s.admClient.ListEndOffsets, "end") } // ListStartOffsets fetches the low water mark for all topic partitions. func (s *Service) ListStartOffsets(ctx context.Context) (kadm.ListedOffsets, error) { return s.listOffsetsInternal(ctx, s.admClient.ListStartOffsets, "start") } type listOffsetsFunc func(context.Context, ...string) (kadm.ListedOffsets, error) func (s *Service) listOffsetsInternal(ctx context.Context, listFunc listOffsetsFunc, offsetType string) (kadm.ListedOffsets, error) { listedOffsets, err := listFunc(ctx) if err != nil { var se *kadm.ShardErrors if !errors.As(err, &se) { return nil, fmt.Errorf("failed to list %s offsets: %w", offsetType, err) } if se.AllFailed { return nil, fmt.Errorf("failed to list %s offsets, all shard responses failed: %w", offsetType, err) } s.logger.Info(fmt.Sprintf("failed to list %s offset from some shards", offsetType), zap.Int("failed_shards", len(se.Errs))) for _, shardErr := range se.Errs { s.logger.Warn(fmt.Sprintf("shard error for listing %s offsets", offsetType), zap.Int32("broker_id", shardErr.Broker.NodeID), zap.Error(shardErr.Err)) } } // Log inner errors before returning them. We do that inside of this function to avoid duplicate logging as the response // are cached for each scrape anyways. // // Create two metrics to aggregate error logs in few messages. Logging one message per occured partition error // is too much. Typical errors are LEADER_NOT_AVAILABLE etc. errorCountByErrCode := make(map[error]int) errorCountByTopic := make(map[string]int) // Iterate on all partitions listedOffsets.Each(func(offset kadm.ListedOffset) { if offset.Err != nil { errorCountByTopic[offset.Topic]++ errorCountByErrCode[offset.Err]++ } }) // Print log line for each error type for err, count := range errorCountByErrCode { s.logger.Warn(fmt.Sprintf("failed to list some partitions %s watermarks", offsetType), zap.Error(err), zap.Int("error_count", count)) } if len(errorCountByTopic) > 0 { s.logger.Warn(fmt.Sprintf("some topics had one or more partitions whose %s watermarks could not be fetched from Kafka", offsetType), zap.Int("topics_with_errors", len(errorCountByTopic))) } return listedOffsets, nil } ================================================ FILE: minion/log_dirs.go ================================================ package minion import ( "context" "github.com/twmb/franz-go/pkg/kgo" "github.com/twmb/franz-go/pkg/kmsg" ) type LogDirResponseShard struct { Err error Broker kgo.BrokerMetadata LogDirs *kmsg.DescribeLogDirsResponse } func (s *Service) DescribeLogDirs(ctx context.Context) []LogDirResponseShard { req := kmsg.NewDescribeLogDirsRequest() req.Topics = nil // Describe all topics responses := s.client.RequestSharded(ctx, &req) res := make([]LogDirResponseShard, len(responses)) for i, responseShard := range responses { logDirs, ok := responseShard.Resp.(*kmsg.DescribeLogDirsResponse) if !ok { logDirs = &kmsg.DescribeLogDirsResponse{} } res[i] = LogDirResponseShard{ Err: responseShard.Err, Broker: responseShard.Meta, LogDirs: logDirs, } } return res } ================================================ FILE: minion/metadata.go ================================================ package minion import ( "context" "fmt" "time" "github.com/twmb/franz-go/pkg/kmsg" ) func (s *Service) GetMetadataCached(ctx context.Context) (*kmsg.MetadataResponse, error) { reqId := ctx.Value("requestId").(string) key := "metadata-" + reqId if cachedRes, exists := s.getCachedItem(key); exists { return cachedRes.(*kmsg.MetadataResponse), nil } res, err, _ := s.requestGroup.Do(key, func() (interface{}, error) { metadata, err := s.GetMetadata(ctx) if err != nil { return nil, err } s.setCachedItem(key, metadata, 120*time.Second) return metadata, nil }) if err != nil { return nil, err } return res.(*kmsg.MetadataResponse), nil } func (s *Service) GetMetadata(ctx context.Context) (*kmsg.MetadataResponse, error) { req := kmsg.NewMetadataRequest() req.Topics = nil res, err := req.RequestWith(ctx, s.client) if err != nil { return nil, fmt.Errorf("failed to request metadata: %w", err) } return res, nil } ================================================ FILE: minion/offset_consumer.go ================================================ package minion import ( "context" "fmt" "time" "github.com/twmb/franz-go/pkg/kbin" "github.com/twmb/franz-go/pkg/kerr" "github.com/twmb/franz-go/pkg/kgo" "github.com/twmb/franz-go/pkg/kmsg" "go.uber.org/zap" ) // startConsumingOffsets consumes the __consumer_offsets topic and forwards the kafka messages to their respective // methods where they'll be decoded and further processed. func (s *Service) startConsumingOffsets(ctx context.Context) { client := s.client s.logger.Info("starting to consume messages from offsets topic") go s.checkIfConsumerLagIsCaughtUp(ctx) for { select { case <-ctx.Done(): return default: fetches := client.PollFetches(ctx) errors := fetches.Errors() for _, err := range errors { // Log all errors and continue afterwards as we might get errors and still have some fetch results s.logger.Error("failed to fetch records from kafka", zap.String("topic", err.Topic), zap.Int32("partition", err.Partition), zap.Error(err.Err)) } iter := fetches.RecordIter() for !iter.Done() { record := iter.Next() s.storage.markRecordConsumed(record) err := s.decodeOffsetRecord(record) if err != nil { s.logger.Warn("failed to decode offset record", zap.Error(err)) } } } } } // checkIfConsumerLagIsCaughtUp fetches the newest partition offsets for all partitions in the __consumer_offsets // topic and compares these against the last consumed messages from our offset consumer. If the consumed offsets are // higher than the partition offsets this means we caught up the initial lag and can mark our storage as ready. A ready // store will start to expose consumer group offsets. func (s *Service) checkIfConsumerLagIsCaughtUp(ctx context.Context) { for { time.Sleep(12 * time.Second) s.logger.Debug("checking if lag in consumer offsets metadataReqTopic is caught up") // 1. Get metadataReqTopic high watermarks for __consumer_offsets metadataReqTopic metadataReq := kmsg.NewMetadataRequest() metadataReqTopic := kmsg.NewMetadataRequestTopic() topicName := "__consumer_offsets" metadataReqTopic.Topic = &topicName metadataReq.Topics = []kmsg.MetadataRequestTopic{metadataReqTopic} res, err := metadataReq.RequestWith(ctx, s.client) if err != nil { s.logger.Warn("failed to check if consumer lag on offsets metadataReqTopic is caught up because metadata request failed", zap.Error(err)) continue } // 2. Request high watermarks for consumer offset partitions topicReqs := make([]kmsg.ListOffsetsRequestTopic, len(res.Topics)) for i, topic := range res.Topics { req := kmsg.NewListOffsetsRequestTopic() req.Topic = *topic.Topic partitionReqs := make([]kmsg.ListOffsetsRequestTopicPartition, len(topic.Partitions)) for j, partition := range topic.Partitions { partitionReqs[j] = kmsg.NewListOffsetsRequestTopicPartition() partitionReqs[j].Partition = partition.Partition partitionReqs[j].Timestamp = -1 // Newest } req.Partitions = partitionReqs topicReqs[i] = req } offsetReq := kmsg.NewListOffsetsRequest() offsetReq.Topics = topicReqs highMarksRes, err := offsetReq.RequestWith(ctx, s.client) if err != nil { s.logger.Warn("failed to check if consumer lag on offsets metadataReqTopic is caught up because high watermark request failed", zap.Error(err)) continue } if len(highMarksRes.Topics) != 1 { s.logger.Error("expected exactly one metadataReqTopic response for high water mark request") continue } // 3. Check if high watermarks have been consumed. To avoid a race condition here we will wait some time before // comparing, so that the consumer has enough time to catch up to the new high watermarks we just fetched. time.Sleep(3 * time.Second) consumedOffsets := s.storage.getConsumedOffsets() topicRes := highMarksRes.Topics[0] isReady := true type laggingParition struct { Name string Id int32 Lag int64 } var partitionsLagging []laggingParition totalLag := int64(0) for _, partition := range topicRes.Partitions { err := kerr.ErrorForCode(partition.ErrorCode) if err != nil { s.logger.Warn("failed to check if consumer lag on offsets metadataReqTopic is caught up because high "+ "watermark request failed, with an inner error", zap.Error(err)) } highWaterMark := partition.Offset - 1 consumedOffset := consumedOffsets[partition.Partition] partitionLag := highWaterMark - consumedOffset if partitionLag < 0 { partitionLag = 0 } if partitionLag > 0 { partitionsLagging = append(partitionsLagging, laggingParition{ Name: topicRes.Topic, Id: partition.Partition, Lag: partitionLag, }) totalLag += partitionLag s.logger.Debug("consumer_offsets metadataReqTopic lag has not been caught up yet", zap.Int32("partition_id", partition.Partition), zap.Int64("high_water_mark", highWaterMark), zap.Int64("consumed_offset", consumedOffset), zap.Int64("partition_lag", partitionLag)) isReady = false continue } } if isReady { s.logger.Info("successfully consumed all consumer offsets. consumer group lags will be exported from now on") s.storage.setReadyState(true) return } else { s.logger.Info("catching up the message lag on consumer offsets", zap.Int("lagging_partitions_count", len(partitionsLagging)), zap.Any("lagging_partitions", partitionsLagging), zap.Int64("total_lag", totalLag)) } } } // decodeOffsetRecord decodes all messages in the consumer offsets topic by routing records to the correct decoding // method. func (s *Service) decodeOffsetRecord(record *kgo.Record) error { if len(record.Key) < 2 { return fmt.Errorf("offset commit key is supposed to be at least 2 bytes long") } messageVer := (&kbin.Reader{Src: record.Key}).Int16() switch messageVer { case 0, 1: err := s.decodeOffsetCommit(record) if err != nil { return err } case 2: err := s.decodeOffsetMetadata(record) if err != nil { return err } } return nil } // decodeOffsetMetadata decodes to metadata which includes the following information: // - group // - protocolType (connect/consumer/...) // - generation // - protocol // - currentStateTimestamp // - groupMembers (member metadata such aus: memberId, groupInstanceId, clientId, clientHost, rebalanceTimeout, ...) func (s *Service) decodeOffsetMetadata(record *kgo.Record) error { childLogger := s.logger.With( zap.String("topic", record.Topic), zap.Int32("partition_id", record.Partition), zap.Int64("offset", record.Offset)) metadataKey := kmsg.NewGroupMetadataKey() err := metadataKey.ReadFrom(record.Key) if err != nil { childLogger.Warn("failed to decode offset metadata key", zap.Error(err)) return fmt.Errorf("failed to decode offset metadata key: %w", err) } if record.Value == nil { return nil } metadataValue := kmsg.NewGroupMetadataValue() err = metadataValue.ReadFrom(record.Value) if err != nil { childLogger.Warn("failed to decode offset metadata value", zap.Error(err)) return fmt.Errorf("failed to decode offset metadata value: %w", err) } return nil } // decodeOffsetCommit decodes to group offsets which include the following information: // - group, topic, partition // - offset // - leaderEpoch // - metadata (user specified string for each offset commit) // - commitTimestamp // - expireTimestamp (only version 1 offset commits / deprecated) func (s *Service) decodeOffsetCommit(record *kgo.Record) error { childLogger := s.logger.With( zap.String("topic", record.Topic), zap.Int32("partition_id", record.Partition), zap.Int64("offset", record.Offset)) offsetCommitKey := kmsg.NewOffsetCommitKey() err := offsetCommitKey.ReadFrom(record.Key) if err != nil { childLogger.Warn("failed to decode offset commit key", zap.Error(err)) return fmt.Errorf("failed to decode offset commit key: %w", err) } if record.Value == nil { // Tombstone - The group offset is expired or no longer valid (e.g. because the topic has been deleted) s.storage.deleteOffsetCommit(offsetCommitKey) return nil } offsetCommitValue := kmsg.NewOffsetCommitValue() err = offsetCommitValue.ReadFrom(record.Value) if err != nil { childLogger.Warn("failed to decode offset commit value", zap.Error(err)) return fmt.Errorf("failed to decode offset commit value: %w", err) } s.storage.addOffsetCommit(offsetCommitKey, offsetCommitValue) return nil } func (s *Service) GetNumberOfOffsetRecordsConsumed() float64 { return s.storage.getNumberOfConsumedRecords() } ================================================ FILE: minion/service.go ================================================ package minion import ( "context" "encoding/json" "fmt" "net/http" "regexp" "strings" "sync" "time" "github.com/twmb/franz-go/pkg/kadm" "github.com/twmb/franz-go/pkg/kgo" "github.com/twmb/franz-go/pkg/kmsg" "github.com/twmb/franz-go/pkg/kversion" "go.uber.org/zap" "golang.org/x/sync/singleflight" "github.com/cloudhut/kminion/v2/kafka" ) type Service struct { Cfg Config logger *zap.Logger // requestGroup is used to deduplicate multiple concurrent requests to kafka requestGroup *singleflight.Group cache map[string]interface{} cacheLock sync.RWMutex AllowedGroupIDsExpr []*regexp.Regexp IgnoredGroupIDsExpr []*regexp.Regexp AllowedTopicsExpr []*regexp.Regexp IgnoredTopicsExpr []*regexp.Regexp client *kgo.Client admClient *kadm.Client storage *Storage } func NewService(cfg Config, logger *zap.Logger, kafkaSvc *kafka.Service, metricsNamespace string, ctx context.Context) (*Service, error) { storage, err := newStorage(logger) if err != nil { return nil, fmt.Errorf("failed to create storage: %w", err) } // Kafka client minionHooks := newMinionClientHooks(logger.Named("kafka_hooks"), metricsNamespace) kgoOpts := []kgo.Opt{ kgo.WithHooks(minionHooks), } if cfg.ConsumerGroups.Enabled && cfg.ConsumerGroups.ScrapeMode == ConsumerGroupScrapeModeOffsetsTopic { kgoOpts = append(kgoOpts, kgo.ConsumeResetOffset(kgo.NewOffset().AtStart()), kgo.ConsumeTopics("__consumer_offsets")) } logger.Info("connecting to Kafka seed brokers, trying to fetch cluster metadata", zap.String("seed_brokers", strings.Join(kafkaSvc.Brokers(), ","))) client, err := kafkaSvc.CreateAndTestClient(ctx, logger, kgoOpts) if err != nil { return nil, fmt.Errorf("failed to create kafka client: %w", err) } logger.Info("successfully connected to kafka cluster") // Compile regexes. We can ignore the errors because valid compilation has been validated already allowedGroupIDsExpr, _ := compileRegexes(cfg.ConsumerGroups.AllowedGroupIDs) ignoredGroupIDsExpr, _ := compileRegexes(cfg.ConsumerGroups.IgnoredGroupIDs) allowedTopicsExpr, _ := compileRegexes(cfg.Topics.AllowedTopics) ignoredTopicsExpr, _ := compileRegexes(cfg.Topics.IgnoredTopics) service := &Service{ Cfg: cfg, logger: logger.Named("minion_service"), requestGroup: &singleflight.Group{}, cache: make(map[string]interface{}), cacheLock: sync.RWMutex{}, AllowedGroupIDsExpr: allowedGroupIDsExpr, IgnoredGroupIDsExpr: ignoredGroupIDsExpr, AllowedTopicsExpr: allowedTopicsExpr, IgnoredTopicsExpr: ignoredTopicsExpr, client: client, admClient: kadm.NewClient(client), storage: storage, } return service, nil } func (s *Service) Start(ctx context.Context) error { err := s.ensureCompatibility(ctx) if err != nil { return fmt.Errorf("failed to check feature compatibility against Kafka: %w", err) } if s.Cfg.ConsumerGroups.Enabled && s.Cfg.ConsumerGroups.ScrapeMode == ConsumerGroupScrapeModeOffsetsTopic { go s.startConsumingOffsets(ctx) } return nil } func (s *Service) isReady() bool { if s.Cfg.ConsumerGroups.ScrapeMode == ConsumerGroupScrapeModeAdminAPI { return true } return s.storage.isReady() } func (s *Service) HandleIsReady() http.HandlerFunc { type response struct { StatusCode int `json:"statusCode"` } return func(w http.ResponseWriter, r *http.Request) { status := http.StatusOK if !s.isReady() { status = http.StatusServiceUnavailable } res := response{StatusCode: status} resJson, _ := json.Marshal(res) w.WriteHeader(status) w.Write(resJson) } } // ensureCompatibility checks whether the options as configured are available in the connected cluster. For example // we will check if the target Kafka's API version support the LogDirs request. If that's not the case we will // disable the option and print a warning message. func (s *Service) ensureCompatibility(ctx context.Context) error { ctx, cancel := context.WithTimeout(ctx, 15*time.Second) defer cancel() versionsRes, err := s.GetAPIVersions(ctx) if err != nil { return fmt.Errorf("kafka api versions couldn't be fetched: %w", err) } versions := kversion.FromApiVersionsResponse(versionsRes) // Check Describe Log Dirs if s.Cfg.LogDirs.Enabled { k := kmsg.NewDescribeLogDirsRequest() isSupported := versions.HasKey(k.Key()) if !isSupported { s.logger.Warn("describing log dirs is enabled, but it is not supported because your Kafka cluster " + "version is too old. feature will be disabled") s.Cfg.LogDirs.Enabled = false } } return nil } func (s *Service) getCachedItem(key string) (interface{}, bool) { s.cacheLock.RLock() defer s.cacheLock.RUnlock() val, exists := s.cache[key] return val, exists } func (s *Service) setCachedItem(key string, val interface{}, timeout time.Duration) { s.cacheLock.Lock() defer s.cacheLock.Unlock() go func() { time.Sleep(timeout) s.deleteCachedItem(key) }() s.cache[key] = val } func (s *Service) deleteCachedItem(key string) { s.cacheLock.Lock() defer s.cacheLock.Unlock() delete(s.cache, key) } ================================================ FILE: minion/storage.go ================================================ package minion import ( "fmt" "strconv" "time" cmap "github.com/orcaman/concurrent-map" "github.com/twmb/franz-go/pkg/kgo" "github.com/twmb/franz-go/pkg/kmsg" "go.uber.org/atomic" "go.uber.org/zap" ) // Storage stores the current state of all consumer group information that has been consumed using the offset consumer. type Storage struct { logger *zap.Logger // offsetCommits is a map of all consumer offsets. // A unique key in the format "group:topic:partition" is used as map key. // Value is of type OffsetCommit offsetCommits cmap.ConcurrentMap // progressTracker is a map that tracks what offsets in each partition have already been consumed progressTracker cmap.ConcurrentMap isReadyBool *atomic.Bool // Number of consumed records (used for a Prometheus metric) consumedRecords *atomic.Float64 } // OffsetCommit is used as value for the OffsetCommit map type OffsetCommit struct { Key kmsg.OffsetCommitKey Value kmsg.OffsetCommitValue // CommitCount is the number of offset commits for this group-topic-partition combination CommitCount int // ExpireTimestamp is a timestamp that indicates when this offset commit will expire on the Kafka cluster ExpireTimestamp time.Time } func newStorage(logger *zap.Logger) (*Storage, error) { return &Storage{ logger: logger.Named("storage"), offsetCommits: cmap.New(), progressTracker: cmap.New(), isReadyBool: atomic.NewBool(false), consumedRecords: atomic.NewFloat64(0), }, nil } func (s *Storage) isReady() bool { return s.isReadyBool.Load() } func (s *Storage) setReadyState(isReady bool) { s.isReadyBool.Store(isReady) } // markRecordConsumed stores the latest consumed offset for each partition. This is necessary in order to figure out // whether we have caught up the message lag when starting KMinion as we start consuming from the very oldest offset // commit. func (s *Storage) markRecordConsumed(rec *kgo.Record) { key := fmt.Sprintf("%v", rec.Partition) s.progressTracker.Set(key, rec.Offset) s.consumedRecords.Add(1) } func (s *Storage) addOffsetCommit(key kmsg.OffsetCommitKey, value kmsg.OffsetCommitValue) { // For performance reasons we'll store offset commits using a "unique key". Writes happen way more frequently than // reads (Prometheus scraping the endpoint). Hence we can group everything by group or topic on the read path as // needed instead of writing it into nested maps like a map[GroupID]map[Topic]map[Partition] uniqueKey := encodeOffsetCommitKey(key) commitCount := 0 commitInterface, exists := s.offsetCommits.Get(uniqueKey) if exists { offsetCommit := commitInterface.(OffsetCommit) commitCount = offsetCommit.CommitCount } timeDay := 24 * time.Hour commit := OffsetCommit{ Key: key, Value: value, CommitCount: commitCount + 1, ExpireTimestamp: time.Unix(0, value.CommitTimestamp*int64(time.Millisecond)).Add(7 * timeDay), } s.offsetCommits.Set(uniqueKey, commit) } func (s *Storage) getConsumedOffsets() map[int32]int64 { offsetsByPartition := make(map[int32]int64) offsets := s.progressTracker.Items() for partitionID, offsetStr := range offsets { val := offsetStr.(int64) partitionID, _ := strconv.ParseInt(partitionID, 10, 32) offsetsByPartition[int32(partitionID)] = val } return offsetsByPartition } func (s *Storage) getNumberOfConsumedRecords() float64 { return s.consumedRecords.Load() } func (s *Storage) getGroupOffsets() map[string]map[string]map[int32]OffsetCommit { // Offsets by group, topic, partition offsetsByGroup := make(map[string]map[string]map[int32]OffsetCommit) if !s.isReady() { s.logger.Info("Tried to fetch consumer group offsets, but haven't consumed the whole topic yet") return offsetsByGroup } offsets := s.offsetCommits.Items() for _, offset := range offsets { val := offset.(OffsetCommit) // Initialize inner maps as necessary if _, exists := offsetsByGroup[val.Key.Group]; !exists { offsetsByGroup[val.Key.Group] = make(map[string]map[int32]OffsetCommit) } if _, exists := offsetsByGroup[val.Key.Group][val.Key.Topic]; !exists { offsetsByGroup[val.Key.Group][val.Key.Topic] = make(map[int32]OffsetCommit) } offsetsByGroup[val.Key.Group][val.Key.Topic][val.Key.Partition] = val } return offsetsByGroup } func (s *Storage) deleteOffsetCommit(key kmsg.OffsetCommitKey) { uniqueKey := encodeOffsetCommitKey(key) s.offsetCommits.Remove(uniqueKey) } func encodeOffsetCommitKey(key kmsg.OffsetCommitKey) string { return fmt.Sprintf("%v:%v:%v", key.Group, key.Topic, key.Partition) } ================================================ FILE: minion/utils.go ================================================ package minion import ( "fmt" "regexp" "strings" ) func (s *Service) IsGroupAllowed(groupName string) bool { isAllowed := false for _, regex := range s.AllowedGroupIDsExpr { if regex.MatchString(groupName) { isAllowed = true break } } for _, regex := range s.IgnoredGroupIDsExpr { if regex.MatchString(groupName) { isAllowed = false break } } return isAllowed } func (s *Service) IsTopicAllowed(topicName string) bool { isAllowed := false for _, regex := range s.AllowedTopicsExpr { if regex.MatchString(topicName) { isAllowed = true break } } for _, regex := range s.IgnoredTopicsExpr { if regex.MatchString(topicName) { isAllowed = false break } } return isAllowed } func compileRegex(expr string) (*regexp.Regexp, error) { if strings.HasPrefix(expr, "/") && strings.HasSuffix(expr, "/") { substr := expr[1 : len(expr)-1] regex, err := regexp.Compile(substr) if err != nil { return nil, err } return regex, nil } // If this is no regex input (which is marked by the slashes around it) then we escape it so that it's a literal regex, err := regexp.Compile("^" + expr + "$") if err != nil { return nil, err } return regex, nil } func compileRegexes(expr []string) ([]*regexp.Regexp, error) { compiledExpressions := make([]*regexp.Regexp, len(expr)) for i, exprStr := range expr { expr, err := compileRegex(exprStr) if err != nil { return nil, fmt.Errorf("failed to compile expression string '%v': %w", exprStr, err) } compiledExpressions[i] = expr } return compiledExpressions, nil } ================================================ FILE: minion/versions.go ================================================ package minion import ( "context" "fmt" "github.com/twmb/franz-go/pkg/kerr" "github.com/twmb/franz-go/pkg/kmsg" "github.com/twmb/franz-go/pkg/kversion" ) func (s *Service) GetClusterVersion(ctx context.Context) (string, error) { res, err := s.GetAPIVersions(ctx) if err != nil { return "", err } versions := kversion.FromApiVersionsResponse(res) return versions.VersionGuess(), nil } func (s *Service) GetAPIVersions(ctx context.Context) (*kmsg.ApiVersionsResponse, error) { versionsReq := kmsg.NewApiVersionsRequest() versionsReq.ClientSoftwareName = "kminion" versionsReq.ClientSoftwareVersion = "v2" res, err := versionsReq.RequestWith(ctx, s.client) if err != nil { return nil, fmt.Errorf("failed to request api versions: %w", err) } err = kerr.ErrorForCode(res.ErrorCode) if err != nil { return nil, fmt.Errorf("failed to request api versions. Inner kafka error: %w", err) } return res, nil } ================================================ FILE: prometheus/collect_broker_info.go ================================================ package prometheus import ( "context" "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" "strconv" ) func (e *Exporter) collectBrokerInfo(ctx context.Context, ch chan<- prometheus.Metric) bool { metadata, err := e.minionSvc.GetMetadataCached(ctx) if err != nil { e.logger.Error("failed to get kafka metadata", zap.Error(err)) return false } for _, broker := range metadata.Brokers { rack := "" if broker.Rack != nil { rack = *broker.Rack } isController := metadata.ControllerID == broker.NodeID ch <- prometheus.MustNewConstMetric( e.brokerInfo, prometheus.GaugeValue, 1, strconv.Itoa(int(broker.NodeID)), broker.Host, strconv.Itoa(int(broker.Port)), rack, strconv.FormatBool(isController), ) } return true } ================================================ FILE: prometheus/collect_cluster_info.go ================================================ package prometheus import ( "context" "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" "strconv" ) func (e *Exporter) collectClusterInfo(ctx context.Context, ch chan<- prometheus.Metric) bool { version, err := e.minionSvc.GetClusterVersion(ctx) if err != nil { e.logger.Error("failed to get kafka cluster version", zap.Error(err)) return false } metadata, err := e.minionSvc.GetMetadataCached(ctx) if err != nil { e.logger.Error("failed to get kafka metadata", zap.Error(err)) return false } brokerCount := len(metadata.Brokers) clusterID := "" if metadata.ClusterID != nil { clusterID = *metadata.ClusterID } ch <- prometheus.MustNewConstMetric( e.clusterInfo, prometheus.GaugeValue, 1, version, strconv.Itoa(brokerCount), strconv.Itoa(int(metadata.ControllerID)), clusterID, ) return true } ================================================ FILE: prometheus/collect_consumer_group_lags.go ================================================ package prometheus import ( "context" "math" "strconv" "github.com/prometheus/client_golang/prometheus" "github.com/twmb/franz-go/pkg/kadm" "github.com/twmb/franz-go/pkg/kerr" "go.uber.org/zap" "github.com/cloudhut/kminion/v2/minion" ) type waterMark struct { TopicName string PartitionID int32 LowWaterMark int64 HighWaterMark int64 } func (e *Exporter) collectConsumerGroupLags(ctx context.Context, ch chan<- prometheus.Metric) bool { if !e.minionSvc.Cfg.ConsumerGroups.Enabled { return true } // Low Watermarks (at the moment they are not needed at all, they could be used to calculate the lag on partitions // that don't have any active offsets) lowWaterMarks, err := e.minionSvc.ListStartOffsetsCached(ctx) if err != nil { e.logger.Error("failed to fetch low water marks", zap.Error(err)) return false } // High Watermarks highWaterMarks, err := e.minionSvc.ListEndOffsetsCached(ctx) if err != nil { e.logger.Error("failed to fetch low water marks", zap.Error(err)) return false } waterMarksByTopic := e.waterMarksByTopic(lowWaterMarks, highWaterMarks) // We have two different options to get consumer group offsets - either via the AdminAPI or by consuming the // __consumer_offsets topic. if e.minionSvc.Cfg.ConsumerGroups.ScrapeMode == minion.ConsumerGroupScrapeModeAdminAPI { return e.collectConsumerGroupLagsAdminAPI(ctx, ch, waterMarksByTopic) } else { return e.collectConsumerGroupLagsOffsetTopic(ctx, ch, waterMarksByTopic) } } func (e *Exporter) collectConsumerGroupLagsOffsetTopic(_ context.Context, ch chan<- prometheus.Metric, marks map[string]map[int32]waterMark) bool { offsets := e.minionSvc.ListAllConsumerGroupOffsetsInternal() for groupName, group := range offsets { if !e.minionSvc.IsGroupAllowed(groupName) { continue } offsetCommits := 0 for topicName, topic := range group { topicLag := float64(0) topicOffsetSum := float64(0) for partitionID, partition := range topic { childLogger := e.logger.With( zap.String("consumer_group", groupName), zap.String("topic_name", topicName), zap.Int32("partition_id", partitionID), zap.Int64("group_offset", partition.Value.Offset)) topicMark, exists := marks[topicName] if !exists { childLogger.Warn("consumer group has committed offsets on a topic we don't have watermarks for") break // We can stop trying to find any other offsets for that topic so let's quit this loop } partitionMark, exists := topicMark[partitionID] if !exists { childLogger.Warn("consumer group has committed offsets on a partition we don't have watermarks for") continue } lag := float64(partitionMark.HighWaterMark - partition.Value.Offset) // Lag might be negative because we fetch group offsets after we get partition offsets. It's kinda a // race condition. Negative lags obviously do not make sense so use at least 0 as lag. lag = math.Max(0, lag) topicLag += lag topicOffsetSum += float64(partition.Value.Offset) // Offset commit count for this consumer group offsetCommits += partition.CommitCount if e.minionSvc.Cfg.ConsumerGroups.Granularity == minion.ConsumerGroupGranularityTopic { continue } ch <- prometheus.MustNewConstMetric( e.consumerGroupTopicPartitionLag, prometheus.GaugeValue, lag, groupName, topicName, strconv.Itoa(int(partitionID)), ) } ch <- prometheus.MustNewConstMetric( e.consumerGroupTopicLag, prometheus.GaugeValue, topicLag, groupName, topicName, ) ch <- prometheus.MustNewConstMetric( e.consumerGroupTopicOffsetSum, prometheus.GaugeValue, topicOffsetSum, groupName, topicName, ) } ch <- prometheus.MustNewConstMetric( e.offsetCommits, prometheus.CounterValue, float64(offsetCommits), groupName, ) } return true } func (e *Exporter) collectConsumerGroupLagsAdminAPI(ctx context.Context, ch chan<- prometheus.Metric, marks map[string]map[int32]waterMark) bool { isOk := true groupOffsets, err := e.minionSvc.ListAllConsumerGroupOffsetsAdminAPI(ctx) for groupName, offsetRes := range groupOffsets { if !e.minionSvc.IsGroupAllowed(groupName) { continue } err = kerr.ErrorForCode(offsetRes.ErrorCode) if err != nil { e.logger.Warn("failed to get offsets from consumer group, inner kafka error", zap.String("consumer_group", groupName), zap.Error(err)) isOk = false continue } for _, topic := range offsetRes.Topics { topicLag := float64(0) topicOffsetSum := float64(0) for _, partition := range topic.Partitions { err := kerr.ErrorForCode(partition.ErrorCode) if err != nil { e.logger.Warn("failed to get consumer group offsets for a partition, inner kafka error", zap.String("consumer_group", groupName), zap.Error(err)) isOk = false continue } childLogger := e.logger.With( zap.String("consumer_group", groupName), zap.String("topic_name", topic.Topic), zap.Int32("partition_id", partition.Partition), zap.Int64("group_offset", partition.Offset)) topicMark, exists := marks[topic.Topic] if !exists { childLogger.Warn("consumer group has committed offsets on a topic we don't have watermarks for") isOk = false break // We can stop trying to find any other offsets for that topic so let's quit this loop } partitionMark, exists := topicMark[partition.Partition] if !exists { childLogger.Warn("consumer group has committed offsets on a partition we don't have watermarks for") isOk = false continue } lag := float64(partitionMark.HighWaterMark - partition.Offset) // Lag might be negative because we fetch group offsets after we get partition offsets. It's kinda a // race condition. Negative lags obviously do not make sense so use at least 0 as lag. lag = math.Max(0, lag) topicLag += lag topicOffsetSum += float64(partition.Offset) if e.minionSvc.Cfg.ConsumerGroups.Granularity == minion.ConsumerGroupGranularityTopic { continue } ch <- prometheus.MustNewConstMetric( e.consumerGroupTopicPartitionLag, prometheus.GaugeValue, lag, groupName, topic.Topic, strconv.Itoa(int(partition.Partition)), ) } ch <- prometheus.MustNewConstMetric( e.consumerGroupTopicLag, prometheus.GaugeValue, topicLag, groupName, topic.Topic, ) ch <- prometheus.MustNewConstMetric( e.consumerGroupTopicOffsetSum, prometheus.GaugeValue, topicOffsetSum, groupName, topic.Topic, ) } } return isOk } func (e *Exporter) waterMarksByTopic(lowMarks kadm.ListedOffsets, highMarks kadm.ListedOffsets) map[string]map[int32]waterMark { type partitionID = int32 type topicName = string waterMarks := make(map[topicName]map[partitionID]waterMark) for topic, lowMarksByPartitionID := range lowMarks { _, exists := waterMarks[topic] if !exists { waterMarks[topic] = make(map[partitionID]waterMark) } for _, lowOffset := range lowMarksByPartitionID { if lowOffset.Err != nil { e.logger.Debug("failed to get partition low water mark, inner kafka error", zap.String("topic_name", lowOffset.Topic), zap.Int32("partition_id", lowOffset.Partition), zap.Error(lowOffset.Err)) continue } higOffset, exists := highMarks.Lookup(lowOffset.Topic, lowOffset.Partition) if !exists { e.logger.Error("got low water marks for a topic's partition but no high watermarks", zap.String("topic_name", lowOffset.Topic), zap.Int32("partition_id", lowOffset.Partition), zap.Int64("offset", lowOffset.Offset)) delete(waterMarks, lowOffset.Topic) break // Topic watermarks are invalid -> delete & skip this topic } if higOffset.Err != nil { e.logger.Debug("failed to get partition low water mark, inner kafka error", zap.String("topic_name", lowOffset.Topic), zap.Int32("partition_id", lowOffset.Partition), zap.Error(lowOffset.Err)) continue } waterMarks[lowOffset.Topic][lowOffset.Partition] = waterMark{ TopicName: lowOffset.Topic, PartitionID: lowOffset.Partition, LowWaterMark: lowOffset.Offset, HighWaterMark: higOffset.Offset, } } } return waterMarks } ================================================ FILE: prometheus/collect_consumer_groups.go ================================================ package prometheus import ( "context" "fmt" "strconv" "github.com/prometheus/client_golang/prometheus" "github.com/twmb/franz-go/pkg/kerr" "github.com/twmb/franz-go/pkg/kmsg" "go.uber.org/zap" ) func (e *Exporter) collectConsumerGroups(ctx context.Context, ch chan<- prometheus.Metric) bool { if !e.minionSvc.Cfg.ConsumerGroups.Enabled { return true } groups, err := e.minionSvc.DescribeConsumerGroups(ctx) if err != nil { e.logger.Error("failed to collect consumer groups, because Kafka request failed", zap.Error(err)) return false } // The list of groups may be incomplete due to group coordinators that might fail to respond. We do log an error // message in that case (in the kafka request method) and groups will not be included in this list. for _, grp := range groups { coordinator := grp.BrokerMetadata.NodeID for _, group := range grp.Groups.Groups { err := kerr.ErrorForCode(group.ErrorCode) if err != nil { e.logger.Warn("failed to describe consumer group, internal kafka error", zap.Error(err), zap.String("group_id", group.Group), ) continue } if !e.minionSvc.IsGroupAllowed(group.Group) { continue } state := 0 if group.State == "Stable" { state = 1 } ch <- prometheus.MustNewConstMetric( e.consumerGroupInfo, prometheus.GaugeValue, float64(state), group.Group, group.Protocol, group.ProtocolType, group.State, strconv.FormatInt(int64(coordinator), 10), ) // total number of members in consumer groups ch <- prometheus.MustNewConstMetric( e.consumerGroupMembers, prometheus.GaugeValue, float64(len(group.Members)), group.Group, ) // iterate all members and build two maps: // - {topic -> number-of-consumers} // - {topic -> number-of-partitions-assigned} topicConsumers := make(map[string]int) topicPartitionsAssigned := make(map[string]int) membersWithEmptyAssignment := 0 failedAssignmentsDecode := 0 for _, member := range group.Members { if len(member.MemberAssignment) == 0 { membersWithEmptyAssignment++ continue } kassignment, err := decodeMemberAssignments(group.ProtocolType, member) if err != nil { e.logger.Debug("failed to decode consumer group member assignment, internal kafka error", zap.Error(err), zap.String("group_id", group.Group), zap.String("client_id", member.ClientID), zap.String("member_id", member.MemberID), zap.String("client_host", member.ClientHost), ) failedAssignmentsDecode++ continue } if kassignment == nil { // This is expected in the case of protocolTypes that don't provide valuable information continue } if len(kassignment.Topics) == 0 { membersWithEmptyAssignment++ } for _, topic := range kassignment.Topics { topicConsumers[topic.Topic]++ topicPartitionsAssigned[topic.Topic] += len(topic.Partitions) } } if failedAssignmentsDecode > 0 { e.logger.Error("failed to decode consumer group member assignment, internal kafka error", zap.Error(err), zap.String("group_id", group.Group), zap.Int("assignment_decode_failures", failedAssignmentsDecode), ) } // number of members with no assignment in a stable consumer group if membersWithEmptyAssignment > 0 { ch <- prometheus.MustNewConstMetric( e.consumerGroupMembersEmpty, prometheus.GaugeValue, float64(membersWithEmptyAssignment), group.Group, ) } // number of members in consumer groups for each topic for topicName, consumers := range topicConsumers { ch <- prometheus.MustNewConstMetric( e.consumerGroupTopicMembers, prometheus.GaugeValue, float64(consumers), group.Group, topicName, ) } // number of partitions assigned in consumer groups for each topic for topicName, partitions := range topicPartitionsAssigned { ch <- prometheus.MustNewConstMetric( e.consumerGroupAssignedTopicPartitions, prometheus.GaugeValue, float64(partitions), group.Group, topicName, ) } } } return true } func decodeMemberAssignments(protocolType string, member kmsg.DescribeGroupsResponseGroupMember) (*kmsg.ConsumerMemberAssignment, error) { switch protocolType { case "consumer": a := kmsg.NewConsumerMemberAssignment() if err := a.ReadFrom(member.MemberAssignment); err != nil { return nil, fmt.Errorf("failed to decode member assignment: %w", err) } return &a, nil case "connect": return nil, nil default: return nil, nil } } ================================================ FILE: prometheus/collect_exporter_metrics.go ================================================ package prometheus import ( "context" "github.com/prometheus/client_golang/prometheus" ) func (e *Exporter) collectExporterMetrics(_ context.Context, ch chan<- prometheus.Metric) bool { recordsConsumed := e.minionSvc.GetNumberOfOffsetRecordsConsumed() ch <- prometheus.MustNewConstMetric( e.offsetConsumerRecordsConsumed, prometheus.CounterValue, recordsConsumed, ) return true } ================================================ FILE: prometheus/collect_log_dirs.go ================================================ package prometheus import ( "context" "github.com/prometheus/client_golang/prometheus" "github.com/twmb/franz-go/pkg/kerr" "github.com/twmb/franz-go/pkg/kgo" "go.uber.org/zap" "strconv" ) func (e *Exporter) collectLogDirs(ctx context.Context, ch chan<- prometheus.Metric) bool { if !e.minionSvc.Cfg.LogDirs.Enabled { return true } isOk := true sizeByBroker := make(map[kgo.BrokerMetadata]int64) sizeByTopicName := make(map[string]int64) logDirsSharded := e.minionSvc.DescribeLogDirs(ctx) for _, logDirRes := range logDirsSharded { childLogger := e.logger.With(zap.String("broker_address", logDirRes.Broker.Host), zap.String("broker_id", strconv.Itoa(int(logDirRes.Broker.NodeID)))) if logDirRes.Err != nil { childLogger.Error("failed to describe a broker's log dirs", zap.Error(logDirRes.Err)) isOk = false continue } for _, dir := range logDirRes.LogDirs.Dirs { err := kerr.ErrorForCode(dir.ErrorCode) if err != nil { childLogger.Error("failed to describe a broker's log dir", zap.String("log_dir", dir.Dir), zap.Error(err)) isOk = false continue } for _, topic := range dir.Topics { topicSize := int64(0) for _, partition := range topic.Partitions { topicSize += partition.Size } sizeByTopicName[topic.Topic] += topicSize sizeByBroker[logDirRes.Broker] += topicSize } } } // Report the total log dir size per broker for broker, size := range sizeByBroker { rackID := "" if broker.Rack != nil { rackID = *broker.Rack } ch <- prometheus.MustNewConstMetric( e.brokerLogDirSize, prometheus.GaugeValue, float64(size), strconv.Itoa(int(broker.NodeID)), broker.Host, strconv.Itoa(int(broker.Port)), rackID, ) } // If one of the log dir responses returned an error we can not reliably report the topic log dirs, as there might // be additional data on the brokers that failed to respond. if !isOk { return false } // Report the total log dir size per topic for topicName, size := range sizeByTopicName { ch <- prometheus.MustNewConstMetric( e.topicLogDirSize, prometheus.GaugeValue, float64(size), topicName, ) } return isOk } ================================================ FILE: prometheus/collect_topic_info.go ================================================ package prometheus import ( "context" "strconv" "github.com/prometheus/client_golang/prometheus" "github.com/twmb/franz-go/pkg/kerr" "go.uber.org/zap" ) func (e *Exporter) collectTopicInfo(ctx context.Context, ch chan<- prometheus.Metric) bool { if !e.minionSvc.Cfg.Topics.Enabled { return true } metadata, err := e.minionSvc.GetMetadataCached(ctx) if err != nil { e.logger.Error("failed to get metadata", zap.Error(err)) return false } topicConfigs, err := e.minionSvc.GetTopicConfigs(ctx) if err != nil { e.logger.Error("failed to get topic configs", zap.Error(err)) return false } isOk := true // ConfigsByTopic is indexed by topic name and config resource name (inner key) configsByTopic := make(map[string]map[string]string) for _, resource := range topicConfigs.Resources { configsByTopic[resource.ResourceName] = make(map[string]string) typedErr := kerr.TypedErrorForCode(resource.ErrorCode) if typedErr != nil { isOk = false e.logger.Warn("failed to get topic config of a specific topic", zap.String("topic_name", resource.ResourceName), zap.Error(typedErr)) continue } for _, config := range resource.Configs { confVal := "nil" if config.Value != nil { confVal = *config.Value } configsByTopic[resource.ResourceName][config.Name] = confVal } } for _, topic := range metadata.Topics { topicName := *topic.Topic if !e.minionSvc.IsTopicAllowed(topicName) { continue } typedErr := kerr.TypedErrorForCode(topic.ErrorCode) if typedErr != nil { isOk = false e.logger.Warn("failed to get metadata of a specific topic", zap.String("topic_name", topicName), zap.Error(typedErr)) continue } partitionCount := len(topic.Partitions) replicationFactor := -1 if partitionCount > 0 { // It should never be possible to skip this, but just to be safe we'll check this so that we don't cause panics replicationFactor = len(topic.Partitions[0].Replicas) } var labelsValues []string labelsValues = append(labelsValues, topicName) labelsValues = append(labelsValues, strconv.Itoa(partitionCount)) labelsValues = append(labelsValues, strconv.Itoa(replicationFactor)) for _, key := range e.minionSvc.Cfg.Topics.InfoMetric.ConfigKeys { labelsValues = append(labelsValues, getOrDefault(configsByTopic[topicName], key, "N/A")) } ch <- prometheus.MustNewConstMetric( e.topicInfo, prometheus.GaugeValue, float64(1), labelsValues..., ) } return isOk } func getOrDefault(m map[string]string, key string, defaultValue string) string { if value, exists := m[key]; exists { return value } return defaultValue } ================================================ FILE: prometheus/collect_topic_partition_offsets.go ================================================ package prometheus import ( "context" "strconv" "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" "github.com/cloudhut/kminion/v2/minion" ) func (e *Exporter) collectTopicPartitionOffsets(ctx context.Context, ch chan<- prometheus.Metric) bool { if !e.minionSvc.Cfg.Topics.Enabled { return true } isOk := true // Low Watermarks lowWaterMarks, err := e.minionSvc.ListStartOffsetsCached(ctx) if err != nil { e.logger.Error("failed to fetch low water marks", zap.Error(err)) return false } // High Watermarks highWaterMarks, err := e.minionSvc.ListEndOffsetsCached(ctx) if err != nil { e.logger.Error("failed to fetch low water marks", zap.Error(err)) return false } // Process Low Watermarks for topicName, partitions := range lowWaterMarks { if !e.minionSvc.IsTopicAllowed(topicName) { continue } waterMarkSum := int64(0) hasErrors := false for _, offset := range partitions { if offset.Err != nil { hasErrors = true isOk = false continue } waterMarkSum += offset.Offset // Let's end here if partition metrics shall not be exposed if e.minionSvc.Cfg.Topics.Granularity == minion.TopicGranularityTopic { continue } ch <- prometheus.MustNewConstMetric( e.partitionLowWaterMark, prometheus.GaugeValue, float64(offset.Offset), topicName, strconv.Itoa(int(offset.Partition)), ) } // We only want to report the sum of all partition marks if we receive watermarks from all partition if !hasErrors { ch <- prometheus.MustNewConstMetric( e.topicLowWaterMarkSum, prometheus.GaugeValue, float64(waterMarkSum), topicName, ) } } for topicName, partitions := range highWaterMarks { if !e.minionSvc.IsTopicAllowed(topicName) { continue } waterMarkSum := int64(0) hasErrors := false for _, offset := range partitions { if offset.Err != nil { hasErrors = true isOk = false continue } waterMarkSum += offset.Offset // Let's end here if partition metrics shall not be exposed if e.minionSvc.Cfg.Topics.Granularity == minion.TopicGranularityTopic { continue } ch <- prometheus.MustNewConstMetric( e.partitionHighWaterMark, prometheus.GaugeValue, float64(offset.Offset), topicName, strconv.Itoa(int(offset.Partition)), ) } // We only want to report the sum of all partition marks if we receive watermarks from all partitions if !hasErrors { ch <- prometheus.MustNewConstMetric( e.topicHighWaterMarkSum, prometheus.GaugeValue, float64(waterMarkSum), topicName, ) } } return isOk } ================================================ FILE: prometheus/config.go ================================================ package prometheus type Config struct { Host string `koanf:"host"` Port int `koanf:"port"` Namespace string `koanf:"namespace"` } func (c *Config) SetDefaults() { c.Port = 8080 c.Namespace = "kminion" } ================================================ FILE: prometheus/exporter.go ================================================ package prometheus import ( "context" "os" "strings" "time" "github.com/cloudhut/kminion/v2/minion" uuid2 "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" ) // Exporter is the Prometheus exporter that implements the prometheus.Collector interface type Exporter struct { cfg Config logger *zap.Logger minionSvc *minion.Service // Exporter metrics exporterUp *prometheus.Desc offsetConsumerRecordsConsumed *prometheus.Desc // Kafka metrics // General clusterInfo *prometheus.Desc brokerInfo *prometheus.Desc // Log Dir Sizes brokerLogDirSize *prometheus.Desc topicLogDirSize *prometheus.Desc // Topic / Partition topicInfo *prometheus.Desc topicHighWaterMarkSum *prometheus.Desc partitionHighWaterMark *prometheus.Desc topicLowWaterMarkSum *prometheus.Desc partitionLowWaterMark *prometheus.Desc // Consumer Groups consumerGroupInfo *prometheus.Desc consumerGroupMembers *prometheus.Desc consumerGroupMembersEmpty *prometheus.Desc consumerGroupTopicMembers *prometheus.Desc consumerGroupAssignedTopicPartitions *prometheus.Desc consumerGroupTopicOffsetSum *prometheus.Desc consumerGroupTopicPartitionLag *prometheus.Desc consumerGroupTopicLag *prometheus.Desc offsetCommits *prometheus.Desc } func NewExporter(cfg Config, logger *zap.Logger, minionSvc *minion.Service) (*Exporter, error) { return &Exporter{cfg: cfg, logger: logger.Named("prometheus"), minionSvc: minionSvc}, nil } func (e *Exporter) InitializeMetrics() { // Exporter / internal metrics // Exporter up e.exporterUp = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "exporter", "up"), "Build info about this Prometheus Exporter. Gauge value is 0 if one or more scrapes have failed.", nil, map[string]string{"version": os.Getenv("VERSION")}, ) // OffsetConsumer records consumed e.offsetConsumerRecordsConsumed = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "exporter", "offset_consumer_records_consumed_total"), "The number of offset records that have been consumed by the internal offset consumer", []string{}, nil, ) // Kafka metrics // Cluster info e.clusterInfo = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "cluster_info"), "Kafka cluster information", []string{"cluster_version", "broker_count", "controller_id", "cluster_id"}, nil, ) // Broker Info e.brokerInfo = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "broker_info"), "Kafka broker information", []string{"broker_id", "address", "port", "rack_id", "is_controller"}, nil, ) // LogDir sizes e.brokerLogDirSize = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "broker_log_dir_size_total_bytes"), "The summed size in bytes of all log dirs for a given broker", []string{"broker_id", "address", "port", "rack_id"}, nil, ) e.topicLogDirSize = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "topic_log_dir_size_total_bytes"), "The summed size in bytes of partitions for a given topic. This includes the used space for replica partitions.", []string{"topic_name"}, nil, ) // Topic / Partition metrics // Topic info var labels = []string{"topic_name", "partition_count", "replication_factor"} for _, key := range e.minionSvc.Cfg.Topics.InfoMetric.ConfigKeys { // prometheus does not allow . in label keys labels = append(labels, strings.ReplaceAll(key, ".", "_")) } e.topicInfo = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "topic_info"), "Info labels for a given topic", labels, nil, ) // Partition Low Water Mark e.partitionLowWaterMark = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "topic_partition_low_water_mark"), "Partition Low Water Mark", []string{"topic_name", "partition_id"}, nil, ) // Topic Low Water Mark Sum e.topicLowWaterMarkSum = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "topic_low_water_mark_sum"), "Sum of all the topic's partition low water marks", []string{"topic_name"}, nil, ) // Partition High Water Mark e.partitionHighWaterMark = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "topic_partition_high_water_mark"), "Partition High Water Mark", []string{"topic_name", "partition_id"}, nil, ) // Topic Low Water Mark Sum e.topicHighWaterMarkSum = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "topic_high_water_mark_sum"), "Sum of all the topic's partition high water marks", []string{"topic_name"}, nil, ) // Consumer Group Metrics // Group Info e.consumerGroupInfo = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "consumer_group_info"), "Consumer Group info metrics. It will report 1 if the group is in the stable state, otherwise 0.", []string{"group_id", "protocol", "protocol_type", "state", "coordinator_id"}, nil, ) // Group Members e.consumerGroupMembers = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "consumer_group_members"), "Consumer Group member count metrics. It will report the number of members in the consumer group", []string{"group_id"}, nil, ) // Group Empty Memmbers e.consumerGroupMembersEmpty = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "consumer_group_empty_members"), "It will report the number of members in the consumer group with no partition assigned", []string{"group_id"}, nil, ) // Group Topic Members e.consumerGroupTopicMembers = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "consumer_group_topic_members"), "It will report the number of members in the consumer group assigned on a given topic", []string{"group_id", "topic_name"}, nil, ) // Group Topic Assigned Partitions e.consumerGroupAssignedTopicPartitions = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "consumer_group_topic_assigned_partitions"), "It will report the number of partitions assigned in the consumer group for a given topic", []string{"group_id", "topic_name"}, nil, ) // Topic / Partition Offset Sum (useful for calculating the consumed messages / sec on a topic) e.consumerGroupTopicOffsetSum = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "consumer_group_topic_offset_sum"), "The sum of all committed group offsets across all partitions in a topic", []string{"group_id", "topic_name"}, nil, ) // Partition Lag e.consumerGroupTopicPartitionLag = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "consumer_group_topic_partition_lag"), "The number of messages a consumer group is lagging behind the latest offset of a partition", []string{"group_id", "topic_name", "partition_id"}, nil, ) // Topic Lag (sum of all partition lags) e.consumerGroupTopicLag = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "consumer_group_topic_lag"), "The number of messages a consumer group is lagging behind across all partitions in a topic", []string{"group_id", "topic_name"}, nil, ) // Offset commits by group id e.offsetCommits = prometheus.NewDesc( prometheus.BuildFQName(e.cfg.Namespace, "kafka", "consumer_group_offset_commits_total"), "The number of offsets committed by a group", []string{"group_id"}, nil, ) } // Describe implements the prometheus.Collector interface. It sends the // super-set of all possible descriptors of metrics collected by this // Collector to the provided channel and returns once the last descriptor // has been sent. The sent descriptors fulfill the consistency and uniqueness // requirements described in the Desc documentation. func (e *Exporter) Describe(ch chan<- *prometheus.Desc) { ch <- e.exporterUp ch <- e.clusterInfo } func (e *Exporter) Collect(ch chan<- prometheus.Metric) { ctx, cancel := context.WithTimeout(context.Background(), time.Second*60) defer cancel() // Attach a unique id which will be used for caching (and and it's invalidation) of the kafka requests uuid := uuid2.New() ctx = context.WithValue(ctx, "requestId", uuid.String()) ok := e.collectClusterInfo(ctx, ch) ok = e.collectExporterMetrics(ctx, ch) && ok ok = e.collectBrokerInfo(ctx, ch) && ok ok = e.collectLogDirs(ctx, ch) && ok ok = e.collectConsumerGroups(ctx, ch) && ok ok = e.collectTopicPartitionOffsets(ctx, ch) && ok ok = e.collectConsumerGroupLags(ctx, ch) && ok ok = e.collectTopicInfo(ctx, ch) && ok if ok { ch <- prometheus.MustNewConstMetric(e.exporterUp, prometheus.GaugeValue, 1.0) } else { ch <- prometheus.MustNewConstMetric(e.exporterUp, prometheus.GaugeValue, 0.0) } }