Showing preview only (358K chars total). Download the full file or copy to clipboard to get everything.
Repository: cloudworkz/kafka-minion
Branch: master
Commit: 06075b9f33bd
Files: 107
Total size: 334.9 KB
Directory structure:
gitextract_ind4nfvb/
├── .github/
│ ├── ai-opt-out
│ └── workflows/
│ ├── docker-image.yml
│ └── goreleaser.yml
├── .gitignore
├── .goreleaser.yml
├── Dockerfile
├── LICENSE
├── README.md
├── charts/
│ ├── README.md
│ ├── archives/
│ │ ├── .gitkeep
│ │ ├── index.yaml
│ │ ├── kminion-0.1.0.tgz
│ │ ├── kminion-0.1.1.tgz
│ │ ├── kminion-0.1.2.tgz
│ │ ├── kminion-0.1.3.tgz
│ │ ├── kminion-0.11.1.tgz
│ │ ├── kminion-0.11.2.tgz
│ │ ├── kminion-0.11.3.tgz
│ │ ├── kminion-0.12.0.tgz
│ │ ├── kminion-0.2.0.tgz
│ │ ├── kminion-0.2.1.tgz
│ │ ├── kminion-0.2.2.tgz
│ │ ├── kminion-0.3.0.tgz
│ │ ├── kminion-0.3.1.tgz
│ │ ├── kminion-0.4.0.tgz
│ │ ├── kminion-0.5.0.tgz
│ │ ├── kminion-0.6.0.tgz
│ │ ├── kminion-0.7.0.tgz
│ │ ├── kminion-0.8.0.tgz
│ │ ├── kminion-0.8.1.tgz
│ │ ├── kminion-0.8.2.tgz
│ │ ├── kminion-0.8.3.tgz
│ │ └── kminion-0.9.0.tgz
│ └── kminion/
│ ├── .helmignore
│ ├── Chart.yaml
│ ├── templates/
│ │ ├── NOTES.txt
│ │ ├── _helpers.tpl
│ │ ├── configmap.yaml
│ │ ├── daemonset.yaml
│ │ ├── deployment.yaml
│ │ ├── hpa.yaml
│ │ ├── ingress.yaml
│ │ ├── poddisruptionbudget.yaml
│ │ ├── service.yaml
│ │ ├── serviceaccount.yaml
│ │ └── servicemonitor.yaml
│ └── values.yaml
├── config.go
├── docker-compose.yml
├── docs/
│ ├── end-to-end.md
│ ├── metrics.md
│ └── reference-config.yaml
├── e2e/
│ ├── client_hooks.go
│ ├── config.go
│ ├── config_consumer.go
│ ├── config_producer.go
│ ├── config_topic.go
│ ├── config_topic_test.go
│ ├── consumer.go
│ ├── endtoend_message.go
│ ├── group_tracker.go
│ ├── message_tracker.go
│ ├── partition_planner.go
│ ├── partition_planner_test.go
│ ├── producer.go
│ ├── service.go
│ ├── topic.go
│ └── utils.go
├── go.mod
├── go.sum
├── kafka/
│ ├── client_config_helper.go
│ ├── client_logger.go
│ ├── config.go
│ ├── config_sasl.go
│ ├── config_sasl_gssapi.go
│ ├── config_sasl_oauthbearer.go
│ ├── config_tls.go
│ └── service.go
├── logging/
│ ├── config.go
│ └── logger.go
├── main.go
├── minion/
│ ├── client_hooks.go
│ ├── config.go
│ ├── config_consumer_group.go
│ ├── config_log_dirs.go
│ ├── config_topic_config.go
│ ├── consumer_group_offsets.go
│ ├── describe_consumer_groups.go
│ ├── describe_topic_config.go
│ ├── list_offsets.go
│ ├── log_dirs.go
│ ├── metadata.go
│ ├── offset_consumer.go
│ ├── service.go
│ ├── storage.go
│ ├── utils.go
│ └── versions.go
└── prometheus/
├── collect_broker_info.go
├── collect_cluster_info.go
├── collect_consumer_group_lags.go
├── collect_consumer_groups.go
├── collect_exporter_metrics.go
├── collect_log_dirs.go
├── collect_topic_info.go
├── collect_topic_partition_offsets.go
├── config.go
└── exporter.go
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/ai-opt-out
================================================
opt-out: true
================================================
FILE: .github/workflows/docker-image.yml
================================================
---
name: docker-image
on:
push:
tags: ['*']
branches: ['master']
paths-ignore: ['charts/**']
permissions:
id-token: write
contents: read
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: ${{ vars.RP_AWS_CRED_REGION }}
role-to-assume: arn:aws:iam::${{ secrets.RP_AWS_CRED_ACCOUNT_ID }}:role/${{ vars.RP_AWS_CRED_BASE_ROLE_NAME }}${{ github.event.repository.name }}
- uses: aws-actions/aws-secretsmanager-get-secrets@v2
with:
secret-ids: |
,sdlc/prod/github/dockerhub
parse-json-secrets: true
- uses: actions/checkout@v4
- uses: docker/setup-qemu-action@v3
- uses: docker/setup-buildx-action@v3
with:
driver-opts: |
image=moby/buildkit:v0.21.1
network=host
- name: Set build date
run: |
echo "BUILT_AT=$(date --rfc-3339=date)" >> ${GITHUB_ENV}
- uses: docker/metadata-action@v5
id: docker_meta
with:
# list of Docker images to use as base name for tags
images: |
redpandadata/kminion
name=public.ecr.aws/l9j0i2e0/kminion,enable=${{ startsWith(github.ref, 'refs/tags/v') }}
# generate Docker tags based on the following events/attributes
# Semver type is only active on 'push tag' events,
# hence no enable condition required
tags: |
type=sha,prefix={{branch}}-,format=short,enable={{is_default_branch}}
type=semver,pattern={{raw}}
- uses: docker/login-action@v3
with:
username: ${{ env.DOCKERHUB_USER }}
password: ${{ env.DOCKERHUB_TOKEN }}
- uses: aws-actions/configure-aws-credentials@v4
if: ${{ startsWith(github.ref, 'refs/tags/v') }}
with:
aws-region: us-east-1
role-to-assume: arn:aws:iam::${{ secrets.RP_AWS_CRED_ACCOUNT_ID }}:role/${{ vars.RP_AWS_CRED_BASE_ROLE_NAME }}${{ github.event.repository.name }}
- uses: aws-actions/amazon-ecr-login@v2
if: ${{ startsWith(github.ref, 'refs/tags/v') }}
with:
registry-type: public
- uses: docker/build-push-action@v6
with:
provenance: false
push: true
platforms: linux/amd64,linux/arm64
tags: ${{ steps.docker_meta.outputs.tags }}
build-args: |
VERSION=${{ fromJSON(steps.docker_meta.outputs.json).labels['org.opencontainers.image.version'] }}
BUILT_AT=${{ env.BUILT_AT }}
COMMIT=${{ github.sha }}
cache-from: type=gha
cache-to: type=gha,mode=max
================================================
FILE: .github/workflows/goreleaser.yml
================================================
---
name: goreleaser
on:
push:
tags: ['*']
jobs:
goreleaser:
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0
- uses: actions/setup-go@v6
with:
go-version-file: 'go.mod'
- uses: goreleaser/goreleaser-action@v7
if: startsWith(github.ref, 'refs/tags/')
with:
version: latest
args: release --clean
workdir: .
env:
CGO_ENABLED: 0
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GORELEASER_CURRENT_TAG: ${{ github.ref_name }}
================================================
FILE: .gitignore
================================================
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib
# Test binary, build with `go test -c`
*.test
# Output of the go coverage tool, specifically when used with LiteIDE
*.out
zk-single-kafka-single
zk-multiple-kafka-multiple
.vscode
.idea
config
/kminion
================================================
FILE: .goreleaser.yml
================================================
---
version: 2
release:
name_template: '{{.Version}} / {{time "2006-01-02"}}'
prerelease: auto
mode: append
footer: |
## Docker Image
Use the following command to pull this release's Docker image:
```sh
docker pull redpandadata/kminion:{{ .Tag }}
```
changelog:
disable: false
use: github
filters:
# Commit messages matching the regexp listed here will be removed from the changelog
exclude:
- '^docs:'
- '^test:'
- '^npm:'
- '^go.mod:'
- '^.github:'
- 'Merge branch'
builds:
- id: kminion
binary: kminion
goos:
- darwin
- linux
- windows
goarch:
- amd64
- arm64
ldflags:
- -s -w -X main.version={{.Version}} -X main.builtAt={{.Date}} -X main.commit={{.Commit}}
checksum:
name_template: 'checksums.txt'
================================================
FILE: Dockerfile
================================================
############################################################
# Build image
############################################################
FROM golang:1.26-alpine AS builder
ARG VERSION
ARG BUILT_AT
ARG COMMIT
RUN apk update && apk upgrade --no-cache && apk add --no-cache git ca-certificates && update-ca-certificates
WORKDIR /app
COPY go.mod .
COPY go.sum .
RUN go mod download
COPY . .
RUN CGO_ENABLED=0 go build \
-ldflags="-w -s \
-X main.version=$VERSION \
-X main.commit=$COMMIT \
-X main.builtAt=$BUILT_AT" \
-o ./bin/kminion
############################################################
# Runtime Image
############################################################
FROM alpine:3
COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
COPY --from=builder /app/bin/kminion /app/kminion
RUN addgroup -S redpanda \
&& adduser -S redpanda -G redpanda \
&& chmod o+rx /app/kminion \
&& apk upgrade --no-cache
USER redpanda
ENTRYPOINT ["/app/kminion"]
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2021 CloudHut
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# Redpanda Minion (KMinion) - Prometheus Exporter via Kafka API
[Redpanda](https://www.redpanda.com) Minion is a feature-rich and flexible Prometheus Exporter to monitor your Redpanda or Kafka cluster.
All valuable information that are accessible via the Kafka protocol are supposed to be accessible using
KMinion.
## 🚀 Features
- **Kafka versions:** Supports all Kafka versions v0.11+
- **Supported SASL mechanisms:** plain, scram-sha-256/512, gssapi/kerberos
- **TLS support:** TLS is supported, regardless whether you need mTLS, a custom CA, encrypted keys or just the trusted
root certs
- **Consumer Group Lags:** Number of messages a consumer group is lagging behind the latest offset
- **Log dir sizes:** Metric for log dir sizes either grouped by broker or by topic
- **Broker info:** Metric for each broker with its address, broker id, controller and rack id
- **Configurable granularity:** Export metrics (e.g. consumer group lags) either per partition or per topic. Helps to reduce the number of exported metric series.
- **End to End Monitoring:** Sends messages to its own topic and consumes them, measuring a messages real-world "roundtrip" latency. Also provides ack-latency and offset-commit-latency. [More Info](/docs/end-to-end.md)
- **Configurable targets:** You can configure what topics or groups you'd like to export using regex expressions
- **Multiple config parsers:** It's possible to configure KMinion using YAML, Environment variables or a mix of both
You can find a list of all exported metrics here: [/docs/metrics.md](/docs/metrics.md)
## Getting started
### 🐳 Docker image
All images will be built on each push to master or for every new release. You can find an overview of all available tags
in our [DockerHub repository](https://hub.docker.com/r/redpandadata/kminion/tags).
```shell
docker pull redpandadata/kminion:latest
```
### ☸ Helm chart
A Helm chart will be maintained as part of Redpanda's [helm-charts](https://github.com/redpanda-data/helm-charts/tree/main/charts/kminion) repository.
### 🔧 Configuration
All options in KMinion can be configured via YAML or environment variables. Configuring some options via YAML and some
via environment variables is also possible. Environment variables take precedence in this case. You can find the
reference config with additional documentation in [/docs/reference-config.yaml](/docs/reference-config.yaml).
If you want to use a YAML config file, specify the path to the config file by setting the env variable
`CONFIG_FILEPATH`.
### 📊 Grafana Dashboards
I uploaded three separate Grafana dashboards that can be used as inspiration in order to create your own dashboards. Please take note that these dashboards might not immediately work for you due to different labeling in your Prometheus config.
Cluster Dashboard: https://grafana.com/grafana/dashboards/14012
Consumer Group Dashboard: https://grafana.com/grafana/dashboards/14014
Topic Dashboard: https://grafana.com/grafana/dashboards/14013
<p float="left">
<img src="/docs/screenshots/kminion-cluster.png" width="250" />
<img src="/docs/screenshots/kminion-groups.png" width="250" />
<img src="/docs/screenshots/kminion-topics.png" width="250" />
</p>
### ⚡ Testing locally
This repo contains a docker-compose file that you can run on your machine. It will spin up a Kafka & ZooKeeper cluster
and starts KMinion on port 8080 which is exposed to your host machine:
```shell
# 1. Clone this repo
# 2. Browse to the repo's root directory and run:
docker-compose up
```
## Chat with us
We use Slack to communicate. If you are looking for more interactive discussions or support, you are invited to join
our Slack server: https://redpanda.com/slack
## License
KMinion is distributed under the [MIT License](https://github.com/cloudhut/kminion/blob/master/LICENSE).
================================================
FILE: charts/README.md
================================================
# Helm Chart
⚠️ This chart has been moved to https://github.com/redpanda-data/helm-charts/tree/main/charts/kminion . Please install this chart instead. The existing archives are still being hosted here, to not break existing deployments.
---
This chart is intentionally very light on input validation. The goal was to offer a flexible Helm chart that allows
users to deploy KMinion the way they want to. Therefore it's very flexible at the cost of less input validation, so that
you might run into runtime errors for a misconfiguration.
All available input is documented inside of the [values.yaml](./kminion/values.yaml) file.
## Installing the Helm chart
```shell
helm repo add kminion https://raw.githubusercontent.com/cloudhut/kminion/master/charts/archives
helm repo update
helm install -f values.yaml kminion kminion/kminion
```
================================================
FILE: charts/archives/.gitkeep
================================================
================================================
FILE: charts/archives/index.yaml
================================================
apiVersion: v1
entries:
kminion:
- apiVersion: v2
appVersion: v2.2.5
created: "2023-07-03T16:38:22.568312+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: 32e2ee36d0b0a045061d4e1490780fef905b4c85d7a23659819c5cb128aaa119
name: kminion
type: application
urls:
- kminion-0.12.0.tgz
version: 0.12.0
- apiVersion: v2
appVersion: v2.2.5
created: "2023-07-03T16:38:22.567922+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: 8a7be130d57f6f8ead720277b69319ff4dcd364859e80f4750416abe5ed460c3
name: kminion
type: application
urls:
- kminion-0.11.3.tgz
version: 0.11.3
- apiVersion: v2
appVersion: v2.2.3
created: "2023-07-03T16:38:22.5675+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: 42991a871f58b6d31a9e5b38539eb3d1e9cd35c0097a0fcf63f21f818fa7a999
name: kminion
type: application
urls:
- kminion-0.11.2.tgz
version: 0.11.2
- apiVersion: v2
appVersion: v2.2.3
created: "2023-07-03T16:38:22.566877+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: 65d7231f1e8ee586bec42bc383b66726d596fe03e0f3183e14b688174a3a8112
name: kminion
type: application
urls:
- kminion-0.11.1.tgz
version: 0.11.1
- apiVersion: v2
appVersion: v2.2.0
created: "2023-07-03T16:38:22.575384+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: 358bdd509f573049d4bfe77d2edb94c7ad3938f609aea11a8e2c2dc65cca2a9a
name: kminion
type: application
urls:
- kminion-0.9.0.tgz
version: 0.9.0
- apiVersion: v2
appVersion: v2.2.0
created: "2023-07-03T16:38:22.574906+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: be8f0047b345d3954fc7c7e7f8953a848c909ef253107d6e77ed747843ddd167
name: kminion
type: application
urls:
- kminion-0.8.3.tgz
version: 0.8.3
- apiVersion: v2
appVersion: v2.1.0
created: "2023-07-03T16:38:22.573746+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: 888bc665cddc6b6b99af1ce6dd1dea0b107a2e928dff6bfe1c077bc741e20ef7
name: kminion
type: application
urls:
- kminion-0.8.2.tgz
version: 0.8.2
- apiVersion: v2
appVersion: v2.1.0
created: "2023-07-03T16:38:22.573271+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: e59c5d5574f162708bf1434c266acbfd9040a89aa7a4abd4a0db70885248e38d
name: kminion
type: application
urls:
- kminion-0.8.1.tgz
version: 0.8.1
- apiVersion: v2
appVersion: v2.1.0
created: "2023-07-03T16:38:22.572697+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: f54d8236f8cf03c863b53e077e1647164ffe2a7c34e1cf77101fa3312c589706
name: kminion
type: application
urls:
- kminion-0.8.0.tgz
version: 0.8.0
- apiVersion: v2
appVersion: v2.1.0
created: "2023-07-03T16:38:22.572269+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: 4cc64cd9f78bd55673b00612579157e493020fb76440abbef10fe5152aef9acc
name: kminion
type: application
urls:
- kminion-0.7.0.tgz
version: 0.7.0
- apiVersion: v2
appVersion: v2.1.0
created: "2023-07-03T16:38:22.571852+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: 0955e04fe9ef4b516fb0d9ed439ae79778ccdffcf817f09099790cb7e183e4d4
name: kminion
type: application
urls:
- kminion-0.6.0.tgz
version: 0.6.0
- apiVersion: v2
appVersion: v2.0.0
created: "2023-07-03T16:38:22.571391+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: d3eb64d05535e136802538662eef7e9fdfdb3f0b93b6a42dfdcc93ee7deeadbd
name: kminion
type: application
urls:
- kminion-0.5.0.tgz
version: 0.5.0
- apiVersion: v2
appVersion: v2.0.0
created: "2023-07-03T16:38:22.570618+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: 6b4209352d1dffd7873791ee1573dc325eb08d67656b01b430729f45dea4c09a
name: kminion
type: application
urls:
- kminion-0.4.0.tgz
version: 0.4.0
- apiVersion: v2
appVersion: v2.0.0
created: "2023-07-03T16:38:22.570281+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: c51e3b45791e9fd51f33036916b0d36f7ac695e2fa916a9e99882ea83914ed97
name: kminion
type: application
urls:
- kminion-0.3.1.tgz
version: 0.3.1
- apiVersion: v2
appVersion: v2.0.0
created: "2023-07-03T16:38:22.569892+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: a2be2dd8a02dc5222ec7386195a0e25b2682a39bbdcf52b60793c171acac7653
name: kminion
type: application
urls:
- kminion-0.3.0.tgz
version: 0.3.0
- apiVersion: v2
appVersion: v2.0.0
created: "2023-07-03T16:38:22.569445+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: 321b6d5ff95ce310d2a3257b3d55f9ced51de99af6519d6d91723d7bdb6456fa
name: kminion
type: application
urls:
- kminion-0.2.2.tgz
version: 0.2.2
- apiVersion: v2
appVersion: v2.0.0
created: "2023-07-03T16:38:22.569089+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: ed57df27158521a1eb33d215731fcc3248c71b3f36a4a029eb2d3a7b617ca519
name: kminion
type: application
urls:
- kminion-0.2.1.tgz
version: 0.2.1
- apiVersion: v2
appVersion: v2.0.0
created: "2023-07-03T16:38:22.568694+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: 025661ee7cc574ad8dde7a68093a3b614fc92e26dd5dd398fc89d0b5308010e1
name: kminion
type: application
urls:
- kminion-0.2.0.tgz
version: 0.2.0
- apiVersion: v2
appVersion: v2.0.0
created: "2023-07-03T16:38:22.566269+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: e277e976d864b4bd2e505038dd865a9300486ae8c4323d3f0be40b84df75732b
name: kminion
type: application
urls:
- kminion-0.1.3.tgz
version: 0.1.3
- apiVersion: v2
appVersion: v2.0.0
created: "2023-07-03T16:38:22.565773+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: 562937d3613624c55984e51adbc6765e7898d1cf8cc2d7d241b6d671bbc12303
name: kminion
type: application
urls:
- kminion-0.1.2.tgz
version: 0.1.2
- apiVersion: v2
appVersion: v2.0.0
created: "2023-07-03T16:38:22.562776+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: 25e83d7c7cc92a63268d76b13ecc13077758b48be093490f281498a4f55ad3ca
name: kminion
type: application
urls:
- kminion-0.1.1.tgz
version: 0.1.1
- apiVersion: v2
appVersion: v2.0.0
created: "2023-07-03T16:38:22.562046+01:00"
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache
Kafka
digest: 7c10e9d9957e9752bc6f4b4a1fffb742d88cd57be06bf4f26ff7b5031645ccbd
name: kminion
type: application
urls:
- kminion-0.1.0.tgz
version: 0.1.0
generated: "2023-07-03T16:38:22.560328+01:00"
================================================
FILE: charts/kminion/.helmignore
================================================
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/
================================================
FILE: charts/kminion/Chart.yaml
================================================
apiVersion: v2
name: kminion
description: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka
# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.12.0
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "v2.2.5"
================================================
FILE: charts/kminion/templates/NOTES.txt
================================================
1. Get the application URL by running these commands:
{{- if .Values.ingress.enabled }}
{{- range .Values.ingress.hosts }}
http://{{ . }}
{{- end }}
{{- else if contains "NodePort" .Values.service.type }}
export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "kminion.fullname" . }})
export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
echo http://$NODE_IP:$NODE_PORT
{{- else if contains "LoadBalancer" .Values.service.type }}
NOTE: It may take a few minutes for the LoadBalancer IP to be available.
You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "kminion.fullname" . }}'
export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "kminion.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
echo http://$SERVICE_IP:{{ .Values.service.port }}
{{- else if contains "ClusterIP" .Values.service.type }}
export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "kminion.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
echo "Visit http://127.0.0.1:8080 to use your application"
kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT
{{- end }}
================================================
FILE: charts/kminion/templates/_helpers.tpl
================================================
{{/*
Expand the name of the chart.
*/}}
{{- define "kminion.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "kminion.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "kminion.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "kminion.labels" -}}
helm.sh/chart: {{ include "kminion.chart" . }}
{{ include "kminion.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- if .Values.customLabels}}
{{ toYaml .Values.customLabels }}
{{- end}}
{{- end }}
{{/*
Selector labels
*/}}
{{- define "kminion.selectorLabels" -}}
app.kubernetes.io/name: {{ include "kminion.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{/*
Create the name of the service account to use
*/}}
{{- define "kminion.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "kminion.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}
{{/*
Return the appropriate apiVersion for ingress.
*/}}
{{- define "kminion.ingress.apiVersion" -}}
{{- if and ($.Capabilities.APIVersions.Has "networking.k8s.io/v1") (semverCompare ">= 1.19-0" .Capabilities.KubeVersion.Version) }}
{{- print "networking.k8s.io/v1" }}
{{- else if $.Capabilities.APIVersions.Has "networking.k8s.io/v1beta1" }}
{{- print "networking.k8s.io/v1beta1" }}
{{- else }}
{{- print "extensions/v1beta1" }}
{{- end }}
{{- end }}
{{/*
Return if ingress is stable.
*/}}
{{- define "kminion.ingress.isStable" -}}
{{- eq (include "kminion.ingress.apiVersion" .) "networking.k8s.io/v1" }}
{{- end }}
{{/*
Return if ingress supports ingressClassName.
*/}}
{{- define "kminion.ingress.supportsIngressClassName" -}}
{{- or (eq (include "kminion.ingress.isStable" .) "true") (and (eq (include "kminion.ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18-0" .Capabilities.KubeVersion.Version)) }}
{{- end }}
{{/*
Return if ingress supports pathType.
*/}}
{{- define "kminion.ingress.supportsPathType" -}}
{{- or (eq (include "kminion.ingress.isStable" .) "true") (and (eq (include "kminion.ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18-0" .Capabilities.KubeVersion.Version)) }}
{{- end }}
{{/*
Return the appropriate apiVersion for podDisruptionBudget.
*/}}
{{- define "kminion.podDisruptionBudget.apiVersion" -}}
{{- if $.Capabilities.APIVersions.Has "policy/v1/PodDisruptionBudget" }}
{{- print "policy/v1" }}
{{- else }}
{{- print "policy/v1beta1" }}
{{- end }}
{{- end }}
================================================
FILE: charts/kminion/templates/configmap.yaml
================================================
apiVersion: v1
kind: ConfigMap
metadata:
name: {{include "kminion.fullname" .}}
namespace: {{ .Release.Namespace | quote }}
labels:
{{- include "kminion.labels" . | nindent 4}}
data:
config.yaml: |
{{- toYaml .Values.kminion.config | nindent 4}}
================================================
FILE: charts/kminion/templates/daemonset.yaml
================================================
{{- if .Values.daemonset.enabled }}
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{include "kminion.fullname" .}}
namespace: {{ .Release.Namespace | quote }}
labels:
{{- include "kminion.labels" . | nindent 4}}
spec:
updateStrategy:
type: OnDelete
selector:
matchLabels:
{{- include "kminion.selectorLabels" . | nindent 6}}
template:
metadata:
{{- with .Values.podAnnotations}}
annotations:
{{- toYaml . | nindent 8}}
{{- end}}
labels:
{{- include "kminion.selectorLabels" . | nindent 8}}
{{- if .Values.customLabels}}
{{toYaml .Values.customLabels | nindent 8}}
{{- end}}
spec:
{{- with .Values.imagePullSecrets}}
imagePullSecrets:
{{- toYaml . | nindent 8}}
{{- end}}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8}}
serviceAccountName: {{ .Values.serviceAccount.name }}
volumes:
- name: config
configMap:
name: {{include "kminion.fullname" .}}
{{- range .Values.deployment.volumes.secrets}}
- name: {{.secretName}}
secret:
secretName: {{.secretName}}
{{- end}}
containers:
- name: {{.Chart.Name}}
securityContext:
{{- toYaml .Values.securityContext | nindent 12}}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{.Values.image.pullPolicy}}
ports:
- name: metrics
containerPort: {{.Values.service.port}}
protocol: TCP
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: CONFIG_FILEPATH
value: /etc/kminion/config.yaml
{{- range .Values.deployment.env.values}}
- name: {{.name}}
value: {{.value | quote}}
{{- end}}
{{- range .Values.deployment.env.secretKeyRefs}}
- name: {{.name}}
valueFrom:
secretKeyRef:
name: {{.secretName}}
key: {{.secretKey}}
{{- end}}
{{- range .Values.deployment.env.configMapKeyRefs}}
- name: {{.name}}
valueFrom:
configMapKeyRef:
name: {{.configMapName}}
key: {{.configMapKey}}
{{- end}}
volumeMounts:
- name: config
mountPath: /etc/kminion
{{- range .Values.deployment.volumes.secrets}}
- name: {{.secretName}}
mountPath: {{.mountPath}}
{{- end}}
resources:
{{- toYaml .Values.resources | nindent 12}}
livenessProbe:
failureThreshold: 3
httpGet:
path: /ready
port: metrics
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
readinessProbe:
failureThreshold: 3
httpGet:
path: /ready
port: metrics
scheme: HTTP
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
{{- with .Values.affinity}}
affinity:
{{- toYaml . | nindent 8}}
{{- end}}
{{- with .Values.tolerations}}
tolerations:
{{- toYaml . | nindent 8}}
{{- end}}
{{- end }}
================================================
FILE: charts/kminion/templates/deployment.yaml
================================================
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{include "kminion.fullname" .}}
namespace: {{ .Release.Namespace | quote }}
labels:
{{- include "kminion.labels" . | nindent 4}}
{{- with .Values.deployment.labels}}
{{- toYaml . | nindent 4}}
{{- end}}
{{- with .Values.deployment.annotations}}
annotations:
{{- toYaml . | nindent 4}}
{{- end}}
spec:
{{- if not .Values.autoscaling.enabled}}
replicas: {{.Values.replicaCount}}
{{- end}}
selector:
matchLabels:
{{- include "kminion.selectorLabels" . | nindent 6}}
template:
metadata:
{{- with .Values.podAnnotations}}
annotations:
{{- toYaml . | nindent 8}}
{{- end}}
labels:
{{- include "kminion.selectorLabels" . | nindent 8}}
{{- if .Values.customLabels}}
{{toYaml .Values.customLabels | nindent 8}}
{{- end}}
spec:
{{- with .Values.imagePullSecrets}}
imagePullSecrets:
{{- toYaml . | nindent 8}}
{{- end}}
serviceAccountName: {{include "kminion.serviceAccountName" .}}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8}}
volumes:
- name: config
configMap:
name: {{include "kminion.fullname" .}}
{{- range .Values.deployment.volumes.secrets}}
- name: {{.secretName}}
secret:
secretName: {{.secretName}}
{{- end}}
{{- with .Values.deployment.volumes.extra }}
{{- toYaml . | nindent 8 }}
{{- end }}
initContainers:
{{- with .Values.deployment.initContainers }}
{{- toYaml . | nindent 8 }}
{{- end }}
containers:
- name: {{.Chart.Name}}
securityContext:
{{- toYaml .Values.securityContext | nindent 12}}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{.Values.image.pullPolicy}}
ports:
- name: metrics
containerPort: {{.Values.service.port}}
protocol: TCP
env:
- name: CONFIG_FILEPATH
value: /etc/kminion/config.yaml
{{- range .Values.deployment.env.values}}
- name: {{.name}}
value: {{.value | quote}}
{{- end}}
{{- range .Values.deployment.env.secretKeyRefs}}
- name: {{.name}}
valueFrom:
secretKeyRef:
name: {{.secretName}}
key: {{.secretKey}}
{{- end}}
{{- range .Values.deployment.env.configMapKeyRefs}}
- name: {{.name}}
valueFrom:
configMapKeyRef:
name: {{.configMapName}}
key: {{.configMapKey}}
{{- end}}
volumeMounts:
- name: config
mountPath: /etc/kminion
{{- range .Values.deployment.volumes.secrets}}
- name: {{.secretName}}
mountPath: {{.mountPath}}
{{- end}}
resources:
{{- toYaml .Values.resources | nindent 12}}
{{- if .Values.deployment.readinessProbe.enabled }}
readinessProbe:
httpGet:
path: /ready
port: {{.Values.service.port}}
initialDelaySeconds: 10
{{- end }}
{{- with .Values.deployment.extraContainers }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.nodeSelector}}
nodeSelector:
{{- toYaml . | nindent 8}}
{{- end}}
{{- with .Values.affinity}}
affinity:
{{- toYaml . | nindent 8}}
{{- end}}
{{- with .Values.tolerations}}
tolerations:
{{- toYaml . | nindent 8}}
{{- end}}
================================================
FILE: charts/kminion/templates/hpa.yaml
================================================
{{- if .Values.autoscaling.enabled }}
apiVersion: {{ ternary "autoscaling/v2" "autoscaling/v2beta1" (.Capabilities.APIVersions.Has "autoscaling/v2") }}
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "kminion.fullname" . }}
namespace: {{ .Release.Namespace | quote }}
labels:
{{- include "kminion.labels" . | nindent 4 }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "kminion.fullname" . }}
minReplicas: {{ .Values.autoscaling.minReplicas }}
maxReplicas: {{ .Values.autoscaling.maxReplicas }}
metrics:
{{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
- type: Resource
resource:
name: cpu
{{- if .Capabilities.APIVersions.Has "autoscaling/v2" }}
target:
type: Utilization
averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
{{ else }}
targetAverageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
{{- end }}
{{- end }}
{{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
- type: Resource
resource:
name: memory
{{- if .Capabilities.APIVersions.Has "autoscaling/v2" }}
target:
type: Utilization
averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
{{ else }}
targetAverageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
{{- end }}
{{- end }}
{{- end }}
================================================
FILE: charts/kminion/templates/ingress.yaml
================================================
{{- if .Values.ingress.enabled -}}
{{- $fullName := include "kminion.fullname" . -}}
{{- $svcPort := .Values.service.port -}}
{{- $ingressApiIsStable := eq (include "kminion.ingress.isStable" .) "true" -}}
{{- $ingressSupportsIngressClassName := eq (include "kminion.ingress.supportsIngressClassName" .) "true" -}}
{{- $ingressSupportsPathType := eq (include "kminion.ingress.supportsPathType" .) "true" -}}
{{- $fullName := include "kminion.fullname" . -}}
{{- $servicePort := .Values.service.port -}}
{{- $ingressPath := .Values.ingress.path -}}
{{- $ingressPathType := .Values.ingress.pathType -}}
{{- $extraPaths := .Values.ingress.extraPaths -}}
apiVersion: {{ include "kminion.ingress.apiVersion" . }}
kind: Ingress
metadata:
name: {{ $fullName }}
namespace: {{ .Release.Namespace | quote }}
labels:
{{- include "kminion.labels" . | nindent 4 }}
{{- with .Values.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
{{- if and $ingressSupportsIngressClassName .Values.ingress.ingressClassName }}
ingressClassName: {{ .Values.ingress.ingressClassName }}
{{- end -}}
{{- with .Values.ingress.tls }}
tls:
{{- tpl (toYaml .) $ | nindent 4 }}
{{- end }}
rules:
{{- if .Values.ingress.hosts }}
{{- range .Values.ingress.hosts }}
- host: {{ tpl . $ }}
http:
paths:
{{- with $extraPaths }}
{{- toYaml . | nindent 10 }}
{{- end }}
- path: {{ $ingressPath }}
{{- if $ingressSupportsPathType }}
pathType: {{ $ingressPathType }}
{{- end }}
backend:
{{- if $ingressApiIsStable }}
service:
name: {{ $fullName }}
port:
number: {{ $servicePort }}
{{- else }}
serviceName: {{ $fullName }}
servicePort: {{ $servicePort }}
{{- end }}
{{- end }}
{{- else }}
- http:
paths:
- backend:
{{- if $ingressApiIsStable }}
service:
name: {{ $fullName }}
port:
number: {{ $servicePort }}
{{- else }}
serviceName: {{ $fullName }}
servicePort: {{ $servicePort }}
{{- end }}
{{- with $ingressPath }}
path: {{ . }}
{{- end }}
{{- if $ingressSupportsPathType }}
pathType: {{ $ingressPathType }}
{{- end }}
{{- end -}}
{{- end }}
================================================
FILE: charts/kminion/templates/poddisruptionbudget.yaml
================================================
{{- if .Values.podDisruptionBudget }}
apiVersion: {{ include "kminion.podDisruptionBudget.apiVersion" . }}
kind: PodDisruptionBudget
metadata:
name: {{ template "kminion.fullname" . }}
namespace: {{ .Release.Namespace | quote }}
labels:
{{- include "kminion.labels" . | nindent 4}}
spec:
{{- if .Values.podDisruptionBudget.minAvailable }}
minAvailable: {{ .Values.podDisruptionBudget.minAvailable }}
{{- end }}
{{- if .Values.podDisruptionBudget.maxUnavailable }}
maxUnavailable: {{ .Values.podDisruptionBudget.maxUnavailable }}
{{- end }}
selector:
matchLabels:
{{- include "kminion.selectorLabels" . | nindent 6}}
{{- end }}
================================================
FILE: charts/kminion/templates/service.yaml
================================================
apiVersion: v1
kind: Service
metadata:
name: {{ include "kminion.fullname" . }}
namespace: {{ .Release.Namespace | quote }}
labels:
{{- include "kminion.labels" . | nindent 4 }}
{{- if .Values.service.annotations }}
annotations:
{{- toYaml .Values.service.annotations | nindent 4 }}
{{- end }}
spec:
type: {{ .Values.service.type }}
ports:
- port: {{ .Values.service.port }}
targetPort: metrics
protocol: TCP
name: metrics
{{- if .Values.service.extraPorts }}
{{- toYaml .Values.service.extraPorts | nindent 4 }}
{{- end }}
selector:
{{- include "kminion.selectorLabels" . | nindent 4 }}
================================================
FILE: charts/kminion/templates/serviceaccount.yaml
================================================
{{- if .Values.serviceAccount.create -}}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "kminion.serviceAccountName" . }}
namespace: {{ .Release.Namespace | quote }}
labels:
{{- include "kminion.labels" . | nindent 4 }}
{{- with .Values.serviceAccount.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}
================================================
FILE: charts/kminion/templates/servicemonitor.yaml
================================================
{{- if .Values.serviceMonitor.create }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{include "kminion.fullname" .}}
namespace: {{ .Release.Namespace | quote }}
labels:
{{- include "kminion.labels" . | nindent 4}}
{{- if .Values.serviceMonitor.additionalLabels}}
{{toYaml .Values.serviceMonitor.additionalLabels | nindent 4}}
{{- end}}
spec:
selector:
matchLabels:
{{- include "kminion.labels" . | nindent 6}}
endpoints:
- port: metrics
path: /metrics
honorLabels: {{ .Values.serviceMonitor.honorLabels }}
scrapeTimeout: {{ .Values.serviceMonitor.scrapeTimeout }}
interval: {{ .Values.serviceMonitor.interval }}
{{- if .Values.serviceMonitor.relabelings }}
relabelings:
{{ toYaml .Values.serviceMonitor.relabelings | nindent 6 }}
{{- end }}
{{- if .Values.serviceMonitor.targetLabels}}
targetLabels:
{{- toYaml .Values.serviceMonitor.targetLabels | nindent 4}}
{{- end}}
{{- if .Values.customLabels }}
podTargetLabels:
{{- (keys .Values.customLabels | sortAlpha) | toYaml | nindent 4 }}
{{- end}}
{{- end }}
================================================
FILE: charts/kminion/values.yaml
================================================
# Default values for kminion.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
replicaCount: 1
image:
repository: redpandadata/kminion
pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion.
tag: ""
imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""
serviceAccount:
# Specifies whether a service account should be created
create: true
# Annotations to add to the service account
annotations: {}
# The name of the service account to use.
# If not set and create is true, a name is generated using the fullname template
name: ""
podAnnotations: {}
# prometheus.io/scrape: "true"
# prometheus.io/port: "8080"
# prometheus.io/path: "/metrics"
podSecurityContext:
runAsUser: 99
fsGroup: 99
## See `kubectl explain poddisruptionbudget.spec` for more
## ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/
podDisruptionBudget:
maxUnavailable: 1
# minAvailable: 1
securityContext:
allowPrivilegeEscalation: false
# capabilities:
# drop:
# - ALL
# readOnlyRootFilesystem: true
# runAsNonRoot: true
# runAsUser: 1000
service:
type: ClusterIP
port: 8080 # This port is also used as exposed container port
annotations: {} # # Annotations to add to the service
extraPorts: [] # when []extraContainers expose additional metrics, make
# discoverable for servicemontors
# - port: 8443
# targetPort: 8443
# protocol: TCP
# name: expose-x509-for-ttl-checks
ingress:
enabled: false
# For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName
# See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress
# ingressClassName: nginx
# Values can be templated
annotations: {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
labels: {}
path: /
# pathType is only for k8s >= 1.1=
pathType: Prefix
hosts:
- chart-example.local
## Extra paths to prepend to every host configuration. This is useful when working with annotation based services.
extraPaths: []
# - path: /*
# backend:
# serviceName: ssl-redirect
# servicePort: use-annotation
## Or for k8s > 1.19
# - path: /*
# pathType: Prefix
# backend:
# service:
# name: ssl-redirect
# port:
# name: use-annotation
tls: []
# - secretName: chart-example-tls
# hosts:
# - chart-example.local
# ingressClassName:
resources: {}
# We usually recommend not to specify default resources and to leave this as a conscious
# choice for the user. This also increases chances charts run on environments with little
# resources, such as Minikube. If you do want to specify resources, uncomment the following
# lines, adjust them as necessary, and remove the curly braces after 'resources:'.
# limits:
# cpu: 100m
# memory: 128Mi
# requests:
# cpu: 100m
# memory: 128Mi
autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 100
targetCPUUtilizationPercentage: 80
# targetMemoryUtilizationPercentage: 80
nodeSelector: {}
tolerations: []
affinity: {}
customLabels: {}
serviceMonitor:
create: false
additionalLabels: {}
honorLabels: false
scrapeTimeout: 10s
interval: 15s
relabelings: []
# - sourceLabels: [__meta_kubernetes_pod_label_my_label]
# separator: ;
# regex: (.*)
# targetLabel: my_label
# replacement: $1
# action: replace
# For DaemonSet mode you may set daemonset to "true" and replicaCount to 0.
daemonset:
enabled: false
deployment:
readinessProbe:
enabled: true
labels: {}
# Annotations to add to the Deployment resource
annotations: {}
volumes:
# Mount files from Kubernetes secrets into the container
secrets: []
# - secretName: vault-tls
# mountPath: /vault/tls
extra: []
# - name: example
# emptyDir: {}
# If you want to provide specifc config settings like sensitive Kafka credentials via environment variables you can
# do so by making them available here. See the kminion reference config to figure out the expected variable names.
env:
# Configure environment variables which you want to make available
values: []
# - name: KAFKA_SASL_MECHANISM
# value: PLAIN
# Configure environment variables which you want to make available from a config map
configMapKeyRefs: []
# - name: KAFKA_SASL_USERNAME
# configMapName: kafka-user-team-xy
# key: username
# Configure environment variables which you want to make available from a secret
secretKeyRefs: []
# - name: KAFKA_SASL_PASSWORD
# secretName: kafka-credentials-team-xy
# secretKey: password
# Add additional containers, e. g. for oauth2-proxy
extraContainers: {}
# Add additional init containers, e. g. for oauth2-proxy
initContainers: {}
kminion:
# KMinion can be configured using environment variables and/or a YAML config. The yaml contents under config will
# end up in a YAML file which will be mounted into the kminion container.
# See reference config: https://github.com/cloudhut/kminion/blob/master/docs/reference-config.yaml
config: {}
# kafka:
# brokers: [ ]
# clientId: "kminion"
# rackId: ""
# tls:
# enabled: false
# caFilepath: ""
# certFilepath: ""
# keyFilepath: ""
# passphrase: ""
# insecureSkipTlsVerify: false
# sasl:
# # Whether or not SASL authentication will be used for authentication
# enabled: false
# # Username to use for PLAIN or SCRAM mechanism
# username: ""
# # Password to use for PLAIN or SCRAM mechanism
# password: ""
# # Mechanism to use for SASL Authentication. Valid values are PLAIN, SCRAM-SHA-256, SCRAM-SHA-512, GSSAPI
# mechanism: "PLAIN"
# # GSSAPI / Kerberos config properties
# gssapi:
# authType: ""
# keyTabPath: ""
# kerberosConfigPath: ""
# serviceName: ""
# username: ""
# password: ""
# realm: ""
# # Whether to retry the initial test connection to Kafka. False will exit with code 1 on error,
# # while true will retry until success.
# retryInitConnection: false
#
# minion:
# consumerGroups:
# # Enabled specifies whether consumer groups shall be scraped and exported or not.
# enabled: true
# # Mode specifies whether we export consumer group offsets using the Admin API or by consuming the internal
# # __consumer_offsets topic. Both modes have their advantages and disadvantages.
# scrapeMode: adminApi # Valid values: adminApi, offsetsTopic
# # Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and
# # you aren't interested in per partition lags you could choose "topic" where all partition lags will be summed
# # and only topic lags will be exported.
# granularity: partition
# # AllowedGroups are regex strings of group ids that shall be exported
# # You can specify allowed groups by providing literals like "my-consumergroup-name" or by providing regex expressions
# # like "/internal-.*/".
# allowedGroups: [ ]
# # IgnoredGroups are regex strings of group ids that shall be ignored/skipped when exporting metrics. Ignored groups
# # take precedence over allowed groups.
# ignoredGroups: [ ]
# topics:
# # Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and
# # you aren't interested in per partition metrics you could choose "topic".
# granularity: partition
# # AllowedTopics are regex strings of topic names whose topic metrics that shall be exported.
# # You can specify allowed topics by providing literals like "my-topic-name" or by providing regex expressions
# # like "/internal-.*/".
# allowedTopics: [ ]
#
# # IgnoredTopics are regex strings of topic names that shall be ignored/skipped when exporting metrics. Ignored topics
# # take precedence over allowed topics.
# ignoredTopics: [ ]
# # infoMetric is a configuration object for the kminion_kafka_topic_info metric
# infoMetric:
# # ConfigKeys are set of strings of Topic configs that you want to have exported as part of the metric
# configKeys: ["cleanup.policy"]
# logDirs:
# # Enabled specifies whether log dirs shall be scraped and exported or not. This should be disabled for clusters prior
# # to version 1.0.0 as describing log dirs was not supported back then.
# enabled: true
#
# exporter:
# # Namespace is the prefix for all exported Prometheus metrics
# namespace: "kminion"
# # Host that shall be used to bind the HTTP server on
# host: ""
# # Port that shall be used to bind the HTTP server on
# port: 8080
#
# logger:
# # Level is a logging priority. Higher levels are more important. Valid values are: debug, info, warn, error, fatal, panic
# level: info
================================================
FILE: config.go
================================================
package main
import (
"fmt"
"os"
"strings"
"github.com/cloudhut/kminion/v2/kafka"
"github.com/cloudhut/kminion/v2/logging"
"github.com/cloudhut/kminion/v2/minion"
"github.com/cloudhut/kminion/v2/prometheus"
"github.com/knadh/koanf"
"github.com/knadh/koanf/parsers/yaml"
"github.com/knadh/koanf/providers/env"
"github.com/knadh/koanf/providers/file"
"github.com/mitchellh/mapstructure"
"go.uber.org/zap"
)
type Config struct {
Kafka kafka.Config `koanf:"kafka"`
Minion minion.Config `koanf:"minion"`
Exporter prometheus.Config `koanf:"exporter"`
Logger logging.Config `koanf:"logger"`
}
func (c *Config) SetDefaults() {
c.Kafka.SetDefaults()
c.Minion.SetDefaults()
c.Exporter.SetDefaults()
c.Logger.SetDefaults()
}
func (c *Config) Validate() error {
err := c.Kafka.Validate()
if err != nil {
return fmt.Errorf("failed to validate kafka config: %w", err)
}
err = c.Minion.Validate()
if err != nil {
return fmt.Errorf("failed to validate minion config: %w", err)
}
err = c.Logger.Validate()
if err != nil {
return fmt.Errorf("failed to validate logger config: %w", err)
}
return nil
}
func newConfig(logger *zap.Logger) (Config, error) {
k := koanf.New(".")
var cfg Config
cfg.SetDefaults()
// 1. Check if a config filepath is set via flags. If there is one we'll try to load the file using a YAML Parser
envKey := "CONFIG_FILEPATH"
configFilepath := os.Getenv(envKey)
if configFilepath == "" {
logger.Info("the env variable '" + envKey + "' is not set, therefore no YAML config will be loaded")
} else {
err := k.Load(file.Provider(configFilepath), yaml.Parser())
if err != nil {
return Config{}, fmt.Errorf("failed to parse YAML config: %w", err)
}
}
// We could unmarshal the loaded koanf input after loading both providers, however we want to unmarshal the YAML
// config with `ErrorUnused` set to true, but unmarshal environment variables with `ErrorUnused` set to false (default).
// Rationale: Orchestrators like Kubernetes inject unrelated environment variables, which we still want to allow.
err := k.UnmarshalWithConf("", &cfg, koanf.UnmarshalConf{
Tag: "",
FlatPaths: false,
DecoderConfig: &mapstructure.DecoderConfig{
DecodeHook: mapstructure.ComposeDecodeHookFunc(
mapstructure.StringToTimeDurationHookFunc()),
Metadata: nil,
Result: &cfg,
WeaklyTypedInput: true,
ErrorUnused: true,
},
})
if err != nil {
return Config{}, err
}
err = k.Load(env.ProviderWithValue("", ".", func(s string, v string) (string, interface{}) {
// key := strings.Replace(strings.ToLower(s), "_", ".", -1)
key := strings.Replace(strings.ToLower(s), "_", ".", -1)
// Check to exist if we have a configuration option already and see if it's a slice
// If there is a comma in the value, split the value into a slice by the comma.
if strings.Contains(v, ",") {
return key, strings.Split(v, ",")
}
// Otherwise return the new key with the unaltered value
return key, v
}), nil)
if err != nil {
return Config{}, err
}
err = k.Unmarshal("", &cfg)
if err != nil {
return Config{}, err
}
err = cfg.Validate()
if err != nil {
return Config{}, fmt.Errorf("failed to validate config: %w", err)
}
return cfg, nil
}
================================================
FILE: docker-compose.yml
================================================
---
version: '2.1'
services:
zookeeper:
image: confluentinc/cp-zookeeper:latest
ports:
- 2181:2181
environment:
ZOOKEEPER_CLIENT_PORT: 2181
ZOOKEEPER_TICK_TIME: 2000
container_name: zookeeper
hostname: zookeeper
kafka:
image: confluentinc/cp-kafka:latest
hostname: kafka
container_name: kafka
depends_on:
- zookeeper
ports:
- 9092:9092
environment:
KAFKA_BROKER_ID: 1
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
kafka-minion:
build:
context: .
dockerfile: ./Dockerfile
hostname: kafka-minion
container_name: kafka-minion
depends_on:
- zookeeper
- kafka
ports:
- 8080:8080
environment:
KAFKA_BROKERS: kafka:29092
restart: unless-stopped
================================================
FILE: docs/end-to-end.md
================================================
# End-To-End Monitoring
This page describes the end-to-end monitoring feature in KMinion, how it works, and what metrics it provides.
## Motivation
> What is the issue? Why did we build this feature?
We can monitor metrics like CPU usage, free disk space, or even consumer group lag. However, these metrics don't give us
a good idea of the performance characteristics an actual, real-world, client experiences when connected to the cluster.
With the "classic" metrics lots of questions go unanswered:
- Can a client produce messages to the cluster?
- Can clients produce & consume messages as well as commit group offsets with an acceptable latency?
- Is the cluster in a healthy state from a client's perspective?
## Approach & Implementation
> How do we solve those issues? How does the feature work?
The most reliably way to get real-world performance and availability metrics is to actually run a producer/consumer
ourselves. This is exactly what the end-to-end monitoring feature does!
## High Level Overview
In order to determine if the cluster is fully operational, and it's performance is within acceptable limits, KMinion
continuously produces and consumes messages to/from the cluster. That way we can measure things like ack-latency,
commit-latency, and roundtrip-time.
KMinion creates and manages its own topic for the end-to-end test messages. The name of the topic can be configured.
**The first step** is to create a message and send it to the cluster.
- Every produced message is added to an internal tracker, so we can recognize messages being "lost". A message is
considered lost if it doesn't arrive back at the consumer within the configured time span.
**The second step** is to continuously consume the topic.
- As each message arrives, we calculate its roundtrip time (time from the point the message was created, until KMinion
received it again)
- Consumer group offsets are committed periodically, while also recording the time each commit takes.
### Topic Management
The topic KMinion uses, is created and managed completely automatically (the topic name can be configured though).
KMinion continuously checks the topic and fixes issues/imbalances automatically:
- Add partitions to the topic, so it has at least as many partitions as there are brokers.
- Will reassign partitions to ensure every broker leads at least one partition, and that all partitions' replicas are
distributed evenly across the brokers. KMinion tries to assign partitionIDs to brokers that have the same broker id.
### Consumer Group Management
On startup each KMinion instance generates a unique identifier (UUID) that is used to create its own consumer group. It
incorporates the shared prefix from the config.
That is necessary because:
- Offsets must not be shared among multiple instances.
- Each instance must always consume **all** partitions of the topic.
The instances' UUID is also embedded in every message, so each instance can easily filter out messages it didn't
produce. That's why it is perfectly fine to run multiple KMinion instances against the same cluster, using the same
topic.
KMinion also monitors and deletes consumer groups that use it's configured prefix. That way, when an instance
exits/restarts, previous consumer groups will be cleaned up quickly (check happens every 20s).
## Available Metrics
The end-to-end monitoring feature exports the following metrics.
### Counters
| Name | Description |
| --- | --- |
| `kminion_end_to_end_messages_produced_total ` | Messages KMinion *tried* to send |
| `kminion_end_to_end_messages_received_total ` | Number of messages received (only counts those that match, i.e. that this instance actually produced itself) |
| `kminion_end_to_end_offset_commits_total` | Number of successful offset commits |
| `kminion_end_to_end_messages_lost_total` Number of messages that have been produced successfully but not received within the configured SLA duration |
| `kminion_end_to_end_messages_produced_failed_total` Number of messages failed to produce to Kafka because of a timeout or failure |
| `kminion_end_to_end_offset_commits_total` Counts how many times kminions end-to-end test has committed offsets |
### Histograms
| Name | Description |
| --- | --- |
| `kminion_end_to_end_produce_latency_seconds ` | Duration until the cluster acknowledged a message. |
| `kminion_end_to_end_offset_commit_latency_seconds` Time kafka took to respond to kminion's offset commit |
| `kminion_end_to_end_roundtrip_latency_seconds ` | Duration from creation of a message, until it was received/consumed again. |
### Gauges
| Name | Description |
| --- | --- |
| `kminion_end_to_end_messages_produced_in_flight` Number of messages that kminion's end-to-end test produced but has not received an answer for yet |
## Config Properties
All config properties related to this feature are located in `minion.endToEnd`.
```yaml
endToEnd:
enabled: true
probeInterval: 800ms # how often to send end-to-end test messages
topicManagement:
# You can disable topic management, without disabling the testing feature.
# Only makes sense if you have multiple kminion instances, and for some reason only want one of them to create/configure the topic.
# It is strongly recommended to leave this enabled.
enabled: true
# Name of the topic kminion uses to send its test messages
# You do *not* need to change this if you are running multiple kminion instances on the same cluster.
# Different instances are perfectly fine with sharing the same topic!
name: kminion-end-to-end
# How often kminion checks its topic to validate configuration, partition count, and partition assignments
reconciliationInterval: 10m
# Useful for monitoring the performance of acks (if >1 this is best combined with 'producer.requiredAcks' set to 'all')
replicationFactor: 1
# Rarely makes sense to change this, but maybe if you want some sort of cheap load test?
partitionsPerBroker: 1
producer:
# This defines the maximum time to wait for an ack response after producing a message,
# and the upper bound for histogram buckets in "produce_latency_seconds"
ackSla: 5s
# Can be to "all" (default) so kafka only reports an end-to-end test message as acknowledged if
# the message was written to all in-sync replicas of the partition.
# Or can be set to "leader" to only require to have written the message to its log.
requiredAcks: all
consumer:
# Prefix kminion uses when creating its consumer groups. Current kminion instance id will be appended automatically
groupIdPrefix: kminion-end-to-end
# Whether KMinion should try to delete empty consumer groups with the same prefix. This can be used if you want
# KMinion to cleanup it's old consumer groups. It should only be used if you use a unique prefix for KMinion.
deleteStaleConsumerGroups: false
# Defines the time limit beyond which a message is considered "lost" (failed the roundtrip),
# also used as the upper bound for histogram buckets in "roundtrip_latency"
roundtripSla: 20s
# Maximum time an offset commit is allowed to take before considering it failed,
# also used as the upper bound for histogram buckets in "commit_latency_seconds"
commitSla: 10s
```
================================================
FILE: docs/metrics.md
================================================
# Exported Metrics
This document lists all exported metrics in an exemplary way.
## Exporter Metrics
```
# HELP kminion_exporter_up Build info about this Prometheus Exporter. Gauge value is 0 if one or more scrapes have failed.
# TYPE kminion_exporter_up gauge
kminion_exporter_up{version="sha-0ab0dcdf862f7a34b06998cd2d980148e048151a"} 1
# HELP kminion_exporter_offset_consumer_records_consumed_total The number of offset records that have been consumed by the internal offset consumer
# TYPE kminion_exporter_offset_consumer_records_consumed_total counter
kminion_exporter_offset_consumer_records_consumed_total 5.058244883e+09
```
## Kafka Metrics
### General / Cluster Metrics
```
# HELP kminion_kafka_broker_info Kafka broker information
# TYPE kminion_kafka_broker_info gauge
kminion_kafka_broker_info{address="broker-9.analytics-prod.kafka.cloudhut.dev",broker_id="9",is_controller="false",port="9092",rack_id="europe-west1-b"} 1
# HELP kminion_kafka_cluster_info Kafka cluster information
# TYPE kminion_kafka_cluster_info gauge
kminion_kafka_cluster_info{broker_count="12",cluster_id="UYZJg8bhT_6SxhsdaQZEQ",cluster_version="v2.6",controller_id="6"} 1
```
### Log Dir Metrics
```
# HELP kminion_kafka_broker_log_dir_size_total_bytes The summed size in bytes of all log dirs for a given broker
# TYPE kminion_kafka_broker_log_dir_size_total_bytes gauge
kminion_kafka_broker_log_dir_size_total_bytes{address="broker-9.analytics-prod.kafka.cloudhut.dev",broker_id="9",port="9092",rack_id="europe-west1-b"} 8.32654935115e+11
# HELP kminion_kafka_topic_log_dir_size_total_bytes The summed size in bytes of partitions for a given topic. This includes the used space for replica partitions.
# TYPE kminion_kafka_topic_log_dir_size_total_bytes gauge
kminion_kafka_topic_log_dir_size_total_bytes{topic_name="__consumer_offsets"} 9.026554258e+09
```
### Topic & Partition Metrics
```
# HELP kminion_kafka_topic_info Info labels for a given topic
# TYPE kminion_kafka_topic_info gauge
kminion_kafka_topic_info{cleanup_policy="compact",partition_count="1",replication_factor="1",topic_name="_confluent-ksql-default__command_topic"} 1
# HELP kminion_kafka_topic_partition_low_water_mark Partition Low Water Mark
# TYPE kminion_kafka_topic_partition_low_water_mark gauge
kminion_kafka_topic_partition_low_water_mark{partition_id="0",topic_name="__consumer_offsets"} 0
# HELP kminion_kafka_topic_low_water_mark_sum Sum of all the topic's partition low water marks
# TYPE kminion_kafka_topic_low_water_mark_sum gauge
kminion_kafka_topic_low_water_mark_sum{topic_name="__consumer_offsets"} 0
# HELP kminion_kafka_topic_partition_high_water_mark Partition High Water Mark
# TYPE kminion_kafka_topic_partition_high_water_mark gauge
kminion_kafka_topic_partition_high_water_mark{partition_id="0",topic_name="__consumer_offsets"} 2.04952001e+08
# HELP kminion_kafka_topic_high_water_mark_sum Sum of all the topic's partition high water marks
# TYPE kminion_kafka_topic_high_water_mark_sum gauge
kminion_kafka_topic_high_water_mark_sum{topic_name="__consumer_offsets"} 1.512023846873e+12
```
### Consumer Group Metrics
```
# HELP kminion_kafka_consumer_group_info Consumer Group info metrics. It will report 1 if the group is in the stable state, otherwise 0.
# TYPE kminion_kafka_consumer_group_info gauge
kminion_kafka_consumer_group_info{coordinator_id="0",group_id="bigquery-sink",protocol="range",protocol_type="consumer",state="Stable"} 1
# HELP kminion_kafka_consumer_group_members Consumer Group member count metrics. It will report the number of members in the consumer group
# TYPE kminion_kafka_consumer_group_members gauge
kminion_kafka_consumer_group_members{group_id="bigquery-sink"} 2
# HELP kminion_kafka_consumer_group_empty_members Consumer Group Empty Members. It will report the number of members in the consumer group with no partition assigned
# TYPE kminion_kafka_consumer_group_empty_members gauge
kminion_kafka_consumer_group_empty_members{group_id="bigquery-sink"} 1
# HELP kminion_kafka_consumer_group_topic_members Consumer Group topic member count metrics. It will report the number of members in the consumer group assigned on a given topic
# TYPE kminion_kafka_consumer_group_topic_members gauge
kminion_kafka_consumer_group_topic_members{group_id="bigquery-sink",topic_name="shop-activity"} 4
# HELP kminion_kafka_consumer_group_topic_assigned_partitions Consumer Group topic partitions count metrics. It will report the number of partitions assigned in the consumer group for a given topic
# TYPE kminion_kafka_consumer_group_topic_assigned_partitions gauge
kminion_kafka_consumer_group_topic_assigned_partitions{group_id="bigquery-sink",topic_name="shop-activity"} 32
# HELP kminion_kafka_consumer_group_topic_offset_sum The sum of all committed group offsets across all partitions in a topic
# TYPE kminion_kafka_consumer_group_topic_offset_sum gauge
kminion_kafka_consumer_group_topic_offset_sum{group_id="bigquery-sink",topic_name="shop-activity"} 4.259513e+06
# HELP kminion_kafka_consumer_group_topic_partition_lag The number of messages a consumer group is lagging behind the latest offset of a partition
# TYPE kminion_kafka_consumer_group_topic_partition_lag gauge
kminion_kafka_consumer_group_topic_partition_lag{group_id="bigquery-sink",partition_id="10",topic_name="shop-activity"} 147481
# HELP kminion_kafka_consumer_group_topic_lag The number of messages a consumer group is lagging behind across all partitions in a topic
# TYPE kminion_kafka_consumer_group_topic_lag gauge
kminion_kafka_consumer_group_topic_lag{group_id="bigquery-sink",topic_name="shop-activity"} 147481
# HELP kminion_kafka_consumer_group_offset_commits_total The number of offsets committed by a group
# TYPE kminion_kafka_consumer_group_offset_commits_total counter
kminion_kafka_consumer_group_offset_commits_total{group_id="bigquery-sink"} 1098
```
### End-to-End Metrics
```
# HELP kminion_end_to_end_messages_produced_total Number of messages that kminion's end-to-end test has tried to send to kafka
# TYPE kminion_end_to_end_messages_produced_total counter
kminion_end_to_end_messages_produced_total 384
# HELP kminion_end_to_end_offset_commits_total Counts how many times kminions end-to-end test has committed messages
# TYPE kminion_end_to_end_offset_commits_total counter
kminion_end_to_end_offset_commits_total 18
# HELP kminion_end_to_end_messages_received_total Number of *matching* messages kminion received. Every roundtrip message has a minionID (randomly generated on startup) and a timestamp. Kminion only considers a message a match if it it arrives within the configured roundtrip SLA (and it matches the minionID)
# TYPE kminion_end_to_end_messages_received_total counter
kminion_end_to_end_messages_received_total 383
# HELP kminion_end_to_end_produce_latency_seconds Time until we received an ack for a produced message
# TYPE kminion_end_to_end_produce_latency_seconds histogram
kminion_end_to_end_produce_latency_seconds_bucket{partitionId="0",le="0.005"} 0
# HELP kminion_end_to_end_offset_commit_latency_seconds Time kafka took to respond to kminion's offset commit
# TYPE kminion_end_to_end_offset_commit_latency_seconds histogram
kminion_end_to_end_offset_commit_latency_seconds_bucket{groupCoordinatorBrokerId="0",le="0.005"} 0
# HELP kminion_end_to_end_roundtrip_latency_seconds Time it took between sending (producing) and receiving (consuming) a message
# TYPE kminion_end_to_end_roundtrip_latency_seconds histogram
kminion_end_to_end_roundtrip_latency_seconds_bucket{partitionId="0",le="0.005"} 0
# HELP kminion_end_to_end_messages_lost_total Number of messages that have been produced successfully but not received within the configured SLA duration
# TYPE kminion_end_to_end_messages_lost_total counter
kminion_end_to_end_messages_lost_total{partition_id="0"} 0
# HELP kminion_end_to_end_messages_produced_failed_total Number of messages failed to produce to Kafka because of a timeout or failure
# TYPE kminion_end_to_end_messages_produced_failed_total counter
kminion_end_to_end_messages_produced_failed_total{partition_id="0"} 0
# HELP kminion_end_to_end_messages_produced_in_flight Number of messages that kminion's end-to-end test produced but has not received an answer for yet
# TYPE kminion_end_to_end_messages_produced_in_flight gauge
kminion_end_to_end_messages_produced_in_flight{partition_id="0"} 0
```
================================================
FILE: docs/reference-config.yaml
================================================
#####################################################################################
# This file documents all the available config options and it's default values.
#
# All config options can be configured via environment variables as well.
# If you specify both the env variable and yaml option for the same configuration
# the environment variable will take precedence. If you want to use a YAML config
# file, specify the path to the config file by setting the env variable
# CONFIG_FILEPATH.
#
# The env variable name is auto generated by upper casing everything and adding
# an underscore for each indentation/level. Some examples:
# kafka.rackId => KAFKA_RACKID
# kafka.tls.caFilepath => KAFKA_TLS_CAFILEPATH
# minion.consumerGroups.allowedGroups => MINION_CONSUMERGROUPS_ALLOWEDGROUPS
#
# Env variables that expect array values can be provided by separting them using
# a comma: KAFKA_BROKERS = "broker1:9092,broker2:9092,broker3:9092"
#####################################################################################
logger:
# Valid values are: debug, info, warn, error, fatal, panic
level: info
kafka:
brokers: [ ]
clientId: "kminion"
rackId: ""
tls:
enabled: false
caFilepath: ""
certFilepath: ""
keyFilepath: ""
# base64 encoded tls CA, cannot be set if 'caFilepath' is set
ca: ""
# base64 encoded tls cert, cannot be set if 'certFilepath' is set
cert: ""
# base64 encoded tls key, cannot be set if 'keyFilepath' is set
key: ""
passphrase: ""
insecureSkipTlsVerify: false
sasl:
# Whether or not SASL authentication will be used for authentication
enabled: false
# Username to use for PLAIN or SCRAM mechanism
username: ""
# Password to use for PLAIN or SCRAM mechanism
password: ""
# Mechanism to use for SASL Authentication. Valid values are PLAIN, SCRAM-SHA-256, SCRAM-SHA-512, GSSAPI, OAUTHBEARER
mechanism: "PLAIN"
# GSSAPI / Kerberos config properties
gssapi:
# Required. One of USER_AUTH or KEYTAB_AUTH
authType: ""
keyTabPath: ""
kerberosConfigPath: ""
serviceName: ""
username: ""
password: ""
realm: ""
enableFast: true
# OAUTHBEARER config properties
oauth:
tokenEndpoint: ""
clientId: ""
clientSecret: ""
scope: ""
minion:
consumerGroups:
# Enabled specifies whether consumer groups shall be scraped and exported or not.
enabled: true
# Mode specifies whether we export consumer group offsets using the Admin API or by consuming the internal
# __consumer_offsets topic. Both modes have their advantages and disadvantages.
# * adminApi:
# - Useful for managed kafka clusters that do not provide access to the offsets topic.
# * offsetsTopic
# - Enables kminion_kafka_consumer_group_offset_commits_total metrics.
# - Processing the offsetsTopic requires slightly more memory and cpu than using the adminApi. The amount depends on the
# size and throughput of the offsets topic.
scrapeMode: adminApi # Valid values: adminApi, offsetsTopic
# Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and
# you aren't interested in per partition lags you could choose "topic" where all partition lags will be summed
# and only topic lags will be exported.
granularity: partition
# AllowedGroups are regex strings of group ids that shall be exported
# You can specify allowed groups by providing literals like "my-consumergroup-name" or by providing regex expressions
# like "/internal-.*/".
allowedGroups: [ ".*" ]
# IgnoredGroups are regex strings of group ids that shall be ignored/skipped when exporting metrics. Ignored groups
# take precedence over allowed groups.
ignoredGroups: [ ]
topics:
# Enabled can be set to false in order to disable collecting any topic metrics.
enabled: true
# Granularity can be per topic or per partition. If you want to reduce the number of exported metric series and
# you aren't interested in per partition metrics you could choose "topic".
granularity: partition
# AllowedTopics are regex strings of topic names whose topic metrics that shall be exported.
# You can specify allowed topics by providing literals like "my-topic-name" or by providing regex expressions
# like "/internal-.*/".
allowedTopics: [ ".*" ]
# IgnoredTopics are regex strings of topic names that shall be ignored/skipped when exporting metrics. Ignored topics
# take precedence over allowed topics.
ignoredTopics: [ ]
# infoMetric is a configuration object for the kminion_kafka_topic_info metric
infoMetric:
# ConfigKeys are set of strings of Topic configs that you want to have exported as part of the metric
configKeys: [ "cleanup.policy" ]
logDirs:
# Enabled specifies whether log dirs shall be scraped and exported or not. This should be disabled for clusters prior
# to version 1.0.0 as describing log dirs was not supported back then.
enabled: true
# EndToEnd Metrics
# When enabled, kminion creates a topic which it produces to and consumes from, to measure various advanced metrics. See docs for more info
endToEnd:
enabled: false
# How often to send end-to-end test messages
probeInterval: 100ms
topicManagement:
# Controls whether kminion should create, validate, and alter the end-to-end topic.
#
# When enabled (true):
# - kminion creates the topic if it doesn't exist
# - Validates partition count and replica assignments on startup and periodically (based on reconciliationInterval)
# - Alters the topic (reassigns partitions, adds partitions) to ensure one partition leader per broker
# - Fails on startup if topic alteration operations fail
#
# When disabled (false):
# - kminion will NOT create the topic if it doesn't exist (startup will fail)
# - Will NOT validate or alter an existing topic (accepts current partition layout as-is)
# - Logs warnings if the topic configuration is suboptimal
# - Continues end-to-end testing with whatever partition layout exists
#
# Use disabled mode on managed Kafka platforms that restrict partition reassignment operations.
# In this mode, you must manually create the topic with appropriate partition distribution
# before starting kminion.
enabled: true
# Name of the topic kminion uses to send its test messages
# You do *not* need to change this if you are running multiple kminion instances on the same cluster.
# Different instances are perfectly fine with sharing the same topic!
name: kminion-end-to-end
# How often kminion checks its topic to validate configuration, partition count, and partition assignments
reconciliationInterval: 10m
# Depending on the desired monitoring (e.g. you want to alert on broker failure vs. cluster that is not writable)
# you may choose replication factor 1 or 3 most commonly.
replicationFactor: 1
# Rarely makes sense to change this, but maybe if you want some sort of cheap load test?
# By default (1) every broker gets one partition
partitionsPerBroker: 1
# Whether kminion should issue AlterPartitionAssignments requests to rebalance partition
# leaders across brokers. Disable this when running against Kafka-compatible clusters that
# reject AlterPartitionAssignments (e.g. Redpanda with autobalancer enabled, Confluent Cloud).
# Topic creation and partition count management remain active regardless of this setting.
rebalancePartitions: true
producer:
# This defines:
# - Maximum time to wait for an ack response after producing a message
# - Upper bound for histogram buckets in "produce_latency_seconds"
ackSla: 5s
# Can be to "all" (default) so kafka only reports an end-to-end test message as acknowledged if
# the message was written to all in-sync replicas of the partition.
# Or can be set to "leader" to only require to have written the message to its log.
requiredAcks: all
consumer:
# Prefix kminion uses when creating its consumer groups. Current kminion instance id will be appended automatically
groupIdPrefix: kminion-end-to-end
# Whether KMinion should try to delete empty consumer groups with the same prefix. This can be used if you want
# KMinion to cleanup it's old consumer groups. It should only be used if you use a unique prefix for KMinion.
deleteStaleConsumerGroups: false
# This defines:
# - Upper bound for histogram buckets in "roundtrip_latency"
# - Time limit beyond which a message is considered "lost" (failed the roundtrip)
roundtripSla: 20s
# - Upper bound for histogram buckets in "commit_latency_seconds"
# - Maximum time an offset commit is allowed to take before considering it failed
commitSla: 10s
exporter:
# Namespace is the prefix for all exported Prometheus metrics
namespace: "kminion"
# Host that shall be used to bind the HTTP server on
host: ""
# Port that shall be used to bind the HTTP server on
port: 8080
================================================
FILE: e2e/client_hooks.go
================================================
package e2e
import (
"net"
"sync/atomic"
"time"
"github.com/twmb/franz-go/pkg/kgo"
"github.com/twmb/franz-go/pkg/kmsg"
"go.uber.org/zap"
)
// in e2e we only use client hooks for logging connect/disconnect messages
type clientHooks struct {
logger *zap.Logger
lastCoordinatorUpdate time.Time
currentCoordinator *atomic.Value // kgo.BrokerMetadata
}
func newEndToEndClientHooks(logger *zap.Logger) *clientHooks {
return &clientHooks{
logger: logger.Named("e2e_hooks"),
currentCoordinator: &atomic.Value{},
}
}
func (c *clientHooks) OnBrokerConnect(meta kgo.BrokerMetadata, dialDur time.Duration, _ net.Conn, err error) {
if err != nil {
c.logger.Error("kafka connection failed", zap.String("broker_host", meta.Host), zap.Int32("broker_id", meta.NodeID), zap.Error(err))
return
}
c.logger.Debug("kafka connection succeeded",
zap.String("host", meta.Host), zap.Int32("broker_id", meta.NodeID),
zap.Int64("dial_duration_ms", dialDur.Milliseconds()))
}
func (c *clientHooks) OnDisconnect(meta kgo.BrokerMetadata, _ net.Conn) {
c.logger.Warn("kafka broker disconnected", zap.Int32("broker_id", meta.NodeID),
zap.String("host", meta.Host))
}
// OnBrokerWrite is passed the broker metadata, the key for the request that
// was written, the number of bytes written, how long the request
// waited before being written, how long it took to write the request,
// and any error.
//
// The bytes written does not count any tls overhead.
// OnWrite is called after a write to a broker.
//
// OnWrite(meta BrokerMetadata, key int16, bytesWritten int, writeWait, timeToWrite time.Duration, err error)
func (c *clientHooks) OnBrokerWrite(meta kgo.BrokerMetadata, key int16, bytesWritten int, writeWait, timeToWrite time.Duration, err error) {
keyName := kmsg.NameForKey(key)
if keyName != "OffsetCommit" {
return
}
// c.logger.Info("hooks onWrite",
// zap.Duration("timeToWrite", timeToWrite),
// zap.NamedError("err", err))
}
// OnBrokerRead is passed the broker metadata, the key for the response that
// was read, the number of bytes read, how long the Client waited
// before reading the response, how long it took to read the response,
// and any error.
//
// The bytes written does not count any tls overhead.
// OnRead is called after a read from a broker.
// OnRead(meta BrokerMetadata, key int16, bytesRead int, readWait, timeToRead time.Duration, err error)
func (c *clientHooks) OnBrokerRead(meta kgo.BrokerMetadata, key int16, bytesRead int, readWait, timeToRead time.Duration, err error) {
consumerGroupMsgKeys := []int16{
(&kmsg.OffsetCommitResponse{}).Key(),
(&kmsg.JoinGroupResponse{}).Key(),
(&kmsg.HeartbeatResponse{}).Key(),
(&kmsg.SyncGroupResponse{}).Key(),
}
isMessageFromGroupCoordinator := isInArray(key, consumerGroupMsgKeys)
if !isMessageFromGroupCoordinator {
return
}
if err == nil {
c.currentCoordinator.Store(meta)
c.lastCoordinatorUpdate = time.Now()
}
}
================================================
FILE: e2e/config.go
================================================
package e2e
import (
"fmt"
"time"
)
type Config struct {
Enabled bool `koanf:"enabled"`
TopicManagement EndToEndTopicConfig `koanf:"topicManagement"`
ProbeInterval time.Duration `koanf:"probeInterval"`
Producer EndToEndProducerConfig `koanf:"producer"`
Consumer EndToEndConsumerConfig `koanf:"consumer"`
}
func (c *Config) SetDefaults() {
c.Enabled = false
c.ProbeInterval = 100 * time.Millisecond
c.TopicManagement.SetDefaults()
c.Producer.SetDefaults()
c.Consumer.SetDefaults()
}
func (c *Config) Validate() error {
if !c.Enabled {
return nil
}
// If the timeduration is 0s or 0ms or its variation of zero, it will be parsed as 0
if c.ProbeInterval == 0 {
return fmt.Errorf("failed to validate probeInterval config, the duration can't be zero")
}
err := c.TopicManagement.Validate()
if err != nil {
return fmt.Errorf("failed to validate topicManagement config: %w", err)
}
_, err = time.ParseDuration(c.ProbeInterval.String())
if err != nil {
return fmt.Errorf("failed to parse '%s' to time.Duration: %v", c.ProbeInterval.String(), err)
}
err = c.Producer.Validate()
if err != nil {
return fmt.Errorf("failed to validate producer config: %w", err)
}
err = c.Consumer.Validate()
if err != nil {
return fmt.Errorf("failed to validate consumer config: %w", err)
}
return nil
}
================================================
FILE: e2e/config_consumer.go
================================================
package e2e
import (
"fmt"
"time"
)
type EndToEndConsumerConfig struct {
GroupIdPrefix string `koanf:"groupIdPrefix"`
DeleteStaleConsumerGroups bool `koanf:"deleteStaleConsumerGroups"`
// RoundtripSLA is the time duration from the moment where we try to produce until the moment where we consumed
// the message. Therefore this should always be higher than the produceTimeout / SLA.
RoundtripSla time.Duration `koanf:"roundtripSla"`
CommitSla time.Duration `koanf:"commitSla"`
}
func (c *EndToEndConsumerConfig) SetDefaults() {
c.GroupIdPrefix = "kminion-end-to-end"
c.DeleteStaleConsumerGroups = false
c.RoundtripSla = 20 * time.Second
c.CommitSla = 5 * time.Second
}
func (c *EndToEndConsumerConfig) Validate() error {
if len(c.GroupIdPrefix) < 3 {
return fmt.Errorf("kminion prefix should be at least 3 characters long")
}
if c.RoundtripSla <= 0 {
return fmt.Errorf("consumer.roundtripSla must be greater than zero")
}
if c.CommitSla <= 0 {
return fmt.Errorf("consumer.commitSla must be greater than zero")
}
return nil
}
================================================
FILE: e2e/config_producer.go
================================================
package e2e
import (
"fmt"
"time"
)
type EndToEndProducerConfig struct {
AckSla time.Duration `koanf:"ackSla"`
RequiredAcks string `koanf:"requiredAcks"`
}
func (c *EndToEndProducerConfig) SetDefaults() {
c.AckSla = 5 * time.Second
c.RequiredAcks = "all"
}
func (c *EndToEndProducerConfig) Validate() error {
if c.RequiredAcks != "all" && c.RequiredAcks != "leader" {
return fmt.Errorf("producer.requiredAcks must be 'all' or 'leader")
}
if c.AckSla <= 0 {
return fmt.Errorf("producer.ackSla must be greater than zero")
}
return nil
}
================================================
FILE: e2e/config_topic.go
================================================
package e2e
import (
"fmt"
"time"
)
type EndToEndTopicConfig struct {
Enabled bool `koanf:"enabled"`
Name string `koanf:"name"`
ReplicationFactor int `koanf:"replicationFactor"`
PartitionsPerBroker int `koanf:"partitionsPerBroker"`
ReconciliationInterval time.Duration `koanf:"reconciliationInterval"`
// RebalancePartitions controls whether kminion will issue AlterPartitionAssignments
// requests to rebalance partition leaders across brokers. Set to false when running
// against Kafka-compatible clusters (e.g. Redpanda with autobalancer enabled, Confluent
// Cloud) that reject AlterPartitionAssignments. Topic creation and partition count
// management remain active regardless of this setting.
RebalancePartitions bool `koanf:"rebalancePartitions"`
}
func (c *EndToEndTopicConfig) SetDefaults() {
c.Enabled = true
c.Name = "kminion-end-to-end"
c.ReplicationFactor = 1
c.PartitionsPerBroker = 1
c.ReconciliationInterval = 10 * time.Minute
c.RebalancePartitions = true
}
func (c *EndToEndTopicConfig) Validate() error {
if c.ReplicationFactor < 1 {
return fmt.Errorf("failed to parse replicationFactor, it should be more than 1, retrieved value %v", c.ReplicationFactor)
}
if c.PartitionsPerBroker < 1 {
return fmt.Errorf("failed to parse partitionsPerBroker, it should be more than 1, retrieved value %v", c.PartitionsPerBroker)
}
// If the timeduration is 0s or 0ms or its variation of zero, it will be parsed as 0
if c.ReconciliationInterval == 0 {
return fmt.Errorf("failed to validate topic.ReconciliationInterval config, the duration can't be zero")
}
return nil
}
================================================
FILE: e2e/config_topic_test.go
================================================
package e2e
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/zap"
)
func TestEndToEndTopicConfig_SetDefaults(t *testing.T) {
var cfg EndToEndTopicConfig
cfg.SetDefaults()
assert.True(t, cfg.Enabled)
assert.Equal(t, "kminion-end-to-end", cfg.Name)
assert.Equal(t, 1, cfg.ReplicationFactor)
assert.Equal(t, 1, cfg.PartitionsPerBroker)
assert.Equal(t, 10*time.Minute, cfg.ReconciliationInterval)
assert.True(t, cfg.RebalancePartitions, "RebalancePartitions should default to true for backward compatibility")
}
func TestEndToEndTopicConfig_Validate(t *testing.T) {
tests := []struct {
name string
cfg EndToEndTopicConfig
wantErr bool
}{
{
name: "valid config with rebalance enabled",
cfg: EndToEndTopicConfig{
Enabled: true,
Name: "test-topic",
ReplicationFactor: 3,
PartitionsPerBroker: 1,
ReconciliationInterval: 10 * time.Minute,
RebalancePartitions: true,
},
wantErr: false,
},
{
name: "valid config with rebalance disabled",
cfg: EndToEndTopicConfig{
Enabled: true,
Name: "test-topic",
ReplicationFactor: 3,
PartitionsPerBroker: 1,
ReconciliationInterval: 10 * time.Minute,
RebalancePartitions: false,
},
wantErr: false,
},
{
name: "invalid replication factor",
cfg: EndToEndTopicConfig{
ReplicationFactor: 0,
PartitionsPerBroker: 1,
ReconciliationInterval: 10 * time.Minute,
RebalancePartitions: false,
},
wantErr: true,
},
{
name: "invalid partitions per broker",
cfg: EndToEndTopicConfig{
ReplicationFactor: 1,
PartitionsPerBroker: 0,
ReconciliationInterval: 10 * time.Minute,
RebalancePartitions: false,
},
wantErr: true,
},
{
name: "zero reconciliation interval",
cfg: EndToEndTopicConfig{
ReplicationFactor: 1,
PartitionsPerBroker: 1,
ReconciliationInterval: 0,
RebalancePartitions: false,
},
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := tt.cfg.Validate()
if tt.wantErr {
require.Error(t, err)
} else {
require.NoError(t, err)
}
})
}
}
// TestPartitionPlanner_RebalancePartitionsDisabled verifies that when
// RebalancePartitions is false the planner still produces a valid plan
// (the caller in topic.go is responsible for not executing reassignments),
// and that partition creation assignments are still generated normally.
func TestPartitionPlanner_RebalancePartitionsDisabled(t *testing.T) {
// Three brokers, topic already exists with suboptimal leader distribution:
// all partitions led by broker 0.
meta := buildMeta("e2e",
map[int32]string{0: "", 1: "", 2: ""},
[][]int32{
{0, 1, 2},
{0, 2, 1},
{0, 1, 2},
},
)
cfg := EndToEndTopicConfig{
Enabled: true,
Name: "e2e",
ReplicationFactor: 3,
PartitionsPerBroker: 1,
ReconciliationInterval: 10 * time.Minute,
RebalancePartitions: false,
}
planner := NewPartitionPlanner(cfg, zap.NewNop())
plan, err := planner.Plan(meta)
require.NoError(t, err)
// The planner should still detect that reassignments are needed —
// it's the caller's responsibility to skip executing them.
assert.NotEmpty(t, plan.Reassignments, "planner should detect reassignments are needed")
for _, ra := range plan.Reassignments {
assertNoDuplicates(t, ra.Replicas)
assert.Len(t, ra.Replicas, cfg.ReplicationFactor)
}
// No new partitions should be created (3 brokers × 1 per broker = 3 already exist).
assert.Empty(t, plan.CreateAssignments)
assert.Equal(t, 3, plan.FinalPartitionCount)
}
// TestPartitionPlanner_RebalancePartitionsDisabled_Creates verifies that when
// RebalancePartitions is false and new partitions need to be created, Phase 3
// uses actual current leaders (not predicted leaders from staged reassignments)
// to pick the preferred leader for new partitions.
func TestPartitionPlanner_RebalancePartitionsDisabled_Creates(t *testing.T) {
// 4 brokers, 3 partitions all led by broker 0.
// PartitionsPerBroker=1 means desired = 4, so Phase 3 must create 1.
meta := buildMeta("e2e",
map[int32]string{0: "", 1: "", 2: "", 3: ""},
[][]int32{
{0, 1, 2},
{0, 2, 3},
{0, 1, 3},
},
)
cfg := EndToEndTopicConfig{
Enabled: true,
Name: "e2e",
ReplicationFactor: 3,
PartitionsPerBroker: 1,
ReconciliationInterval: 10 * time.Minute,
RebalancePartitions: false,
}
planner := NewPartitionPlanner(cfg, zap.NewNop())
plan, err := planner.Plan(meta)
require.NoError(t, err)
// Phase 3 should create exactly 1 partition (4 desired - 3 existing).
require.Len(t, plan.CreateAssignments, 1)
// The new partition's preferred leader should NOT be broker 0,
// because actual state shows broker 0 already leads 3 partitions.
// With rebalancePartitions=false, Phase 3 counts from actual leaders,
// so it should pick one of the under-represented brokers (1, 2, or 3).
newLeader := plan.CreateAssignments[0].Replicas[0]
assert.NotEqual(t, int32(0), newLeader,
"new partition should not be led by broker 0 (already leads 3 partitions in actual state)")
}
================================================
FILE: e2e/consumer.go
================================================
package e2e
import (
"context"
"encoding/json"
"strconv"
"time"
"github.com/twmb/franz-go/pkg/kgo"
"github.com/twmb/franz-go/pkg/kmsg"
"go.uber.org/zap"
)
func (s *Service) startConsumeMessages(ctx context.Context, initializedCh chan<- bool) {
client := s.client
s.logger.Info("starting to consume end-to-end topic",
zap.String("topic_name", s.config.TopicManagement.Name),
zap.String("group_id", s.groupId))
isInitialized := false
for {
fetches := client.PollFetches(ctx)
if !isInitialized {
isInitialized = true
initializedCh <- true
close(initializedCh)
}
// Log all errors and continue afterwards as we might get errors and still have some fetch results
errors := fetches.Errors()
for _, err := range errors {
s.logger.Error("kafka fetch error",
zap.String("topic", err.Topic),
zap.Int32("partition", err.Partition),
zap.Error(err.Err))
}
fetches.EachRecord(s.processMessage)
}
}
func (s *Service) commitOffsets(ctx context.Context) {
client := s.client
uncommittedOffset := client.UncommittedOffsets()
if uncommittedOffset == nil {
return
}
startCommitTimestamp := time.Now()
childCtx, cancel := context.WithTimeout(ctx, s.config.Consumer.CommitSla)
client.CommitOffsets(childCtx, uncommittedOffset, func(_ *kgo.Client, req *kmsg.OffsetCommitRequest, r *kmsg.OffsetCommitResponse, err error) {
cancel()
coordinator := s.clientHooks.currentCoordinator.Load().(kgo.BrokerMetadata)
coordinatorID := strconv.Itoa(int(coordinator.NodeID))
latency := time.Since(startCommitTimestamp)
s.offsetCommitLatency.WithLabelValues(coordinatorID).Observe(latency.Seconds())
s.offsetCommitsTotal.WithLabelValues(coordinatorID).Inc()
// We do this to ensure that a series with that coordinator id is initialized
s.offsetCommitsTotal.WithLabelValues(coordinatorID).Add(0)
// If we have at least one error in our commit response we want to report it as an error with an appropriate
// reason as label.
if errCode := s.logCommitErrors(r, err); errCode != "" {
s.offsetCommitsFailedTotal.WithLabelValues(coordinatorID, errCode).Inc()
return
}
})
}
// processMessage:
// - deserializes the message
// - checks if it is from us, or from another kminion process running somewhere else
// - hands it off to the service, which then reports metrics on it
func (s *Service) processMessage(record *kgo.Record) {
if record.Value == nil {
// Init messages have nil values - we want to skip these. They are only used to make sure a consumer is ready.
return
}
var msg EndToEndMessage
if jerr := json.Unmarshal(record.Value, &msg); jerr != nil {
s.logger.Error("failed to unmarshal message value", zap.Error(jerr))
return // maybe older version
}
if msg.MinionID != s.minionID {
return // not from us
}
// restore partition, which is not serialized
msg.partition = int(record.Partition)
s.messageTracker.onMessageArrived(&msg)
}
================================================
FILE: e2e/endtoend_message.go
================================================
package e2e
import "time"
const (
_ = iota
EndToEndMessageStateCreated
EndToEndMessageStateProducedSuccessfully
)
type EndToEndMessage struct {
MinionID string `json:"minionID"` // unique for each running kminion instance
MessageID string `json:"messageID"` // unique for each message
Timestamp int64 `json:"createdUtcNs"` // when the message was created, unix nanoseconds
// The following properties are only used within the message tracker
partition int
state int
produceLatency float64
}
func (m *EndToEndMessage) creationTime() time.Time {
return time.Unix(0, m.Timestamp)
}
================================================
FILE: e2e/group_tracker.go
================================================
package e2e
import (
"context"
"strings"
"time"
"github.com/twmb/franz-go/pkg/kerr"
"github.com/twmb/franz-go/pkg/kgo"
"github.com/twmb/franz-go/pkg/kmsg"
"go.uber.org/zap"
)
const (
oldGroupCheckInterval = 5 * time.Second // how often to check for old kminion groups
oldGroupMaxAge = 20 * time.Second // maximum age after which an old group should be deleted
)
// groupTracker keeps checking for empty consumerGroups matching the kminion prefix.
// When a group was seen empty for some time, we delete it.
// Why?
// Whenever a kminion instance starts up it creates a consumer-group for itself in order to not "collide" with other kminion instances.
// When an instance restarts (for whatever reason), it creates a new group again, so we'd end up with a lot of unused groups.
type groupTracker struct {
cfg Config
logger *zap.Logger
client *kgo.Client // kafka client
groupId string // our own groupId
potentiallyEmptyGroups map[string]time.Time // groupName -> utc timestamp when the group was first seen
}
func newGroupTracker(cfg Config, logger *zap.Logger, client *kgo.Client, groupID string) *groupTracker {
return &groupTracker{
cfg: cfg,
logger: logger.Named("group_tracker"),
client: client,
groupId: groupID,
potentiallyEmptyGroups: make(map[string]time.Time),
}
}
func (g *groupTracker) start(ctx context.Context) {
g.logger.Debug("starting group tracker")
deleteOldGroupsTicker := time.NewTicker(oldGroupCheckInterval)
for {
select {
case <-ctx.Done():
g.logger.Debug("stopping group tracker, context was cancelled")
return
case <-deleteOldGroupsTicker.C:
childCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
err := g.checkAndDeleteOldConsumerGroups(childCtx)
if err != nil {
g.logger.Error("failed to check for old consumer groups: %w", zap.Error(err))
}
cancel()
}
}
}
func (g *groupTracker) checkAndDeleteOldConsumerGroups(ctx context.Context) error {
groupsRq := kmsg.NewListGroupsRequest()
groupsRq.StatesFilter = []string{"Empty"}
g.logger.Debug("checking for stale kminion consumer groups")
shardedResponse := g.client.RequestSharded(ctx, &groupsRq)
// find groups that start with the kminion prefix
matchingGroups := make([]string, 0)
for _, shard := range shardedResponse {
if shard.Err != nil {
g.logger.Error("error in response to ListGroupsRequest", zap.Int32("broker_id", shard.Meta.NodeID), zap.Error(shard.Err))
continue
}
r, ok := shard.Resp.(*kmsg.ListGroupsResponse)
if !ok {
g.logger.Error("cannot cast responseShard.Resp to kmsg.ListGroupsResponse")
continue
}
for _, group := range r.Groups {
name := group.Group
if name == g.groupId {
continue // skip our own consumer group
}
if strings.HasPrefix(name, g.cfg.Consumer.GroupIdPrefix) {
matchingGroups = append(matchingGroups, name)
}
}
}
// save new (previously unseen) groups to tracker
g.logger.Debug("checked for stale consumer groups", zap.Int("found_groups", len(matchingGroups)), zap.Strings("groups", matchingGroups))
for _, name := range matchingGroups {
_, exists := g.potentiallyEmptyGroups[name]
if !exists {
// add it with the current timestamp
g.potentiallyEmptyGroups[name] = time.Now()
g.logger.Debug("found new empty kminion group, adding it to the tracker", zap.String("group", name))
}
}
// go through saved groups:
// - don't track the ones we don't see anymore (bc they got deleted or are not empty anymore)
// - mark the ones that are too old (have been observed as empty for too long)
groupsToDelete := make([]string, 0)
for name, firstSeen := range g.potentiallyEmptyGroups {
exists, _ := containsStr(matchingGroups, name)
if exists {
// still there, check age and maybe delete it
age := time.Since(firstSeen)
if age > oldGroupMaxAge {
// group was unused for too long, delete it
groupsToDelete = append(groupsToDelete, name)
delete(g.potentiallyEmptyGroups, name)
}
} else {
// does not exist anymore, it must have been deleted, or is in use now (no longer empty)
// don't track it anymore
delete(g.potentiallyEmptyGroups, name)
}
}
// actually delete the groups we've decided to delete
if len(groupsToDelete) == 0 {
return nil
}
deleteRq := kmsg.NewDeleteGroupsRequest()
deleteRq.Groups = groupsToDelete
deleteResp := g.client.RequestSharded(ctx, &deleteRq)
// done, now just errors
// if we get a not authorized error we'll disable deleting groups
foundNotAuthorizedError := false
deletedGroups := make([]string, 0)
for _, shard := range deleteResp {
if shard.Err != nil {
g.logger.Error("sharded consumer group delete request failed", zap.Error(shard.Err))
continue
}
resp, ok := shard.Resp.(*kmsg.DeleteGroupsResponse)
if !ok {
g.logger.Error("failed to cast shard response to DeleteGroupsResponse while handling an error for deleting groups", zap.String("shard_host", shard.Meta.Host), zap.Int32("broker_id", shard.Meta.NodeID), zap.Error(shard.Err))
continue
}
for _, groupResp := range resp.Groups {
err := kerr.ErrorForCode(groupResp.ErrorCode)
if err != nil {
g.logger.Error("failed to delete consumer group", zap.String("shard", shard.Meta.Host), zap.Int32("broker_id", shard.Meta.NodeID), zap.String("group", groupResp.Group), zap.Error(err))
if groupResp.ErrorCode == kerr.GroupAuthorizationFailed.Code {
foundNotAuthorizedError = true
}
} else {
deletedGroups = append(deletedGroups, groupResp.Group)
}
}
}
g.logger.Info("deleted old consumer groups", zap.Strings("deleted_groups", deletedGroups))
if foundNotAuthorizedError {
g.logger.Info("disabling trying to delete old kminion consumer-groups since one of the last delete results had an 'GroupAuthorizationFailed' error")
}
return nil
}
================================================
FILE: e2e/message_tracker.go
================================================
package e2e
import (
"fmt"
"strconv"
"time"
"github.com/jellydator/ttlcache/v2"
"go.uber.org/zap"
)
// messageTracker keeps track of the messages' lifetime
//
// When we successfully send a mesasge, it will be added to this tracker.
// Later, when we receive the message back in the consumer, the message is marked as completed and removed from the tracker.
// If the message does not arrive within the configured `consumer.roundtripSla`, it is counted as lost. Messages that
// failed to be produced will not be
// considered as lost message.
//
// We use a dedicated counter to track messages that couldn't be produced to Kafka.
type messageTracker struct {
svc *Service
logger *zap.Logger
cache *ttlcache.Cache
}
func newMessageTracker(svc *Service) *messageTracker {
defaultExpirationDuration := svc.config.Consumer.RoundtripSla
cache := ttlcache.NewCache()
cache.SetTTL(defaultExpirationDuration)
t := &messageTracker{
svc: svc,
logger: svc.logger.Named("message_tracker"),
cache: cache,
}
t.cache.SetExpirationReasonCallback(func(key string, reason ttlcache.EvictionReason, value interface{}) {
t.onMessageExpired(key, reason, value.(*EndToEndMessage))
})
return t
}
func (t *messageTracker) addToTracker(msg *EndToEndMessage) {
t.cache.Set(msg.MessageID, msg)
}
// updateItemIfExists only updates a message if it still exists in the cache. The remaining time to live will not
// be refreshed.
// If it doesn't exist an ttlcache.ErrNotFound error will be returned.
func (t *messageTracker) updateItemIfExists(msg *EndToEndMessage) error {
_, ttl, err := t.cache.GetWithTTL(msg.MessageID)
if err != nil {
if err == ttlcache.ErrNotFound {
return err
}
panic(err)
}
// Because the returned TTL is set to the original TTL duration (and not the remaining TTL) we have to calculate
// the remaining TTL now as we want to updat the existing cache item without changing the remaining time to live.
expiryTimestamp := msg.creationTime().Add(ttl)
remainingTTL := expiryTimestamp.Sub(time.Now())
if remainingTTL < 0 {
// This entry should have been deleted already. Race condition.
return ttlcache.ErrNotFound
}
err = t.cache.SetWithTTL(msg.MessageID, msg, remainingTTL)
if err != nil {
panic(err)
}
return nil
}
// removeFromTracker removes an entry from the cache. If the key does not exist it will return an ttlcache.ErrNotFound error.
func (t *messageTracker) removeFromTracker(messageID string) error {
return t.cache.Remove(messageID)
}
func (t *messageTracker) onMessageArrived(arrivedMessage *EndToEndMessage) {
cm, err := t.cache.Get(arrivedMessage.MessageID)
if err != nil {
if err == ttlcache.ErrNotFound {
// message expired and was removed from the cache
// it arrived too late, nothing to do here...
return
} else {
panic(fmt.Errorf("failed to get message from cache: %w", err))
}
}
msg := cm.(*EndToEndMessage)
expireTime := msg.creationTime().Add(t.svc.config.Consumer.RoundtripSla)
isExpired := time.Now().Before(expireTime)
latency := time.Now().Sub(msg.creationTime())
if !isExpired {
// Message arrived late, but was still in cache. We don't increment the lost counter here because eventually
// it will be evicted from the cache. This case should only pop up if the sla time is exceeded, but if the
// item has not been evicted from the cache yet.
t.logger.Info("message arrived late, will be marked as a lost message",
zap.Int64("delay_ms", latency.Milliseconds()),
zap.String("id", msg.MessageID))
return
}
// message arrived early enough
pID := strconv.Itoa(msg.partition)
t.svc.messagesReceived.WithLabelValues(pID).Inc()
t.svc.roundtripLatency.WithLabelValues(pID).Observe(latency.Seconds())
// Remove message from cache, so that we don't track it any longer and won't mark it as lost when the entry expires.
t.cache.Remove(msg.MessageID)
}
func (t *messageTracker) onMessageExpired(_ string, reason ttlcache.EvictionReason, value interface{}) {
if reason == ttlcache.Removed {
// We are not interested in messages that have been removed by us!
return
}
msg := value.(*EndToEndMessage)
created := msg.creationTime()
age := time.Since(created)
t.svc.lostMessages.WithLabelValues(strconv.Itoa(msg.partition)).Inc()
t.logger.Debug("message expired/lost",
zap.Int64("age_ms", age.Milliseconds()),
zap.Int("partition", msg.partition),
zap.String("message_id", msg.MessageID),
zap.Bool("successfully_produced", msg.state == EndToEndMessageStateProducedSuccessfully),
zap.Float64("produce_latency_seconds", msg.produceLatency),
)
}
================================================
FILE: e2e/partition_planner.go
================================================
package e2e
import (
"fmt"
"sort"
"github.com/twmb/franz-go/pkg/kmsg"
"go.uber.org/zap"
)
// PartitionPlanner is a thin orchestrator around the three planning phases. It wires in
// configuration, logging, and the replica-selection strategy. The planning phases are:
// 1. Fix replication factor and rack diversity on existing partitions.
// 2. Ensure every broker is the preferred leader of at least 1 partition.
// 3. Ensure total partition count meets the configured lower bound.
//
// The probe topic is created to continuously test end-to-end availability by
// producing and consuming records per partition. The planner enforces:
//
// - Correct replication factor (RF == configured RF) and no duplicate brokers
// within a single partition's replica set.
// - Rack awareness: maximize the number of unique racks per partition (bounded
// by min(RF, #racks)). This reduces the blast radius of a rack failure.
// - Sufficient partition count: >= max(current, #brokers, partitionsPerBroker*#brokers).
// - Leader coverage: every broker must be the preferred leader (replicas[0]) of
// at least one partition, so a per-broker failure is observable.
// - Minimal movement: prefer to fix RF and rack issues first, then rotate/swap
// leaders to fill gaps, and only create partitions when necessary.
type PartitionPlanner struct {
cfg EndToEndTopicConfig
logger *zap.Logger
sel ReplicaSelector
}
// NewPartitionPlanner constructs a Planner with the given config & logger. The replica
// selector is chosen in Plan() once we have ClusterState ready.
func NewPartitionPlanner(cfg EndToEndTopicConfig, logger *zap.Logger) *PartitionPlanner {
return &PartitionPlanner{cfg: cfg, logger: logger}
}
// Plan produces an in-memory plan (reassignments + creations) for the probe
// topic based on current cluster metadata. See the package header for the
// invariants we enforce.
func (p *PartitionPlanner) Plan(meta *kmsg.MetadataResponse) (*Plan, error) {
if meta == nil || len(meta.Topics) == 0 {
return nil, fmt.Errorf("metadata response has no topics")
}
if len(meta.Brokers) == 0 {
return nil, fmt.Errorf("metadata response has no brokers")
}
if p.cfg.ReplicationFactor > len(meta.Brokers) {
return nil, fmt.Errorf("replication factor %d exceeds available brokers %d", p.cfg.ReplicationFactor, len(meta.Brokers))
}
// Build state required for the planning
state := BuildState(meta)
desired := ComputeDesired(state, p.cfg)
tracker := NewLoadTracker(state)
selector := NewRackAwareSelector(state, tracker)
p.sel = selector
b := NewPlanBuilder(state, desired, tracker, p.cfg.RebalancePartitions)
// Phase 1: normalize RF and racks (low movement first)
// Grow/trim replicas to configured RF and re-pick to maximize unique racks
// per partition. We avoid moving the leader when possible.
fixReplicationAndRack(b, selector, p.cfg.ReplicationFactor)
// Phase 2: ensure each broker is preferred leader for >= 1 partition
// This guarantees probe coverage: if a broker dies, some partition leader is
// unavailable and the probe trips.
ensureLeaderCoverage(b, selector)
// Phase 3: ensure desired partition count
// Enforce >= max(current, #brokers, partitionsPerBroker * #brokers).
ensurePartitionCount(b, selector)
// Build final plan and log the changes
plan := b.Build()
// Log with appropriate level based on whether changes are needed
totalChanges := len(plan.Reassignments) + len(plan.CreateAssignments)
if totalChanges == 0 {
p.logger.Info("e2e probe topic partition leadership and replica distribution check completed - optimal",
zap.String("topic", state.TopicName),
zap.Int("brokers", len(state.Brokers)),
zap.Int("partitions", len(state.Partitions)),
zap.String("status", "optimal"),
)
} else {
p.logger.Info("plan to change partition leadership and replica placements on e2e topic has been prepared",
zap.String("topic", state.TopicName),
zap.Int("brokers", len(state.Brokers)),
zap.Int("cur_partitions", len(state.Partitions)),
zap.Int("final_partitions", plan.FinalPartitionCount),
zap.Int("reassignments", len(plan.Reassignments)),
zap.Int("creates", len(plan.CreateAssignments)),
zap.Int("total_changes", totalChanges),
)
}
return plan, nil
}
// -----------------------------------------------------------------------------
// Planning phases
// -----------------------------------------------------------------------------
// PlanBuilder holds a predictive view of partition -> replicas after applying
// staged operations. We never mutate ClusterState.Partitions; instead, we write
// new assignments into PlanBuilder.view and record high-level operations to
// produce Kafka requests at the end (see Plan.ToRequests).
//
// This keeps the planning phases simple and side-effect free.
type PlanBuilder struct {
state ClusterState
desired Desired
tracker *LoadTracker
// rebalancePartitions indicates whether reassignments will actually be executed.
// When false, Phase 3 uses actual current leaders instead of predicted leaders
// from the view, since reassignments won't be applied.
rebalancePartitions bool
// view is our predictive map: partitionID -> replicas (preferred leader at idx 0)
view map[int32][]int32
reassignments []Reassignment // staged reassignments for existing partitions
creations []CreateAssignment // staged creations of new partitions
}
// Reassignment captures a single partition’s new replica list.
//
// The order of Replicas matters: index 0 will become the preferred leader after
// reassignment completes on the broker side.
type Reassignment struct {
Partition int32
Replicas []int32
}
// CreateAssignment captures the replica list for a *new* partition that will be
// appended to the topic during CreatePartitions.
//
// The order of Replicas matters: index 0 is the preferred leader for the new
// partition.
type CreateAssignment struct {
Replicas []int32
}
// Plan is the final, immutable result of planning. It can be turned into Kafka
// requests via ToRequests.
//
// FinalPartitionCount is the topic’s partition count after applying creations.
// (Reassignments do not change the count.)
type Plan struct {
Reassignments []Reassignment
CreateAssignments []CreateAssignment
FinalPartitionCount int
}
// NewPlanBuilder initializes a predictive view by cloning the current
// partition->replicas map. We avoid accidental mutation by copying slices.
func NewPlanBuilder(state ClusterState, desired Desired, tracker *LoadTracker, rebalancePartitions bool) *PlanBuilder {
view := make(map[int32][]int32, len(state.Partitions))
for pid, p := range state.Partitions {
view[pid] = append([]int32(nil), p.Replicas...)
}
return &PlanBuilder{state: state, desired: desired, tracker: tracker, rebalancePartitions: rebalancePartitions, view: view}
}
// Build freezes the current staged operations into a Plan. We compute the final
// partition count as current + number of creates.
func (b *PlanBuilder) Build() *Plan {
return &Plan{
Reassignments: b.reassignments,
CreateAssignments: b.creations,
FinalPartitionCount: len(b.state.Partitions) + len(b.creations),
}
}
// CommitReassignment records a reassignment and updates the predictive view.
func (b *PlanBuilder) CommitReassignment(pid int32, reps []int32) {
b.reassignments = append(b.reassignments, Reassignment{Partition: pid, Replicas: reps})
b.view[pid] = reps
}
// CommitCreate records a new-partition assignment. The final partition count is
// computed when building the Plan.
func (b *PlanBuilder) CommitCreate(reps []int32) {
b.creations = append(b.creations, CreateAssignment{Replicas: reps})
}
// fixReplicationAndRack enforces configured RF on each existing partition
// (growing or shrinking as needed) and re-picks replicas when rack diversity can
// be improved. We try to keep the current leader by always retaining replicas[0]
// when shrinking.
func fixReplicationAndRack(b *PlanBuilder, sel ReplicaSelector, rf int) {
// Sort partition IDs for deterministic iteration
pids := make([]int32, 0, len(b.view))
for pid := range b.view {
pids = append(pids, pid)
}
sort.Slice(pids, func(i, j int) bool { return pids[i] < pids[j] })
for _, pid := range pids {
replicas := b.view[pid]
desiredRF := rf
newReplicas := replicas
switch {
// Grow: re-pick the full set based on the current actual leader.
case len(replicas) < desiredRF:
// Use the actual current leader from the partition metadata
currentPartition, exists := b.state.Partitions[pid]
preferredLeader := replicas[0] // fallback to preferred leader
if exists && currentPartition.Leader != -1 {
preferredLeader = currentPartition.Leader
}
newReplicas = sel.ChooseReplicas(preferredLeader, desiredRF)
// Shrink: keep leader; then pick remaining replicas preferring
// new racks, then lower load, then lower broker ID.
case len(replicas) > desiredRF:
newReplicas = shrinkPreservingLeader(b, pid, replicas, desiredRF)
// Same RF: if rack diversity can be improved, re-pick.
default:
if violatesRackDiversity(replicas, b.state.NumRacks, b.state.Brokers) {
// Use the actual current leader from the partition metadata
currentPartition, exists := b.state.Partitions[pid]
preferredLeader := replicas[0] // fallback to preferred leader
if exists && currentPartition.Leader != -1 {
preferredLeader = currentPartition.Leader
}
newReplicas = sel.ChooseReplicas(preferredLeader, desiredRF)
}
}
if !equalInt32s(newReplicas, replicas) {
b.CommitReassignment(pid, newReplicas)
}
}
}
// shrinkPreservingLeader returns a replica set of size rf that keeps the
// current leader and greedily prefers candidates that add a new rack;
// among equals, chooses lower load, then lower broker ID.
func shrinkPreservingLeader(b *PlanBuilder, pid int32, replicas []int32, rf int) []int32 {
// Use the actual current leader from the partition metadata, not replicas[0]
currentPartition, exists := b.state.Partitions[pid]
leader := replicas[0] // fallback to preferred leader if no current leader found
if exists && currentPartition.Leader != -1 {
leader = currentPartition.Leader
}
keep := []int32{leader}
seen := map[string]struct{}{
b.state.Brokers[leader].Rack: {},
}
type cand struct {
id int32
rack string
load int
}
// Build the candidate pool from non-leader replicas.
pool := make([]cand, 0, len(replicas)-1)
for _, id := range replicas[1:] {
pool = append(pool, cand{
id: id,
rack: b.state.Brokers[id].Rack,
load: b.tracker.Load(id).Replicas,
})
}
// Greedy selection with dynamic "seen racks".
for len(keep) < rf && len(pool) > 0 {
best := 0
for i := 1; i < len(pool); i++ {
a, b2 := pool[i], pool[best]
_, aSeen := seen[a.rack]
_, bSeen := seen[b2.rack]
switch {
// Prefer a candidate that adds a new rack.
case aSeen != bSeen:
if !aSeen && bSeen {
best = i
}
// Then prefer lower load.
case a.load != b2.load:
if a.load < b2.load {
best = i
}
// Then prefer lower broker ID (stable tie-breaker).
case a.id < b2.id:
best = i
}
}
chosen := pool[best]
keep = append(keep, chosen.id)
seen[chosen.rack] = struct{}{}
// Remove chosen from pool (swap-delete).
pool[best] = pool[len(pool)-1]
pool = pool[:len(pool)-1]
}
return keep
}
// ensureLeaderCoverage guarantees that each broker becomes preferred leader for
// at least one partition. We try the cheapest options first:
// 1. If the broker already hosts a replica of some partition where it is not
// leader, rotate it to index 0 (no RF change, minimal movement).
// 2. Otherwise, replace a non-unique-rack replica in some donor partition and
// rotate the target broker to index 0.
// 3. If neither is possible, create a new partition led by the target broker.
func ensureLeaderCoverage(b *PlanBuilder, sel ReplicaSelector) {
// Guard: if actual leaders already cover all brokers, skip preferred leader rebalancing
actualLeaders := make(map[int32][]int32, len(b.state.BrokerIDs))
for _, id := range b.state.BrokerIDs {
actualLeaders[id] = nil
}
for pid, part := range b.state.Partitions {
if part.Leader != -1 {
actualLeaders[part.Leader] = append(actualLeaders[part.Leader], pid)
}
}
if len(brokersMissingLeadership(b.state.BrokerIDs, actualLeaders)) == 0 {
return // Actual coverage is perfect - no need to rebalance preferred leaders
}
// Build "leadersByBroker": broker -> list of partition IDs it currently leads (preferred).
leadersByBroker := indexLeaders(b.state.BrokerIDs, b.view)
// Brokers that currently lead zero partitions (preferred).
// However, if a broker already has actual leadership (even if not preferred),
// we can skip it to minimize unnecessary reassignments.
missing := []int32{}
for _, broker := range brokersMissingLeadership(b.state.BrokerIDs, leadersByBroker) {
// Skip if this broker already has actual leadership
if len(actualLeaders[broker]) > 0 {
continue
}
missing = append(missing, broker)
}
if len(missing) == 0 {
return
}
// Local helpers that both perform the action and update leadersByBroker.
rotateIfReplica := func(target int32, donors []int32) bool {
for _, donor := range donors {
// Collect candidate partitions where target is already a replica
pids := append([]int32(nil), leadersByBroker[donor]...)
// Sort with preference: partitions where the donor is the ACTUAL leader first.
// This ensures we're actually freeing up leadership from the donor, rather than
// rotating a partition where the donor is only the preferred leader.
// Then by partition ID for determinism.
sort.Slice(pids, func(i, j int) bool {
pi, pj := pids[i], pids[j]
// Prefer partitions where the donor is the actual leader
iDonorIsActual := b.state.Partitions[pi].Leader == donor
jDonorIsActual := b.state.Partitions[pj].Leader == donor
if iDonorIsActual != jDonorIsActual {
return iDonorIsActual
}
// Then by partition ID for stability
return pi < pj
})
for _, pid := range pids {
reps := b.view[pid]
if !contains(reps, target) {
continue
}
newReps := putFirst(reps, target) // make target the leader
b.CommitReassignment(pid, newReps)
// Update bookkeeping: pid moves from donor to target.
leadersByBroker[donor] = remove(leadersByBroker[donor], pid)
leadersByBroker[target] = append(leadersByBroker[target], pid)
return true
}
}
return false
}
replaceDuplicateAndRotate := func(target int32, donors []int32) bool {
for _, donor := range donors {
// Sort partition IDs for deterministic iteration
pids := append([]int32(nil), leadersByBroker[donor]...)
sort.Slice(pids, func(i, j int) bool { return pids[i] < pids[j] })
for _, pid := range pids {
reps := b.view[pid]
if contains(reps, target) {
continue // covered by rotate path above
}
idx := victimIndex(reps, b.state.Brokers)
if idx < 0 {
continue // no safe replica to swap (unique racks already)
}
newReps := append([]int32{}, reps...)
newReps[idx] = target
newReps = putFirst(newReps, target)
b.CommitReassignment(pid, newReps)
leadersByBroker[donor] = remove(leadersByBroker[donor], pid)
leadersByBroker[target] = append(leadersByBroker[target], pid)
return true
}
}
return false
}
for _, target := range missing {
// Donors: brokers leading more partitions than their soft target,
// sorted by largest surplus first (tie-breaker: smaller broker ID).
donors := donorBrokers(b.state.BrokerIDs, leadersByBroker, b.desired.TargetLeaders)
// 1) Cheapest: rotate target to lead where it already is a replica.
if rotateIfReplica(target, donors) {
continue
}
// 2) Next-cheapest: replace a duplicate-rack replica, then rotate.
if replaceDuplicateAndRotate(target, donors) {
continue
}
// 3) Last resort: create a new partition led by target.
reps := sel.ChooseReplicas(target, b.desired.RF)
b.CommitCreate(reps)
// Track a synthetic partition ID so counts stay consistent within this loop.
newPID := int32(len(b.state.Partitions) + len(b.creations) - 1)
leadersByBroker[target] = append(leadersByBroker[target], newPID)
}
}
// ensurePartitionCount adds partitions until we reach DesiredPartitions.
//
// Leader selection strategy per new partition:
// 1. Prefer a broker still below its soft target (TargetLeaders). Among those,
// pick the one closest to its target (smallest positive gap).
// 2. If all brokers are at/above target, pick the broker that currently leads
// the fewest partitions (stable tie-breaker via leastLoadedLeader).
func ensurePartitionCount(b *PlanBuilder, sel ReplicaSelector) {
desiredTotal := b.desired.DesiredPartitions
total := len(b.state.Partitions) + len(b.creations)
if total >= desiredTotal || len(b.state.BrokerIDs) == 0 {
return
}
// Count current leaders per broker.
leaderCount := make(map[int32]int, len(b.state.BrokerIDs))
if b.rebalancePartitions {
// Use predictive view (reassignments will be applied)
for _, reps := range b.view {
if len(reps) > 0 {
leaderCount[reps[0]]++
}
}
} else {
// Use actual current leaders (reassignments won't be applied)
for _, p := range b.state.Partitions {
if p.Leader != -1 {
leaderCount[p.Leader]++
}
}
}
// Always include leaders from staged creates (Phase 2 fallback creates are always executed)
for _, ca := range b.creations {
if len(ca.Replicas) > 0 {
leaderCount[ca.Replicas[0]]++
}
}
// Make sure every broker has an entry in leaderCount.
for _, id := range b.state.BrokerIDs {
if _, ok := leaderCount[id]; !ok {
leaderCount[id] = 0
}
}
for total < desiredTotal {
// Pick a preferred leader for the new partition.
preferred := pickLeader(b.state.BrokerIDs, leaderCount, b.desired.TargetLeaders)
// Materialize replicas and commit the create.
reps := sel.ChooseReplicas(preferred, b.desired.RF)
b.CommitCreate(reps)
leaderCount[preferred]++
total++
}
}
// pickLeader returns the broker to prefer as the new partition's leader.
//
// If any brokers are still below their soft target, it returns the one with the
// smallest positive remaining gap (# of missing leaderships for partitions).
// Otherwise, it returns the least-loaded leader.
func pickLeader(brokerIDs []int32, leaderCount map[int32]int, target map[int32]int) int32 {
var (
chosen int32
bestGap int // smallest positive gap seen so far
found bool
)
for _, id := range brokerIDs {
gap := target[id] - leaderCount[id] // how many leaders this broker still “should” get
if gap <= 0 {
continue
}
if !found || gap < bestGap {
chosen = id
bestGap = gap
found = true
}
}
if found {
return chosen
}
// Everyone at/above target: fall back to least leaders.
return leastLoadedLeader(leaderCount, brokerIDs)
}
// -----------------------------------------------------------------------------
// State & Desired
// -----------------------------------------------------------------------------
// Broker is an immutable snapshot of a broker’s ID and rack label at plan time.
// An empty rack value means the broker did not advertise one.
//
// We never mutate this during planning; it mirrors the Metadata response.
type Broker struct {
ID int32
Rack string // empty string if unknown
}
// Partition is an immutable snapshot of a partition's current replica list.
//
// The order of Replicas matters: index 0 is the *preferred leader* for this
// partition, i.e., where the controller will attempt to place leadership after
// changes. The Leader field contains the *actual current leader* as reported
// by Kafka metadata, which may differ from the preferred leader (replicas[0]).
//
// We do not mutate these structs; instead we work with a predictive "view"
// inside PlanBuilder (below).
type Partition struct {
ID int32
Leader int32 // actual current leader from metadata
Replicas []int32 // order matters; index 0 is preferred leader
}
// ClusterState is a convenience wrapper around the metadata we actually need in
// this planner. It is deliberately small to keep the code readable.
//
// - TopicName: the probe topic name (for logging and request building)
// - Brokers: map of broker id -> Broker
// - BrokerIDs: sorted slice of broker IDs for deterministic iteration
// - ByRack: rack -> broker IDs, to reason about rack diversity
// - NumRacks: number of distinct racks (empty rack counts as one)
// - Partitions: map of partition id -> Partition snapshot
//
// None of the members are mutated after construction.
type ClusterState struct {
TopicName string
Brokers map[int32]Broker
BrokerIDs []int32 // sorted
ByRack map[string][]int32
NumRacks int
Partitions map[int32]Partition
}
// BuildState converts Metadata response to ClusterState and copies slices so
// planning cannot accidentally mutate the input.
func BuildState(meta *kmsg.MetadataResponse) ClusterState {
brokers := make(map[int32]Broker, len(meta.Brokers))
byRack := make(map[string][]int32)
var ids []int32
for _, b := range meta.Brokers {
rack := ""
if b.Rack != nil {
rack = *b.Rack
}
brokers[b.NodeID] = Broker{ID: b.NodeID, Rack: rack}
byRack[rack] = append(byRack[rack], b.NodeID)
ids = append(ids, b.NodeID)
}
sort.Slice(ids, func(i, j int) bool { return ids[i] < ids[j] })
parts := make(map[int32]Partition)
for _, p := range meta.Topics[0].Partitions {
rep := append([]int32(nil), p.Replicas...)
parts[p.Partition] = Partition{ID: p.Partition, Leader: p.Leader, Replicas: rep}
}
return ClusterState{
TopicName: pointerStrToStr(meta.Topics[0].Topic),
Brokers: brokers,
BrokerIDs: ids,
ByRack: byRack,
NumRacks: len(byRack),
Partitions: parts,
}
}
type Desired struct {
RF int
DesiredPartitions int
TargetLeaders map[int32]int // per broker (even split)
}
// ComputeDesired derives the minimal partition count and a soft target for
// preferred leader distribution. We require at least one partition per broker so
// everyone can lead, and we honor PartitionsPerBroker as an additional lower
// bound.
func ComputeDesired(state ClusterState, cfg EndToEndTopicConfig) Desired {
perBroker := cfg.PartitionsPerBroker
if perBroker < 1 {
perBroker = 1
}
cur := len(state.Partitions)
desiredPartitions := max(cur, perBroker*len(state.BrokerIDs))
target := evenSplit(desiredPartitions, state.BrokerIDs)
return Desired{RF: cfg.ReplicationFactor, DesiredPartitions: desiredPartitions, TargetLeaders: target}
}
// -----------------------------------------------------------------------------
// Replica selection (rack-aware) & load tracking
// -----------------------------------------------------------------------------
// Load captures, per broker, how many times it appears as a replica and as a
// preferred leader across the predictive view of the topic. We use this to
// bias selection towards less-loaded brokers to avoid hot spots.
//
// Note: this is a transient view local to a single planning run. Nothing here
// is persisted and no attempt is made to perfectly balance replicas across the
// cluster—only to avoid obviously uneven choices.
type Load struct {
Replicas int
Leaders int
}
// LoadTracker is a tiny helper holding a per-broker Load map with convenience
// methods for reading and incrementing counts while the plan is being
// constructed.
//
// Why not compute loads on the fly? We do that initially (from current
// assignments) and then update incrementally while choosing replicas for new or
// changed partitions to keep subsequent choices informed by earlier ones.
// Keeping it explicit in a struct makes the intent obvious and testing easier.
type LoadTracker struct {
l map[int32]Load
}
// NewLoadTracker builds initial loads from the current assignments in the
// metadata (before any staged changes). The caller updates loads as it makes
// predictive choices so the next decision can see the latest picture.
func NewLoadTracker(state ClusterState) *LoadTracker {
l := make(map[int32]Load, len(state.BrokerIDs))
for _, id := range state.BrokerIDs {
l[id] = Load{}
}
for _, p := range state.Partitions {
for _, r := range p.Replicas {
ld := l[r]
ld.Replicas++
l[r] = ld
}
// Count the actual current leader separately
if p.Leader != -1 { // -1 indicates no leader (error state)
ld := l[p.Leader]
ld.Leaders++
l[p.Leader] = ld
}
}
return &LoadTracker{l: l}
}
// AddReplica increments replica and (optionally) leader counts for a broker in
// the predictive view. Call this after the planner decides to place a replica
// (e.g., in ChooseReplicas).
func (t *LoadTracker) AddReplica(id int32, leader bool) {
ld := t.l[id]
ld.Replicas++
if leader {
ld.Leaders++
}
t.l[id] = ld
}
// Load returns the current transient load counters for a broker.
func (t *LoadTracker) Load(id int32) Load { return t.l[id] }
// ReplicaSelector abstracts the heuristic used to pick a concrete replica set
// for a given (preferred) leader and RF. Keeping this as an interface makes it
// trivial to swap strategy in tests if you’d like to assert specific behaviors.
// The production strategy we use is RackAwareSelector.
type ReplicaSelector interface {
ChooseReplicas(preferredLeader int32, rf int) []int32
}
// RackAwareSelector is a simple greedy strategy that tries to:
// 1. Always include the requested preferred leader at index 0.
// 2. Maximize rack diversity by preferring brokers on new racks first.
// 3. Among candidates on equally novel racks, prefer lower replica load.
// 4. Use broker ID as a final tiebreaker for determinism.
//
// After selecting, we update the transient LoadTracker so subsequent decisions
// are informed by this choice.
//
// This is intentionally not perfect or global-optimal—just a pragmatic heuristic
// that produces good, stable results for the probe topic.
type RackAwareSelector struct {
state ClusterState
loads *LoadTracker
}
// NewRackAwareSelector constructs the default selection strategy.
func NewRackAwareSelector(state ClusterState, loads *LoadTracker) *RackAwareSelector {
return &RackAwareSelector{state: state, loads: loads}
}
// ChooseReplicas returns an ordered replica list of length rf where index 0 is
// the preferred leader. Candidates are scored by new rack first, then lower
// load, then lower broker ID.
func (s *RackAwareSelector) ChooseReplicas(preferredLeader int32, rf int) []int32 {
rf = min(rf, len(s.state.BrokerIDs))
res := make([]int32, 0, rf)
res = append(res, preferredLeader)
seen := map[int32]struct{}{preferredLeader: {}}
usedRack := map[string]struct{}{s.state.Brokers[preferredLeader].Rack: {}}
type cand struct {
id int32
rack string
load int
}
build := func() []cand {
out := make([]cand, 0, len(s.state.BrokerIDs))
for _, id := range s.state.BrokerIDs {
if _, ok := seen[id]; ok {
continue
}
out = append(out, cand{id: id, rack: s.state.Brokers[id].Rack, load: s.loads.Load(id).Replicas})
}
sort.Slice(out, func(i, j int) bool {
_, iu := usedRack[out[i].rack]
_, ju := usedRack[out[j].rack]
if iu != ju {
return !iu && ju
}
if out[i].load != out[j].load {
return out[i].load < out[j].load
}
return out[i].id < out[j].id
})
return out
}
for len(res) < rf {
cands := build()
if len(cands) == 0 {
break
}
c := cands[0]
res = append(res, c.id)
seen[c.id] = struct{}{}
usedRack[c.rack] = struct{}{}
}
// update transient loads
for i, id := range res {
s.loads.AddReplica(id, i == 0)
}
return res
}
// -----------------------------------------------------------------------------
// Plan -> Kafka requests
// -----------------------------------------------------------------------------
// ToRequests converts a Plan to Kafka admin requests. Either result may be nil
// if the plan contains no operations of that type.
//
// rebalancePartitions controls whether explicit replica assignments are included
// in the CreatePartitions request. Set it to false for Redpanda Cloud, which
// disallows explicit partition assignments via the Kafka API and returns
// INVALID_REQUEST when they are present.
func (p *Plan) ToRequests(topic string, rebalancePartitions bool) (*kmsg.AlterPartitionAssignmentsRequest, *kmsg.CreatePartitionsRequest) {
var alter *kmsg.AlterPartitionAssignmentsRequest
var create *kmsg.CreatePartitionsRequest
if len(p.Reassignments) > 0 {
r := kmsg.NewAlterPartitionAssignmentsRequest()
t := kmsg.NewAlterPartitionAssignmentsRequestTopic()
t.Topic = topic
for _, ra := range p.Reassignments {
pr := kmsg.NewAlterPartitionAssignmentsRequestTopicPartition()
pr.Partition = ra.Partition
pr.Replicas = append([]int32(nil), ra.Replicas...)
t.Partitions = append(t.Partitions, pr)
}
r.Topics = []kmsg.AlterPartitionAssignmentsRequestTopic{t}
alter = &r
}
if len(p.CreateAssignments) > 0 {
r := kmsg.NewCreatePartitionsRequest()
t := kmsg.NewCreatePartitionsRequestTopic()
t.Topic = topic
t.Count = int32(p.FinalPartitionCount)
// Redpanda Cloud disallows explicit partition assignments via the Kafka API
// (returning INVALID_REQUEST), the same restriction that applies to
// AlterPartitionAssignments. Omit the Assignment list and let the broker
// auto-place new partitions when rebalancing is disabled.
if rebalancePartitions {
for _, ca := range p.CreateAssignments {
ta := kmsg.NewCreatePartitionsRequestTopicAssignment()
ta.Replicas = append([]int32(nil), ca.Replicas...)
t.Assignment = append(t.Assignment, ta)
}
}
r.Topics = []kmsg.CreatePartitionsRequestTopic{t}
create = &r
}
return alter, create
}
// -----------------------------------------------------------------------------
// Utils
// -----------------------------------------------------------------------------
// indexLeaders builds broker -> list of partition IDs it currently leads (from view).
func indexLeaders(brokerIDs []int32, view map[int32][]int32) map[int32][]int32 {
m := make(map[int32][]int32, len(brokerIDs))
for _, id := range brokerIDs {
m[id] = nil
}
for pid, reps := range view {
if len(reps) > 0 {
m[reps[0]] = append(m[reps[0]], pid)
}
}
return m
}
// brokersMissingLeadership returns brokers that lead zero partitions.
func brokersMissingLeadership(brokerIDs []int32, leadersByBroker map[int32][]int32) []int32 {
var out []int32
for _, id := range brokerIDs {
if len(leadersByBroker[id]) == 0 {
out = append(out, id)
}
}
return out
}
// donorBrokers returns brokers that currently lead more than their soft target,
// sorted by largest surplus first; ties broken by broker ID ascending.
func donorBrokers(brokerIDs []int32, leadersByBroker map[int32][]int32, target map[int32]int) []int32 {
var donors []int32
for _, id := range brokerIDs {
if len(leadersByBroker[id]) > target[id] {
donors = append(donors, id)
}
}
sort.Slice(donors, func(i, j int) bool {
surplusI := len(leadersByBroker[donors[i]]) - target[donors[i]]
surplusJ := len(leadersByBroker[donors[j]]) - target[donors[j]]
if surplusI != surplusJ {
return surplusI > surplusJ
}
return donors[i] < donors[j]
})
return donors
}
// violatesRackDiversity returns true if a partition’s replicas do not use as
// many unique racks as they could (bounded by min(len(reps), numRacks)).
func violatesRackDiversity(reps []int32, numRacks int, brokers map[int32]Broker) bool {
if len(reps) <= 1 {
return false
}
seen := map[string]struct{}{}
for _, r := range reps {
seen[brokers[r].Rack] = struct{}{}
}
maxUnique := min(len(reps), numRacks)
return len(seen) < maxUnique
}
// victimIndex returns the index of a replica that sits on a rack appearing more
// than once within the replica set (i.e., a duplicate-rack candidate). This is
// used when we need to swap in a new broker to keep/restore diversity. If none
// exists, we fall back to the last replica (a stable, simple choice that avoids
// touching the leader at index 0).
func victimIndex(reps []int32, brokers map[int32]Broker) int {
rc := map[string]int{}
for _, r := range reps {
rc[brokers[r].Rack]++
}
for i, r := range reps {
if rc[brokers[r].Rack] > 1 {
return i
}
}
return len(reps) - 1 // fallback: last (never the leader)
}
// remove removes v from a slice without preserving order. Used for maintaining
// the leaders map in ensureLeaderCoverage.
func remove(xs []int32, v int32) []int32 {
out := xs[:0]
for _, x := range xs {
if x != v {
out = append(out, x)
}
}
return out
}
// putFirst moves id to index 0 while preserving the relative order of the
// remaining elements. This models changing the preferred leader.
func putFirst(reps []int32, id int32) []int32 {
out := make([]int32, 0, len(reps))
out = append(out, id)
for _, r := range reps {
if r != id {
out = append(out, r)
}
}
return out
}
// contains reports whether v is present in xs.
func contains(xs []int32, v int32) bool {
for _, x := range xs {
if x == v {
return true
}
}
return false
}
// evenSplit returns a soft target leader count per broker such that totals sum
// to n. The remainder (+1) is assigned to the lowest broker IDs for stability.
func evenSplit(n int, ids []int32) map[int32]int {
m := make(map[int32]int, len(ids))
if len(ids) == 0 {
return m
}
base := n / len(ids)
rem := n % len(ids)
for i, id := range ids {
m[id] = base
if i < rem {
m[id]++
}
}
return m
}
// leastLoadedLeader returns the broker with the smallest number of preferred
// leader assignments (ties broken by smaller broker ID).
func leastLoadedLeader(leaders map[int32]int, ids []int32) int32 {
best, bestCnt := ids[0], 1<<30
for _, id := range ids {
if leaders[id] < bestCnt || (leaders[id] == bestCnt && id < best) {
bestCnt, best = leaders[id], id
}
}
return best
}
func equalInt32s(a, b []int32) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}
================================================
FILE: e2e/partition_planner_test.go
================================================
package e2e
import (
"sort"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/twmb/franz-go/pkg/kmsg"
"go.uber.org/zap"
)
// buildMeta constructs a MetadataResponse for tests.
// brokers: brokerID -> rack label ("" means no rack/unknown).
// partitions: list of replica lists where index 0 is the preferred leader.
func buildMeta(topic string, brokers map[int32]string, partitions [][]int32) *kmsg.MetadataResponse {
// Brokers
bs := make([]kmsg.MetadataResponseBroker, 0, len(brokers))
ids := make([]int32, 0, len(brokers))
for id := range brokers {
ids = append(ids, id)
}
sort.Slice(ids, func(i, j int) bool { return ids[i] < ids[j] })
for _, id := range ids {
rack := brokers[id] // copy for address stability
bs = append(bs, kmsg.MetadataResponseBroker{
NodeID: id,
Rack: &rack, // empty string is allowed and treated as one "rack" bucket by planner
})
}
// Partitions
ps := make([]kmsg.MetadataResponseTopicPartition, 0, len(partitions))
for i, reps := range partitions {
cp := append([]int32(nil), reps...)
leader := int32(-1)
if len(reps) > 0 {
leader = reps[0] // In tests, assume preferred leader is actual leader
}
ps = append(ps, kmsg.MetadataResponseTopicPartition{
Partition: int32(i),
Leader: leader,
Replicas: cp,
})
}
return &kmsg.MetadataResponse{
Brokers: bs,
Topics: []kmsg.MetadataResponseTopic{
{
Topic: kmsg.StringPtr(topic),
Partitions: ps,
},
},
}
}
// applyPlan returns the final predictive assignments after applying the plan
// to the given metadata snapshot (without mutating meta).
func applyPlan(meta *kmsg.MetadataResponse, plan *Plan) map[int32][]int32 {
final := map[int32][]int32{}
for _, p := range meta.Topics[0].Partitions {
final[p.Partition] = append([]int32(nil), p.Replicas...)
}
for _, ra := range plan.Reassignments {
final[ra.Partition] = append([]int32(nil), ra.Replicas...)
}
nextID := int32(len(meta.Topics[0].Partitions))
for _, ca := range plan.CreateAssignments {
final[nextID] = append([]int32(nil), ca.Replicas...)
nextID++
}
return final
}
func countLeaders(assigns map[int32][]int32) map[int32]int {
m := map[int32]int{}
for _, reps := range assigns {
if len(reps) > 0 {
m[reps[0]]++
}
}
return m
}
func assertNoDuplicates(t *testing.T, reps []int32) {
t.Helper()
seen := map[int32]struct{}{}
for _, r := range reps {
_, dup := seen[r]
assert.Falsef(t, dup, "duplicate broker in replica set: %v", reps)
seen[r] = struct{}{}
}
}
func max(a, b int) int {
if a > b {
return a
}
return b
}
// --- tests -----------------------------------------------------------------
func TestPartitionPlanner_Plan(t *testing.T) {
type tc struct {
name string
brokers map[int32]string // brokerID -> rack ("" for no rack)
partitions [][]int32 // ordered replicas (index 0 = preferred leader)
cfg EndToEndTopicConfig // uses ReplicationFactor & PartitionsPerBroker
check func(t *testing.T, meta *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32)
}
tests := []tc{
{
name: "single broker creates one partition",
brokers: map[int32]string{1: ""},
partitions: nil, // empty topic
cfg: EndToEndTopicConfig{
ReplicationFactor: 1,
PartitionsPerBroker: 1,
},
check: func(t *testing.T, _ *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) {
require.NotNil(t, plan)
assert.Equal(t, 1, len(plan.CreateAssignments), "should create exactly one partition")
assert.Equal(t, 1, plan.FinalPartitionCount)
for pid, reps := range final {
assert.Lenf(t, reps, 1, "pid %d must have RF=1", pid)
}
leaders := countLeaders(final)
assert.Equal(t, 1, leaders[1], "broker 1 should lead one partition")
},
},
{
name: "three brokers, no racks, RF grows to 3; ensure coverage and count",
brokers: map[int32]string{
1: "", 2: "", 3: "",
},
// start with 2 partitions at RF=2 both led by broker with id 1 (skewed)
partitions: [][]int32{
{1, 2},
{1, 2},
},
cfg: EndToEndTopicConfig{
ReplicationFactor: 3,
PartitionsPerBroker: 1,
},
check: func(t *testing.T, _ *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) {
assert.Equal(t, 3, plan.FinalPartitionCount, "desired should be max(cur=2, ppb*brokers=3)=3")
for pid, reps := range final {
assert.Lenf(t, reps, 3, "pid %d must have RF=3", pid)
assertNoDuplicates(t, reps)
}
leaders := countLeaders(final)
assert.GreaterOrEqual(t, leaders[1], 1)
assert.GreaterOrEqual(t, leaders[2], 1)
assert.GreaterOrEqual(t, leaders[3], 1)
},
},
{
name: "rack diversity improves on same-RF partition (2 racks, RF=2)",
brokers: map[int32]string{
1: "a", 2: "a", 3: "b",
},
// single partition with both replicas on rack "a" -> re-pick to include "b"
partitions: [][]int32{
{1, 2},
},
cfg: EndToEndTopicConfig{
ReplicationFactor: 2,
PartitionsPerBroker: 1, // desired total becomes 3, but p0 should be improved first
},
check: func(t *testing.T, meta *kmsg.MetadataResponse, _ *Plan, final map[int32][]int32) {
reps := final[0]
require.Len(t, reps, 2)
assertNoDuplicates(t, reps)
state := BuildState(meta)
assert.False(t, violatesRackDiversity(reps, state.NumRacks, state.Brokers), "p0 should span both racks a/b")
},
},
{
name: "shrink RF preserves current leader",
brokers: map[int32]string{
1: "", 2: "", 3: "",
},
// RF=3 currently, leader is 2; configured RF=2 -> leader must remain 2
partitions: [][]int32{
{2, 1, 3},
},
cfg: EndToEndTopicConfig{
ReplicationFactor: 2,
PartitionsPerBroker: 1,
},
check: func(t *testing.T, _ *kmsg.MetadataResponse, _ *Plan, final map[int32][]int32) {
reps := final[0]
require.Len(t, reps, 2)
assert.Equal(t, int32(2), reps[0], "leader 2 should be preserved after shrink")
assertNoDuplicates(t, reps)
},
},
{
name: "rotate-if-replica covers missing brokers without extra swaps",
brokers: map[int32]string{
1: "", 2: "", 3: "",
},
// 2 partitions, both led by 1; 2 and 3 are replicas only in separate partitions.
partitions: [][]int32{
{1, 2},
{1, 3},
},
cfg: EndToEndTopicConfig{
ReplicationFactor: 2,
PartitionsPerBroker: 1, // desired total 3 -> one create expected
},
check: func(t *testing.T, _ *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) {
assert.Equal(t, 1, len(plan.CreateAssignments), "one create due to partition target")
leaders := countLeaders(final)
assert.GreaterOrEqual(t, leaders[1], 1)
assert.GreaterOrEqual(t, leaders[2], 1)
assert.GreaterOrEqual(t, leaders[3], 1)
for pid, reps := range final {
assert.Lenf(t, reps, 2, "pid %d must have RF=2", pid)
assertNoDuplicates(t, reps)
}
},
},
{
name: "replace-duplicate-and-rotate when target broker is not a replica anywhere",
brokers: map[int32]string{
1: "a", 2: "a", 3: "b",
},
// 2 partitions both {1,2}; broker 3 is nowhere; donors exist (1 leads 2),
// and duplicates exist (rack "a" twice) so we can safely swap in 3 and rotate.
partitions: [][]int32{
{1, 2},
{1, 2},
},
cfg: EndToEndTopicConfig{
ReplicationFactor: 2,
PartitionsPerBroker: 1, // desired total 3; coverage for 3 should be via swap+rotate
},
check: func(t *testing.T, _ *kmsg.MetadataResponse, _ *Plan, final map[int32][]int32) {
leaders := countLeaders(final)
assert.GreaterOrEqual(t, leaders[3], 1, "broker 3 should lead at least one partition")
for pid, reps := range final {
assert.Lenf(t, reps, 2, "pid %d RF=2", pid)
assertNoDuplicates(t, reps)
}
},
},
{
name: "even split leaders when scaling partitions (ppb=2)",
brokers: map[int32]string{
1: "", 2: "", 3: "",
},
// Start with 3 partitions, each broker already leads one
partitions: [][]int32{
{1, 2},
{2, 1},
{3, 1},
},
cfg: EndToEndTopicConfig{
ReplicationFactor: 2,
PartitionsPerBroker: 2, // desired = 2 * 3 = 6
},
check: func(t *testing.T, _ *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) {
assert.Equal(t, 6, plan.FinalPartitionCount)
leaders := countLeaders(final)
assert.Equal(t, 2, leaders[1])
assert.Equal(t, 2, leaders[2])
assert.Equal(t, 2, leaders[3])
},
},
{
name: "one defined rack for all brokers -> no diversity churn; no ops needed",
brokers: map[int32]string{
1: "a", 2: "a", 3: "a",
},
// Already meets RF and coverage and desired count == 3
partitions: [][]int32{
{1, 2},
{2, 3},
{3, 1},
},
cfg: EndToEndTopicConfig{
ReplicationFactor: 2,
PartitionsPerBroker: 1,
},
check: func(t *testing.T, meta *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) {
assert.Equal(t, 0, len(plan.Reassignments), "no reassignments expected")
assert.Equal(t, 0, len(plan.CreateAssignments), "no creates expected")
state := BuildState(meta)
// Diversity can't be improved (NumRacks=1)
for pid, reps := range final {
assert.Falsef(t, violatesRackDiversity(reps, state.NumRacks, state.Brokers), "pid %d should not violate with NumRacks=1", pid)
}
},
},
{
name: "broker present but leading nothing (restarting?) gets coverage (via swap or create)",
brokers: map[int32]string{
1: "a", 2: "b", 3: "c",
},
// Two partitions led by 1 and 2; broker 3 has no replicas/leadership.
partitions: [][]int32{
{1, 2},
{2, 1},
},
cfg: EndToEndTopicConfig{
ReplicationFactor: 2,
PartitionsPerBroker: 1, // desired = max(2, 3)=3 -> at least one create or a swap+rotate + create for count
},
check: func(t *testing.T, _ *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) {
assert.Equal(t, 3, plan.FinalPartitionCount)
leaders := countLeaders(final)
assert.GreaterOrEqual(t, leaders[1], 1)
assert.GreaterOrEqual(t, leaders[2], 1)
assert.GreaterOrEqual(t, leaders[3], 1)
for pid, reps := range final {
assert.Lenf(t, reps, 2, "pid %d RF=2", pid)
assertNoDuplicates(t, reps)
}
},
},
{
name: "phase 3 accounts for leaders created in phase 2 - no over-assignment",
brokers: map[int32]string{
1: "a", 2: "b", 3: "c", 4: "d",
},
// Start with 2 partitions, both led by broker 1
// Brokers 2,3,4 have no leadership -> phase 2 will create partitions for them
// Then phase 3 should NOT over-assign broker 1 when creating additional partitions
partitions: [][]int32{
{1, 2}, // broker 1 leads
{1, 3}, // broker 1 leads (over-represented)
},
cfg: EndToEndTopicConfig{
ReplicationFactor: 2,
PartitionsPerBroker: 2, // 4*2=8 total desired, currently have 2, so need 6 more
},
check: func(t *testing.T, _ *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) {
// Should create 6 new partitions (3 for coverage in phase 2, 3 more for count in phase 3)
assert.Equal(t, 8, plan.FinalPartitionCount, "should reach desired partition count of 8")
assert.Equal(t, 6, len(plan.CreateAssignments), "should create 6 new partitions")
leaders := countLeaders(final)
// Each broker should lead exactly 2 partitions (8 total / 4 brokers = 2 each)
// This test will fail if phase 3 doesn't account for leaders created in phase 2
// because it will see broker 1 as leading 2 partitions and think it needs 0 more,
// while it actually leads 2 from existing + potentially more from phase 2
for brokerID := int32(1); brokerID <= 4; brokerID++ {
assert.Equal(t, 2, leaders[brokerID],
"broker %d should lead exactly 2 partitions (even distribution), but leads %d",
brokerID, leaders[brokerID])
}
},
},
{
name: "uses actual leader not preferred leader when they differ",
brokers: map[int32]string{
1: "a", 2: "b", 3: "c",
},
// This case simulates when the actual leader differs from preferred leader (replicas[0])
// We'll manually construct metadata where leader != replicas[0]
partitions: [][]int32{
{1, 2, 3}, // preferred leader is 1, but we'll set actual leader to 2 in buildMetaWithLeader
{2, 3, 1}, // preferred leader is 2, but we'll set actual leader to 3 in buildMetaWithLeader
},
cfg: EndToEndTopicConfig{
ReplicationFactor: 3,
PartitionsPerBroker: 1, // 3*1=3 total desired, have 2, need 1 more
RebalancePartitions: true,
},
check: func(t *testing.T, meta *kmsg.MetadataResponse, plan *Plan, final map[int32][]int32) {
// With our manually set leaders (2, 3), broker 1 has no leadership
// The planner should recognize this and either rotate leadership to broker 1
// or create a new partition led by broker 1
leaders := countLeaders(final)
assert.GreaterOrEqual(t, leaders[1], 1, "broker 1 should lead at least one partition")
assert.GreaterOrEqual(t, leaders[2], 1, "broker 2 should lead at least one partition")
assert.GreaterOrEqual(t, leaders[3], 1, "broker 3 should lead at least one partition")
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var meta *kmsg.MetadataResponse
if tt.name == "uses actual leader not preferred leader when they differ" {
// Special case: manually set leaders to differ from preferred leaders
meta = buildMeta("probe", tt.brokers, tt.partitions)
// Override the leaders: partition 0 actual leader = 2, partition 1 actual leader = 3
meta.Topics[0].Partitions[0].Leader = 2 // preferred is 1, actual is 2
meta.Topics[0].Partitions[1].Leader = 3 // preferred is 2, actual is 3
} else {
meta = buildMeta("probe", tt.brokers, tt.partitions)
}
planner := NewPartitionPlanner(tt.cfg, zap.NewNop())
plan, err := planner.Plan(meta)
require.NoError(t, err, "Plan() should not error")
require.NotNil(t, plan, "Plan() returned nil plan")
// Sanity: final >= max(current, ppb*brokers)
expectedMin := max(len(meta.Topics[0].Partitions), tt.cfg.PartitionsPerBroker*len(tt.brokers))
assert.GreaterOrEqual(t, plan.FinalPartitionCount, expectedMin, "final partition count must meet lower bound")
// Apply and enforce universal invariants.
final := applyPlan(meta, plan)
for pid, reps := range final {
assert.Lenf(t, reps, tt.cfg.ReplicationFactor, "pid %d RF mismatch", pid)
assertNoDuplicates(t, reps)
}
// Scenario-specific checks.
tt.check(t, meta, plan, final)
})
}
}
func TestPartitionPlanner_Plan_Deterministic(t *testing.T) {
// Test that the same input produces identical plans across multiple runs
brokers := map[int32]string{
10: "rack1", 20: "rack2", 30: "rack3", 40: "rack1", 50: "rack2", 60: "rack3",
}
// Many partitions with suboptimal RF to force fixReplicationAndRack to iterate over map
partitions := [][]int32{
{10}, {20}, {30}, {40}, {50}, {60}, // RF=1, needs growth to 3
{10, 20}, {20, 30}, {30, 40}, {40, 50}, // RF=2, needs growth to 3
{50, 60, 10}, {60, 10, 20}, {10, 30, 40}, // RF=3, may need rack fixes
}
cfg := EndToEndTopicConfig{
ReplicationFactor: 3,
PartitionsPerBroker: 3, // 6*3=18 total desired, have 13, need 5 more
}
meta := buildMeta("probe", brokers, partitions)
var plans []*Plan
// Run the same plan many times to increase chance of hitting different map iteration orders
for i := 0; i < 10; i++ {
planner := NewPartitionPlanner(cfg, zap.NewNop())
plan, err := planner.Plan(meta)
require.NoError(t, err, "Plan() should not error on run %d", i)
require.NotNil(t, plan, "Plan() returned nil plan on run %d", i)
plans = append(plans, plan)
}
// All plans should be identical
firstPlan := plans[0]
for i := 1; i < len(plans); i++ {
assert.Equal(t, len(firstPlan.Reassignments), len(plans[i].Reassignments),
"run %d: reassignment count should be identical", i)
assert.Equal(t, len(firstPlan.CreateAssignments), len(plans[i].CreateAssignments),
"run %d: create count should be identical", i)
assert.Equal(t, firstPlan.FinalPartitionCount, plans[i].FinalPartitionCount,
"run %d: final partition count should be identical", i)
// Build maps for comparison
reassign1 := make(map[int32][]int32)
for _, r := range firstPlan.Reassignments {
reassign1[r.Partition] = r.Replicas
}
reassign2 := make(map[int32][]int32)
for _, r := range plans[i].Reassignments {
reassign2[r.Partition] = r.Replicas
}
assert.Equal(t, reassign1, reassign2, "run %d: reassignments should be identical", i)
// Create assignments order matters for determinism
assert.Equal(t, firstPlan.CreateAssignments, plans[i].CreateAssignments,
"run %d: create assignments should be identical", i)
}
}
// TestActualLeaderCoverageSkipsPreferredRebalancing tests the fix for the bug where
// ensureLeaderCoverage would trigger unnecessary reassignments when actual leader
// coverage was perfect but preferred leader coverage was unbalanced.
func TestActualLeaderCoverageSkipsPreferredRebalancing(t *testing.T) {
// Simulate the exact scenario from the bug report:
// - All brokers in same rack (no rack diversity benefit possible)
// - Actual leaders perfectly distributed: broker 0→p1, broker 1→p0, broker 2→p2
// - But preferred leaders (replicas[0]) unbalanced: broker 1→p0&p2, broker 0→p1, broker 2→none
brokers := map[int32]string{
0: "europe-west1-b", 1: "europe-west1-b", 2: "europe-west1-b",
}
// Build metadata with specific replica assignments matching the bug report
meta := buildMeta("probe", brokers, [][]int32{
{1, 2, 0}, // partition 0: preferred leader = 1
{0, 1, 2}, // partition 1: preferred leader = 0
{1, 2, 0}, // partition 2: preferred leader = 1
})
// Override actual leaders to match the bug report scenario
meta.Topics[0].Partitions[0].Leader = 1 // p0: preferred=1, actual=1 (same)
meta.Topics[0].Partitions[1].Leader = 0 // p1: preferred=0, actual=0 (same)
meta.Topics[0].Partitions[2].Leader = 2 // p2: preferred=1, actual=2 (DIFFERENT!)
cfg := EndToEndTopicConfig{
ReplicationFactor: 3,
PartitionsPerBroker: 1, // 3*1=3 total desired, have 3, perfect
}
planner := NewPartitionPlanner(cfg, zap.NewNop())
plan, err := planner.Plan(meta)
require.NoError(t, err, "Plan() should not error")
require.NotNil(t, plan, "Plan() returned nil plan")
// This is the key assertion: should have ZERO reassignments because:
// 1. All brokers in same rack → no rack diversity violations possible
// 2. Actual leader coverage already perfect (each broker leads exactly 1 partition)
// 3. RF and partition count already correct
assert.Equal(t, 0, len(plan.Reassignments), "should have no reassignments when actual coverage is perfect")
assert.Equal(t, 0, len(plan.CreateAssignments), "should have no creates when partition count is perfect")
assert.Equal(t, 3, plan.FinalPartitionCount, "should maintain existing partition count")
// Verify the fix: no changes to replica assignments
final := applyPlan(meta, plan)
assert.Equal(t, []int32{1, 2, 0}, final[0], "partition 0 replicas should be unchanged")
assert.Equal(t, []int32{0, 1, 2}, final[1], "partition 1 replicas should be unchanged")
assert.Equal(t, []int32{1, 2, 0}, final[2], "partition 2 replicas should be unchanged")
// Verify all partitions still have correct RF and no duplicates
for pid, reps := range final {
assert.Lenf(t, reps, 3, "pid %d must have RF=3", pid)
assertNoDuplicates(t, reps)
}
}
// TestPlan_ToRequests_RebalancePartitions verifies that ToRequests omits explicit
// replica assignments from the CreatePartitions request when RebalancePartitions
// is false, but still sets the correct total Count.
func TestPlan_ToRequests_RebalancePartitions(t *testing.T) {
brokers := map[int32]string{1: "a", 2: "b", 3: "c", 4: "d", 5: "e", 6: "f"}
// Topic currently has 3 partitions; 6 brokers → planner will want 6 total.
partitions := [][]int32{
{1, 2, 3},
{2, 3, 4},
{3, 4, 5},
}
meta := buildMeta("probe", brokers, partitions)
t.Run("rebalancePartitions=true includes explicit assignments", func(t *testing.T) {
cfg := EndToEndTopicConfig{
ReplicationFactor: 3,
PartitionsPerBroker: 1,
RebalancePartitions: true,
}
plan, err := NewPartitionPlanner(cfg, zap.NewNop()).Plan(meta)
require.NoError(t, err)
_, createReq := plan.ToRequests("probe", true)
require.NotNil(t, createReq, "should have a CreatePartitions request")
topic := createReq.Topics[0]
assert.Equal(t, int32(plan.FinalPartitionCount), topic.Count)
assert.NotEmpty(t, topic.Assignment, "assignments must be present when rebalancePartitions=true")
assert.Equal(t, len(plan.CreateAssignments), len(topic.Assignment),
"one assignment entry per new partition")
})
t.Run("rebalancePartitions=false omits assignments", func(t *testing.T) {
cfg := EndToEndTopicConfig{
ReplicationFactor: 3,
PartitionsPerBroker: 1,
RebalancePartitions: false,
}
plan, err := NewPartitionPlanner(cfg, zap.NewNop()).Plan(meta)
require.NoError(t, err)
// Planner should still compute create assignments (used for logging / count),
// but ToRequests must NOT include them in the wire request.
require.NotEmpty(t, plan.CreateAssignments, "planner should still compute assignments for count tracking")
_, createReq := plan.ToRequests("probe", false)
require.NotNil(t, createReq, "should still produce a CreatePartitions request")
topic := createReq.Topics[0]
assert.Equal(t, int32(plan.FinalPartitionCount), topic.Count,
"Count must reflect the desired total even without explicit assignments")
assert.Empty(t, topic.Assignment,
"assignments must be absent when rebalancePartitions=false")
})
t.Run("no creates needed produces nil create request regardless of flag", func(t *testing.T) {
// Already-optimal topic: 3 brokers, 3 partitions, each broker leads one.
optMeta := buildMeta("probe",
map[int32]string{1: "", 2: "", 3: ""},
[][]int32{{1, 2, 3}, {2, 3, 1}, {3, 1, 2}},
)
for _, rebalance := range []bool{true, false} {
cfg := EndToEndTopicConfig{
ReplicationFactor: 3,
PartitionsPerBroker: 1,
RebalancePartitions: rebalance,
}
plan, err := NewPartitionPlanner(cfg, zap.NewNop()).Plan(optMeta)
require.NoError(t, err)
assert.Empty(t, plan.CreateAssignments)
_, createReq := plan.ToRequests("probe", rebalance)
assert.Nil(t, createReq, "no CreatePartitions request when nothing to create (rebalance=%v)", rebalance)
}
})
}
func TestMinimalReassignmentsWhenActualLeadersDivergeFromPreferred(t *testing.T) {
// Scenario: all partitions have same preferred leader (broker 0), but actual
// leaders are distributed. Algorithm should recognize brokers with actual
// leadership and only fix gaps, not realign everything to preferred.
brokers := map[int32]string{
0: "rack-a", 1: "rack-b", 2: "rack-c",
}
meta := buildMeta("_redpanda_e2e_probe", brokers, [][]int32{
{0, 1, 2}, // partition 0: preferred leader = 0
{0, 1, 2}, // partition 1: preferred leader = 0
{0, 1, 2}, // partition 2: preferred leader = 0
})
// Set actual leaders to show divergence from preferred
meta.Topics[0].Partitions[0].Leader = 0 // p0: preferred=0, actual=0 (match)
meta.Topics[0].Partitions[1].Leader = 1 // p1: preferred=0, actual=1 (DIVERGED)
meta.Topics[0].Partitions[2].Leader = 0 // p2: preferred=0, actual=0 (match)
cfg := EndToEndTopicConfig{
ReplicationFactor: 3,
PartitionsPerBroker: 1,
}
planner := NewPartitionPlanner(cfg, zap.NewNop())
plan, err := planner.Plan(meta)
require.NoError(t, err)
require.NotNil(t, plan)
// Key assertion: should have ONLY ONE reassignment
// Broker 1 already has actual leadership (p1), even though preferred leader of
// p1 is broker 0. Only broker 2 is missing from actual leadership, so we only
// need to fix that one gap.
assert.Equal(t, 1, len(plan.Reassignments), "should need only ONE reassignment since broker 1 already has actual leadership")
assert.Equal(t, 0, len(plan.CreateAssignments), "should have no creates")
// Verify the reassignment gives broker 2 preferred leadership
require.Len(t, plan.Reassignments, 1)
reassignment := plan.Reassignments[0]
assert.Equal(t, int32(2), reassignment.Replicas[0], "reassignment should give broker 2 preferred leadership")
// Verify all partitions still have correct RF and no duplicates
final := applyPlan(meta, plan)
for pid, reps := range final {
assert.Lenf(t, reps, 3, "pid %d must have RF=3", pid)
assertNoDuplicates(t, reps)
}
}
================================================
FILE: e2e/producer.go
================================================
package e2e
import (
"context"
"encoding/json"
"strconv"
"time"
"github.com/google/uuid"
"github.com/twmb/franz-go/pkg/kgo"
"go.uber.org/zap"
)
// produceMessagesToAllPartitions sends an EndToEndMessage to every partition on the given topic
func (s *Service) produceMessagesToAllPartitions(ctx context.Context) {
for i := 0; i < s.partitionCount; i++ {
s.produceMessage(ctx, i)
}
}
// produceMessage produces an end to end record to a single given partition. If it succeeds producing the record
// it will add it to the message tracker. If producing fails a message will be logged and the respective metrics
// will be incremented.
func (s *Service) produceMessage(ctx context.Context, partition int) {
topicName := s.config.TopicManagement.Name
record, msg := createEndToEndRecord(s.minionID, topicName, partition)
startTime := time.Now()
// This childCtx will ensure that we will abort our efforts to produce (including retries) when we exceed
// the SLA for producers.
childCtx, cancel := context.WithTimeout(ctx, s.config.Producer.AckSla+2*time.Second)
pID := strconv.Itoa(partition)
s.messagesProducedInFlight.WithLabelValues(pID).Inc()
s.messageTracker.addToTracker(msg)
s.client.TryProduce(childCtx, record, func(r *kgo.Record, err error) {
defer cancel()
ackDuration := time.Since(startTime)
s.messagesProducedInFlight.WithLabelValues(pID).Dec()
s.messagesProducedTotal.WithLabelValues(pID).Inc()
// We add 0 in order to ensure that the "failed" metric series for that partition id are initialized as well.
s.messagesProducedFailed.WithLabelValues(pID).Add(0)
s.lostMessages.WithLabelValues(pID).Add(0)
if err != nil {
s.messagesProducedFailed.WithLabelValues(pID).Inc()
_ = s.messageTracker.removeFromTracker(msg.MessageID)
s.logger.Info("failed to produce message to end-to-end topic",
zap.String("topic_name", r.Topic),
zap.Int32("partition", r.Partition),
zap.Error(err))
return
} else {
// Update the message's state. If this message expires and is marked as successfully produced we will
// report this as a lost message, which would indicate that the producer was told that the message got
// produced successfully, but it got lost somewhere.
// We need to use updateItemIfExists() because it's possible that the message has already been consumed
// before we have received the message here (because we were awaiting the produce ack).
msg.state = EndToEndMessageStateProducedSuccessfully
msg.produceLatency = ackDuration.Seconds()
// TODO: Enable again as soon as https://github.com/ReneKroon/ttlcache/issues/60 is fixed
// Because we cannot update cache items in an atomic fashion we currently can't use this method
// as this would cause a race condition which ends up in records being reported as lost/expired.
// s.messageTracker.updateItemIfExists(msg)
}
s.produceLatency.WithLabelValues(pID).Observe(ackDuration.Seconds())
})
}
func createEndToEndRecord(minionID string, topicName string, partition int) (*kgo.Record, *EndToEndMessage) {
message := &EndToEndMessage{
MinionID: minionID,
MessageID: uuid.NewString(),
Timestamp: time.Now().UnixNano(),
partition: partition,
state: EndToEndMessageStateCreated,
}
mjson, err := json.Marshal(message)
if err != nil {
// Should never happen since the struct is so simple,
// but if it does, something is completely broken anyway
panic("cannot serialize EndToEndMessage")
}
record := &kgo.Record{
Topic: topicName,
Value: mjson,
Partition: int32(partition), // we set partition for producing so our customPartitioner can make use of it
}
return record, message
}
================================================
FILE: e2e/service.go
================================================
package e2e
import (
"context"
"fmt"
"strings"
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/twmb/franz-go/pkg/kgo"
"go.uber.org/zap"
"github.com/cloudhut/kminion/v2/kafka"
)
type Service struct {
// General
config Config
logger *zap.Logger
kafkaSvc *kafka.Service // creates kafka client for us
client *kgo.Client
// Service
minionID string // unique identifier, reported in metrics, in case multiple instances run at the same time
groupId string // our own consumer group
groupTracker *groupTracker // tracks consumer groups starting with the kminion prefix and deletes them if they are unused for some time
messageTracker *messageTracker // tracks successfully produced messages,
clientHooks *clientHooks // logs broker events, tracks the coordinator (i.e. which broker last responded to our offset commit)
partitionCount int // number of partitions of our test topic, used to send messages to all partitions
// Metrics
messagesProducedInFlight *prometheus.GaugeVec
messagesProducedTotal *prometheus.CounterVec
messagesProducedFailed *prometheus.CounterVec
messagesReceived *prometheus.CounterVec
offsetCommitsTotal *prometheus.CounterVec
offsetCommitsFailedTotal *prometheus.CounterVec
lostMessages *prometheus.CounterVec
produceLatency *prometheus.HistogramVec
roundtripLatency *prometheus.HistogramVec
offsetCommitLatency *prometheus.HistogramVec
}
// NewService creates a new instance of the e2e monitoring service (wow)
func NewService(ctx context.Context, cfg Config, logger *zap.Logger, kafkaSvc *kafka.Service, promRegisterer prometheus.Registerer) (*Service, error) {
minionID := uuid.NewString()
groupID := fmt.Sprintf("%v-%v", cfg.Consumer.GroupIdPrefix, minionID)
// Producer options
kgoOpts := []kgo.Opt{
kgo.ProduceRequestTimeout(3 * time.Second),
kgo.RecordRetries(3),
// We use the manual partitioner so that the records' partition id will be used as target partition
kgo.RecordPartitioner(kgo.ManualPartitioner()),
}
if cfg.Producer.RequiredAcks == "all" {
kgoOpts = append(kgoOpts, kgo.RequiredAcks(kgo.AllISRAcks()))
} else {
kgoOpts = append(kgoOpts, kgo.RequiredAcks(kgo.LeaderAck()))
kgoOpts = append(kgoOpts, kgo.DisableIdempotentWrite())
}
// Consumer configs
kgoOpts = append(kgoOpts,
kgo.ConsumerGroup(groupID),
kgo.ConsumeTopics(cfg.TopicManagement.Name),
kgo.Balancers(kgo.CooperativeStickyBalancer()),
kgo.DisableAutoCommit(),
kgo.ConsumeResetOffset(kgo.NewOffset().AtEnd()),
)
// Prepare hooks
hooks := newEndToEndClientHooks(logger)
kgoOpts = append(kgoOpts, kgo.WithHooks(hooks))
// Create kafka service and check if client can successfully connect to Kafka cluster
logger.Info("connecting to Kafka seed brokers, trying to fetch cluster metadata",
zap.String("seed_brokers", strings.Join(kafkaSvc.Brokers(), ",")))
client, err := kafkaSvc.CreateAndTestClient(ctx, logger, kgoOpts)
if err != nil {
return nil, fmt.Errorf("failed to create kafka client for e2e: %w", err)
}
logger.Info("successfully connected to kafka cluster")
svc := &Service{
config: cfg,
logger: logger.Named("e2e"),
kafkaSvc: kafkaSvc,
client: client,
minionID: minionID,
groupId: groupID,
clientHooks: hooks,
}
svc.groupTracker = newGroupTracker(cfg, logger, client, groupID)
svc.messageTracker = newMessageTracker(svc)
makeCounterVec := func(name string, labelNames []string, help string) *prometheus.CounterVec {
cv := prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: "end_to_end",
Name: name,
Help: help,
}, labelNames)
promRegisterer.MustRegister(cv)
return cv
}
makeGaugeVec := func(name string, labelNames []string, help string) *prometheus.GaugeVec {
gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: "end_to_end",
Name: name,
Help: help,
}, labelNames)
promRegisterer.MustRegister(gv)
return gv
}
makeHistogramVec := func(name string, maxLatency time.Duration, labelNames []string, help string) *prometheus.HistogramVec {
hv := prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: "end_to_end",
Name: name,
Help: help,
Buckets: createHistogramBuckets(maxLatency),
}, labelNames)
promRegisterer.MustRegister(hv)
return hv
}
// Low-level info
// Users can construct alerts like "can't produce messages" themselves from those
svc.messagesProducedInFlight = makeGaugeVec("messages_produced_in_flight", []string{"partition_id"}, "Number of messages that kminion's end-to-end test produced but has not received an answer for yet")
svc.messagesProducedTotal = makeCounterVec("messages_produced_total", []string{"partition_id"}, "Number of all messages produced to Kafka. This counter will be incremented when we receive a response (failure/timeout or success) from Kafka")
svc.messagesProducedFailed = makeCounterVec("messages_produced_failed_total", []string{"partition_id"}, "Number of messages failed to produce to Kafka because of a timeout or failure")
svc.messagesReceived = makeCounterVec("messages_received_total", []string{"partition_id"}, "Number of *matching* messages kminion received. Every roundtrip message has a minionID (randomly generated on startup) and a timestamp. Kminion only considers a message a match if it it arrives within the configured roundtrip SLA (and it matches the minionID)")
svc.offsetCommitsTotal = makeCounterVec("offset_commits_total", []string{"coordinator_id"}, "Counts how many times kminions end-to-end test has committed offsets")
svc.offsetCommitsFailedTotal = makeCounterVec("offset_commits_failed_total", []string{"coordinator_id", "reason"}, "Number of offset commits that returned an error or timed out")
svc.lostMessages = makeCounterVec("messages_lost_total", []string{"partition_id"}, "Number of messages that have been produced successfully but not received within the configured SLA duration")
// Latency Histograms
// More detailed info about how long stuff took
// Since histograms also have an 'infinite' bucket, they can be used to detect small hickups "lost" messages
svc.produceLatency = makeHistogramVec("produce_latency_seconds", cfg.Producer.AckSla, []string{"partition_id"}, "Time until we received an ack for a produced message")
svc.roundtripLatency = makeHistogramVec("roundtrip_latency_seconds", cfg.Consumer.RoundtripSla, []string{"partition_id"}, "Time it took between sending (producing) and receiving (consuming) a message")
svc.offsetCommitLatency = makeHistogramVec("offset_commit_latency_seconds", cfg.Consumer.CommitSla, []string{"coordinator_id"}, "Time kafka took to respond to kminion's offset commit")
return svc, nil
}
// Start starts the service (wow)
func (s *Service) Start(ctx context.Context) error {
// Ensure topic exists and is configured correctly
if err := s.validateManagementTopic(ctx); err != nil {
return fmt.Errorf("could not validate end-to-end topic: %w", err)
}
// finally start everything else (producing, consuming, continuous validation, consumer group tracking)
go s.startReconciliation(ctx)
// Start consumer and wait until we've received a response for the first poll
// which would indicate that the consumer is ready. Only if the consumer is
// ready we want to start the e2e producer to ensure that we will not miss
// messages because the consumer wasn't ready. However, if this initialization
// does not succeed within 30s we have to assume, that something is wrong on the
// consuming or producing side. KMinion is supposed to report these kind of
// issues and therefore this should not block KMinion from starting.
initCh := make(chan bool, 1)
s.logger.Info("initializing consumer and waiting until it has received the first record batch")
go s.startConsumeMessages(ctx, initCh)
// Produce an init message until the consumer received at least one fetch
initTicker := time.NewTicker(1 * time.Second)
isInitialized := false
// We send a first message immediately, but we'll keep sending more messages later
// since the consumers start at the latest offset and may have missed this message.
initCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
s.sendInitMessage(initCtx, s.client, s.config.TopicManagement.Name)
for !isInitialized {
select {
case <-initTicker.C:
s.sendInitMessage(initCtx, s.client, s.config.TopicManagement.Name)
case <-initCh:
isInitialized = true
s.logger.Info("consumer has been successfully initialized")
case <-initCtx.Done():
// At this point we just assume the consumers are running fine.
// The entire cluster may be down or producing fails.
s.logger.Warn("initializing the consumers timed out, proceeding with the startup")
isInitialized = true
case <-ctx.Done():
return nil
}
}
go s.startOffsetCommits(ctx)
go s.startProducer(ctx)
// keep track of groups, delete old unused groups
if s.config.Consumer.DeleteStaleConsumerGroups {
go s.groupTracker.start(ctx)
}
return nil
}
func (s *Service) sendInitMessage(ctx context.Context, client *kgo.Client, topicName string) {
// Try to produce one record into each partition. This is important because
// one or more partitions may be offline, while others may still be writable.
for i := 0; i < s.partitionCount; i++ {
client.TryProduce(ctx, &kgo.Record{
Key: []byte("init-message"),
Value: nil,
Topic: topicName,
Partition: int32(i),
}, nil)
}
}
func (s *Service) startReconciliation(ctx context.Context) {
if !s.config.TopicManagement.Enabled {
return
}
validateTopicTicker := time.NewTicker(s.config.TopicManagement.ReconciliationInterval)
for {
select {
case <-ctx.Done():
return
case <-validateTopicTicker.C:
err := s.validateManagementTopic(ctx)
if err != nil {
s.logger.Error("failed to validate end-to-end topic", zap.Error(err))
}
}
}
}
func (s *Service) startProducer(ctx context.Context) {
produceTicker := time.NewTicker(s.config.ProbeInterval)
for {
select {
case <-ctx.Done():
return
case <-produceTicker.C:
s.produceMessagesToAllPartitions(ctx)
}
}
}
func (s *Service) startOffsetCommits(ctx context.Context) {
commitTicker := time.NewTicker(5 * time.Second)
for {
select {
case <-ctx.Done():
return
case <-commitTicker.C:
s.commitOffsets(ctx)
}
}
}
================================================
FILE: e2e/topic.go
================================================
package e2e
import (
"context"
"errors"
"fmt"
"sort"
"time"
"github.com/twmb/franz-go/pkg/kerr"
"github.com/twmb/franz-go/pkg/kmsg"
"go.uber.org/zap"
)
// Check our end-to-end test topic and adapt accordingly if something does not match our expectations.
// - does it exist?
//
// - is it configured correctly?
// - does it have enough partitions?
// - is the replicationFactor correct?
//
// - are assignments good?
// - is each broker leading at least one partition?
// - are replicas distributed correctly?
func (s *Service) validateManagementTopic(ctx context.Context) error {
s.logger.Debug("validating end-to-end topic...")
meta, err := s.getTopicMetadata(ctx)
if err != nil {
return fmt.Errorf("validateManagementTopic cannot get metadata of e2e topic: %w", err)
}
typedErr := kerr.TypedErrorForCode(meta.Topics[0].ErrorCode)
topicExists := false
switch {
case typedErr == nil:
topicExists = true
case errors.Is(typedErr, kerr.UnknownTopicOrPartition):
// UnknownTopicOrPartition (Error code 3) means that the topic does not exist.
// When the topic doesn't exist, continue to create it further down in the code.
topicExists = false
default:
// If the topic (possibly) exists, but there's an error, then this should result in a fail
return fmt.Errorf("failed to get metadata for end-to-end topic: %w", err)
}
// Create topic if it doesn't exist
if !topicExists {
if !s.config.TopicManagement.Enabled {
return fmt.Errorf("the configured end to end topic does not exist. The topic will not be created " +
"because topic management is disabled")
}
if err = s.createManagementTopic(ctx, meta); err != nil {
return err
}
// Topic was just created with optimal assignments from the partition planner.
// We can skip the validation/planning phase since the topic already has the correct
// partition count and optimal replica assignments. We only need to update our
// internal partition count tracking for KMinion's e2e monitoring operations.
return s.updatePartitionCount(ctx)
}
// If topic management is disabled, skip validation and alteration of the existing topic.
// This allows kminion to work on managed Kafka platforms (e.g., Confluent Cloud) that
// block partition reassignment operations.
if !s.config.TopicManagement.Enabled {
topicMeta := meta.Topics[0]
brokerIDs := make([]int32, len(meta.Brokers))
for i, broker := range meta.Brokers {
brokerIDs[i] = broker.NodeID
}
s.logger.Info("topic management is disabled, skipping validation and alteration of existing topic",
zap.String("topic", s.config.TopicManagement.Name),
zap.Int("current_partitions", len(topicMeta.Partitions)),
zap.Int("replication_factor", len(topicMeta.Partitions[0].Replicas)))
// Log warnings if the topic configuration differs from expectations
expectedPartitions := s.config.TopicManagement.PartitionsPerBroker * len(brokerIDs)
if len(topicMeta.Partitions) != expectedPartitions {
s.logger.Warn("topic partition count differs from expected configuration",
zap.Int("current_partitions", len(topicMeta.Partitions)),
zap.Int("expected_partitions", expectedPartitions),
zap.Int("brokers", len(brokerIDs)),
zap.Int("partitions_per_broker_config", s.config.TopicManagement.PartitionsPerBroker),
zap.String("reason", "topic management is disabled, will not alter"))
}
// Check if each broker is leading at least one partition
leaderCounts := make(map[int32]int)
for _, partition := range topicMeta.Partitions {
leaderCounts[partition.Leader]++
}
brokersWithoutLeader := []int32{}
for _, brokerID := range brokerIDs {
if leaderCounts[brokerID] == 0 {
brokersWithoutLeader = append(brokersWithoutLeader, brokerID)
}
}
if len(brokersWithoutLeader) > 0 {
s.logger.Warn("some brokers are not leading any partitions on the e2e topic",
zap.Int32s("brokers_without_leader", brokersWithoutLeader),
zap.String("reason", "topic management is disabled, will not alter"),
zap.String("impact", "end-to-end monitoring may not cover all brokers"))
}
return s.updatePartitionCount(ctx)
}
// Topic already exists - use partition planner to validate and potentially fix assignments
planner := NewPartitionPlanner(s.config.TopicManagement, s.logger)
plan, err := planner.Plan(meta)
if err != nil {
return fmt.Errorf("failed to create partition plan: %w", err)
}
// Convert the plan to Kafka requests
topicName := pointerStrToStr(meta.Topics[0].Topic)
alterReq, createReq := plan.ToRequests(topicName, s.config.TopicManagement.RebalancePartitions)
if s.config.TopicManagement.RebalancePartitions {
if len(plan.Reassignments) > 0 {
s.logPlannedReassignments(meta, plan, topicName)
}
err = s.executeAlterPartitionAssignments(ctx, alterReq)
if err != nil {
return fmt.Errorf("failed to alter partition assignments: %w", err)
}
} else if len(plan.Reassignments) > 0 {
s.logger.Info("skipping partition reassignment because rebalancePartitions is disabled",
zap.String("topic", topicName),
zap.Int("skipped_reassignments", len(plan.Reassignments)),
)
}
if len(plan.CreateAssignments) > 0 {
s.logPlannedCreations(meta, plan, topicName, s.config.TopicManagement.RebalancePartitions)
}
err = s.executeCreatePartitions(ctx, createReq)
if err != nil {
return fmt.Errorf("failed to create partitions: %w", err)
}
return s.updatePartitionCount(ctx)
}
// updatePartitionCount retrieves metadata to inform kminion about the updated
// partition count of its e2e topic. It must be updated after topic validation
// because the validation process may lead to the creation of new partitions.
// This can occur when new brokers are added to the cluster.
func (s *Service) updatePartitionCount(ctx context.Context) error {
retryTicker := time.NewTicker(1 * time.Second)
defer retryTicker.Stop()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-retryTicker.C:
meta, err := s.getTopicMetadata(ctx)
if err != nil {
return fmt.Errorf("could not get topic metadata while updating partition count: %w", err)
}
typedErr := kerr.TypedErrorForCode(meta.Topics[0].ErrorCode)
if typedErr == nil {
s.partitionCount = len(meta.Topics[0].Partitions)
s.logger.Debug("updatePartitionCount: successfully updated partition count", zap.Int("partition_count", s.partitionCount))
return nil
}
if !errors.Is(typedErr, kerr.UnknownTopicOrPartition) {
return fmt.Errorf("unexpected error while updating partition count: %w", typedErr)
}
s.logger.Warn("updatePartitionCount: received UNKNOWN_TOPIC_OR_PARTITION error, possibly due to timing issue. Retrying...")
// The UNKNOWN_TOPIC_OR_PARTITION error occurs occasionally even though the topic is created
// in the validateManagementTopic function. It appears to be a timing issue where the topic metadata
// is not immediately available after creation. In practice, waiting for a short period and then retrying
// the operation resolves the issue.
}
}
}
func (s *Service) executeCreatePartitions(ctx context.Context, req *kmsg.CreatePartitionsRequest) error {
if req == nil {
return nil
}
res, err := req.RequestWith(ctx, s.client)
if err != nil {
return err
}
for _, topic := range res.Topics {
err := kerr.ErrorForCode(topic.ErrorCode)
if err != nil {
return fmt.Errorf("inner Kafka error: %w", err)
}
}
return nil
}
func (s *Service) executeAlterPartitionAssignments(ctx context.Context, req *kmsg.AlterPartitionAssignmentsRequest) error {
if req == nil {
return nil
}
res, err := req.RequestWith(ctx, s.client)
if err != nil {
return err
}
typedErr := kerr.TypedErrorForCode(res.ErrorCode)
if typedErr != nil {
s.logger.Error("alter partition assignments: failed to alter partition assignments", zap.Any("request_topics", req.Topics))
return fmt.Errorf("inner Kafka error: %w", typedErr)
}
for _, topic := range res.Topics {
for _, partition := range topic.Partitions {
typedErr = kerr.TypedErrorForCode(partition.ErrorCode)
if typedErr != nil {
return fmt.Errorf("inner Kafka partition error on partition '%v': %w", partition.Partition, typedErr)
}
}
}
return nil
}
// logPlannedReassignments logs current partition state and planned reassignment details.
func (s *Service) logPlannedReassignments(meta *kmsg.MetadataResponse, plan *Plan, topicName string) {
topicMeta := meta.Topics[0]
// Log current partition state
s.logger.Info("current partition assignments for e2e topic",
zap.String("topic", topicName),
zap.Int("current_partitions", len(topicMeta.Partitions)),
zap.Int("brokers_available", len(meta.Brokers)),
)
// Log each current partition assignment (sorted by partition ID)
sortedPartitions := make([]kmsg.MetadataResponseTopicPartition, len(topicMeta.Partitions))
copy(sortedPartitions, topicMeta.Partitions)
sort.Slice(sortedPartitions, func(i, j int) bool {
return sortedPartitions[i].Partition < sortedPartitions[j].Partition
})
for _, partition := range sortedPartitions {
s.logger.Info("current partition assignment",
zap.String("topic", topicName),
zap.Int32("partition", partition.Partition),
zap.Int32s("replicas", partition.Replicas),
zap.Int32("leader", partition.Leader),
)
}
s.logger.Info("planned partition reassignments",
zap.String("topic", topicName),
zap.Int("reassignment_count", len(plan.Reassignments)),
)
// Sort reassignments by partition ID for consistent logging
sortedReassignments := make([]Reassignment, len(plan.Reassignments))
copy(sortedReassignments, plan.Reassignments)
sort.Slice(sortedReassignments, func(i, j int) bool {
return sortedReassignments[i].Partition < sortedReassignments[j].Partition
})
for _, reassignment := range sortedReassignments {
// Find current assignment for this partition
var currentReplicas []int32
var currentLeader int32 = -1
for _, partition := range topicMeta.Partitions {
if partition.Partition == reassignment.Partition {
currentReplicas = partition.Replicas
currentLeader = partition.Leader
break
}
}
s.logger.Info("partition reassignment",
zap.String("topic", topicName),
zap.Int32("partition", reassignment.Partition),
zap.Int32s("current_replicas", currentReplicas),
zap.Int32s("new_replicas", reassignment.Replicas),
zap.Int32("current_leader", currentLeader),
zap.Int32("new_leader", reassignment.Replicas[0]),
)
}
}
// logPlannedCreations logs planned partition creation details.
func (s *Service) logPlannedCreations(meta *kmsg.MetadataResponse, plan *Plan, topicName string, rebalancePartitions bool) {
topicMeta := meta.Topics[0]
s.logger.Info("planned partition creations",
zap.String("topic", topicName),
zap.Int("creation_count", len(plan.CreateAssignments)),
zap.Int("current_partitions", len(topicMeta.Partitions)),
zap.Int("final_partitions", plan.FinalPartitionCount),
)
if !rebalancePartitions {
s.logger.Info("partition assignments will be auto-placed by broker (rebalancePartitions is disabled)",
zap.String("topic", topicName),
)
return
}
nextPartitionID := int32(len(topicMeta.Partitions))
for i, creation := range plan.CreateAssignments {
s.logger.Info("new partition creation",
zap.String("topic", topicName),
zap.Int32("new_partition", nextPartitionID+int32(i)),
zap.Int32s("replicas", creation.Replicas),
zap.Int32("leader", creation.Replicas[0]),
)
}
}
func (s *Service) createManagementTopic(ctx context.Context, allMeta *kmsg.MetadataResponse) error {
topicCfg := s.config.TopicManagement
brokerCount := len(allMeta.Brokers)
totalPartitions := brokerCount * topicCfg.PartitionsPerBroker
s.logger.Info("e2e topic does not exist, creating it...",
zap.String("topic_name", topicCfg.Name),
zap.Int("partitions_per_broker", topicCfg.PartitionsPerBroker),
zap.Int("replication_factor", topicCfg.ReplicationFactor),
zap.Int("broker_count", brokerCount),
zap.Int("total_partitions", totalPartitions),
)
// Use partition planner to determine optimal assignments for the new topic.
// The metadata already contains broker info, and since the topic doesn't exist,
// meta.Topics[0].Partitions will be empty, which is exactly what we want.
planner := NewPartitionPlanner(topicCfg, s.logger)
plan, err := planner.Plan(allMeta)
if err != nil {
return fmt.Errorf("failed to create partition plan for new topic: %w", err)
}
// Create topic with specific replica assignments from the planner
topic := kmsg.NewCreateTopicsRequestTopic()
topic.Topic = topicCfg.Name
topic.NumPartitions = -1 // Must be -1 when using ReplicaAssignment
topic.ReplicationFactor = -1 // Must be -1 when using ReplicaAssignment
topic.Configs = createTopicConfig(topicCfg)
// Convert planner's CreateAssignments to Kafka's ReplicaAssignment format
for i, assignment := range plan.CreateAssignments {
replica := kmsg.NewCreateTopicsRequestTopicReplicaAssignment()
replica.Partition = int32(i)
replica.Replicas = append([]int32(nil), assignment.Replicas...)
topic.ReplicaAssignment = append(topic.ReplicaAssignment, replica)
}
req := kmsg.NewCreateTopicsRequest()
req.Topics = []kmsg.CreateTopicsRequestTopic{topic}
res, err := req.RequestWith(ctx, s.client)
if err != nil {
return fmt.Errorf("failed to create e2e topic: %w", err)
}
if len(res.Topics) > 0 {
err := kerr.ErrorForCode(res.Topics[0].ErrorCode)
if err != nil {
return fmt.Errorf("failed to create e2e topic: %w", err)
}
}
return nil
}
func (s *Service) getTopicMetadata(ctx context.Context) (*kmsg.MetadataResponse, error) {
topicReq := kmsg.NewMetadataRequestTopic()
topicName := s.config.TopicManagement.Name
topicReq.Topic = &topicName
req := kmsg.NewMetadataRequest()
req.Topics = []kmsg.MetadataRequestTopic{topicReq}
return req.RequestWith(ctx, s.client)
}
func (s *Service) getTopicsConfigs(ctx context.Context, configNames []string) (*kmsg.DescribeConfigsResponse, error) {
req := kmsg.NewDescribeConfigsRequest()
req.IncludeDocumentation = false
req.IncludeSynonyms = false
req.Resources = []kmsg.DescribeConfigsRequestResource{
{
ResourceType: kmsg.ConfigResourceTypeTopic,
ResourceName: s.config.TopicManagement.Name,
ConfigNames: configNames,
},
}
return req.RequestWith(ctx, s.client)
}
func createTopicConfig(cfgTopic EndToEndTopicConfig) []kmsg.CreateTopicsRequestTopicConfig {
topicConfig := func(name string, value interface{}) kmsg.CreateTopicsRequestTopicConfig {
prop := kmsg.NewCreateTopicsRequestTopicConfig()
prop.Name = name
valStr := fmt.Sprintf("%v", value)
prop.Value = &valStr
return prop
}
minISR := 1
if cfgTopic.ReplicationFactor >= 3
gitextract_ind4nfvb/
├── .github/
│ ├── ai-opt-out
│ └── workflows/
│ ├── docker-image.yml
│ └── goreleaser.yml
├── .gitignore
├── .goreleaser.yml
├── Dockerfile
├── LICENSE
├── README.md
├── charts/
│ ├── README.md
│ ├── archives/
│ │ ├── .gitkeep
│ │ ├── index.yaml
│ │ ├── kminion-0.1.0.tgz
│ │ ├── kminion-0.1.1.tgz
│ │ ├── kminion-0.1.2.tgz
│ │ ├── kminion-0.1.3.tgz
│ │ ├── kminion-0.11.1.tgz
│ │ ├── kminion-0.11.2.tgz
│ │ ├── kminion-0.11.3.tgz
│ │ ├── kminion-0.12.0.tgz
│ │ ├── kminion-0.2.0.tgz
│ │ ├── kminion-0.2.1.tgz
│ │ ├── kminion-0.2.2.tgz
│ │ ├── kminion-0.3.0.tgz
│ │ ├── kminion-0.3.1.tgz
│ │ ├── kminion-0.4.0.tgz
│ │ ├── kminion-0.5.0.tgz
│ │ ├── kminion-0.6.0.tgz
│ │ ├── kminion-0.7.0.tgz
│ │ ├── kminion-0.8.0.tgz
│ │ ├── kminion-0.8.1.tgz
│ │ ├── kminion-0.8.2.tgz
│ │ ├── kminion-0.8.3.tgz
│ │ └── kminion-0.9.0.tgz
│ └── kminion/
│ ├── .helmignore
│ ├── Chart.yaml
│ ├── templates/
│ │ ├── NOTES.txt
│ │ ├── _helpers.tpl
│ │ ├── configmap.yaml
│ │ ├── daemonset.yaml
│ │ ├── deployment.yaml
│ │ ├── hpa.yaml
│ │ ├── ingress.yaml
│ │ ├── poddisruptionbudget.yaml
│ │ ├── service.yaml
│ │ ├── serviceaccount.yaml
│ │ └── servicemonitor.yaml
│ └── values.yaml
├── config.go
├── docker-compose.yml
├── docs/
│ ├── end-to-end.md
│ ├── metrics.md
│ └── reference-config.yaml
├── e2e/
│ ├── client_hooks.go
│ ├── config.go
│ ├── config_consumer.go
│ ├── config_producer.go
│ ├── config_topic.go
│ ├── config_topic_test.go
│ ├── consumer.go
│ ├── endtoend_message.go
│ ├── group_tracker.go
│ ├── message_tracker.go
│ ├── partition_planner.go
│ ├── partition_planner_test.go
│ ├── producer.go
│ ├── service.go
│ ├── topic.go
│ └── utils.go
├── go.mod
├── go.sum
├── kafka/
│ ├── client_config_helper.go
│ ├── client_logger.go
│ ├── config.go
│ ├── config_sasl.go
│ ├── config_sasl_gssapi.go
│ ├── config_sasl_oauthbearer.go
│ ├── config_tls.go
│ └── service.go
├── logging/
│ ├── config.go
│ └── logger.go
├── main.go
├── minion/
│ ├── client_hooks.go
│ ├── config.go
│ ├── config_consumer_group.go
│ ├── config_log_dirs.go
│ ├── config_topic_config.go
│ ├── consumer_group_offsets.go
│ ├── describe_consumer_groups.go
│ ├── describe_topic_config.go
│ ├── list_offsets.go
│ ├── log_dirs.go
│ ├── metadata.go
│ ├── offset_consumer.go
│ ├── service.go
│ ├── storage.go
│ ├── utils.go
│ └── versions.go
└── prometheus/
├── collect_broker_info.go
├── collect_cluster_info.go
├── collect_consumer_group_lags.go
├── collect_consumer_groups.go
├── collect_exporter_metrics.go
├── collect_log_dirs.go
├── collect_topic_info.go
├── collect_topic_partition_offsets.go
├── config.go
└── exporter.go
SYMBOL INDEX (261 symbols across 54 files)
FILE: config.go
type Config (line 20) | type Config struct
method SetDefaults (line 27) | func (c *Config) SetDefaults() {
method Validate (line 34) | func (c *Config) Validate() error {
function newConfig (line 53) | func newConfig(logger *zap.Logger) (Config, error) {
FILE: e2e/client_hooks.go
type clientHooks (line 14) | type clientHooks struct
method OnBrokerConnect (line 28) | func (c *clientHooks) OnBrokerConnect(meta kgo.BrokerMetadata, dialDur...
method OnDisconnect (line 38) | func (c *clientHooks) OnDisconnect(meta kgo.BrokerMetadata, _ net.Conn) {
method OnBrokerWrite (line 52) | func (c *clientHooks) OnBrokerWrite(meta kgo.BrokerMetadata, key int16...
method OnBrokerRead (line 71) | func (c *clientHooks) OnBrokerRead(meta kgo.BrokerMetadata, key int16,...
function newEndToEndClientHooks (line 21) | func newEndToEndClientHooks(logger *zap.Logger) *clientHooks {
FILE: e2e/config.go
type Config (line 8) | type Config struct
method SetDefaults (line 16) | func (c *Config) SetDefaults() {
method Validate (line 24) | func (c *Config) Validate() error {
FILE: e2e/config_consumer.go
type EndToEndConsumerConfig (line 8) | type EndToEndConsumerConfig struct
method SetDefaults (line 18) | func (c *EndToEndConsumerConfig) SetDefaults() {
method Validate (line 25) | func (c *EndToEndConsumerConfig) Validate() error {
FILE: e2e/config_producer.go
type EndToEndProducerConfig (line 8) | type EndToEndProducerConfig struct
method SetDefaults (line 13) | func (c *EndToEndProducerConfig) SetDefaults() {
method Validate (line 18) | func (c *EndToEndProducerConfig) Validate() error {
FILE: e2e/config_topic.go
type EndToEndTopicConfig (line 8) | type EndToEndTopicConfig struct
method SetDefaults (line 22) | func (c *EndToEndTopicConfig) SetDefaults() {
method Validate (line 31) | func (c *EndToEndTopicConfig) Validate() error {
FILE: e2e/config_topic_test.go
function TestEndToEndTopicConfig_SetDefaults (line 12) | func TestEndToEndTopicConfig_SetDefaults(t *testing.T) {
function TestEndToEndTopicConfig_Validate (line 24) | func TestEndToEndTopicConfig_Validate(t *testing.T) {
function TestPartitionPlanner_RebalancePartitionsDisabled (line 102) | func TestPartitionPlanner_RebalancePartitionsDisabled(t *testing.T) {
function TestPartitionPlanner_RebalancePartitionsDisabled_Creates (line 144) | func TestPartitionPlanner_RebalancePartitionsDisabled_Creates(t *testing...
FILE: e2e/consumer.go
method startConsumeMessages (line 14) | func (s *Service) startConsumeMessages(ctx context.Context, initializedC...
method commitOffsets (line 43) | func (s *Service) commitOffsets(ctx context.Context) {
method processMessage (line 78) | func (s *Service) processMessage(record *kgo.Record) {
FILE: e2e/endtoend_message.go
constant _ (line 6) | _ = iota
constant EndToEndMessageStateCreated (line 7) | EndToEndMessageStateCreated
constant EndToEndMessageStateProducedSuccessfully (line 8) | EndToEndMessageStateProducedSuccessfully
type EndToEndMessage (line 11) | type EndToEndMessage struct
method creationTime (line 22) | func (m *EndToEndMessage) creationTime() time.Time {
FILE: e2e/group_tracker.go
constant oldGroupCheckInterval (line 15) | oldGroupCheckInterval = 5 * time.Second
constant oldGroupMaxAge (line 16) | oldGroupMaxAge = 20 * time.Second
type groupTracker (line 24) | type groupTracker struct
method start (line 42) | func (g *groupTracker) start(ctx context.Context) {
method checkAndDeleteOldConsumerGroups (line 62) | func (g *groupTracker) checkAndDeleteOldConsumerGroups(ctx context.Con...
function newGroupTracker (line 32) | func newGroupTracker(cfg Config, logger *zap.Logger, client *kgo.Client,...
FILE: e2e/message_tracker.go
type messageTracker (line 22) | type messageTracker struct
method addToTracker (line 45) | func (t *messageTracker) addToTracker(msg *EndToEndMessage) {
method updateItemIfExists (line 52) | func (t *messageTracker) updateItemIfExists(msg *EndToEndMessage) error {
method removeFromTracker (line 79) | func (t *messageTracker) removeFromTracker(messageID string) error {
method onMessageArrived (line 83) | func (t *messageTracker) onMessageArrived(arrivedMessage *EndToEndMess...
method onMessageExpired (line 120) | func (t *messageTracker) onMessageExpired(_ string, reason ttlcache.Ev...
function newMessageTracker (line 28) | func newMessageTracker(svc *Service) *messageTracker {
FILE: e2e/partition_planner.go
type PartitionPlanner (line 29) | type PartitionPlanner struct
method Plan (line 44) | func (p *PartitionPlanner) Plan(meta *kmsg.MetadataResponse) (*Plan, e...
function NewPartitionPlanner (line 37) | func NewPartitionPlanner(cfg EndToEndTopicConfig, logger *zap.Logger) *P...
type PlanBuilder (line 114) | type PlanBuilder struct
method Build (line 172) | func (b *PlanBuilder) Build() *Plan {
method CommitReassignment (line 181) | func (b *PlanBuilder) CommitReassignment(pid int32, reps []int32) {
method CommitCreate (line 188) | func (b *PlanBuilder) CommitCreate(reps []int32) {
type Reassignment (line 135) | type Reassignment struct
type CreateAssignment (line 145) | type CreateAssignment struct
type Plan (line 154) | type Plan struct
method ToRequests (line 792) | func (p *Plan) ToRequests(topic string, rebalancePartitions bool) (*km...
function NewPlanBuilder (line 162) | func NewPlanBuilder(state ClusterState, desired Desired, tracker *LoadTr...
function fixReplicationAndRack (line 196) | func fixReplicationAndRack(b *PlanBuilder, sel ReplicaSelector, rf int) {
function shrinkPreservingLeader (line 247) | func shrinkPreservingLeader(b *PlanBuilder, pid int32, replicas []int32,...
function ensureLeaderCoverage (line 320) | func ensureLeaderCoverage(b *PlanBuilder, sel ReplicaSelector) {
function ensurePartitionCount (line 451) | func ensurePartitionCount(b *PlanBuilder, sel ReplicaSelector) {
function pickLeader (line 506) | func pickLeader(brokerIDs []int32, leaderCount map[int32]int, target map...
type Broker (line 539) | type Broker struct
type Partition (line 553) | type Partition struct
type ClusterState (line 570) | type ClusterState struct
function BuildState (line 581) | func BuildState(meta *kmsg.MetadataResponse) ClusterState {
type Desired (line 612) | type Desired struct
function ComputeDesired (line 622) | func ComputeDesired(state ClusterState, cfg EndToEndTopicConfig) Desired {
type Load (line 644) | type Load struct
type LoadTracker (line 657) | type LoadTracker struct
method AddReplica (line 688) | func (t *LoadTracker) AddReplica(id int32, leader bool) {
method Load (line 698) | func (t *LoadTracker) Load(id int32) Load { return t.l[id] }
function NewLoadTracker (line 664) | func NewLoadTracker(state ClusterState) *LoadTracker {
type ReplicaSelector (line 704) | type ReplicaSelector interface
type RackAwareSelector (line 719) | type RackAwareSelector struct
method ChooseReplicas (line 732) | func (s *RackAwareSelector) ChooseReplicas(preferredLeader int32, rf i...
function NewRackAwareSelector (line 725) | func NewRackAwareSelector(state ClusterState, loads *LoadTracker) *RackA...
function indexLeaders (line 837) | func indexLeaders(brokerIDs []int32, view map[int32][]int32) map[int32][...
function brokersMissingLeadership (line 851) | func brokersMissingLeadership(brokerIDs []int32, leadersByBroker map[int...
function donorBrokers (line 863) | func donorBrokers(brokerIDs []int32, leadersByBroker map[int32][]int32, ...
function violatesRackDiversity (line 883) | func violatesRackDiversity(reps []int32, numRacks int, brokers map[int32...
function victimIndex (line 900) | func victimIndex(reps []int32, brokers map[int32]Broker) int {
function remove (line 915) | func remove(xs []int32, v int32) []int32 {
function putFirst (line 927) | func putFirst(reps []int32, id int32) []int32 {
function contains (line 939) | func contains(xs []int32, v int32) bool {
function evenSplit (line 950) | func evenSplit(n int, ids []int32) map[int32]int {
function leastLoadedLeader (line 968) | func leastLoadedLeader(leaders map[int32]int, ids []int32) int32 {
function equalInt32s (line 978) | func equalInt32s(a, b []int32) bool {
FILE: e2e/partition_planner_test.go
function buildMeta (line 16) | func buildMeta(topic string, brokers map[int32]string, partitions [][]in...
function applyPlan (line 60) | func applyPlan(meta *kmsg.MetadataResponse, plan *Plan) map[int32][]int32 {
function countLeaders (line 76) | func countLeaders(assigns map[int32][]int32) map[int32]int {
function assertNoDuplicates (line 86) | func assertNoDuplicates(t *testing.T, reps []int32) {
function max (line 96) | func max(a, b int) int {
function TestPartitionPlanner_Plan (line 105) | func TestPartitionPlanner_Plan(t *testing.T) {
function TestPartitionPlanner_Plan_Deterministic (line 425) | func TestPartitionPlanner_Plan_Deterministic(t *testing.T) {
function TestActualLeaderCoverageSkipsPreferredRebalancing (line 483) | func TestActualLeaderCoverageSkipsPreferredRebalancing(t *testing.T) {
function TestPlan_ToRequests_RebalancePartitions (line 538) | func TestPlan_ToRequests_RebalancePartitions(t *testing.T) {
function TestMinimalReassignmentsWhenActualLeadersDivergeFromPreferred (line 609) | func TestMinimalReassignmentsWhenActualLeadersDivergeFromPreferred(t *te...
FILE: e2e/producer.go
method produceMessagesToAllPartitions (line 15) | func (s *Service) produceMessagesToAllPartitions(ctx context.Context) {
method produceMessage (line 24) | func (s *Service) produceMessage(ctx context.Context, partition int) {
function createEndToEndRecord (line 74) | func createEndToEndRecord(minionID string, topicName string, partition i...
FILE: e2e/service.go
type Service (line 17) | type Service struct
method Start (line 152) | func (s *Service) Start(ctx context.Context) error {
method sendInitMessage (line 209) | func (s *Service) sendInitMessage(ctx context.Context, client *kgo.Cli...
method startReconciliation (line 222) | func (s *Service) startReconciliation(ctx context.Context) {
method startProducer (line 241) | func (s *Service) startProducer(ctx context.Context) {
method startOffsetCommits (line 253) | func (s *Service) startOffsetCommits(ctx context.Context) {
function NewService (line 48) | func NewService(ctx context.Context, cfg Config, logger *zap.Logger, kaf...
FILE: e2e/topic.go
method validateManagementTopic (line 25) | func (s *Service) validateManagementTopic(ctx context.Context) error {
method updatePartitionCount (line 153) | func (s *Service) updatePartitionCount(ctx context.Context) error {
method executeCreatePartitions (line 185) | func (s *Service) executeCreatePartitions(ctx context.Context, req *kmsg...
method executeAlterPartitionAssignments (line 205) | func (s *Service) executeAlterPartitionAssignments(ctx context.Context, ...
method logPlannedReassignments (line 233) | func (s *Service) logPlannedReassignments(meta *kmsg.MetadataResponse, p...
method logPlannedCreations (line 295) | func (s *Service) logPlannedCreations(meta *kmsg.MetadataResponse, plan ...
method createManagementTopic (line 323) | func (s *Service) createManagementTopic(ctx context.Context, allMeta *km...
method getTopicMetadata (line 377) | func (s *Service) getTopicMetadata(ctx context.Context) (*kmsg.MetadataR...
method getTopicsConfigs (line 388) | func (s *Service) getTopicsConfigs(ctx context.Context, configNames []st...
function createTopicConfig (line 403) | func createTopicConfig(cfgTopic EndToEndTopicConfig) []kmsg.CreateTopics...
FILE: e2e/utils.go
function createHistogramBuckets (line 16) | func createHistogramBuckets(maxLatency time.Duration) []float64 {
function containsStr (line 28) | func containsStr(ar []string, x string) (bool, int) {
method logCommitErrors (line 38) | func (s *Service) logCommitErrors(r *kmsg.OffsetCommitResponse, err erro...
function brokerMetadataByBrokerID (line 70) | func brokerMetadataByBrokerID(meta []kmsg.MetadataResponseBroker) map[in...
function brokerMetadataByRackID (line 79) | func brokerMetadataByRackID(meta []kmsg.MetadataResponseBroker) map[stri...
function pointerStrToStr (line 91) | func pointerStrToStr(str *string) string {
function safeUnwrap (line 98) | func safeUnwrap(err error) string {
function isInArray (line 105) | func isInArray(num int16, arr []int16) bool {
FILE: kafka/client_config_helper.go
function NewKgoConfig (line 29) | func NewKgoConfig(cfg Config, logger *zap.Logger) ([]kgo.Opt, error) {
FILE: kafka/client_logger.go
type KgoZapLogger (line 8) | type KgoZapLogger struct
method Level (line 14) | func (k KgoZapLogger) Level() kgo.LogLevel {
method Log (line 19) | func (k KgoZapLogger) Log(level kgo.LogLevel, msg string, keyvals ...i...
FILE: kafka/config.go
type Config (line 5) | type Config struct
method SetDefaults (line 17) | func (c *Config) SetDefaults() {
method Validate (line 24) | func (c *Config) Validate() error {
FILE: kafka/config_sasl.go
constant SASLMechanismPlain (line 6) | SASLMechanismPlain = "PLAIN"
constant SASLMechanismScramSHA256 (line 7) | SASLMechanismScramSHA256 = "SCRAM-SHA-256"
constant SASLMechanismScramSHA512 (line 8) | SASLMechanismScramSHA512 = "SCRAM-SHA-512"
constant SASLMechanismGSSAPI (line 9) | SASLMechanismGSSAPI = "GSSAPI"
constant SASLMechanismOAuthBearer (line 10) | SASLMechanismOAuthBearer = "OAUTHBEARER"
type SASLConfig (line 14) | type SASLConfig struct
method SetDefaults (line 26) | func (c *SASLConfig) SetDefaults() {
method Validate (line 33) | func (c *SASLConfig) Validate() error {
FILE: kafka/config_sasl_gssapi.go
type SASLGSSAPIConfig (line 4) | type SASLGSSAPIConfig struct
method SetDefaults (line 19) | func (s *SASLGSSAPIConfig) SetDefaults() {
FILE: kafka/config_sasl_oauthbearer.go
type OAuthBearerConfig (line 13) | type OAuthBearerConfig struct
method Validate (line 20) | func (c *OAuthBearerConfig) Validate() error {
method getToken (line 31) | func (c *OAuthBearerConfig) getToken(ctx context.Context) (string, err...
FILE: kafka/config_tls.go
type TLSConfig (line 6) | type TLSConfig struct
method SetDefaults (line 18) | func (c *TLSConfig) SetDefaults() {
method Validate (line 22) | func (c *TLSConfig) Validate() error {
FILE: kafka/service.go
type Service (line 15) | type Service struct
method CreateAndTestClient (line 29) | func (s *Service) CreateAndTestClient(ctx context.Context, l *zap.Logg...
method Brokers (line 64) | func (s *Service) Brokers() []string {
method testConnection (line 70) | func (s *Service) testConnection(client *kgo.Client, ctx context.Conte...
function NewService (line 20) | func NewService(cfg Config, logger *zap.Logger) *Service {
FILE: logging/config.go
type Config (line 8) | type Config struct
method SetDefaults (line 12) | func (c *Config) SetDefaults() {
method Validate (line 16) | func (c *Config) Validate() error {
FILE: logging/logger.go
function NewLogger (line 14) | func NewLogger(cfg Config, metricsNamespace string) *zap.Logger {
function prometheusHook (line 35) | func prometheusHook(metricsNamespace string) func(zapcore.Entry) error {
FILE: main.go
function main (line 36) | func main() {
FILE: minion/client_hooks.go
type clientHooks (line 15) | type clientHooks struct
method OnBrokerConnect (line 57) | func (c clientHooks) OnBrokerConnect(meta kgo.BrokerMetadata, dialDur ...
method OnBrokerDisconnect (line 67) | func (c clientHooks) OnBrokerDisconnect(meta kgo.BrokerMetadata, _ net...
method OnBrokerRead (line 79) | func (c clientHooks) OnBrokerRead(_ kgo.BrokerMetadata, _ int16, bytes...
method OnBrokerWrite (line 91) | func (c clientHooks) OnBrokerWrite(_ kgo.BrokerMetadata, _ int16, byte...
function newMinionClientHooks (line 25) | func newMinionClientHooks(logger *zap.Logger, metricsNamespace string) *...
FILE: minion/config.go
type Config (line 9) | type Config struct
method SetDefaults (line 16) | func (c *Config) SetDefaults() {
method Validate (line 23) | func (c *Config) Validate() error {
FILE: minion/config_consumer_group.go
constant ConsumerGroupScrapeModeOffsetsTopic (line 8) | ConsumerGroupScrapeModeOffsetsTopic string = "offsetsTopic"
constant ConsumerGroupScrapeModeAdminAPI (line 9) | ConsumerGroupScrapeModeAdminAPI string = "adminApi"
constant ConsumerGroupGranularityTopic (line 11) | ConsumerGroupGranularityTopic string = "topic"
constant ConsumerGroupGranularityPartition (line 12) | ConsumerGroupGranularityPartition string = "partition"
type ConsumerGroupConfig (line 15) | type ConsumerGroupConfig struct
method SetDefaults (line 36) | func (c *ConsumerGroupConfig) SetDefaults() {
method Validate (line 43) | func (c *ConsumerGroupConfig) Validate() error {
FILE: minion/config_log_dirs.go
type LogDirsConfig (line 3) | type LogDirsConfig struct
method Validate (line 10) | func (c *LogDirsConfig) Validate() error {
method SetDefaults (line 15) | func (c *LogDirsConfig) SetDefaults() {
FILE: minion/config_topic_config.go
constant TopicGranularityTopic (line 8) | TopicGranularityTopic string = "topic"
constant TopicGranularityPartition (line 9) | TopicGranularityPartition string = "partition"
type TopicConfig (line 12) | type TopicConfig struct
method Validate (line 39) | func (c *TopicConfig) Validate() error {
method SetDefaults (line 65) | func (c *TopicConfig) SetDefaults() {
type InfoMetricConfig (line 31) | type InfoMetricConfig struct
FILE: minion/consumer_group_offsets.go
method ListAllConsumerGroupOffsetsInternal (line 15) | func (s *Service) ListAllConsumerGroupOffsetsInternal() map[string]map[s...
method ListAllConsumerGroupOffsetsAdminAPI (line 20) | func (s *Service) ListAllConsumerGroupOffsetsAdminAPI(ctx context.Contex...
method listConsumerGroupOffsetsBulk (line 34) | func (s *Service) listConsumerGroupOffsetsBulk(ctx context.Context, grou...
method listConsumerGroupOffsets (line 69) | func (s *Service) listConsumerGroupOffsets(ctx context.Context, group st...
FILE: minion/describe_consumer_groups.go
type DescribeConsumerGroupsResponse (line 14) | type DescribeConsumerGroupsResponse struct
method listConsumerGroupsCached (line 19) | func (s *Service) listConsumerGroupsCached(ctx context.Context) (*kmsg.L...
method listConsumerGroups (line 42) | func (s *Service) listConsumerGroups(ctx context.Context) (*kmsg.ListGro...
method DescribeConsumerGroups (line 56) | func (s *Service) DescribeConsumerGroups(ctx context.Context) ([]Describ...
FILE: minion/describe_topic_config.go
method GetTopicConfigs (line 11) | func (s *Service) GetTopicConfigs(ctx context.Context) (*kmsg.DescribeCo...
FILE: minion/list_offsets.go
method ListEndOffsetsCached (line 13) | func (s *Service) ListEndOffsetsCached(ctx context.Context) (kadm.Listed...
method ListStartOffsetsCached (line 17) | func (s *Service) ListStartOffsetsCached(ctx context.Context) (kadm.List...
method listOffsetsCached (line 21) | func (s *Service) listOffsetsCached(ctx context.Context, offsetType stri...
method ListEndOffsets (line 57) | func (s *Service) ListEndOffsets(ctx context.Context) (kadm.ListedOffset...
method ListStartOffsets (line 62) | func (s *Service) ListStartOffsets(ctx context.Context) (kadm.ListedOffs...
type listOffsetsFunc (line 66) | type listOffsetsFunc
method listOffsetsInternal (line 68) | func (s *Service) listOffsetsInternal(ctx context.Context, listFunc list...
FILE: minion/log_dirs.go
type LogDirResponseShard (line 10) | type LogDirResponseShard struct
method DescribeLogDirs (line 16) | func (s *Service) DescribeLogDirs(ctx context.Context) []LogDirResponseS...
FILE: minion/metadata.go
method GetMetadataCached (line 11) | func (s *Service) GetMetadataCached(ctx context.Context) (*kmsg.Metadata...
method GetMetadata (line 36) | func (s *Service) GetMetadata(ctx context.Context) (*kmsg.MetadataRespon...
FILE: minion/offset_consumer.go
method startConsumingOffsets (line 17) | func (s *Service) startConsumingOffsets(ctx context.Context) {
method checkIfConsumerLagIsCaughtUp (line 56) | func (s *Service) checkIfConsumerLagIsCaughtUp(ctx context.Context) {
method decodeOffsetRecord (line 164) | func (s *Service) decodeOffsetRecord(record *kgo.Record) error {
method decodeOffsetMetadata (line 193) | func (s *Service) decodeOffsetMetadata(record *kgo.Record) error {
method decodeOffsetCommit (line 226) | func (s *Service) decodeOffsetCommit(record *kgo.Record) error {
method GetNumberOfOffsetRecordsConsumed (line 255) | func (s *Service) GetNumberOfOffsetRecordsConsumed() float64 {
FILE: minion/service.go
type Service (line 23) | type Service struct
method Start (line 96) | func (s *Service) Start(ctx context.Context) error {
method isReady (line 109) | func (s *Service) isReady() bool {
method HandleIsReady (line 117) | func (s *Service) HandleIsReady() http.HandlerFunc {
method ensureCompatibility (line 136) | func (s *Service) ensureCompatibility(ctx context.Context) error {
method getCachedItem (line 159) | func (s *Service) getCachedItem(key string) (interface{}, bool) {
method setCachedItem (line 167) | func (s *Service) setCachedItem(key string, val interface{}, timeout t...
method deleteCachedItem (line 179) | func (s *Service) deleteCachedItem(key string) {
function NewService (line 42) | func NewService(cfg Config, logger *zap.Logger, kafkaSvc *kafka.Service,...
FILE: minion/storage.go
type Storage (line 16) | type Storage struct
method isReady (line 55) | func (s *Storage) isReady() bool {
method setReadyState (line 59) | func (s *Storage) setReadyState(isReady bool) {
method markRecordConsumed (line 66) | func (s *Storage) markRecordConsumed(rec *kgo.Record) {
method addOffsetCommit (line 72) | func (s *Storage) addOffsetCommit(key kmsg.OffsetCommitKey, value kmsg...
method getConsumedOffsets (line 95) | func (s *Storage) getConsumedOffsets() map[int32]int64 {
method getNumberOfConsumedRecords (line 107) | func (s *Storage) getNumberOfConsumedRecords() float64 {
method getGroupOffsets (line 111) | func (s *Storage) getGroupOffsets() map[string]map[string]map[int32]Of...
method deleteOffsetCommit (line 138) | func (s *Storage) deleteOffsetCommit(key kmsg.OffsetCommitKey) {
type OffsetCommit (line 34) | type OffsetCommit struct
function newStorage (line 45) | func newStorage(logger *zap.Logger) (*Storage, error) {
function encodeOffsetCommitKey (line 143) | func encodeOffsetCommitKey(key kmsg.OffsetCommitKey) string {
FILE: minion/utils.go
method IsGroupAllowed (line 9) | func (s *Service) IsGroupAllowed(groupName string) bool {
method IsTopicAllowed (line 27) | func (s *Service) IsTopicAllowed(topicName string) bool {
function compileRegex (line 45) | func compileRegex(expr string) (*regexp.Regexp, error) {
function compileRegexes (line 64) | func compileRegexes(expr []string) ([]*regexp.Regexp, error) {
FILE: minion/versions.go
method GetClusterVersion (line 12) | func (s *Service) GetClusterVersion(ctx context.Context) (string, error) {
method GetAPIVersions (line 22) | func (s *Service) GetAPIVersions(ctx context.Context) (*kmsg.ApiVersions...
FILE: prometheus/collect_broker_info.go
method collectBrokerInfo (line 10) | func (e *Exporter) collectBrokerInfo(ctx context.Context, ch chan<- prom...
FILE: prometheus/collect_cluster_info.go
method collectClusterInfo (line 10) | func (e *Exporter) collectClusterInfo(ctx context.Context, ch chan<- pro...
FILE: prometheus/collect_consumer_group_lags.go
type waterMark (line 16) | type waterMark struct
method collectConsumerGroupLags (line 23) | func (e *Exporter) collectConsumerGroupLags(ctx context.Context, ch chan...
method collectConsumerGroupLagsOffsetTopic (line 52) | func (e *Exporter) collectConsumerGroupLagsOffsetTopic(_ context.Context...
method collectConsumerGroupLagsAdminAPI (line 128) | func (e *Exporter) collectConsumerGroupLagsAdminAPI(ctx context.Context,...
method waterMarksByTopic (line 214) | func (e *Exporter) waterMarksByTopic(lowMarks kadm.ListedOffsets, highMa...
FILE: prometheus/collect_consumer_groups.go
method collectConsumerGroups (line 14) | func (e *Exporter) collectConsumerGroups(ctx context.Context, ch chan<- ...
function decodeMemberAssignments (line 144) | func decodeMemberAssignments(protocolType string, member kmsg.DescribeGr...
FILE: prometheus/collect_exporter_metrics.go
method collectExporterMetrics (line 8) | func (e *Exporter) collectExporterMetrics(_ context.Context, ch chan<- p...
FILE: prometheus/collect_log_dirs.go
method collectLogDirs (line 12) | func (e *Exporter) collectLogDirs(ctx context.Context, ch chan<- prometh...
FILE: prometheus/collect_topic_info.go
method collectTopicInfo (line 12) | func (e *Exporter) collectTopicInfo(ctx context.Context, ch chan<- prome...
function getOrDefault (line 90) | func getOrDefault(m map[string]string, key string, defaultValue string) ...
FILE: prometheus/collect_topic_partition_offsets.go
method collectTopicPartitionOffsets (line 13) | func (e *Exporter) collectTopicPartitionOffsets(ctx context.Context, ch ...
FILE: prometheus/config.go
type Config (line 3) | type Config struct
method SetDefaults (line 9) | func (c *Config) SetDefaults() {
FILE: prometheus/exporter.go
type Exporter (line 16) | type Exporter struct
method InitializeMetrics (line 57) | func (e *Exporter) InitializeMetrics() {
method Describe (line 218) | func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
method Collect (line 223) | func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
function NewExporter (line 53) | func NewExporter(cfg Config, logger *zap.Logger, minionSvc *minion.Servi...
Condensed preview — 107 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (370K chars).
[
{
"path": ".github/ai-opt-out",
"chars": 14,
"preview": "opt-out: true\n"
},
{
"path": ".github/workflows/docker-image.yml",
"chars": 2705,
"preview": "---\nname: docker-image\non:\n push:\n tags: ['*']\n branches: ['master']\n paths-ignore: ['charts/**']\npermissions:"
},
{
"path": ".github/workflows/goreleaser.yml",
"chars": 638,
"preview": "---\nname: goreleaser\non:\n push:\n tags: ['*']\njobs:\n goreleaser:\n runs-on: ubuntu-latest\n permissions:\n c"
},
{
"path": ".gitignore",
"chars": 273,
"preview": "# Binaries for programs and plugins\n*.exe\n*.exe~\n*.dll\n*.so\n*.dylib\n\n# Test binary, build with `go test -c`\n*.test\n\n# Ou"
},
{
"path": ".goreleaser.yml",
"chars": 837,
"preview": "---\nversion: 2\nrelease:\n name_template: '{{.Version}} / {{time \"2006-01-02\"}}'\n prerelease: auto\n mode: append\n foot"
},
{
"path": "Dockerfile",
"chars": 1005,
"preview": "############################################################\n# Build image\n#############################################"
},
{
"path": "LICENSE",
"chars": 1065,
"preview": "MIT License\n\nCopyright (c) 2021 CloudHut\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\no"
},
{
"path": "README.md",
"chars": 3835,
"preview": "# Redpanda Minion (KMinion) - Prometheus Exporter via Kafka API\n\n[Redpanda](https://www.redpanda.com) Minion is a featur"
},
{
"path": "charts/README.md",
"chars": 841,
"preview": "# Helm Chart\n\n⚠️ This chart has been moved to https://github.com/redpanda-data/helm-charts/tree/main/charts/kminion . Pl"
},
{
"path": "charts/archives/.gitkeep",
"chars": 0,
"preview": ""
},
{
"path": "charts/archives/index.yaml",
"chars": 7897,
"preview": "apiVersion: v1\nentries:\n kminion:\n - apiVersion: v2\n appVersion: v2.2.5\n created: \"2023-07-03T16:38:22.568312+01"
},
{
"path": "charts/kminion/.helmignore",
"chars": 349,
"preview": "# Patterns to ignore when building packages.\n# This supports shell glob matching, relative path matching, and\n# negation"
},
{
"path": "charts/kminion/Chart.yaml",
"chars": 1188,
"preview": "apiVersion: v2\nname: kminion\ndescription: A Helm chart to deploy KMinion - a Prometheus Exporter for Apache Kafka\n\n# A c"
},
{
"path": "charts/kminion/templates/NOTES.txt",
"chars": 1657,
"preview": "1. Get the application URL by running these commands:\n{{- if .Values.ingress.enabled }}\n {{- range .Values.ingress.h"
},
{
"path": "charts/kminion/templates/_helpers.tpl",
"chars": 3402,
"preview": "{{/*\nExpand the name of the chart.\n*/}}\n{{- define \"kminion.name\" -}}\n{{- default .Chart.Name .Values.nameOverride | tru"
},
{
"path": "charts/kminion/templates/configmap.yaml",
"chars": 262,
"preview": "apiVersion: v1\nkind: ConfigMap\nmetadata:\n name: {{include \"kminion.fullname\" .}}\n namespace: {{ .Release.Namespace | q"
},
{
"path": "charts/kminion/templates/daemonset.yaml",
"chars": 3731,
"preview": "{{- if .Values.daemonset.enabled }}\napiVersion: apps/v1\nkind: DaemonSet\nmetadata:\n name: {{include \"kminion.fullname\" ."
},
{
"path": "charts/kminion/templates/deployment.yaml",
"chars": 3830,
"preview": "apiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: {{include \"kminion.fullname\" .}}\n namespace: {{ .Release.Namespa"
},
{
"path": "charts/kminion/templates/hpa.yaml",
"chars": 1512,
"preview": "{{- if .Values.autoscaling.enabled }}\napiVersion: {{ ternary \"autoscaling/v2\" \"autoscaling/v2beta1\" (.Capabilities.APIVe"
},
{
"path": "charts/kminion/templates/ingress.yaml",
"chars": 2548,
"preview": "{{- if .Values.ingress.enabled -}}\n{{- $fullName := include \"kminion.fullname\" . -}}\n{{- $svcPort := .Values.service.por"
},
{
"path": "charts/kminion/templates/poddisruptionbudget.yaml",
"chars": 661,
"preview": "{{- if .Values.podDisruptionBudget }}\napiVersion: {{ include \"kminion.podDisruptionBudget.apiVersion\" . }}\nkind: PodDisr"
},
{
"path": "charts/kminion/templates/service.yaml",
"chars": 648,
"preview": "apiVersion: v1\nkind: Service\nmetadata:\n name: {{ include \"kminion.fullname\" . }}\n namespace: {{ .Release.Namespace | q"
},
{
"path": "charts/kminion/templates/serviceaccount.yaml",
"chars": 366,
"preview": "{{- if .Values.serviceAccount.create -}}\napiVersion: v1\nkind: ServiceAccount\nmetadata:\n name: {{ include \"kminion.servi"
},
{
"path": "charts/kminion/templates/servicemonitor.yaml",
"chars": 1147,
"preview": "{{- if .Values.serviceMonitor.create }}\napiVersion: monitoring.coreos.com/v1\nkind: ServiceMonitor\nmetadata:\n name: {{in"
},
{
"path": "charts/kminion/values.yaml",
"chars": 9262,
"preview": "# Default values for kminion.\n# This is a YAML-formatted file.\n# Declare variables to be passed into your templates.\n\nre"
},
{
"path": "config.go",
"chars": 3281,
"preview": "package main\n\nimport (\n\t\"fmt\"\n\t\"os\"\n\t\"strings\"\n\n\t\"github.com/cloudhut/kminion/v2/kafka\"\n\t\"github.com/cloudhut/kminion/v2"
},
{
"path": "docker-compose.yml",
"chars": 1169,
"preview": "---\nversion: '2.1'\n\nservices:\n\n zookeeper:\n image: confluentinc/cp-zookeeper:latest\n ports:\n - 2181:2181\n "
},
{
"path": "docs/end-to-end.md",
"chars": 7394,
"preview": "# End-To-End Monitoring\n\nThis page describes the end-to-end monitoring feature in KMinion, how it works, and what metric"
},
{
"path": "docs/metrics.md",
"chars": 8396,
"preview": "# Exported Metrics\n\nThis document lists all exported metrics in an exemplary way.\n\n## Exporter Metrics\n\n```\n# HELP kmini"
},
{
"path": "docs/reference-config.yaml",
"chars": 9312,
"preview": "#####################################################################################\n# This file documents all the avai"
},
{
"path": "e2e/client_hooks.go",
"chars": 2955,
"preview": "package e2e\n\nimport (\n\t\"net\"\n\t\"sync/atomic\"\n\t\"time\"\n\n\t\"github.com/twmb/franz-go/pkg/kgo\"\n\t\"github.com/twmb/franz-go/pkg/"
},
{
"path": "e2e/config.go",
"chars": 1383,
"preview": "package e2e\n\nimport (\n\t\"fmt\"\n\t\"time\"\n)\n\ntype Config struct {\n\tEnabled bool `koanf:\"enabled\"`\n\t"
},
{
"path": "e2e/config_consumer.go",
"chars": 1075,
"preview": "package e2e\n\nimport (\n\t\"fmt\"\n\t\"time\"\n)\n\ntype EndToEndConsumerConfig struct {\n\tGroupIdPrefix string `koanf:\"g"
},
{
"path": "e2e/config_producer.go",
"chars": 570,
"preview": "package e2e\n\nimport (\n\t\"fmt\"\n\t\"time\"\n)\n\ntype EndToEndProducerConfig struct {\n\tAckSla time.Duration `koanf:\"ackSla\""
},
{
"path": "e2e/config_topic.go",
"chars": 1694,
"preview": "package e2e\n\nimport (\n\t\"fmt\"\n\t\"time\"\n)\n\ntype EndToEndTopicConfig struct {\n\tEnabled bool `koanf:\""
},
{
"path": "e2e/config_topic_test.go",
"chars": 5344,
"preview": "package e2e\n\nimport (\n\t\"testing\"\n\t\"time\"\n\n\t\"github.com/stretchr/testify/assert\"\n\t\"github.com/stretchr/testify/require\"\n\t"
},
{
"path": "e2e/consumer.go",
"chars": 2933,
"preview": "package e2e\n\nimport (\n\t\"context\"\n\t\"encoding/json\"\n\t\"strconv\"\n\t\"time\"\n\n\t\"github.com/twmb/franz-go/pkg/kgo\"\n\t\"github.com/t"
},
{
"path": "e2e/endtoend_message.go",
"chars": 618,
"preview": "package e2e\n\nimport \"time\"\n\nconst (\n\t_ = iota\n\tEndToEndMessageStateCreated\n\tEndToEndMessageStateProducedSuccessfully\n)\n\n"
},
{
"path": "e2e/group_tracker.go",
"chars": 5974,
"preview": "package e2e\n\nimport (\n\t\"context\"\n\t\"strings\"\n\t\"time\"\n\n\t\"github.com/twmb/franz-go/pkg/kerr\"\n\t\"github.com/twmb/franz-go/pkg"
},
{
"path": "e2e/message_tracker.go",
"chars": 4603,
"preview": "package e2e\n\nimport (\n\t\"fmt\"\n\t\"strconv\"\n\t\"time\"\n\n\t\"github.com/jellydator/ttlcache/v2\"\n\n\t\"go.uber.org/zap\"\n)\n\n// messageT"
},
{
"path": "e2e/partition_planner.go",
"chars": 34191,
"preview": "package e2e\n\nimport (\n\t\"fmt\"\n\t\"sort\"\n\n\t\"github.com/twmb/franz-go/pkg/kmsg\"\n\t\"go.uber.org/zap\"\n)\n\n// PartitionPlanner is "
},
{
"path": "e2e/partition_planner_test.go",
"chars": 24633,
"preview": "package e2e\n\nimport (\n\t\"sort\"\n\t\"testing\"\n\n\t\"github.com/stretchr/testify/assert\"\n\t\"github.com/stretchr/testify/require\"\n\t"
},
{
"path": "e2e/producer.go",
"chars": 3687,
"preview": "package e2e\n\nimport (\n\t\"context\"\n\t\"encoding/json\"\n\t\"strconv\"\n\t\"time\"\n\n\t\"github.com/google/uuid\"\n\t\"github.com/twmb/franz-"
},
{
"path": "e2e/service.go",
"chars": 10476,
"preview": "package e2e\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"strings\"\n\t\"time\"\n\n\t\"github.com/google/uuid\"\n\t\"github.com/prometheus/client_gol"
},
{
"path": "e2e/topic.go",
"chars": 15503,
"preview": "package e2e\n\nimport (\n\t\"context\"\n\t\"errors\"\n\t\"fmt\"\n\t\"sort\"\n\t\"time\"\n\n\t\"github.com/twmb/franz-go/pkg/kerr\"\n\t\"github.com/twm"
},
{
"path": "e2e/utils.go",
"chars": 3052,
"preview": "package e2e\n\nimport (\n\t\"context\"\n\t\"math\"\n\t\"time\"\n\n\t\"github.com/prometheus/client_golang/prometheus\"\n\t\"github.com/twmb/fr"
},
{
"path": "go.mod",
"chars": 1923,
"preview": "module github.com/cloudhut/kminion/v2\n\ngo 1.26\n\nrequire (\n\tgithub.com/google/uuid v1.6.0\n\tgithub.com/jcmturner/gokrb5/v8"
},
{
"path": "go.sum",
"chars": 48790,
"preview": "cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=\ncloud.google.com/go v0.34.0/go.mod h1"
},
{
"path": "kafka/client_config_helper.go",
"chars": 6254,
"preview": "package kafka\n\nimport (\n\t\"context\"\n\t\"crypto/tls\"\n\t\"crypto/x509\"\n\t\"encoding/pem\"\n\t\"fmt\"\n\t\"io/ioutil\"\n\t\"net\"\n\t\"time\"\n\n\t\"gi"
},
{
"path": "kafka/client_logger.go",
"chars": 765,
"preview": "package kafka\n\nimport (\n\t\"github.com/twmb/franz-go/pkg/kgo\"\n\t\"go.uber.org/zap\"\n)\n\ntype KgoZapLogger struct {\n\tlogger *za"
},
{
"path": "kafka/config.go",
"chars": 780,
"preview": "package kafka\n\nimport \"fmt\"\n\ntype Config struct {\n\t// General\n\tBrokers []string `koanf:\"brokers\"`\n\tClientID string `k"
},
{
"path": "kafka/config_sasl.go",
"chars": 1188,
"preview": "package kafka\n\nimport \"fmt\"\n\nconst (\n\tSASLMechanismPlain = \"PLAIN\"\n\tSASLMechanismScramSHA256 = \"SCRAM-SHA-256\"\n\tSA"
},
{
"path": "kafka/config_sasl_gssapi.go",
"chars": 805,
"preview": "package kafka\n\n// SASLGSSAPIConfig represents the Kafka Kerberos config\ntype SASLGSSAPIConfig struct {\n\tAuthType "
},
{
"path": "kafka/config_sasl_oauthbearer.go",
"chars": 2053,
"preview": "package kafka\n\nimport (\n\t\"context\"\n\t\"encoding/base64\"\n\t\"encoding/json\"\n\t\"fmt\"\n\t\"net/http\"\n\t\"net/url\"\n\t\"strings\"\n)\n\ntype "
},
{
"path": "kafka/config_tls.go",
"chars": 1141,
"preview": "package kafka\n\nimport \"fmt\"\n\n// TLSConfig to connect to Kafka via TLS\ntype TLSConfig struct {\n\tEnabled boo"
},
{
"path": "kafka/service.go",
"chars": 2901,
"preview": "package kafka\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"time\"\n\n\t\"github.com/twmb/franz-go/pkg/kerr\"\n\t\"github.com/twmb/franz-go/pkg/k"
},
{
"path": "logging/config.go",
"chars": 371,
"preview": "package logging\n\nimport (\n\t\"fmt\"\n\t\"go.uber.org/zap\"\n)\n\ntype Config struct {\n\tLevel string `koanf:\"level\"`\n}\n\nfunc (c *Co"
},
{
"path": "logging/logger.go",
"chars": 1785,
"preview": "package logging\n\nimport (\n\t\"os\"\n\n\t\"github.com/prometheus/client_golang/prometheus\"\n\t\"github.com/prometheus/client_golang"
},
{
"path": "main.go",
"chars": 3879,
"preview": "package main\n\nimport (\n\t\"context\"\n\t\"errors\"\n\t\"fmt\"\n\t\"net\"\n\t\"net/http\"\n\t\"os\"\n\t\"os/signal\"\n\t\"strconv\"\n\n\t\"github.com/cloudh"
},
{
"path": "minion/client_hooks.go",
"chars": 3071,
"preview": "package minion\n\nimport (\n\t\"net\"\n\t\"time\"\n\n\t\"github.com/prometheus/client_golang/prometheus\"\n\t\"github.com/prometheus/clien"
},
{
"path": "minion/config.go",
"chars": 980,
"preview": "package minion\n\nimport (\n\t\"fmt\"\n\n\t\"github.com/cloudhut/kminion/v2/e2e\"\n)\n\ntype Config struct {\n\tConsumerGroups ConsumerG"
},
{
"path": "minion/config_consumer_group.go",
"chars": 2540,
"preview": "package minion\n\nimport (\n\t\"fmt\"\n)\n\nconst (\n\tConsumerGroupScrapeModeOffsetsTopic string = \"offsetsTopic\"\n\tConsumerGroupSc"
},
{
"path": "minion/config_log_dirs.go",
"chars": 470,
"preview": "package minion\n\ntype LogDirsConfig struct {\n\t// Enabled specifies whether log dirs shall be scraped and exported or not."
},
{
"path": "minion/config_topic_config.go",
"chars": 2147,
"preview": "package minion\n\nimport (\n\t\"fmt\"\n)\n\nconst (\n\tTopicGranularityTopic string = \"topic\"\n\tTopicGranularityPartition string"
},
{
"path": "minion/consumer_group_offsets.go",
"chars": 2314,
"preview": "package minion\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"sync\"\n\n\t\"github.com/twmb/franz-go/pkg/kmsg\"\n\t\"go.uber.org/zap\"\n\t\"golang.org"
},
{
"path": "minion/describe_consumer_groups.go",
"chars": 2362,
"preview": "package minion\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"time\"\n\n\t\"github.com/twmb/franz-go/pkg/kerr\"\n\t\"github.com/twmb/franz-go/pkg/"
},
{
"path": "minion/describe_topic_config.go",
"chars": 772,
"preview": "package minion\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\n\t\"github.com/pkg/errors\"\n\t\"github.com/twmb/franz-go/pkg/kmsg\"\n)\n\nfunc (s *Se"
},
{
"path": "minion/list_offsets.go",
"chars": 3704,
"preview": "package minion\n\nimport (\n\t\"context\"\n\t\"errors\"\n\t\"fmt\"\n\t\"time\"\n\n\t\"github.com/twmb/franz-go/pkg/kadm\"\n\t\"go.uber.org/zap\"\n)\n"
},
{
"path": "minion/log_dirs.go",
"chars": 805,
"preview": "package minion\n\nimport (\n\t\"context\"\n\n\t\"github.com/twmb/franz-go/pkg/kgo\"\n\t\"github.com/twmb/franz-go/pkg/kmsg\"\n)\n\ntype Lo"
},
{
"path": "minion/metadata.go",
"chars": 960,
"preview": "package minion\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"time\"\n\n\t\"github.com/twmb/franz-go/pkg/kmsg\"\n)\n\nfunc (s *Service) GetMetadat"
},
{
"path": "minion/offset_consumer.go",
"chars": 8582,
"preview": "package minion\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"time\"\n\n\t\"github.com/twmb/franz-go/pkg/kbin\"\n\t\"github.com/twmb/franz-go/pkg/"
},
{
"path": "minion/service.go",
"chars": 5092,
"preview": "package minion\n\nimport (\n\t\"context\"\n\t\"encoding/json\"\n\t\"fmt\"\n\t\"net/http\"\n\t\"regexp\"\n\t\"strings\"\n\t\"sync\"\n\t\"time\"\n\n\t\"github.c"
},
{
"path": "minion/storage.go",
"chars": 4579,
"preview": "package minion\n\nimport (\n\t\"fmt\"\n\t\"strconv\"\n\t\"time\"\n\n\tcmap \"github.com/orcaman/concurrent-map\"\n\t\"github.com/twmb/franz-go"
},
{
"path": "minion/utils.go",
"chars": 1586,
"preview": "package minion\n\nimport (\n\t\"fmt\"\n\t\"regexp\"\n\t\"strings\"\n)\n\nfunc (s *Service) IsGroupAllowed(groupName string) bool {\n\tisAll"
},
{
"path": "minion/versions.go",
"chars": 931,
"preview": "package minion\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\n\t\"github.com/twmb/franz-go/pkg/kerr\"\n\t\"github.com/twmb/franz-go/pkg/kmsg\"\n\t\""
},
{
"path": "prometheus/collect_broker_info.go",
"chars": 783,
"preview": "package prometheus\n\nimport (\n\t\"context\"\n\t\"github.com/prometheus/client_golang/prometheus\"\n\t\"go.uber.org/zap\"\n\t\"strconv\"\n"
},
{
"path": "prometheus/collect_cluster_info.go",
"chars": 856,
"preview": "package prometheus\n\nimport (\n\t\"context\"\n\t\"github.com/prometheus/client_golang/prometheus\"\n\t\"go.uber.org/zap\"\n\t\"strconv\"\n"
},
{
"path": "prometheus/collect_consumer_group_lags.go",
"chars": 8322,
"preview": "package prometheus\n\nimport (\n\t\"context\"\n\t\"math\"\n\t\"strconv\"\n\n\t\"github.com/prometheus/client_golang/prometheus\"\n\t\"github.c"
},
{
"path": "prometheus/collect_consumer_groups.go",
"chars": 4601,
"preview": "package prometheus\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"strconv\"\n\n\t\"github.com/prometheus/client_golang/prometheus\"\n\t\"github.co"
},
{
"path": "prometheus/collect_exporter_metrics.go",
"chars": 394,
"preview": "package prometheus\n\nimport (\n\t\"context\"\n\t\"github.com/prometheus/client_golang/prometheus\"\n)\n\nfunc (e *Exporter) collectE"
},
{
"path": "prometheus/collect_log_dirs.go",
"chars": 2194,
"preview": "package prometheus\n\nimport (\n\t\"context\"\n\t\"github.com/prometheus/client_golang/prometheus\"\n\t\"github.com/twmb/franz-go/pkg"
},
{
"path": "prometheus/collect_topic_info.go",
"chars": 2655,
"preview": "package prometheus\n\nimport (\n\t\"context\"\n\t\"strconv\"\n\n\t\"github.com/prometheus/client_golang/prometheus\"\n\t\"github.com/twmb/"
},
{
"path": "prometheus/collect_topic_partition_offsets.go",
"chars": 2623,
"preview": "package prometheus\n\nimport (\n\t\"context\"\n\t\"strconv\"\n\n\t\"github.com/prometheus/client_golang/prometheus\"\n\t\"go.uber.org/zap\""
},
{
"path": "prometheus/config.go",
"chars": 223,
"preview": "package prometheus\n\ntype Config struct {\n\tHost string `koanf:\"host\"`\n\tPort int `koanf:\"port\"`\n\tNamespace st"
},
{
"path": "prometheus/exporter.go",
"chars": 8879,
"preview": "package prometheus\n\nimport (\n\t\"context\"\n\t\"os\"\n\t\"strings\"\n\t\"time\"\n\n\t\"github.com/cloudhut/kminion/v2/minion\"\n\tuuid2 \"githu"
}
]
// ... and 22 more files (download for full content)
About this extraction
This page contains the full source code of the cloudworkz/kafka-minion GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 107 files (334.9 KB), approximately 106.6k tokens, and a symbol index with 261 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.