Repository: samber/awesome-prometheus-alerts Branch: master Commit: 9d9c648cddd1 Files: 133 Total size: 873.6 KB Directory structure: gitextract_epx1dxil/ ├── .github/ │ ├── FUNDING.yml │ ├── dependabot.yml │ └── workflows/ │ ├── dist.yml │ └── test.yml ├── .gitignore ├── .travis.yml ├── CLAUDE.md ├── CONTRIBUTING.md ├── Gemfile ├── LICENSE ├── README.md ├── _config.yml ├── _data/ │ └── rules.yml ├── _layouts/ │ └── default.html ├── alertmanager.md ├── assets/ │ ├── css/ │ │ └── app.css │ └── js/ │ └── app.js ├── blackbox-exporter.md ├── dist/ │ ├── rules/ │ │ ├── apache/ │ │ │ └── lusitaniae-apache-exporter.yml │ │ ├── apache-flink/ │ │ │ └── flink-prometheus-reporter.yml │ │ ├── apache-spark/ │ │ │ └── spark-prometheus.yml │ │ ├── apc-ups/ │ │ │ └── apcupsd_exporter.yml │ │ ├── argocd/ │ │ │ └── embedded-exporter.yml │ │ ├── aws-cloudwatch/ │ │ │ └── prometheus-cloudwatch-exporter.yml │ │ ├── azure/ │ │ │ └── azure-metrics-exporter.yml │ │ ├── blackbox/ │ │ │ └── blackbox-exporter.yml │ │ ├── caddy/ │ │ │ └── embedded-exporter.yml │ │ ├── cassandra/ │ │ │ ├── criteo-cassandra-exporter.yml │ │ │ └── instaclustr-cassandra-exporter.yml │ │ ├── ceph/ │ │ │ └── embedded-exporter.yml │ │ ├── cert-manager/ │ │ │ └── embedded-exporter.yml │ │ ├── cilium/ │ │ │ └── embedded-exporter.yml │ │ ├── clickhouse/ │ │ │ └── embedded-exporter.yml │ │ ├── cloudflare/ │ │ │ └── lablabs-cloudflare-exporter.yml │ │ ├── consul/ │ │ │ └── consul-exporter.yml │ │ ├── coredns/ │ │ │ └── embedded-exporter.yml │ │ ├── cortex/ │ │ │ └── embedded-exporter.yml │ │ ├── couchdb/ │ │ │ └── gesellix-couchdb-prometheus-exporter.yml │ │ ├── digitalocean/ │ │ │ └── digitalocean-exporter.yml │ │ ├── docker-containers/ │ │ │ └── google-cadvisor.yml │ │ ├── ebpf/ │ │ │ └── ebpf-exporter.yml │ │ ├── elasticsearch/ │ │ │ └── prometheus-community-elasticsearch-exporter.yml │ │ ├── envoy/ │ │ │ └── embedded-exporter.yml │ │ ├── etcd/ │ │ │ └── embedded-exporter.yml │ │ ├── fluxcd/ │ │ │ └── embedded-exporter.yml │ │ ├── freeswitch/ │ │ │ └── znerol-freeswitch-exporter.yml │ │ ├── gitlab-ci/ │ │ │ ├── gitaly.yml │ │ │ ├── gitlab-built-in-exporter.yml │ │ │ └── workhorse.yml │ │ ├── golang/ │ │ │ └── golang-exporter.yml │ │ ├── google-cloud-stackdriver/ │ │ │ └── stackdriver-exporter.yml │ │ ├── grafana-alloy/ │ │ │ └── embedded-exporter.yml │ │ ├── grafana-mimir/ │ │ │ └── embedded-exporter.yml │ │ ├── grafana-tempo/ │ │ │ └── embedded-exporter.yml │ │ ├── graph-node/ │ │ │ └── embedded-exporter.yml │ │ ├── hadoop/ │ │ │ └── jmx_exporter.yml │ │ ├── haproxy/ │ │ │ ├── embedded-exporter-v2.yml │ │ │ └── haproxy-exporter-v1.yml │ │ ├── hashicorp-vault/ │ │ │ └── embedded-exporter.yml │ │ ├── host-and-hardware/ │ │ │ └── node-exporter.yml │ │ ├── ipmi/ │ │ │ └── ipmi-exporter.yml │ │ ├── istio/ │ │ │ └── embedded-exporter.yml │ │ ├── jaeger/ │ │ │ └── embedded-exporter.yml │ │ ├── jenkins/ │ │ │ └── metric-plugin.yml │ │ ├── juniper/ │ │ │ └── czerwonk-junos-exporter.yml │ │ ├── jvm/ │ │ │ └── jvm-exporter.yml │ │ ├── kafka/ │ │ │ ├── danielqsj-kafka-exporter.yml │ │ │ └── linkedin-kafka-exporter.yml │ │ ├── keycloak/ │ │ │ └── aerogear-keycloak-metrics-spi.yml │ │ ├── kubernetes/ │ │ │ └── kubestate-exporter.yml │ │ ├── linkerd/ │ │ │ └── embedded-exporter.yml │ │ ├── loki/ │ │ │ └── embedded-exporter.yml │ │ ├── meilisearch/ │ │ │ └── embedded-exporter.yml │ │ ├── memcached/ │ │ │ └── memcached-exporter.yml │ │ ├── minio/ │ │ │ └── embedded-exporter.yml │ │ ├── mongodb/ │ │ │ ├── dcu-mongodb-exporter.yml │ │ │ ├── percona-mongodb-exporter.yml │ │ │ └── stefanprodan-mgob-exporter.yml │ │ ├── mysql/ │ │ │ └── mysqld-exporter.yml │ │ ├── nats/ │ │ │ └── nats-exporter.yml │ │ ├── netdata/ │ │ │ └── embedded-exporter.yml │ │ ├── nginx/ │ │ │ └── knyar-nginx-exporter.yml │ │ ├── nomad/ │ │ │ └── embedded-exporter.yml │ │ ├── openebs/ │ │ │ └── embedded-exporter.yml │ │ ├── openstack/ │ │ │ └── openstack-exporter.yml │ │ ├── opentelemetry-collector/ │ │ │ └── embedded-exporter.yml │ │ ├── oracle-database/ │ │ │ └── iamseth-oracledb-exporter.yml │ │ ├── patroni/ │ │ │ └── embedded-exporter-patroni.yml │ │ ├── pgbouncer/ │ │ │ └── spreaker-pgbouncer-exporter.yml │ │ ├── php-fpm/ │ │ │ └── bakins-fpm-exporter.yml │ │ ├── postgresql/ │ │ │ └── postgres-exporter.yml │ │ ├── process-exporter/ │ │ │ └── process-exporter.yml │ │ ├── prometheus-self-monitoring/ │ │ │ └── embedded-exporter.yml │ │ ├── promtail/ │ │ │ └── embedded-exporter.yml │ │ ├── proxmox-ve/ │ │ │ └── prometheus-pve-exporter.yml │ │ ├── pulsar/ │ │ │ └── embedded-exporter.yml │ │ ├── python/ │ │ │ └── python-exporter.yml │ │ ├── rabbitmq/ │ │ │ ├── kbudde-rabbitmq-exporter.yml │ │ │ └── rabbitmq-exporter.yml │ │ ├── redis/ │ │ │ └── oliver006-redis-exporter.yml │ │ ├── ruby/ │ │ │ └── ruby-exporter.yml │ │ ├── s.m.a.r.t-device-monitoring/ │ │ │ └── smartctl-exporter.yml │ │ ├── sidekiq/ │ │ │ └── strech-sidekiq-exporter.yml │ │ ├── snmp/ │ │ │ └── snmp-exporter.yml │ │ ├── solr/ │ │ │ └── embedded-exporter.yml │ │ ├── speedtest/ │ │ │ └── nlamirault-speedtest-exporter.yml │ │ ├── spinnaker/ │ │ │ └── embedded-exporter.yml │ │ ├── sql-server/ │ │ │ └── ozarklake-mssql-exporter.yml │ │ ├── ssl/ │ │ │ └── tls/ │ │ │ └── ribbybibby-ssl-exporter.yml │ │ ├── systemd/ │ │ │ └── systemd-exporter.yml │ │ ├── thanos/ │ │ │ ├── thanos-bucket-replicate.yml │ │ │ ├── thanos-compactor.yml │ │ │ ├── thanos-component-absent.yml │ │ │ ├── thanos-query.yml │ │ │ ├── thanos-receiver.yml │ │ │ ├── thanos-ruler.yml │ │ │ ├── thanos-sidecar.yml │ │ │ └── thanos-store.yml │ │ ├── traefik/ │ │ │ ├── embedded-exporter-v1.yml │ │ │ └── embedded-exporter-v2.yml │ │ ├── vmware/ │ │ │ └── pryorda-vmware-exporter.yml │ │ ├── windows-server/ │ │ │ └── windows-exporter.yml │ │ ├── wireguard/ │ │ │ └── mindflavor-prometheus-wireguard-exporter.yml │ │ ├── zfs/ │ │ │ ├── node-exporter.yml │ │ │ └── zfs_exporter.yml │ │ └── zookeeper/ │ │ ├── cloudflare-kafka-zookeeper-exporter.yml │ │ └── dabealu-zookeeper-exporter.yml │ └── template.yml ├── docker-compose.yml ├── index.md ├── package.json ├── rules.md └── sleep-peacefully.md ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/FUNDING.yml ================================================ github: [samber] ko_fi: samuelberthe ================================================ FILE: .github/dependabot.yml ================================================ --- version: 2 updates: - package-ecosystem: "github-actions" directory: "/" schedule: interval: "monthly" ================================================ FILE: .github/workflows/dist.yml ================================================ name: Publish on: workflow_dispatch: push: branches: - master permissions: contents: write jobs: publish: name: Publish # Check if the PR is not from a fork if: github.repository_owner == 'samber' runs-on: ubuntu-latest steps: - name: Checkout Repo uses: actions/checkout@v6 - name: Set up Ruby uses: ruby/setup-ruby@v1 with: ruby-version: 3.4 - name: Set up yq uses: mikefarah/yq@v4 - name: Install liquid run: | gem install liquid -v 5.5.1 gem install liquid-cli - name: Build rule configuration run: | cat _data/rules.yml | yq -I 0 -o json > _data/rules.json rm -rf dist/rules for service in $(cat _data/rules.json | jq -r '.groups[].services[] | @base64'); do subdir=dist/rules/$(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")') mkdir -p "${subdir}" # groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")')) for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug') cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml echo ${subdir}/${exporterName}.yml done done rm _data/rules.json # https://peterevans.dev/posts/github-actions-how-to-automate-code-formatting-in-pull-requests/ - name: Check for modified files id: git-check run: echo "modified=$(git status -s --porcelain | wc -l | awk '{$1=$1};1')" >> $GITHUB_OUTPUT - name: Push changes if: steps.git-check.outputs.modified != '0' run: | git config --global user.name 'samber' git config --global user.email 'samber@users.noreply.github.com' git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }} git add . git commit -m "Publish" git push ================================================ FILE: .github/workflows/test.yml ================================================ name: Promtool check on: pull_request: push: branches: - master jobs: promtool-check: name: Check alert rules syntax runs-on: ubuntu-latest steps: - name: Checkout Repo uses: actions/checkout@v6 - name: Set up Ruby uses: ruby/setup-ruby@v1 with: ruby-version: 3.4 - name: Set up yq uses: mikefarah/yq@v4 - name: Install liquid run: gem install liquid-cli - name: Build rule configuration run: | cat _data/rules.yml | yq -I 0 -o json > _data/rules.json for service in $(cat _data/rules.json | jq -r '.groups[].services[] | @base64'); do subdir=test/rules/$(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")') mkdir -p "${subdir}" # groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")')) for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug') cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml echo ${subdir}/${exporterName}.yml done done rm _data/rules.json - name: Check Prometheus alert rules uses: peimanja/promtool-github-actions@master with: promtool_actions_subcommand: 'rules' promtool_actions_files: 'test/rules/*/*.yml' promtool_actions_comment: true env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .gitignore ================================================ _site/ .sass-cache/ .jekyll-cache/ .jekyll-metadata _data/rules.json test/rules/ /node_modules .worktrees/ ================================================ FILE: .travis.yml ================================================ language: node_js node_js: - 'node' ================================================ FILE: CLAUDE.md ================================================ # CLAUDE.md This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. ## Project Overview A curated collection of ~940 Prometheus alerting rules covering 90+ services across 100+ exporters, organized in 7 categories: basic resource monitoring (Prometheus, host/hardware, SMART, Docker, Blackbox, Windows, VMware, Netdata), databases and brokers (MySQL, PostgreSQL, Redis, MongoDB, RabbitMQ, Elasticsearch, Cassandra, Clickhouse, Kafka, etc.), reverse proxies and load balancers (Nginx, Apache, HaProxy, Traefik, Caddy), runtimes (PHP-FPM, JVM, Sidekiq), orchestrators (Kubernetes, Nomad, Consul, Etcd, Istio, ArgoCD, FluxCD), network/security/storage (Ceph, ZFS, Minio, SSL/TLS, CoreDNS, Vault, Cloudflare), and observability tools (Thanos, Loki, Cortex, OpenTelemetry Collector, Jenkins). All rules are stored in a single YAML data file (`_data/rules.yml`) and rendered as a Jekyll-based GitHub Pages site at https://samber.github.io/awesome-prometheus-alerts. The site provides copy-pasteable Prometheus alert snippets and downloadable rule files per exporter. The project is community-driven. Most contributions are PRs adding or updating rules in `_data/rules.yml`. Files in `dist/rules/` are auto-generated on merge — never edit them manually. ## Architecture - **`_data/rules.yml`** — The single source of truth for all alerting rules. This is the main file contributors edit. It is NOT a valid Prometheus config; the site renders each rule into copy-pasteable Prometheus alert format. - **`rules.md`** — Jekyll template that iterates over `_data/rules.yml` and renders the rules page with copy buttons and formatted YAML blocks. - **`alertmanager.md`** — Static page with Prometheus/AlertManager configuration examples. - **`_layouts/default.html`** — Site layout (Jekyll theme: cayman). - **`_config.yml`** — Jekyll configuration. - **`dist/rules/`** — Pre-built downloadable rule files organized by service/exporter (referenced in the site for `wget` commands). ## Rules YAML Structure Services are listed in README.md. `_data/rules.yml` hierarchy: ``` groups: - name: "" # e.g. "Basic resource monitoring" services: - name: "" # e.g. "Host and hardware" exporters: - name: "" slug: "" # used for download URLs doc_url: "" # optional link to exporter docs comments: # optional, exporter-level multiline notes rendered before rules "" rules: - name: "" description: "" query: "" severity: warning|critical|info for: "" # optional, defaults to 0m comments: # optional, rendered as multiline YAML comments "" ``` Services are grouped in category. If you are not sure about the classification, ask the developer. ## Running Locally ```bash # With Ruby/Bundler gem install bundler bundle install jekyll serve # With Docker Compose docker compose up -d # With Docker directly docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve ``` Site serves at http://localhost:4000/awesome-prometheus-alerts. ## Contributing Rules All rule changes go in `_data/rules.yml`. Each rule needs: `name`, `description`, `query` (valid PromQL), and `severity`. The `for` field is optional. Descriptions should be factual ("what") and include root cause hints ("why"). Queries must be tested against the latest exporter version. Never modify files in `dist/` — they are auto-generated on merge. ## Query Validation - When adding or updating an alert, verify that the PromQL query references metric series that actually exist in the related exporter. Check the exporter's documentation or source code to confirm series names. - If a metric series has been deprecated or removed in a newer version of the exporter, update the query to use the replacement series, or remove the rule if no replacement exists. Known examples: `kube_hpa_*` renamed to `kube_horizontalpodautoscaler_*` in kube-state-metrics 2.x; `node_hwmon_temp_alarm` does not exist (correct: `node_hwmon_temp_crit_alarm_celsius`); node-exporter CLI flags get renamed across versions. - When writing or reviewing a query, search the internet (exporter docs, GitHub issues, changelogs) to validate correctness and catch outdated series names. When you are not sure about a metric name, always search the internet to confirm it exists and is spelled correctly before using it. - Pay special attention to metric naming conventions: many exporters add `_total` suffixes for counters and `_seconds_total` for time-based counters. Verify the exact name from source code, not just docs. Known examples: Spark's PrometheusResource adds `_total` and `_seconds_total` suffixes (e.g., `metrics_executor_failedTasks_total`, not `metrics_executor_failedTasks`); Oracle's `oracledb_sessions_value` not `oracledb_sessions_activity`. - Verify that label names used in `{{ $labels.xxx }}` template variables actually exist on the metric. Check the exporter source code for the exact label names. Known examples: cloudflare/ebpf_exporter uses `id` not `name` for programs, and `config` not `name` for decoder errors. - When a metric uses info-style patterns (value always 1, information carried in labels), `== 0` will never be true — the metric simply won't exist. Use `absent()` instead. Known example: `ebpf_exporter_enabled_configs`. - Some metrics are version-dependent. When a metric was renamed or removed in a newer version, add a comment noting the version requirement. Known examples: `go_memstats_gc_cpu_fraction` removed in client_golang v1.12+; cert-manager renamed `certmanager_http_acme_client_request_count` to `certmanager_acme_client_request_count` in v1.19+. - Verify the unit of a metric before setting thresholds. Some metrics use milliseconds while descriptions assume seconds. Known example: Keycloak's `keycloak_request_duration` is in milliseconds, so `> 2` means 2ms not 2s. - Some exporters expose labels that differ between services even within the same ecosystem. Known example: OpenStack Neutron uses `adminState="up"` while Nova and Cinder use `adminState="enabled"`. - When an official mixin exists for a service, compare thresholds and time windows against it. Known deviations to watch for: Mimir store-gateway sync uses 1800s (not 600s), Mimir compactor skipped blocks uses `[24h]` (not `[5m]`), Tempo normalizes outstanding blocks per worker. ## Common Review Pitfalls (learned from PR history) These are the most frequent issues raised during code review on this repo: ### Severity levels - `critical` = requires immediate human attention. Do not use for informational/security notifications. - `warning` = needs attention soon but not urgent. - `info` = awareness only (e.g., config changes, underutilized resources). - Authentication failures, security notifications, and config-change detections are typically `info`, not `critical`. ### `for` duration - Omit `for` when the default (0m) is intentional and appropriate — do not add `for: 0m` explicitly. - Add a `for` duration (e.g., `for: 2m` or `for: 5m`) to tolerate brief unavailability from restarts or transient spikes. Most "service down" rules should have at least `for: 1m`–`2m`. - Do not blanket-change all `for: 0m` to `for: 1m` — it depends on the alert's semantics and the range window used in `increase()`/`rate()`. ### Query design - Prefer symptom-based alerts over cause-based alerts to reduce alert fatigue. Example: "service is unreachable" is better than "specific internal counter changed". Metrics like heap object count, allocation rate, or free heap slots are causes, not symptoms — prefer GC duration, latency, or error rate alerts instead. - Don't add unnecessary aggregation (`avg()`, `avg_over_time()`) on metrics that are local to a single node/instance. Only aggregate when the alert is cluster-wide. - Don't combine `min_over_time()[1m]` with `for: 2m` redundantly — pick one mechanism for smoothing. Same applies to `avg_over_time()[5m]` with `for: 5m`. - Remove unnecessary label filters (e.g., `job="cassandra"` or `cluster=~".*"`) that add noise without value. - Verify comparison operators match the intent — e.g., "high snapshot count" must use `> N`, not `< N`. - When dividing counters (e.g., error rate = errors / total), guard against division by zero with `and total > 0` or filter appropriately. This is the most common issue in new PRs — check every ratio query. - Filter out system/template databases explicitly in DB queries (e.g., PostgreSQL: add `datid!="0"` alongside `datname!~"template.*|postgres"`). - Never use `rate()` on a gauge metric — use `deriv()` instead. `rate()` is for monotonically increasing counters only. - When using `increase()` for ratio calculations, prefer `rate()` instead — `increase()` can produce incorrect results when counters reset mid-window. - When filtering gRPC error codes, don't use `grpc_code!="OK"` — this includes normal application responses like `NotFound`, `AlreadyExists`, and `Cancelled`. Filter to actual errors: `grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"`. - When computing ratios with `rate()` on a metric that is itself already a normalized rate (e.g., Oracle's `v$waitclassmetric`), applying `rate()` computes the rate-of-change of a rate, which is not meaningful. - When a multi-label metric is used in a binary operation with a metric that has fewer labels, use `ignoring(extra_label)` to avoid join failures. Known example: `systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max`. - When a query groups by labels (e.g., `by (le, worker)`), consider the cardinality impact — hundreds of label values means hundreds of independent alerts. - Ensure `{{ $value | humanizeDuration }}` is only used on values in seconds. If the metric is in milliseconds, divide by 1000 first or use `{{ $value | humanize }}ms`. - Avoid using `up{job=~"exporter-name"} == 0` or `absent(up{job=~"exporter-name"})` to detect whether a service is down. When targets are managed via service discovery or a job reaches multiple targets, a disappeared target causes the `up` series to become stale and vanish rather than drop to 0, so the alert never fires. Prefer application-level or cluster-level metrics instead (e.g., "number of consul cluster members < 3", "PostgreSQL primary node absent"). ### Thresholds - Alert thresholds are inherently arbitrary and depend on workload. Use `comments:` to note this when a threshold is a rough default. - When threshold values in a PR seem unreasonable (too high or too low), challenge them with real-world reasoning or exporter docs. - Watch for thresholds that are so high they only catch catastrophic scenarios and miss real problems. Examples: Go goroutine spike at 100/s (misses gradual leaks), Ruby major GC at 5/s (only fires if app is non-functional), Python gen2 GC at >1/s (extremely rare). - Watch for thresholds that will fire on normal healthy operation. Examples: Memcached at 90% memory is desired (it's a cache), Flink TaskManager at 90% JVM heap is normal, cache hit rate < 80% is common for cold caches. - For SNMP bandwidth utilization, `ifSpeed` (Gauge32) maxes at ~4.29 Gbps. For 10G+ interfaces, use `ifHighSpeed * 1000000` instead. - For alerts using `> 0` on counters with `rate()` or `increase()`, consider whether a single event truly warrants alerting. In most cases, a small threshold (e.g., `> 0.05` for rate, `> 3` for increase) better distinguishes real problems from transient noise. ### Comments - When an alert or its query needs explanation (e.g., non-obvious PromQL logic, threshold rationale, edge cases), use the rule-level `comments:` field. Use multiline comments when needed. - Use the exporter-level `comments:` field for notes that apply to all rules under that exporter (e.g., exporter version requirements, known quirks, setup prerequisites). - Comments are rendered as YAML `#` comments in the output, so they are visible to users who copy-paste the rules. ### Descriptions - Keep descriptions short, factual, and actionable. - Include what is happening ("Disk is almost full") and why it matters or what to check. - Use `{{ $labels.instance }}`, `{{ $value }}`, and other template variables in descriptions when useful. - If the description says "average" but the query uses `histogram_quantile(0.95, ...)`, fix the description to say "p95" (or vice versa). - When alerting on rates or ratios that may not be intuitive, include `{{ $value }}` in the description so operators can see the actual number. ### Structure - Some services have multiple exporters (e.g., MongoDB has `percona/mongodb_exporter` and `dcu/mongodb_exporter`). Place rules under the correct exporter. - Search for duplicates before adding a new rule — a similar alert may already exist under a different exporter or with different thresholds. - The `slug` field must be unique per exporter and is used for download URLs. ## Reference Sources for Cross-Checking Alerts Use these sources to criticize and validate PromQL queries, compare thresholds, and find inspiration for new rules. Everytime you consume an external resource to change a PromQL query, please compare before/after and explain why you think the external source is right. ### Official project mixins (alerts maintained by the project itself) - https://github.com/prometheus/node_exporter/tree/master/docs/node-mixin/alerts - https://github.com/prometheus/prometheus/tree/main/documentation/prometheus-mixin - https://github.com/prometheus/alertmanager/tree/main/doc/alertmanager-mixin - https://github.com/prometheus/snmp_exporter/tree/main/snmp-mixin - https://github.com/prometheus/mysqld_exporter/tree/main/mysqld-mixin - https://github.com/prometheus-community/postgres_exporter/tree/master/postgres_mixin - https://github.com/prometheus-community/elasticsearch_exporter (mixin via Grafana docs) - https://github.com/etcd-io/etcd/tree/main/contrib/mixin - https://github.com/thanos-io/thanos/tree/main/mixin (also: examples/alerts/) - https://github.com/grafana/loki/tree/main/production/loki-mixin (also: promtail-mixin/) - https://github.com/grafana/mimir/tree/main/operations/mimir-mixin - https://github.com/grafana/tempo/tree/main/operations/tempo-mixin - https://github.com/grafana/grafana/tree/main/grafana-mixin - https://github.com/ceph/ceph/tree/main/monitoring/ceph-mixin (in-tree; also https://github.com/ceph/ceph-mixins) - https://github.com/jaegertracing/jaeger/tree/main/monitoring/jaeger-mixin - https://github.com/kubernetes-monitoring/kubernetes-mixin (includes runbook.md) - https://github.com/kubernetes/kube-state-metrics/tree/main/jsonnet/kube-state-metrics-mixin - https://github.com/prometheus-operator/prometheus-operator/tree/main/jsonnet/mixin - https://github.com/prometheus-operator/kube-prometheus - https://github.com/cortexproject/cortex-jsonnet - https://github.com/gluster/gluster-mixins ### Standalone mixin repositories - https://github.com/povilasv/coredns-mixin - https://github.com/adinhodovic/rabbitmq-mixin - https://github.com/adinhodovic/blackbox-exporter-mixin - https://github.com/adinhodovic/django-mixin - https://github.com/adinhodovic/argo-cd-mixin - https://github.com/adinhodovic/ingress-nginx-mixin - https://github.com/adinhodovic/kubernetes-autoscaling-mixin - https://github.com/metalmatze/kube-cockroachdb (CockroachDB on Kubernetes) - https://github.com/bitnami-labs/sealed-secrets (sealed-secrets mixin) - https://github.com/lukas-vlcek/elasticsearch-mixin (includes runbook.md) - https://github.com/adinhodovic/postgresql-mixin - https://github.com/imusmanmalik/cert-manager-mixin - https://gitlab.com/uneeq-oss/cert-manager-mixin (alternative cert-manager mixin) - https://github.com/uneeq-oss/spinnaker-mixin - https://github.com/metalmatze/slo-libsonnet (SLO alerting/recording rules generation library) ### Grafana jsonnet-libs (93 mixins — browse for specific services) - https://github.com/grafana/jsonnet-libs - Notable mixins with alerts: consul, memcached, elasticsearch, haproxy, clickhouse, opensearch, redis, mongodb, kafka, nginx, rabbitmq, jvm, vault, envoy, istio, jenkins, caddy, cloudflare, docker, traefik, windows, snmp, argocd, nomad, pgbouncer, minio, ceph, and 60+ more. ### Mixin aggregators - https://monitoring.mixins.dev/ (central registry of all monitoring mixins) - https://github.com/monitoring-mixins/website/blob/master/mixins.json (machine-readable list of all mixins with source URLs) - https://github.com/nlamirault/monitoring-mixins (hub aggregating many mixins) ### GitLab monitoring & infrastructure - https://gitlab.com/gitlab-com/runbooks (GitLab.com SRE runbooks — production alert rules, runbook docs, alertmanager config) - https://gitlab.com/gitlab-com/runbooks/-/tree/master/mimir-rules (production Mimir alerting rules organized by tenant/environment) - https://gitlab.com/gitlab-com/runbooks/-/tree/master/mimir-rules-jsonnet (jsonnet sources for GitLab alerting rules) - https://gitlab.com/gitlab-org/omnibus-gitlab/-/tree/master/files/gitlab-cookbooks/monitoring/templates/rules (default Prometheus rules shipped with GitLab Omnibus) ### Community alert collections - https://github.com/jpweber/prometheus-alert-rules - https://github.com/bdossantos/prometheus-alert-rules - https://github.com/giantswarm/prometheus-rules - https://github.com/last9/awesome-prometheus-toolkit - https://github.com/warpnet/awesome-prometheus (meta-list of Prometheus resources) ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing ## Adding alerting rule If you don't have time to write a PR, just copy and paste some alerts into an issue. We will format it accordingly. Rules are here: `_data/rules.yml`. ### Guidelines Please ensure your pull request adheres to the following guidelines: - Search previous suggestions before making a new one, as yours may be a duplicate. - Keep descriptions short and simple, but descriptive. - Description must be factual (the "what?") and should provide root cause suggestions (the "why?"), for faster resolution. - Queries must be tested on latest exporter version. ## Improving Github page ### Run locally ``` gem install bundler bundle install jekyll serve ``` Or with Docker: ``` docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve ``` Or with Docker Compose: ``` docker compose up -d ``` ================================================ FILE: Gemfile ================================================ source 'https://rubygems.org' gem 'github-pages', '>= 232', group: :jekyll_plugins gem 'webrick', '~> 1.8' ================================================ FILE: LICENSE ================================================ Creative Commons Attribution 4.0 International License (CC BY 4.0) http://creativecommons.org/licenses/by/4.0/ ================================================ FILE: README.md ================================================ # 👋 Awesome Prometheus Alerts [![Awesome](https://awesome.re/badge-flat.svg)](https://awesome.re) > Most alerting rules are common to every Prometheus setup. We need a place to find them all. 🤘 🚨 📊 Collection available here: **[https://samber.github.io/awesome-prometheus-alerts](https://samber.github.io/awesome-prometheus-alerts)** ## ✨ Contents - [Rules](#-rules) - [Contributing](#-contributing) - [Improvements](#-improvements) - [Help us](#-show-your-support) - [License](#-license) ## 🚨 Rules #### Basic resource monitoring - [Prometheus self-monitoring](https://samber.github.io/awesome-prometheus-alerts/rules#prometheus-internals) - [Host/Hardware](https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware) - [SMART](https://samber.github.io/awesome-prometheus-alerts/rules#smart) - [IPMI](https://samber.github.io/awesome-prometheus-alerts/rules#ipmi) - [Docker Containers](https://samber.github.io/awesome-prometheus-alerts/rules#docker-containers) - [Blackbox](https://samber.github.io/awesome-prometheus-alerts/rules#blackbox) - [Windows](https://samber.github.io/awesome-prometheus-alerts/rules#windows-server) - [VMWare](https://samber.github.io/awesome-prometheus-alerts/rules#vmware) - [Proxmox VE](https://samber.github.io/awesome-prometheus-alerts/rules#proxmox-ve) - [Netdata](https://samber.github.io/awesome-prometheus-alerts/rules#netdata) - [eBPF](https://samber.github.io/awesome-prometheus-alerts/rules#ebpf) - [Process Exporter](https://samber.github.io/awesome-prometheus-alerts/rules#process-exporter) - [Systemd](https://samber.github.io/awesome-prometheus-alerts/rules#systemd) #### Databases - [MySQL](https://samber.github.io/awesome-prometheus-alerts/rules#mysql) - [PostgreSQL](https://samber.github.io/awesome-prometheus-alerts/rules#postgresql) - [SQL Server](https://samber.github.io/awesome-prometheus-alerts/rules#sql-server) - [Oracle Database](https://samber.github.io/awesome-prometheus-alerts/rules#oracle-database) - [Patroni](https://samber.github.io/awesome-prometheus-alerts/rules#patroni) - [PGBouncer](https://samber.github.io/awesome-prometheus-alerts/rules#pgbouncer) - [Redis](https://samber.github.io/awesome-prometheus-alerts/rules#redis) - [Memcached](https://samber.github.io/awesome-prometheus-alerts/rules#memcached) - [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb) - [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch) - [Meilisearch](https://samber.github.io/awesome-prometheus-alerts/rules#meilisearch) - [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra) - [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse) - [CouchDB](https://samber.github.io/awesome-prometheus-alerts/rules#couchdb) - [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr) #### Message brokers - [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq) - [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper) - [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka) - [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar) - [Nats](https://samber.github.io/awesome-prometheus-alerts/rules#nats) #### Proxies, load balancers and service meshes - [Nginx](https://samber.github.io/awesome-prometheus-alerts/rules#nginx) - [Apache](https://samber.github.io/awesome-prometheus-alerts/rules#apache) - [HaProxy](https://samber.github.io/awesome-prometheus-alerts/rules#haproxy) - [Traefik](https://samber.github.io/awesome-prometheus-alerts/rules#traefik) - [Caddy](https://samber.github.io/awesome-prometheus-alerts/rules#caddy) - [Envoy](https://samber.github.io/awesome-prometheus-alerts/rules#envoy) - [Linkerd](https://samber.github.io/awesome-prometheus-alerts/rules#linkerd) - [Istio](https://samber.github.io/awesome-prometheus-alerts/rules#istio) #### Runtimes - [PHP-FPM](https://samber.github.io/awesome-prometheus-alerts/rules#php-fpm) - [JVM](https://samber.github.io/awesome-prometheus-alerts/rules#jvm) - [Golang](https://samber.github.io/awesome-prometheus-alerts/rules#golang) - [Ruby](https://samber.github.io/awesome-prometheus-alerts/rules#ruby) - [Python](https://samber.github.io/awesome-prometheus-alerts/rules#python) - [Sidekiq](https://samber.github.io/awesome-prometheus-alerts/rules#sidekiq) #### Data engineering - [Apache Flink](https://samber.github.io/awesome-prometheus-alerts/rules#apache-flink) - [Apache Spark](https://samber.github.io/awesome-prometheus-alerts/rules#apache-spark) - [Hadoop](https://samber.github.io/awesome-prometheus-alerts/rules#hadoop) #### Orchestrators - [Kubernetes](https://samber.github.io/awesome-prometheus-alerts/rules#kubernetes) - [Nomad](https://samber.github.io/awesome-prometheus-alerts/rules#nomad) - [Consul](https://samber.github.io/awesome-prometheus-alerts/rules#consul) - [Etcd](https://samber.github.io/awesome-prometheus-alerts/rules#etcd) - [OpenStack](https://samber.github.io/awesome-prometheus-alerts/rules#openstack) #### CI/CD - [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins) - [ArgoCD](https://samber.github.io/awesome-prometheus-alerts/rules#argocd) - [FluxCD](https://samber.github.io/awesome-prometheus-alerts/rules#fluxcd) - [GitLab CI](https://samber.github.io/awesome-prometheus-alerts/rules#gitlab-ci) - [Spinnaker](https://samber.github.io/awesome-prometheus-alerts/rules#spinnaker) #### Network and security - [SpeedTest](https://samber.github.io/awesome-prometheus-alerts/rules#speedtest) - [SSL/TLS](https://samber.github.io/awesome-prometheus-alerts/rules#ssl/tls) - [cert-manager](https://samber.github.io/awesome-prometheus-alerts/rules#cert-manager) - [Juniper](https://samber.github.io/awesome-prometheus-alerts/rules#juniper) - [CoreDNS](https://samber.github.io/awesome-prometheus-alerts/rules#coredns) - [FreeSwitch](https://samber.github.io/awesome-prometheus-alerts/rules#freeswitch) - [Hashicorp Vault](https://samber.github.io/awesome-prometheus-alerts/rules#hashicorp-vault) - [Keycloak](https://samber.github.io/awesome-prometheus-alerts/rules#keycloak) - [Cloudflare](https://samber.github.io/awesome-prometheus-alerts/rules#cloudflare) - [SNMP](https://samber.github.io/awesome-prometheus-alerts/rules#snmp) - [Cilium](https://samber.github.io/awesome-prometheus-alerts/rules#cilium) - [WireGuard](https://samber.github.io/awesome-prometheus-alerts/rules#wireguard) #### Storage - [Ceph](https://samber.github.io/awesome-prometheus-alerts/rules#ceph) - [ZFS](https://samber.github.io/awesome-prometheus-alerts/rules#zfs) - [OpenEBS](https://samber.github.io/awesome-prometheus-alerts/rules#openebs) - [Minio](https://samber.github.io/awesome-prometheus-alerts/rules#minio) #### Cloud providers - [AWS CloudWatch](https://samber.github.io/awesome-prometheus-alerts/rules#aws-cloudwatch) - [Google Cloud Stackdriver](https://samber.github.io/awesome-prometheus-alerts/rules#google-cloud-stackdriver) - [DigitalOcean](https://samber.github.io/awesome-prometheus-alerts/rules#digitalocean) - [Azure](https://samber.github.io/awesome-prometheus-alerts/rules#azure) #### Observability - [Thanos](https://samber.github.io/awesome-prometheus-alerts/rules#thanos) - [Loki](https://samber.github.io/awesome-prometheus-alerts/rules#loki) - [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail) - [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex) - [Grafana Tempo](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-tempo) - [Grafana Mimir](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-mimir) - [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy) - [OpenTelemetry Collector](https://samber.github.io/awesome-prometheus-alerts/rules#opentelemetry-collector) - [Jaeger](https://samber.github.io/awesome-prometheus-alerts/rules#jaeger) #### Other - [APC UPS](https://samber.github.io/awesome-prometheus-alerts/rules#apc-ups) - [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node) ## 🤝 Contributing Contributions from community (you!) are most welcome! There are many ways to contribute: writing code, alerting rules, documentation, reporting issues, discussing better error tracking... [Instructions here](CONTRIBUTING.md) ## 🏋️ Improvements - Create an alert rule builder in Jekyll for custom alerts (severity, thresholds, instances...) - Add resolution suggestions to rule descriptions, for faster incident resolution ([#85](https://github.com/samber/awesome-prometheus-alerts/issues/85)). ## 💫 Show your support Give a ⭐️ if this project helped you! [![support us](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/samber) ## 📝 License [![CC4](https://mirrors.creativecommons.org/presskit/cc.srr.primary.svg)](https://creativecommons.org/licenses/by/4.0/legalcode) Licensed under the Creative Commons 4.0 License, see LICENSE file for more detail. ================================================ FILE: _config.yml ================================================ theme: jekyll-theme-cayman title: Awesome Prometheus alerts description: Collection of alerting rules repository: samber/awesome-prometheus-alerts baseurl: /awesome-prometheus-alerts ================================================ FILE: _data/rules.yml ================================================ # # The following yaml cannot be copy-pasted to Prometheus configuration. # Please navigate to https://samber.github.io/awesome-prometheus-alerts/rules instead. # # Contributing guidelines: # https://github.com/samber/awesome-prometheus-alerts/blob/master/CONTRIBUTING.md # groups: - name: Basic resource monitoring services: - name: Prometheus self-monitoring exporters: - slug: embedded-exporter rules: - name: Prometheus job missing description: A Prometheus job has disappeared query: 'absent(up{job="prometheus"})' severity: warning - name: Prometheus target missing description: A Prometheus target has disappeared. An exporter might be crashed. query: "up == 0 unless on(job) (sum by (job) (up) == 0)" severity: critical for: 1m comments: | Only fire if at least one target in the job is still up. If all targets are down, PrometheusJobMissing or PrometheusAllTargetsMissing will fire instead. - name: Prometheus all targets missing description: A Prometheus job does not have living target anymore. query: "sum by (job) (up) == 0" severity: critical for: 1m - name: Prometheus target missing with warmup time description: "Allow a job time to start up (10 minutes) before alerting that it's down." query: "sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))" severity: critical for: 1m - name: Prometheus configuration reload failure description: Prometheus configuration reload error query: "prometheus_config_last_reload_successful != 1" severity: warning - name: Prometheus too many restarts description: Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping. query: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2' severity: warning - name: Prometheus AlertManager job missing description: A Prometheus AlertManager job has disappeared query: 'absent(up{job="alertmanager"})' severity: warning - name: Prometheus AlertManager configuration reload failure description: AlertManager configuration reload error query: "alertmanager_config_last_reload_successful != 1" severity: warning - name: Prometheus AlertManager config not synced description: Configurations of AlertManager cluster instances are out of sync query: 'count(count_values("config_hash", alertmanager_config_hash)) > 1' severity: warning - name: Prometheus AlertManager E2E dead man switch description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager." query: "vector(1)" severity: critical - name: Prometheus not connected to alertmanager description: Prometheus cannot connect the alertmanager query: "prometheus_notifications_alertmanagers_discovered < 1" severity: critical - name: Prometheus rule evaluation failures description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts." query: "increase(prometheus_rule_evaluation_failures_total[3m]) > 0" severity: critical - name: Prometheus template text expansion failures description: "Prometheus encountered {{ $value }} template text expansion failures" query: "increase(prometheus_template_text_expansion_failures_total[3m]) > 0" severity: critical - name: Prometheus rule evaluation slow description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query." query: "prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds" severity: warning for: 5m - name: Prometheus notifications backlog description: The Prometheus notification queue has not been empty for 10 minutes query: "min_over_time(prometheus_notifications_queue_length[10m]) > 0" severity: warning - name: Prometheus AlertManager notification failing description: "Alertmanager is failing sending notifications ({{ $value }} notifications/s)" query: "rate(alertmanager_notifications_failed_total[1m]) > 0" severity: critical - name: Prometheus target empty description: Prometheus has no target in service discovery query: "prometheus_sd_discovered_targets == 0" severity: critical - name: Prometheus target scraping slow description: Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned. query: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05' severity: warning for: 5m - name: Prometheus large scrape description: "Prometheus has many scrapes that exceed the sample limit ({{ $value }} scrapes)" query: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10" severity: warning for: 5m - name: Prometheus target scrape duplicate description: "Prometheus has many samples rejected due to duplicate timestamps but different values ({{ $value }} samples)" query: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 3" severity: warning - name: Prometheus TSDB checkpoint creation failures description: "Prometheus encountered {{ $value }} checkpoint creation failures" query: "increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0" severity: critical - name: Prometheus TSDB checkpoint deletion failures description: "Prometheus encountered {{ $value }} checkpoint deletion failures" query: "increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0" severity: critical - name: Prometheus TSDB compactions failed description: "Prometheus encountered {{ $value }} TSDB compactions failures" query: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0" severity: critical - name: Prometheus TSDB head truncations failed description: "Prometheus encountered {{ $value }} TSDB head truncation failures" query: "increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0" severity: critical - name: Prometheus TSDB reload failures description: "Prometheus encountered {{ $value }} TSDB reload failures" query: "increase(prometheus_tsdb_reloads_failures_total[1m]) > 0" severity: critical - name: Prometheus TSDB WAL corruptions description: "Prometheus encountered {{ $value }} TSDB WAL corruptions" query: "increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0" severity: critical - name: Prometheus TSDB WAL truncations failed description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures" query: "increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0" severity: critical - name: Prometheus timeseries cardinality description: 'The "{{ $labels.name }}" timeseries cardinality is getting very high: {{ $value }}' query: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000' severity: warning - name: Host and hardware exporters: - name: node-exporter slug: node-exporter doc_url: https://github.com/prometheus/node_exporter rules: - name: Host out of memory description: Node memory is filling up (< 10% left) query: "(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)" severity: warning for: 2m - name: Host memory under memory pressure description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s)." query: "(rate(node_vmstat_pgmajfault[5m]) > 1000)" severity: warning - name: Host Memory is underutilized description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})" query: "min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8" severity: info comments: | You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - name: Host unusual network throughput in description: Host receive bandwidth is high (>80%). query: "((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0" severity: warning - name: Host unusual network throughput out description: Host transmit bandwidth is high (>80%) query: "((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0" severity: warning - name: Host disk IO utilization high description: Disk utilization is high (> 80%) query: "(rate(node_disk_io_time_seconds_total[5m]) > .80)" severity: warning - name: Host out of disk space description: Disk is almost full (< 10% left) query: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)' severity: critical comments: | Please add ignored mountpoints in node_exporter parameters like "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. for: 2m - name: Host disk may fill in 24 hours description: Filesystem will likely run out of space within the next 24 hours. query: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0' severity: warning comments: | Please add ignored mountpoints in node_exporter parameters like "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. for: 2m - name: Host out of inodes description: Disk is almost running out of available inodes (< 10% left) query: "(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) and node_filesystem_files > 0" severity: critical for: 2m - name: Host filesystem device error description: "Error stat-ing the {{ $labels.mountpoint }} filesystem" query: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1' severity: critical for: 2m - name: Host inodes may fill in 24 hours description: Filesystem will likely run out of inodes within the next 24 hours at current write rate query: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0' severity: warning for: 2m - name: Host unusual disk read latency description: Disk latency is growing (read operations > 100ms) query: "(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)" severity: warning for: 2m - name: Host unusual disk write latency description: Disk latency is growing (write operations > 100ms) query: "(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)" severity: warning for: 2m - name: Host high CPU load description: CPU load is > 80% query: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80' severity: warning for: 10m - name: Host CPU is underutilized description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs." query: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' severity: info for: 1w comments: | You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - name: Host CPU steal noisy neighbor description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' severity: warning - name: Host CPU high iowait description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond. query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' severity: warning - name: Host unusual disk IO description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues." query: "rate(node_disk_io_time_seconds_total[5m]) > 0.8" severity: warning for: 5m - name: Host context switching high description: Context switching is growing on the node (twice the daily average during the last 15m) query: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2' severity: warning comments: | x2 context switches is an arbitrary number. The alert threshold depends on the nature of the application. Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 - name: Host swap is filling up description: Swap is filling up (>80%) query: "((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) and node_memory_SwapTotal_bytes > 0" severity: warning for: 2m - name: Host systemd service crashed description: "systemd service {{ $labels.name }} crashed" query: '(node_systemd_unit_state{state="failed"} == 1)' severity: warning - name: Host physical component too hot description: "Physical hardware component too hot" query: "node_hwmon_temp_celsius > node_hwmon_temp_max_celsius" severity: warning for: 5m - name: Host node overtemperature alarm description: "Physical node temperature alarm triggered" query: "((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))" severity: critical - name: Host software RAID insufficient drives description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining." query: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)' comments: | Uses ignoring(state) to handle additional labels on node_md_disks. Matches the official node-exporter mixin. severity: critical - name: Host software RAID disk failure description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention." query: '(node_md_disks{state="failed"} > 0)' severity: warning for: 2m - name: Host kernel version deviations description: Kernel version for {{ $labels.instance }} has changed. query: "changes(node_uname_info[1h]) > 0" severity: info - name: Host OOM kill detected description: OOM kill detected query: "(increase(node_vmstat_oom_kill[30m]) > 0)" severity: warning comments: | When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 15–20 minutes to recover, the alert should still trigger. - name: Host EDAC Correctable Errors detected description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.' query: "(increase(node_edac_correctable_errors_total[1m]) > 0)" severity: info - name: Host EDAC Uncorrectable Errors detected description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.' query: "(node_edac_uncorrectable_errors_total > 0)" severity: warning - name: Host Network Receive Errors description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' query: "(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) and rate(node_network_receive_packets_total[2m]) > 0" severity: warning for: 2m - name: Host Network Transmit Errors description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' query: "(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) and rate(node_network_transmit_packets_total[2m]) > 0" severity: warning for: 2m - name: Host Network Bond Degraded description: 'Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".' query: "((node_bonding_active - node_bonding_slaves) != 0)" severity: warning for: 2m - name: Host conntrack limit description: "The number of conntrack is approaching limit" query: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) and node_nf_conntrack_entries_limit > 0" severity: warning for: 5m - name: Host clock skew description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host." query: "((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))" severity: warning for: 10m - name: Host clock not synchronising description: "Clock not synchronising. Ensure NTP is configured on this host." query: "(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)" severity: warning for: 2m - name: S.M.A.R.T Device Monitoring exporters: - name: smartctl-exporter slug: smartctl-exporter doc_url: https://github.com/prometheus-community/smartctl_exporter rules: - name: SMART device temperature warning description: Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C query: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60' severity: warning - name: SMART device temperature critical description: Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C query: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70' severity: critical - name: SMART device temperature over trip value description: Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }}) query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}' severity: critical - name: SMART device temperature nearing trip value description: Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }}) query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)' severity: warning - name: SMART status description: Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }}) query: "smartctl_device_smart_status != 1" severity: critical - name: SMART critical warning description: Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }}) query: "smartctl_device_critical_warning > 0" severity: critical - name: SMART media errors description: Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }}) query: "smartctl_device_media_errors > 0" severity: critical - name: SMART Wearout Indicator description: Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }}) query: "smartctl_device_available_spare < smartctl_device_available_spare_threshold" severity: critical - name: IPMI exporters: - name: prometheus-community/ipmi_exporter slug: ipmi-exporter doc_url: https://github.com/prometheus-community/ipmi_exporter rules: - name: IPMI collector down description: "IPMI collector {{ $labels.collector }} on {{ $labels.instance }} failed to scrape sensor data. Check FreeIPMI tools and BMC connectivity." query: 'ipmi_up == 0' severity: warning for: 5m comments: | The ipmi_up metric is per-collector. A value of 0 means the collector could not retrieve data from the BMC. - name: IPMI temperature sensor warning description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state." query: 'ipmi_temperature_state == 1' severity: warning for: 5m comments: | State values: 0=nominal, 1=warning, 2=critical. Thresholds are defined in the BMC firmware. - name: IPMI temperature sensor critical description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Immediate attention required to prevent hardware damage." query: 'ipmi_temperature_state == 2' severity: critical - name: IPMI fan speed sensor warning description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state." query: 'ipmi_fan_speed_state == 1' severity: warning for: 5m - name: IPMI fan speed sensor critical description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. A fan may have failed." query: 'ipmi_fan_speed_state == 2' severity: critical - name: IPMI fan speed zero description: "IPMI fan {{ $labels.name }} on {{ $labels.instance }} reports 0 RPM. The fan may have failed." query: 'ipmi_fan_speed_rpm == 0' severity: critical for: 5m - name: IPMI voltage sensor warning description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state." query: 'ipmi_voltage_state == 1' severity: warning for: 5m - name: IPMI voltage sensor critical description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Power supply or motherboard issue possible." query: 'ipmi_voltage_state == 2' severity: critical - name: IPMI current sensor warning description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state." query: 'ipmi_current_state == 1' severity: warning for: 5m - name: IPMI current sensor critical description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state." query: 'ipmi_current_state == 2' severity: critical - name: IPMI power sensor warning description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state." query: 'ipmi_power_state == 1' severity: warning for: 5m - name: IPMI power sensor critical description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state." query: 'ipmi_power_state == 2' severity: critical - name: IPMI generic sensor critical description: "IPMI sensor {{ $labels.name }} (type={{ $labels.type }}) on {{ $labels.instance }} is in critical state." query: 'ipmi_sensor_state == 2' severity: critical for: 5m comments: | Catches any sensor type not covered by the specific temperature/fan/voltage/current/power alerts. - name: IPMI chassis power off description: "IPMI reports chassis power is off on {{ $labels.instance }}. The server may have shut down unexpectedly." query: 'ipmi_chassis_power_state == 0' severity: critical - name: IPMI chassis drive fault description: "IPMI reports a drive fault on {{ $labels.instance }}. Check disk health." query: 'ipmi_chassis_drive_fault_state == 0' severity: critical comments: | The metric uses inverted logic: 1=no fault, 0=fault detected. - name: IPMI chassis cooling fault description: "IPMI reports a cooling/fan fault on {{ $labels.instance }}. Check fans and airflow." query: 'ipmi_chassis_cooling_fault_state == 0' severity: critical comments: | The metric uses inverted logic: 1=no fault, 0=fault detected. - name: IPMI SEL almost full description: "IPMI System Event Log on {{ $labels.instance }} has only {{ printf \"%.0f\" $value }} bytes free. Clear the SEL to prevent loss of new events." query: 'ipmi_sel_free_space_bytes < 512' severity: warning for: 5m comments: | SEL storage is typically very limited (e.g., 16KB). When full, new events may be dropped. - name: Docker containers exporters: - name: google/cAdvisor slug: google-cadvisor doc_url: https://github.com/google/cadvisor rules: - name: Container killed description: A container has disappeared query: "time() - container_last_seen > 60" severity: warning comments: | This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. - name: Container absent description: A container is absent for 5 min query: "absent(container_last_seen)" severity: warning for: 5m comments: | This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. - name: Container High CPU utilization description: 'Container CPU utilization is above 80% (current: {{ $value | printf "%.2f" }}%)' query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0' comments: | Only fires for containers with explicit CPU limits. Containers without limits have cpu_quota=0, which is filtered out by the guard. severity: warning for: 2m - name: Container High Memory usage description: Container Memory usage is above 80% query: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80' severity: warning comments: See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d for: 2m - name: Container Volume usage description: Container Volume usage is above 80% query: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80 and sum(container_fs_inodes_total) BY (instance) > 0' severity: warning for: 2m - name: Container high throttle rate description: "Container is being throttled ({{ $value | humanizePercentage }})" query: 'sum(rate(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0' severity: warning for: 5m - name: Container high low change CPU usage description: This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%. query: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25' severity: info - name: Container Low CPU utilization description: 'Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU. (current: {{ $value | printf "%.2f" }}%)' query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20' severity: info for: 7d - name: Container Low Memory usage description: Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory. query: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20' severity: info for: 7d - name: Blackbox exporters: - name: prometheus/blackbox_exporter slug: blackbox-exporter doc_url: https://github.com/prometheus/blackbox_exporter rules: - name: Blackbox probe failed description: Probe failed query: probe_success == 0 severity: critical - name: Blackbox configuration reload failure description: Blackbox configuration reload failure query: "blackbox_exporter_config_last_reload_successful != 1" severity: warning - name: Blackbox slow probe description: Blackbox probe took more than 1s to complete query: "probe_duration_seconds > 1" severity: warning for: 1m - name: Blackbox probe HTTP failure description: HTTP status code is not 200-399 query: "probe_http_status_code <= 199 OR probe_http_status_code >= 400" severity: critical - name: Blackbox SSL certificate will expire soon description: SSL certificate expires in less than 20 days query: "3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20" severity: warning - name: Blackbox SSL certificate will expire very soon description: SSL certificate expires in less than 3 days query: "0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3" severity: critical - name: Blackbox SSL certificate expired description: SSL certificate has expired already query: "round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0" severity: critical comments: | For probe_ssl_earliest_cert_expiry to be exposed after expiration, you need to enable insecure_skip_verify. Note that this will disable certificate validation. See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config - name: Blackbox probe slow HTTP description: HTTP request took more than 1s query: "probe_http_duration_seconds > 1" severity: warning for: 1m - name: Blackbox probe slow ping description: Blackbox ping took more than 1s query: "probe_icmp_duration_seconds > 1" severity: warning for: 1m - name: Windows Server exporters: - name: prometheus-community/windows_exporter slug: windows-exporter doc_url: https://github.com/prometheus-community/windows_exporter rules: - name: Windows Server collector Error description: "Collector {{ $labels.collector }} was not successful" query: "windows_exporter_collector_success == 0" severity: critical - name: Windows Server service Status description: Windows Service state is not OK query: 'windows_service_status{status="ok"} != 1' severity: critical for: 1m - name: Windows Server CPU Usage description: CPU Usage is more than 80% query: '100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80' severity: warning - name: Windows Server memory Usage description: Memory usage is more than 90% query: "100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90" severity: warning for: 2m - name: Windows Server disk Space Usage description: Disk usage is more than 80% query: "100 - 100 * (windows_logical_disk_free_bytes / windows_logical_disk_size_bytes) > 80 and windows_logical_disk_size_bytes > 0" severity: critical for: 2m - name: VMware exporters: - name: pryorda/vmware_exporter slug: pryorda-vmware-exporter doc_url: https://github.com/pryorda/vmware_exporter rules: - name: Virtual Machine Memory Warning description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%' query: "vmware_vm_mem_usage_average / 100 >= 80 and vmware_vm_mem_usage_average / 100 < 90" severity: warning for: 5m - name: Virtual Machine Memory Critical description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%' query: "vmware_vm_mem_usage_average / 100 >= 90" severity: critical for: 1m - name: High Number of Snapshots description: "High snapshots number on {{ $labels.instance }}: {{ $value }}" query: "vmware_vm_snapshots > 3" severity: warning for: 30m - name: Outdated Snapshots description: 'Outdated snapshots on {{ $labels.instance }}: {{ $value | printf "%.0f"}} days' query: "(time() - vmware_vm_snapshot_timestamp_seconds) / (60 * 60 * 24) >= 3" severity: warning for: 5m - name: Proxmox VE exporters: - name: prometheus-pve/prometheus-pve-exporter slug: prometheus-pve-exporter doc_url: https://github.com/prometheus-pve/prometheus-pve-exporter rules: - name: PVE node down description: 'Proxmox VE node {{ $labels.id }} is down.' query: 'pve_up{id=~"node/.*"} == 0' severity: critical for: 2m - name: PVE VM/CT down description: 'Proxmox VE guest {{ $labels.id }} is not running.' query: 'pve_up{id=~"(qemu|lxc)/.*"} == 0' severity: warning for: 5m comments: | This alert triggers for all VMs and containers that are not running. You may want to filter by specific guests using the `id` label, or exclude intentionally stopped guests with additional label matchers. - name: PVE high CPU usage description: 'Proxmox VE CPU usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf "%.2f" }}%' query: 'pve_cpu_usage_ratio * 100 > 90' severity: warning for: 5m - name: PVE high memory usage description: 'Proxmox VE memory usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf "%.2f" }}%' query: 'pve_memory_usage_bytes / pve_memory_size_bytes * 100 > 90 and pve_memory_size_bytes > 0' severity: warning for: 5m - name: PVE storage filling up description: 'Proxmox VE storage {{ $labels.id }} is above 80% used. Current value: {{ $value | printf "%.2f" }}%' query: 'pve_disk_usage_bytes{id=~"storage/.*"} / pve_disk_size_bytes{id=~"storage/.*"} * 100 > 80 and pve_disk_size_bytes{id=~"storage/.*"} > 0' severity: warning for: 5m - name: PVE storage almost full description: 'Proxmox VE storage {{ $labels.id }} is above 95% used. Current value: {{ $value | printf "%.2f" }}%' query: 'pve_disk_usage_bytes{id=~"storage/.*"} / pve_disk_size_bytes{id=~"storage/.*"} * 100 > 95 and pve_disk_size_bytes{id=~"storage/.*"} > 0' severity: critical for: 2m - name: PVE guest not backed up description: '{{ $value }} Proxmox VE guest(s) are not covered by any backup job.' query: 'pve_not_backed_up_total > 0' severity: warning - name: PVE replication failed description: 'Proxmox VE replication for {{ $labels.id }} has {{ $value }} failed sync(s).' query: 'pve_replication_failed_syncs > 0' severity: warning - name: PVE cluster not quorate description: 'Proxmox VE cluster has lost quorum.' query: 'pve_cluster_info{quorate="0"} == 1' severity: critical comments: | Loss of quorum means the cluster cannot make decisions about VM placement and fencing. This requires immediate attention. - name: Netdata exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://github.com/netdata/netdata/blob/master/backends/prometheus/README.md rules: - name: Netdata high cpu usage description: Netdata high CPU usage (> 80%) query: 'netdata_cpu_cpu_percentage_average{dimension="idle"} < 20' severity: warning for: 5m comments: | This is a gauge metric (not a counter). Checking idle < 20% means CPU usage > 80%. - name: Netdata CPU steal noisy neighbor description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. query: 'netdata_cpu_cpu_percentage_average{dimension="steal"} > 10' severity: warning for: 5m - name: Netdata high memory usage description: Netdata high memory usage (> 80%) query: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20 and netdata_system_ram_MiB_average > 0' severity: warning for: 5m - name: Netdata low disk space description: Netdata low disk space (> 80%) query: '100 / netdata_disk_space_GB_average * netdata_disk_space_GB_average{dimension=~"avail|cached"} < 20 and netdata_disk_space_GB_average > 0' severity: warning for: 5m - name: Netdata predicted disk full description: Netdata predicted disk full in 24 hours query: 'predict_linear(netdata_disk_space_GB_average{dimension=~"avail|cached"}[3h], 24 * 3600) < 0' severity: warning - name: Netdata MD mismatch cnt unsynchronized blocks description: RAID Array have unsynchronized blocks query: "netdata_md_mismatch_cnt_unsynchronized_blocks_average > 1024" severity: warning for: 2m - name: Netdata disk reallocated sectors description: "Disk reallocated sectors detected ({{ $value }} sectors)" query: "increase(netdata_smartd_log_reallocated_sectors_count_sectors_average[1m]) > 0" severity: info - name: Netdata disk current pending sector description: Disk current pending sector query: "netdata_smartd_log_current_pending_sector_count_sectors_average > 0" severity: warning - name: Netdata reported uncorrectable disk sectors description: "Reported uncorrectable disk sectors ({{ $value }} sectors)" query: "increase(netdata_smartd_log_offline_uncorrectable_sector_count_sectors_average[2m]) > 0" severity: warning - name: eBPF exporters: - name: cloudflare/ebpf_exporter slug: ebpf-exporter doc_url: https://github.com/cloudflare/ebpf_exporter rules: - name: eBPF exporter program not attached description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})" query: 'ebpf_exporter_ebpf_program_attached == 0' severity: warning for: 5m comments: | The exporter uses loose attachment: if a program fails to load (missing BTF, kernel incompatibility), it sets this metric to 0 and continues running. - name: eBPF exporter decoder errors description: "eBPF exporter is experiencing decoder errors for config {{ $labels.config }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})" query: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0' severity: warning for: 5m - name: eBPF exporter no enabled configs description: "eBPF exporter has no enabled configurations. No eBPF programs are being run. (instance {{ $labels.instance }})" query: 'ebpf_exporter_enabled_configs == 0 or absent(ebpf_exporter_enabled_configs)' severity: warning for: 5m - name: Process Exporter exporters: - name: ncabatoff/process-exporter slug: process-exporter doc_url: https://github.com/ncabatoff/process-exporter rules: - name: Process exporter group down description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})" query: 'namedprocess_namegroup_num_procs == 0' severity: warning for: 5m - name: Process exporter high memory usage description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of resident memory. (instance {{ $labels.instance }})" query: 'namedprocess_namegroup_memory_bytes{memtype="resident"} > 4e+09' severity: warning for: 5m comments: | Threshold of 4GB is arbitrary and depends on the process being monitored. Adjust per group. - name: Process exporter high CPU usage description: "Process group {{ $labels.groupname }} is using {{ $value }}% CPU (core-equivalent). (instance {{ $labels.instance }})" query: 'rate(namedprocess_namegroup_cpu_seconds_total[5m]) * 100 > 80' severity: warning for: 5m comments: | Value is core-equivalent %: 100% = 1 full core, 200% = 2 cores, etc. Threshold of 80% is per-core. Adjust based on expected workload. - name: Process exporter high file descriptor usage description: "Process group {{ $labels.groupname }} is using more than 80% of its file descriptor limit. (instance {{ $labels.instance }})" query: 'namedprocess_namegroup_worst_fd_ratio > 0.8' severity: warning for: 5m - name: Process exporter file descriptors exhausted description: "Process group {{ $labels.groupname }} has nearly exhausted its file descriptor limit. (instance {{ $labels.instance }})" query: 'namedprocess_namegroup_worst_fd_ratio > 0.95' severity: critical for: 2m - name: Process exporter high swap usage description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})" query: 'namedprocess_namegroup_memory_bytes{memtype="swapped"} > 512e+06' severity: warning for: 5m comments: | Threshold of 512MB is arbitrary. Adjust per group and environment. - name: Process exporter zombie processes description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})" query: 'namedprocess_namegroup_states{state="Zombie"} > 5' severity: warning for: 5m - name: Process exporter high context switching description: "Process group {{ $labels.groupname }} has a high rate of context switches ({{ $value }}/s). (instance {{ $labels.instance }})" query: 'rate(namedprocess_namegroup_context_switches_total{ctxswitchtype="voluntary"}[5m]) > 50000' severity: warning for: 5m comments: | Filters to voluntary switches only — involuntary switches are normal under CPU contention. Threshold of 50000/s is a rough default. Adjust based on workload. - name: Process exporter high disk write IO description: "Process group {{ $labels.groupname }} is performing {{ $value | humanize }}B/s of disk writes. (instance {{ $labels.instance }})" query: 'rate(namedprocess_namegroup_write_bytes_total[5m]) > 100e+06' severity: warning for: 5m comments: | Threshold of 100MB/s is arbitrary. Adjust per group. - name: Process exporter process restarting description: "Process group {{ $labels.groupname }} has restarted (oldest process start time changed). (instance {{ $labels.instance }})" query: 'changes(namedprocess_namegroup_oldest_start_time_seconds[5m]) > 0 and namedprocess_namegroup_num_procs > 0' severity: info comments: | Detects restarts by watching for changes in the oldest process start time within the group. - name: Systemd exporters: - name: prometheus-community/systemd_exporter slug: systemd-exporter doc_url: https://github.com/prometheus-community/systemd_exporter rules: - name: Systemd unit failed description: "Systemd unit {{ $labels.name }} has entered failed state. (instance {{ $labels.instance }})" query: 'systemd_unit_state{state="failed"} == 1' severity: warning for: 5m - name: Systemd unit inactive description: "Systemd unit {{ $labels.name }} is inactive. (instance {{ $labels.instance }})" query: 'systemd_unit_state{state="inactive", type="service", name=~"your-critical-service.+"} == 1' severity: warning for: 5m comments: | Many units are legitimately inactive. You must adjust the name=~ filter to match your critical services. - name: Systemd service crash looping description: "Systemd service {{ $labels.name }} has restarted {{ $value }} times in the last hour. (instance {{ $labels.instance }})" query: 'increase(systemd_service_restart_total[1h]) > 5' severity: critical for: 5m - name: Systemd unit tasks near limit description: "Systemd unit {{ $labels.name }} is using {{ $value | humanizePercentage }} of its task limit. (instance {{ $labels.instance }})" query: 'systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max > 0.9 and ignoring(type) systemd_unit_tasks_max > 0' severity: warning for: 5m - name: Systemd socket refused connections description: "Systemd socket {{ $labels.name }} is refusing connections. ({{ $value }} refused in last 5m, instance {{ $labels.instance }})" query: 'increase(systemd_socket_refused_connections_total[5m]) > 0' severity: warning for: 2m - name: Systemd socket high connections description: "Systemd socket {{ $labels.name }} has {{ $value }} active connections. (instance {{ $labels.instance }})" query: 'systemd_socket_current_connections > 100' severity: warning for: 2m comments: | Threshold of 100 connections is arbitrary. Adjust to your workload. - name: Systemd timer missed trigger description: "Systemd timer {{ $labels.name }} has not triggered for over 24 hours. (instance {{ $labels.instance }})" query: '(time() - systemd_timer_last_trigger_seconds) / 3600 > 24 and systemd_timer_last_trigger_seconds > 0' severity: warning for: 5m comments: | Triggers if timer hasn't fired in 24 hours. Adjust threshold per timer schedule. - name: Databases services: - name: MySQL exporters: - name: prometheus/mysqld_exporter slug: mysqld-exporter doc_url: https://github.com/prometheus/mysqld_exporter rules: - name: MySQL down description: MySQL instance is down on {{ $labels.instance }} query: "mysql_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: MySQL too many connections (> 80%) description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}" query: "max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80 and mysql_global_variables_max_connections > 0" severity: warning for: 2m - name: MySQL high prepared statements utilization (> 80%) description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}" query: "max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80 and mysql_global_variables_max_prepared_stmt_count > 0" severity: warning for: 2m - name: MySQL high threads running description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}" query: "max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60 and mysql_global_variables_max_connections > 0" severity: warning for: 2m - name: MySQL Slave IO thread not running description: "MySQL Slave IO thread not running on {{ $labels.instance }}" query: "( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: MySQL Slave SQL thread not running description: "MySQL Slave SQL thread not running on {{ $labels.instance }}" query: "( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: MySQL Slave replication lag description: "MySQL replication lag on {{ $labels.instance }}" query: "( (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and ON (instance) mysql_slave_status_master_server_id > 0 ) > 30" severity: critical for: 1m - name: MySQL slow queries description: "MySQL server mysql has some new slow query ({{ $value }} in the last minute)." query: increase(mysql_global_status_slow_queries[1m]) > 0 severity: warning for: 2m - name: MySQL InnoDB log waits description: "MySQL innodb log writes stalling ({{ $value }} waits/s)" query: rate(mysql_global_status_innodb_log_waits[15m]) > 10 severity: warning - name: MySQL restarted description: MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}. query: "mysql_global_status_uptime < 60" severity: info - name: MySQL High QPS description: MySQL is being overload with unusual QPS (> 10k QPS). query: "irate(mysql_global_status_questions[1m]) > 10000" severity: info for: 2m - name: MySQL too many open files description: MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}. query: "mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75 and mysql_global_variables_open_files_limit > 0" severity: warning for: 2m - name: MySQL InnoDB Force Recovery is enabled description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}" query: "mysql_global_variables_innodb_force_recovery != 0" severity: warning for: 2m - name: MySQL InnoDB history_len too long description: "MySQL history_len (undo log) too long on {{ $labels.instance }}" query: "mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000" severity: warning for: 2m - name: PostgreSQL exporters: - name: prometheus-community/postgres_exporter slug: postgres-exporter doc_url: https://github.com/prometheus-community/postgres_exporter rules: - name: Postgresql down description: Postgresql instance is down query: "pg_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Postgresql restarted description: Postgresql restarted query: "time() - pg_postmaster_start_time_seconds < 60" severity: critical - name: Postgresql exporter error description: Postgresql exporter is showing errors. A query may be buggy in query.yaml query: "pg_exporter_last_scrape_error > 0" severity: critical - name: Postgresql table not auto vacuumed description: Table {{ $labels.relname }} has not been auto vacuumed for 10 days query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10" severity: warning - name: Postgresql table not auto analyzed description: Table {{ $labels.relname }} has not been auto analyzed for 10 days query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10" severity: warning - name: Postgresql too many connections description: PostgreSQL instance has too many connections (> 80%). query: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)" severity: warning for: 2m - name: Postgresql not enough connections description: PostgreSQL instance should have more connections (> 5) query: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5' severity: critical for: 2m - name: Postgresql dead locks description: "PostgreSQL has dead-locks ({{ $value }} in the last minute)" query: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5' severity: warning - name: Postgresql high rollback rate description: Ratio of transactions being aborted compared to committed is > 2 % query: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02' severity: warning - name: Postgresql commit rate low description: Postgresql seems to be processing very few transactions query: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5' severity: critical for: 2m - name: Postgresql low XID consumption description: Postgresql seems to be consuming transaction IDs very slowly query: "rate(pg_txid_current[1m]) < 5" severity: warning for: 2m - name: Postgresql unused replication slot description: Unused Replication Slots query: "(pg_replication_slots_active == 0) and (pg_replication_is_replica == 0)" severity: warning for: 1m - name: Postgresql too many dead tuples description: PostgreSQL dead tuples is too large query: "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 and (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) > 0" severity: warning for: 2m - name: Postgresql configuration changed description: Postgres Database configuration change has occurred query: '{__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} != ON(__name__, instance) {__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} OFFSET 5m' severity: info - name: Postgresql SSL compression active description: Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`. query: "sum by (instance) (pg_stat_ssl_compression) > 0" severity: warning - name: Postgresql too many locks acquired description: Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction. query: "((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 and (pg_settings_max_locks_per_transaction * pg_settings_max_connections) > 0" severity: critical for: 2m - name: Postgresql bloat index high (> 80%) description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`" query: "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)" severity: warning for: 1h comments: | See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - name: Postgresql bloat table high (> 80%) description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`" query: "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)" severity: warning for: 1h comments: | See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - name: Postgresql invalid index description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`" query: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' severity: warning for: 6h comments: | See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - name: Postgresql replication lag description: The PostgreSQL replication lag is high (> 5s) query: "pg_replication_lag_seconds > 5" severity: warning for: 30s - name: SQL Server exporters: - name: Ozarklake/prometheus-mssql-exporter slug: ozarklake-mssql-exporter doc_url: https://github.com/Ozarklake/prometheus-mssql-exporter rules: - name: SQL Server down description: SQL server instance is down query: mssql_up == 0 severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: SQL Server deadlock description: SQL Server {{ $labels.instance }} is experiencing deadlocks ({{ $value }}/s) query: mssql_deadlocks > 5 severity: warning for: 1m - name: Oracle Database exporters: - name: iamseth/oracledb_exporter slug: iamseth-oracledb-exporter doc_url: https://github.com/iamseth/oracledb_exporter rules: - name: Oracle DB down description: Oracle Database instance is down on {{ $labels.instance }} query: "oracledb_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Oracle DB sessions reaching limit (> 85%) description: "Oracle Database session utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)" query: "oracledb_resource_current_utilization{resource_name=\"sessions\"} / oracledb_resource_limit_value{resource_name=\"sessions\"} * 100 > 85 and oracledb_resource_limit_value{resource_name=\"sessions\"} > 0" severity: warning for: 5m comments: | Threshold is workload-dependent. Adjust 85% to suit your environment. - name: Oracle DB processes reaching limit (> 85%) description: "Oracle Database process utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)" query: "oracledb_resource_current_utilization{resource_name=\"processes\"} / oracledb_resource_limit_value{resource_name=\"processes\"} * 100 > 85 and oracledb_resource_limit_value{resource_name=\"processes\"} > 0" severity: warning for: 5m comments: | Threshold is workload-dependent. Adjust 85% to suit your environment. - name: Oracle DB tablespace reaching capacity (> 85%) description: "Oracle Database tablespace {{ $labels.tablespace }} is above 85% usage on {{ $labels.instance }} (current value: {{ $value }}%)" query: "oracledb_tablespace_used_percent > 85" severity: warning for: 5m - name: Oracle DB tablespace full (> 95%) description: "Oracle Database tablespace {{ $labels.tablespace }} is critically full on {{ $labels.instance }} (current value: {{ $value }}%)" query: "oracledb_tablespace_used_percent > 95" severity: critical for: 5m - name: Oracle DB high user rollbacks description: "Oracle Database on {{ $labels.instance }} has a high rollback rate ({{ $value }}% of transactions are rolled back)" query: "rate(oracledb_activity_user_rollbacks[5m]) / (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) * 100 > 20 and (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) > 0" severity: warning for: 5m comments: | A high rollback rate (>20%) often indicates application-level issues such as deadlocks, constraint violations, or poorly designed transactions. - name: Oracle DB too many active sessions description: "Oracle Database on {{ $labels.instance }} has too many active user sessions (current value: {{ $value }})" query: "oracledb_sessions_value{status=\"ACTIVE\", type=\"USER\"} > 200" severity: warning for: 5m comments: | Threshold is highly workload-dependent. Adjust 200 to suit your environment. - name: Oracle DB high wait time (user I/O) description: "Oracle Database on {{ $labels.instance }} is experiencing high user I/O wait time" query: "oracledb_wait_time_user_io > 300" severity: warning for: 5m comments: | The metric from v$waitclassmetric is already a normalized rate (centiseconds per second). Threshold 300 means 3 seconds of I/O wait per second of wall time. - name: Patroni exporters: - name: Embedded exporter (Patroni >= 2.1.0) slug: embedded-exporter-patroni doc_url: https://patroni.readthedocs.io/en/latest/rest_api.html?highlight=prometheus#monitoring-endpoint rules: - name: Patroni has no Leader description: A leader node (neither primary nor standby) cannot be found inside the cluster {{ $labels.scope }} query: (max by (scope) (patroni_primary) < 1) and (max by (scope) (patroni_standby_leader) < 1) severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: PGBouncer exporters: - name: spreaker/prometheus-pgbouncer-exporter slug: spreaker-pgbouncer-exporter doc_url: https://github.com/spreaker/prometheus-pgbouncer-exporter rules: - name: PGBouncer active connections description: PGBouncer pools are filling up query: "pgbouncer_pools_server_active_connections > 200" severity: warning for: 2m - name: PGBouncer errors description: PGBouncer is logging errors. This may be due to a server restart or an admin typing commands at the pgbouncer console. query: 'increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[1m]) > 10' severity: warning - name: PGBouncer max connections description: The number of PGBouncer client connections has reached max_client_conn. query: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[2m]) > 0' severity: critical - name: Redis exporters: - name: oliver006/redis_exporter slug: oliver006-redis-exporter doc_url: https://github.com/oliver006/redis_exporter rules: - name: Redis down description: Redis instance is down query: "redis_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Redis missing master description: Redis cluster has no node marked as master. query: '(count(redis_instance_info{role="master"}) or vector(0)) < 1' severity: critical - name: Redis too many masters description: Redis cluster has too many nodes marked as master. query: 'count(redis_instance_info{role="master"}) > 1' severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Redis disconnected slaves description: Redis not replicating for all slaves. Consider reviewing the redis replication status. query: "count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 0" severity: critical - name: Redis replication broken description: Redis instance lost a slave query: "delta(redis_connected_slaves[1m]) < 0" severity: critical - name: Redis cluster flapping description: Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping). query: "changes(redis_connected_slaves[1m]) > 1" severity: critical for: 2m - name: Redis missing backup description: Redis has not been backed up for 48 hours query: "time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 48" severity: critical - name: Redis out of system memory description: Redis is running out of system memory (> 90%) query: "redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90 and redis_total_system_memory_bytes > 0" severity: warning for: 2m comments: | The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable. - name: Redis out of configured maxmemory description: Redis is running out of configured maxmemory (> 90%) query: "redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0" severity: warning for: 2m - name: Redis too many connections description: Redis is running out of connections (> 90% used) query: "redis_connected_clients / redis_config_maxclients * 100 > 90 and redis_config_maxclients > 0" severity: warning for: 2m - name: Redis not enough connections description: Redis instance should have more connections (> 5) query: "redis_connected_clients < 5" severity: warning for: 2m - name: Redis rejected connections description: Some connections to Redis has been rejected query: "increase(redis_rejected_connections_total[1m]) > 5" severity: warning - name: Memcached exporters: - name: prometheus/memcached_exporter slug: memcached-exporter doc_url: https://github.com/prometheus/memcached_exporter rules: - name: Memcached down description: Memcached instance is down on {{ $labels.instance }} query: "memcached_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Memcached connection limit approaching (> 80%) description: "Memcached connection usage is above 80% on {{ $labels.instance }} (current value: {{ $value }}%)" query: "(memcached_current_connections / memcached_max_connections * 100) > 80 and memcached_max_connections > 0" severity: warning for: 2m - name: Memcached connection limit approaching (> 95%) description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)" query: "(memcached_current_connections / memcached_max_connections * 100) > 95 and memcached_max_connections > 0" severity: critical for: 2m - name: Memcached out of memory errors description: "Memcached is returning out-of-memory errors on {{ $labels.instance }}" query: "sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0" severity: warning for: 5m - name: Memcached memory usage high (> 90%) description: "Memcached memory usage is above 90% on {{ $labels.instance }} (current value: {{ $value }}%)" query: "(memcached_current_bytes / memcached_limit_bytes * 100) > 90 and memcached_limit_bytes > 0" severity: warning for: 5m comments: | High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions. - name: Memcached high eviction rate description: "Memcached is evicting items at a high rate on {{ $labels.instance }} ({{ $value }} evictions/s)" query: "rate(memcached_items_evicted_total[5m]) > 10" severity: warning for: 5m comments: | A sustained eviction rate indicates memory pressure. Consider increasing memcached memory limit or reducing cache usage. Threshold of 10 evictions/s is a rough default — adjust based on your workload. - name: Memcached low cache hit rate (< 80%) description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)" query: '(rate(memcached_commands_total{command="get", status="hit"}[5m]) / (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) * 100) < 80 and (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) > 0' severity: warning for: 10m comments: | A low hit rate may indicate poor cache utilization, incorrect cache keys, or TTLs that are too short. Threshold of 80% is a rough default — adjust based on your workload and access patterns. - name: Memcached connections rejected description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)" query: "increase(memcached_connections_rejected_total[5m]) > 0" severity: warning for: 5m - name: Memcached items too large description: "Memcached is rejecting items exceeding max-item-size on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)" query: "increase(memcached_item_too_large_total[5m]) > 0" severity: info for: 5m - name: MongoDB exporters: - name: percona/mongodb_exporter slug: percona-mongodb-exporter doc_url: https://github.com/percona/mongodb_exporter rules: - name: MongoDB Down description: MongoDB instance is down query: "mongodb_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Mongodb replica member unhealthy description: MongoDB replica member is not healthy query: "mongodb_rs_members_health == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: MongoDB replication lag (Percona) description: Mongodb replication lag is more than 10s query: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10' severity: critical - name: MongoDB replication headroom description: MongoDB replication headroom is <= 0 query: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0' severity: critical comments: | This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both. - name: MongoDB number cursors open (Percona) description: Too many cursors opened by MongoDB for clients (> 10k) query: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000' severity: warning for: 2m - name: MongoDB cursors timeouts (Percona) description: "Too many cursors are timing out ({{ $value }} in the last minute)" query: "increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100" severity: warning for: 2m - name: MongoDB too many connections (Percona) description: Too many connections (> 80%) query: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80 and (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) > 0' severity: warning for: 2m - name: dcu/mongodb_exporter slug: dcu-mongodb-exporter doc_url: https://github.com/dcu/mongodb_exporter rules: - name: MongoDB replication lag (DCU) description: Mongodb replication lag is more than 10s query: 'avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}) > 10' severity: critical - name: MongoDB replication Status 3 description: MongoDB Replication set member either perform startup self-checks, or transition from completing a rollback or resync query: "mongodb_replset_member_state == 3" severity: critical - name: MongoDB replication Status 6 description: MongoDB Replication set member as seen from another member of the set, is not yet known query: "mongodb_replset_member_state == 6" severity: critical - name: MongoDB replication Status 8 description: MongoDB Replication set member as seen from another member of the set, is unreachable query: "mongodb_replset_member_state == 8" severity: critical - name: MongoDB replication Status 9 description: MongoDB Replication set member is actively performing a rollback. Data is not available for reads query: "mongodb_replset_member_state == 9" severity: critical - name: MongoDB replication Status 10 description: MongoDB Replication set member was once in a replica set but was subsequently removed query: "mongodb_replset_member_state == 10" severity: critical - name: MongoDB number cursors open (DCU) description: Too many cursors opened by MongoDB for clients (> 10k) query: 'mongodb_metrics_cursor_open{state="total_open"} > 10000' severity: warning for: 2m - name: MongoDB cursors timeouts (DCU) description: "Too many cursors are timing out ({{ $value }} in the last minute)" query: "increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100" severity: warning for: 2m - name: MongoDB too many connections (DCU) description: Too many connections (> 80%) query: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80 and (mongodb_connections{state="current"} + mongodb_connections{state="available"}) > 0' severity: warning for: 2m - name: stefanprodan/mgob slug: stefanprodan-mgob-exporter doc_url: https://github.com/stefanprodan/mgob rules: - name: Mgob backup failed description: MongoDB backup has failed query: 'changes(mgob_scheduler_backup_total{status="500"}[1h]) > 0' severity: critical - name: Elasticsearch exporters: - name: prometheus-community/elasticsearch_exporter slug: prometheus-community-elasticsearch-exporter doc_url: https://github.com/prometheus-community/elasticsearch_exporter rules: - name: Elasticsearch Heap Usage Too High description: "The heap usage is over 90%" query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0' severity: critical for: 2m - name: Elasticsearch Heap Usage warning description: "The heap usage is over 80%" query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0' severity: warning for: 2m - name: Elasticsearch disk out of space description: The disk usage is over 90% query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10 and elasticsearch_filesystem_data_size_bytes > 0" severity: critical - name: Elasticsearch disk space low description: The disk usage is over 80% query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20 and elasticsearch_filesystem_data_size_bytes > 0" severity: warning for: 2m - name: Elasticsearch Cluster Red description: Elastic Cluster Red status query: 'elasticsearch_cluster_health_status{color="red"} == 1' severity: critical - name: Elasticsearch Cluster Yellow description: Elastic Cluster Yellow status query: 'elasticsearch_cluster_health_status{color="yellow"} == 1' severity: warning - name: Elasticsearch Healthy Nodes description: "Missing node in Elasticsearch cluster" query: "elasticsearch_cluster_health_number_of_nodes < 3" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Elasticsearch Healthy Data Nodes description: "Missing data node in Elasticsearch cluster" query: "elasticsearch_cluster_health_number_of_data_nodes < 3" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Elasticsearch relocating shards description: "Elasticsearch is relocating shards" query: "elasticsearch_cluster_health_relocating_shards > 0" severity: info - name: Elasticsearch relocating shards too long description: "Elasticsearch has been relocating shards for 15min" query: "elasticsearch_cluster_health_relocating_shards > 0" severity: warning for: 15m - name: Elasticsearch initializing shards description: "Elasticsearch is initializing shards" query: "elasticsearch_cluster_health_initializing_shards > 0" severity: info - name: Elasticsearch initializing shards too long description: "Elasticsearch has been initializing shards for 15 min" query: "elasticsearch_cluster_health_initializing_shards > 0" severity: warning for: 15m - name: Elasticsearch unassigned shards description: "Elasticsearch has unassigned shards" query: "elasticsearch_cluster_health_unassigned_shards > 0" severity: critical for: 2m - name: Elasticsearch pending tasks description: "Elasticsearch has pending tasks. Cluster works slowly." query: "elasticsearch_cluster_health_number_of_pending_tasks > 0" severity: warning for: 15m - name: Elasticsearch no new documents description: "No new documents for 10 min!" query: 'increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1' severity: warning - name: Elasticsearch High Indexing Latency description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s)." query: "rate(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / rate(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and rate(elasticsearch_indices_indexing_index_total[1m]) > 0" severity: warning for: 10m - name: Elasticsearch High Indexing Rate description: "The indexing rate on Elasticsearch cluster is higher than the threshold." query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000" severity: warning for: 5m - name: Elasticsearch High Query Rate description: "The query rate on Elasticsearch cluster is higher than the threshold." query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100" severity: warning for: 5m - name: Elasticsearch High Query Latency description: "The query latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s)." query: "rate(elasticsearch_indices_search_query_time_seconds[1m]) / rate(elasticsearch_indices_search_query_total[1m]) > 1 and rate(elasticsearch_indices_search_query_total[1m]) > 0" severity: warning for: 5m - name: Meilisearch exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://github.com/orgs/meilisearch/discussions/625 rules: - name: Meilisearch index is empty description: Meilisearch index {{ $labels.index }} has zero documents query: "meilisearch_index_docs_count == 0" severity: warning - name: Meilisearch http response time description: Meilisearch http response time is too high query: "meilisearch_http_response_time_seconds > 0.5" severity: warning - name: Cassandra exporters: - name: instaclustr/cassandra-exporter slug: instaclustr-cassandra-exporter doc_url: https://github.com/instaclustr/cassandra-exporter rules: - name: "Cassandra Node is unavailable" description: "Cassandra Node is unavailable - {{ $labels.cassandra_cluster }} {{ $labels.exported_endpoint }}" query: "cassandra_endpoint_active < 1" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: "Cassandra many compaction tasks are pending" description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}" query: "cassandra_table_estimated_pending_compactions > 100" severity: warning - name: "Cassandra commitlog pending tasks (Instaclustr)" description: "Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}" query: "cassandra_commit_log_pending_tasks > 15" for: 2m severity: warning - name: "Cassandra compaction executor blocked tasks (Instaclustr)" description: "Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}" query: 'cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15' for: 2m severity: warning - name: "Cassandra flush writer blocked tasks (Instaclustr)" description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}" query: 'cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15' for: 2m severity: warning - name: "Cassandra connection timeouts total (Instaclustr)" description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}" query: "sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5" for: 2m severity: critical - name: "Cassandra storage exceptions (Instaclustr)" description: "Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}" query: "changes(cassandra_storage_exceptions_total[1m]) > 1" severity: critical - name: "Cassandra tombstone dump (Instaclustr)" description: "Cassandra tombstone dump - {{ $labels.cassandra_cluster }}" query: 'avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100' for: 2m severity: critical - name: "Cassandra client request unavailable write (Instaclustr)" description: "Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}" query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0' for: 2m severity: critical - name: "Cassandra client request unavailable read (Instaclustr)" description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}" query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0' for: 2m severity: critical - name: "Cassandra client request write failure (Instaclustr)" description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}" query: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0' for: 2m severity: critical - name: "Cassandra client request read failure (Instaclustr)" description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}" query: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0' for: 2m severity: critical - name: criteo/cassandra_exporter slug: criteo-cassandra-exporter doc_url: https://github.com/criteo/cassandra_exporter rules: - name: Cassandra hints count description: Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:totalhints:count"}[1m]) > 3' severity: critical - name: Cassandra compaction task pending description: Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster. query: 'cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"} > 100' severity: warning for: 2m - name: Cassandra viewwrite latency description: High viewwrite latency on {{ $labels.instance }} cassandra node query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile"} > 100000' severity: warning for: 2m - name: Cassandra authentication failures description: Increase of Cassandra authentication failures query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5' severity: warning for: 2m - name: Cassandra node down description: Cassandra node down query: 'sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0' severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Cassandra commitlog pending tasks (Criteo) description: Unexpected number of Cassandra commitlog pending tasks query: 'cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15' severity: warning for: 2m - name: Cassandra compaction executor blocked tasks (Criteo) description: Some Cassandra compaction executor tasks are blocked query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:compactionexecutor:currentlyblockedtasks:count"} > 0' severity: warning for: 2m - name: Cassandra flush writer blocked tasks (Criteo) description: Some Cassandra flush writer tasks are blocked query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:memtableflushwriter:currentlyblockedtasks:count"} > 0' severity: warning for: 2m - name: Cassandra repair pending tasks description: Some Cassandra repair tasks are pending query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:pendingtasks:value"} > 2' severity: warning for: 2m - name: Cassandra repair blocked tasks description: Some Cassandra repair tasks are blocked query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:currentlyblockedtasks:count"} > 0' severity: warning for: 2m - name: Cassandra connection timeouts total (Criteo) description: Some connection between nodes are ending in timeout query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5' severity: critical for: 2m - name: Cassandra storage exceptions (Criteo) description: Something is going wrong with cassandra storage query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:exceptions:count"}[1m]) > 1' severity: critical - name: Cassandra tombstone dump (Criteo) description: Too much tombstones scanned in queries query: 'cassandra_stats{name="org:apache:cassandra:metrics:table:tombstonescannedhistogram:99thpercentile"} > 1000' severity: critical - name: Cassandra client request unavailable write (Criteo) description: Write failures have occurred because too many nodes are unavailable query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:unavailables:count"}[1m]) > 0' severity: critical - name: Cassandra client request unavailable read (Criteo) description: Read failures have occurred because too many nodes are unavailable query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:unavailables:count"}[1m]) > 0' severity: critical - name: Cassandra client request write failure (Criteo) description: A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large. query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0' severity: critical - name: Cassandra client request read failure (Criteo) description: A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large. query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0' severity: critical - name: Cassandra cache hit rate key cache description: Key cache hit rate is below 85% query: 'cassandra_stats{name="org:apache:cassandra:metrics:cache:keycache:hitrate:value"} < .85' severity: critical for: 2m - name: Clickhouse exporters: - name: Embedded Exporter slug: embedded-exporter doc_url: https://clickhouse.com/docs/en/operations/system-tables/metrics rules: - name: ClickHouse node down description: "No metrics received from ClickHouse exporter for over 2 minutes." query: 'up{job="clickhouse"} == 0' severity: critical for: 2m comments: | Adjust the job label to match your Prometheus configuration. - name: ClickHouse Memory Usage Critical description: "Memory usage is critically high, over 90%." query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0" severity: critical for: 5m - name: ClickHouse Memory Usage Warning description: "Memory usage is over 80%." query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0" severity: warning for: 5m - name: ClickHouse Disk Space Low on Default description: "Disk space on default is below 20%." query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0" severity: warning for: 2m - name: ClickHouse Disk Space Critical on Default description: "Disk space on default disk is critically low, below 10%." query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0" severity: critical for: 2m - name: ClickHouse Disk Space Low on Backups description: "Disk space on backups is below 20%." query: "ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) > 0" severity: warning for: 2m - name: ClickHouse Replica Errors description: "Critical replica errors detected, either all replicas are stale or lost." query: "ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1" severity: critical - name: ClickHouse No Available Replicas description: "No available replicas in ClickHouse." query: "ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1" severity: critical - name: ClickHouse No Live Replicas description: "There are too few live replicas available, risking data loss and service disruption." query: "ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1" severity: critical - name: ClickHouse High TCP Connections description: "High number of TCP connections, indicating heavy client or inter-cluster communication." query: "ClickHouseMetrics_TCPConnection > 400" severity: warning for: 5m comments: | Please replace the threshold with an appropriate value - name: ClickHouse Interserver Connection Issues description: "High number of interserver connections may indicate replication or distributed query handling issues." query: "ClickHouseMetrics_InterserverConnection > 50" severity: warning for: 5m comments: | Adjust the threshold based on your cluster size and expected replication traffic. - name: ClickHouse ZooKeeper Connection Issues description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination." query: "ClickHouseMetrics_ZooKeeperSession != 1" severity: warning for: 3m - name: ClickHouse Authentication Failures description: "Authentication failures detected, indicating potential security issues or misconfiguration." query: "increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 3" severity: info - name: ClickHouse Access Denied Errors description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts." query: "increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 3" severity: info - name: ClickHouse rejected insert queries description: "INSERTs rejected due to too many active data parts. Reduce insert frequency." query: "increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 0" severity: warning for: 1m - name: ClickHouse delayed insert queries description: "INSERTs delayed due to high number of active parts." query: "increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 0" severity: warning for: 2m - name: ClickHouse zookeeper hardware exception description: "Zookeeper hardware exception: network issues communicating with ZooKeeper" query: "increase(ClickHouseProfileEvents_ZooKeeperHardwareExceptions[1m]) > 0" severity: critical for: 1m - name: ClickHouse high network usage description: High network usage. ClickHouse network usage exceeds 100MB/s. query: "rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024" severity: warning for: 2m comments: | Please replace the threshold with an appropriate value - name: ClickHouse distributed rejected inserts description: "INSERTs into Distributed tables rejected due to pending bytes limit." query: "increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 0" severity: critical for: 2m - name: CouchDB exporters: - name: gesellix/couchdb-prometheus-exporter slug: gesellix-couchdb-prometheus-exporter doc_url: https://github.com/gesellix/couchdb-prometheus-exporter rules: - name: CouchDB node down description: CouchDB node is not responding (node_up metric is 0) for more than 2 minutes query: "couchdb_httpd_node_up == 0 or couchdb_httpd_up == 0" severity: critical for: 2m - name: CouchDB atom memory usage critical description: Atom memory usage is above 90% of limit query: "couchdb_erlang_memory_atom_used > 0.9 * couchdb_erlang_memory_atom" severity: critical for: 5m - name: CouchDB open databases critical description: Number of open databases exceeds 90% of node capacity query: "couchdb_httpd_open_databases > 0.9 * 1000" severity: critical for: 5m - name: CouchDB open OS files critical description: CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files query: "couchdb_httpd_open_os_files > 0.9 * 65535" severity: critical for: 5m - name: CouchDB 5xx error ratio high description: More than 5% of HTTP requests are returning 5xx errors query: "rate(couchdb_httpd_status_codes{code=~\"5..\"}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05 and rate(couchdb_httpd_requests[5m]) > 0" severity: critical for: 5m - name: CouchDB temporary view read rate critical description: Temporary view read rate exceeds 100 reads/sec, high risk of performance degradation query: "rate(couchdb_httpd_temporary_view_reads[5m]) > 100" severity: critical for: 5m - name: CouchDB Mango queries scanning too many docs description: Some Mango queries are scanning too many documents, consider adding indexes query: "rate(couchdb_mango_too_many_docs_scanned[5m]) > 50" severity: warning for: 5m - name: CouchDB Mango queries failed due to invalid index description: Some Mango queries failed to execute because the index was missing or invalid query: "rate(couchdb_mango_query_invalid_index[5m]) > 5" severity: warning for: 5m - name: CouchDB Mango docs examined high description: High number of documents examined per Mango queries, consider indexing query: "rate(couchdb_mango_docs_examined[5m]) > 1000" severity: warning for: 5m - name: CouchDB Replicator manager died description: Replication manager process has crashed query: "increase(couchdb_replicator_changes_manager_deaths[5m]) > 0" severity: critical for: 1m - name: CouchDB Replicator queue process died description: Replication queue process has crashed query: "increase(couchdb_replicator_changes_queue_deaths[5m]) > 0" severity: critical for: 1m - name: CouchDB Replicator reader process died description: Replication reader process has crashed query: "increase(couchdb_replicator_changes_reader_deaths[5m]) > 0" severity: critical for: 1m - name: CouchDB Replicator failed to start description: One or more replication tasks failed to start query: "increase(couchdb_replicator_failed_starts[5m]) > 0" severity: critical for: 1m - name: CouchDB replication cluster unstable description: The replication cluster is unstable, replication may be interrupted query: "couchdb_replicator_cluster_is_stable == 0" severity: critical for: 2m - name: CouchDB replication read failures description: Replication changes feed has failed reads more than 5 times in 5 minutes query: "increase(couchdb_replicator_changes_read_failures[5m]) > 5" severity: warning for: 5m - name: CouchDB file descriptors high description: Process is using more than 85% of allowed file descriptors query: "process_open_fds / process_max_fds > 0.85 and process_max_fds > 0" severity: warning for: 5m - name: CouchDB process restarted description: CouchDB process has restarted recently query: "changes(process_start_time_seconds[1h]) > 0" severity: info for: 1m - name: CouchDB critical log entries description: Critical or error log entries detected in the last 5 minutes query: "increase(couchdb_server_couch_log{level=~\"error|critical\"}[5m]) > 0" severity: critical for: 1m - name: Solr exporters: - name: embedded exporter slug: embedded-exporter doc_url: https://solr.apache.org/guide/8_11/monitoring-solr-with-prometheus-and-grafana.html rules: - name: Solr update errors description: Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}. query: "increase(solr_metrics_core_update_handler_errors_total[1m]) > 1" severity: critical - name: Solr query errors description: Solr has increased query errors in collection {{ $labels.collection }} for replica {{ $labels.replica }} on {{ $labels.base_url }}. query: 'increase(solr_metrics_core_errors_total{category="QUERY"}[1m]) > 1' severity: warning for: 5m - name: Solr replication errors description: Solr collection {{ $labels.collection }} has replication errors for replica {{ $labels.replica }} on {{ $labels.base_url }}. query: 'increase(solr_metrics_core_errors_total{category="REPLICATION"}[1m]) > 1' severity: critical - name: Solr low live node count description: Solr collection {{ $labels.collection }} has less than two live nodes for replica {{ $labels.replica }} on {{ $labels.base_url }}. query: "solr_collections_live_nodes < 2" severity: critical - name: Message brokers services: - name: RabbitMQ exporters: - name: rabbitmq/rabbitmq-prometheus slug: rabbitmq-exporter doc_url: https://github.com/rabbitmq/rabbitmq-prometheus rules: - name: RabbitMQ node down description: Less than 3 nodes running in RabbitMQ cluster query: "sum(rabbitmq_build_info) < 3" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: RabbitMQ node not distributed description: Distribution link state is not 'up' query: "erlang_vm_dist_node_state < 3" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: RabbitMQ instances different versions description: Running different version of RabbitMQ in the same cluster, can lead to failure. query: "count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1" severity: warning for: 1h - name: RabbitMQ memory high description: A node use more than 90% of allocated RAM query: "rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90 and rabbitmq_resident_memory_limit_bytes > 0" severity: warning for: 2m - name: RabbitMQ file descriptors usage description: A node use more than 90% of file descriptors query: "rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 and rabbitmq_process_max_fds > 0" severity: warning for: 2m - name: RabbitMQ too many ready messages description: RabbitMQ too many ready messages on {{ $labels.instance }} query: "sum(rabbitmq_queue_messages_ready) BY (queue) > 1000" severity: warning for: 1m - name: RabbitMQ too many unack messages description: Too many unacknowledged messages query: "sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000" severity: warning for: 1m - name: RabbitMQ too many connections description: The total connections of a node is too high query: "rabbitmq_connections > 1000" severity: warning for: 2m - name: RabbitMQ no queue consumer description: A queue has less than 1 consumer query: "rabbitmq_queue_consumers < 1" severity: warning for: 1m # allows a short service restart - name: RabbitMQ unroutable messages description: A queue has unroutable messages ({{ $value }} in the last 1m) query: "increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0" severity: warning for: 2m - name: kbudde/rabbitmq-exporter slug: kbudde-rabbitmq-exporter doc_url: https://github.com/kbudde/rabbitmq_exporter rules: - name: RabbitMQ down description: RabbitMQ node down query: "rabbitmq_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: RabbitMQ cluster down description: Less than 3 nodes running in RabbitMQ cluster query: "sum(rabbitmq_running) < 3" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: RabbitMQ cluster partition description: Cluster partition query: "rabbitmq_partitions > 0" severity: critical - name: RabbitMQ out of memory description: Memory available for RabbitMQ is low (< 10%) query: "rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0" severity: warning for: 2m - name: RabbitMQ instance too many connections description: RabbitMQ instance has too many connections (> 1000) query: "rabbitmq_connectionsTotal > 1000" severity: warning for: 2m - name: RabbitMQ dead letter queue filling up description: Dead letter queue is filling up (> 10 msgs) query: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10' severity: warning for: 1m comments: | Indicate the queue name in dedicated label. - name: RabbitMQ too many messages in queue description: Queue is filling up (> 1000 msgs) query: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000' severity: warning for: 2m comments: | Indicate the queue name in dedicated label. - name: RabbitMQ slow queue consuming description: Queue messages are consumed slowly (> 60s) query: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60' severity: warning for: 2m comments: | Indicate the queue name in dedicated label. - name: RabbitMQ no consumer description: Queue has no consumer query: "rabbitmq_queue_consumers == 0" severity: critical for: 5m comments: | Allows a short service restart. - name: RabbitMQ too many consumers description: Queue should have only 1 consumer query: 'rabbitmq_queue_consumers{queue="my-queue"} > 1' severity: critical comments: | Indicate the queue name in dedicated label. - name: RabbitMQ inactive exchange description: Exchange receive less than 5 msgs per second query: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5' severity: warning comments: | Indicate the exchange name in dedicated label. for: 2m - name: Zookeeper exporters: - name: cloudflare/kafka_zookeeper_exporter slug: cloudflare-kafka-zookeeper-exporter doc_url: https://github.com/cloudflare/kafka_zookeeper_exporter rules: - name: dabealu/zookeeper-exporter slug: dabealu-zookeeper-exporter doc_url: https://github.com/dabealu/zookeeper-exporter rules: - name: Zookeeper Down description: "Zookeeper down on instance {{ $labels.instance }}" query: "zk_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Zookeeper missing leader description: "Zookeeper cluster has no node marked as leader" query: "sum(zk_server_leader) == 0" severity: critical - name: Zookeeper Too Many Leaders description: "Zookeeper cluster has too many nodes marked as leader" query: "sum(zk_server_leader) > 1" severity: critical - name: Zookeeper Not Ok description: "Zookeeper instance is not ok" query: "zk_ruok == 0" severity: warning for: 3m - name: Kafka exporters: - name: danielqsj/kafka_exporter slug: danielqsj-kafka-exporter doc_url: https://github.com/danielqsj/kafka_exporter rules: - name: Kafka topics replicas description: Kafka topic in-sync partition query: "min(kafka_topic_partition_in_sync_replica) by (topic) < 3" severity: critical - name: Kafka consumer group lag description: Kafka consumer group {{ $labels.consumergroup }} is lagging behind ({{ $value }} messages) query: "sum(kafka_consumergroup_lag) by (consumergroup) > 10000" severity: warning for: 1m - name: linkedin/Burrow slug: linkedin-kafka-exporter doc_url: https://github.com/linkedin/Burrow rules: - name: Kafka topic offset decreased description: Kafka topic offset has decreased query: "delta(kafka_burrow_partition_current_offset[1m]) < 0" severity: warning - name: Kafka consumer lag description: Kafka consumer has a 30 minutes and increasing lag query: "kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0" severity: warning for: 15m - name: Pulsar exporters: - name: embedded exporter slug: embedded-exporter doc_url: https://pulsar.apache.org/docs/reference-metrics/ rules: - name: Pulsar subscription high number of backlog entries description: "The number of subscription backlog entries is over 5k" query: sum(pulsar_subscription_back_log) by (subscription) > 5000 for: 1h severity: warning - name: Pulsar subscription very high number of backlog entries description: "The number of subscription backlog entries is over 100k" query: sum(pulsar_subscription_back_log) by (subscription) > 100000 for: 1h severity: critical - name: Pulsar topic large backlog storage size description: "The topic backlog storage size is over 5 GB" query: sum(pulsar_storage_size) by (topic) > 5*1024*1024*1024 for: 1h severity: warning - name: Pulsar topic very large backlog storage size description: "The topic backlog storage size is over 20 GB" query: sum(pulsar_storage_size) by (topic) > 20*1024*1024*1024 for: 1h severity: critical - name: Pulsar high write latency description: "Messages cannot be written in a timely fashion" query: sum(pulsar_storage_write_latency_overflow > 0) by (topic) for: 1h severity: critical - name: Pulsar large message payload description: "Observing large message payload (> 1MB)" query: sum(pulsar_entry_size_overflow > 0) by (topic) for: 1h severity: warning - name: Pulsar high ledger disk usage description: "Observing Ledger Disk Usage (> 75%)" query: sum(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_usage) by (kubernetes_pod_name) > 75 for: 1h severity: critical - name: Pulsar read only bookies description: "Observing Readonly Bookies" query: count(bookie_SERVER_STATUS{} == 0) by (pod) for: 5m severity: critical - name: Pulsar high number of function errors description: "Observing more than 10 Function errors per minute" query: sum(rate(pulsar_function_user_exceptions_total[1m]) + rate(pulsar_function_system_exceptions_total[1m])) by (name) > 10 for: 1m severity: critical - name: Pulsar high number of sink errors description: "Observing more than 10 Sink errors per minute" query: sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10 for: 1m severity: critical - name: Nats exporters: - name: nats-io/prometheus-nats-exporter slug: nats-exporter doc_url: https://github.com/nats-io/prometheus-nats-exporter rules: - name: Nats high routes count description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }} query: "gnatsd_varz_routes > 10" severity: warning for: 3m - name: Nats high memory usage description: NATS server memory usage is above 200MB for {{ $labels.instance }} query: "gnatsd_varz_mem > 200 * 1024 * 1024" severity: warning for: 5m - name: Nats slow consumers description: There are slow consumers in NATS for {{ $labels.instance }} query: "gnatsd_varz_slow_consumers > 0" severity: critical for: 3m - name: Nats server down description: NATS server has been down for more than 5 minutes query: 'absent(up{job="nats"})' severity: critical for: 5m - name: Nats high CPU usage description: NATS server is using more than 80% CPU for the last 5 minutes query: "gnatsd_varz_cpu > 80" severity: warning for: 5m comments: | gnatsd_varz_cpu is a gauge reporting CPU percentage (0-100 scale). - name: Nats high number of connections description: NATS server has more than 1000 active connections query: "gnatsd_connz_num_connections > 1000" severity: warning for: 5m - name: Nats high JetStream store usage description: JetStream store usage is over 80% query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 and gnatsd_varz_jetstream_config_max_storage > 0" severity: warning for: 5m - name: Nats high JetStream memory usage description: JetStream memory usage is over 80% query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 and gnatsd_varz_jetstream_config_max_memory > 0" severity: warning for: 5m - name: Nats high number of subscriptions description: NATS server has more than 1000 active subscriptions query: "gnatsd_connz_subscriptions > 1000" severity: warning for: 5m - name: Nats high pending bytes description: NATS server has more than 100,000 pending bytes query: "gnatsd_connz_pending_bytes > 100000" severity: warning for: 5m - name: Nats too many errors description: NATS server has encountered {{ $value }} JetStream API errors in the last 5 minutes query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0" severity: warning for: 5m - name: Nats JetStream accounts exceeded description: JetStream has more than 100 active accounts query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100" severity: warning for: 5m - name: Nats leaf node connection issue description: No leaf node connections on {{ $labels.instance }} query: "gnatsd_varz_leafnodes == 0" severity: warning for: 5m - name: Proxies, load balancers and service meshes services: - name: Nginx exporters: - name: knyar/nginx-lua-prometheus slug: knyar-nginx-exporter doc_url: https://github.com/knyar/nginx-lua-prometheus rules: - name: Nginx high HTTP 4xx error rate description: Too many HTTP requests with status 4xx (> 5%) query: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0' severity: critical for: 1m - name: Nginx high HTTP 5xx error rate description: Too many HTTP requests with status 5xx (> 5%) query: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0' severity: critical for: 1m - name: Nginx latency high description: Nginx p99 latency is higher than 3 seconds query: "histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node, le)) > 3" severity: warning for: 2m - name: Apache exporters: - name: Lusitaniae/apache_exporter slug: lusitaniae-apache-exporter doc_url: https://github.com/Lusitaniae/apache_exporter rules: - name: Apache down description: Apache down query: "apache_up == 0" severity: critical - name: Apache workers load description: Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }} query: '(sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80 and sum by (instance) (apache_scoreboard) > 0' severity: warning for: 2m - name: Apache restart description: Apache has just been restarted. query: "apache_uptime_seconds_total / 60 < 1" severity: warning - name: HaProxy exporters: - name: Embedded exporter (HAProxy >= v2) slug: embedded-exporter-v2 doc_url: https://github.com/haproxy/haproxy/tree/master/contrib/prometheus-exporter rules: - name: HAProxy high HTTP 4xx error rate backend description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0 severity: critical for: 1m - name: HAProxy high HTTP 5xx error rate backend description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0 severity: critical for: 1m - name: HAProxy high HTTP 4xx error rate server description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }} query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0 severity: critical for: 1m - name: HAProxy high HTTP 5xx error rate server description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }} query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0 severity: critical for: 1m - name: HAProxy server response errors description: Too many response errors to {{ $labels.server }} server (> 5%). query: (sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0 severity: critical for: 1m - name: HAProxy backend connection errors description: Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high. query: (sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100 severity: critical for: 1m - name: HAProxy server connection errors description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high. query: (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100 severity: critical - name: HAProxy backend max active session > 80% description: Session limit from backend {{ $labels.proxy }} reached 80% of limit - {{ $value | printf "%.2f"}}% query: ((haproxy_backend_current_sessions >0) * 100) / (haproxy_backend_limit_sessions > 0) > 80 severity: warning for: 2m - name: HAProxy pending requests description: Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf "%.2f"}} query: sum by (proxy) (haproxy_backend_current_queue) > 0 comments: | haproxy_backend_current_queue is a gauge (current queue depth), not a counter. severity: warning for: 2m - name: HAProxy HTTP slowing down description: Average request time is increasing - {{ $value | printf "%.2f"}} query: avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1 severity: warning for: 1m - name: HAProxy retry high description: High rate of retry on {{ $labels.proxy }} - {{ $value | printf "%.2f"}} query: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10 severity: warning for: 2m - name: HAproxy has no alive backends description: HAProxy has no alive active or backup backends for {{ $labels.proxy }} query: haproxy_backend_active_servers + haproxy_backend_backup_servers == 0 severity: critical - name: HAProxy frontend security blocked requests description: HAProxy is blocking requests for security reason query: sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10 severity: warning for: 2m - name: HAProxy server healthcheck failure description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m) query: increase(haproxy_server_check_failures_total[1m]) > 0 severity: warning for: 1m - name: prometheus/haproxy_exporter (HAProxy < v2) slug: haproxy-exporter-v1 doc_url: https://github.com/prometheus/haproxy_exporter rules: - name: HAProxy down description: HAProxy down query: "haproxy_up == 0" severity: critical - name: HAProxy high HTTP 4xx error rate backend (v1) description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' severity: critical for: 1m - name: HAProxy high HTTP 5xx error rate backend (v1) description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' severity: critical for: 1m - name: HAProxy high HTTP 4xx error rate server (v1) description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }} query: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' severity: critical for: 1m - name: HAProxy high HTTP 5xx error rate server (v1) description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }} query: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' severity: critical for: 1m - name: HAProxy server response errors (v1) description: Too many response errors to {{ $labels.server }} server (> 5%). query: "sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0" severity: critical for: 1m - name: HAProxy backend connection errors (v1) description: Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high. query: "sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100" severity: critical for: 1m - name: HAProxy server connection errors (v1) description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high. query: "sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100" severity: critical - name: HAProxy backend max active session description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%). query: "((sum by (backend) (haproxy_backend_current_sessions * 100) / sum by (backend) (haproxy_backend_limit_sessions))) > 80 and sum by (backend) (haproxy_backend_limit_sessions) > 0" severity: warning for: 2m - name: HAProxy pending requests (v1) description: Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend query: "sum by (backend) (haproxy_backend_current_queue) > 0" severity: warning for: 2m - name: HAProxy HTTP slowing down (v1) description: Average request time is increasing query: "avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1" severity: warning for: 1m - name: HAProxy retry high (v1) description: High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend query: "sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10" severity: warning for: 2m - name: HAProxy backend down description: HAProxy backend is down query: "haproxy_backend_up == 0" severity: critical - name: HAProxy server down description: HAProxy server is down query: "haproxy_server_up == 0" severity: critical - name: HAProxy frontend security blocked requests (v1) description: HAProxy is blocking requests for security reason query: "sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10" severity: warning for: 2m - name: HAProxy server healthcheck failure (v1) description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m) query: "increase(haproxy_server_check_failures_total[1m]) > 0" severity: warning for: 1m - name: Traefik exporters: - name: Embedded exporter v2 slug: embedded-exporter-v2 doc_url: https://docs.traefik.io/observability/metrics/prometheus/ rules: - name: Traefik service down description: All Traefik services are down query: "count(traefik_service_server_up) by (service) == 0" severity: critical - name: Traefik high HTTP 4xx error rate service description: Traefik service 4xx error rate is above 5% query: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0' severity: critical for: 1m - name: Traefik high HTTP 5xx error rate service description: Traefik service 5xx error rate is above 5% query: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0' severity: critical for: 1m - name: Embedded exporter v1 slug: embedded-exporter-v1 doc_url: https://docs.traefik.io/observability/metrics/prometheus/ rules: - name: Traefik backend down description: All Traefik backends are down query: "count(traefik_backend_server_up) by (backend) == 0" severity: critical - name: Traefik high HTTP 4xx error rate backend description: Traefik backend 4xx error rate is above 5% query: 'sum(rate(traefik_backend_requests_total{code=~"4.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0' severity: critical for: 1m - name: Traefik high HTTP 5xx error rate backend description: Traefik backend 5xx error rate is above 5% query: 'sum(rate(traefik_backend_requests_total{code=~"5.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0' severity: critical for: 1m - name: Caddy exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://caddyserver.com/docs/metrics rules: - name: Caddy Reverse Proxy Down description: "All Caddy reverse proxies are down" query: "count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0" severity: critical - name: Caddy high HTTP 4xx error rate service description: "Caddy service 4xx error rate is above 5%" query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0' severity: critical for: 1m - name: Caddy high HTTP 5xx error rate service description: "Caddy service 5xx error rate is above 5%" query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0' severity: critical for: 1m - name: Envoy exporters: - name: Built-in metrics slug: embedded-exporter doc_url: https://www.envoyproxy.io/docs/envoy/latest/configuration/observability/statistics rules: - name: Envoy server not live description: "Envoy server is not live (draining or shutting down) on {{ $labels.instance }}" query: "envoy_server_live != 1" severity: critical for: 1m - name: Envoy high memory usage description: "Envoy memory allocated is above 90% of heap size on {{ $labels.instance }}" query: "envoy_server_memory_allocated / envoy_server_memory_heap_size * 100 > 90 and envoy_server_memory_heap_size > 0" severity: warning for: 5m - name: Envoy high downstream HTTP 5xx error rate description: "More than 5% of downstream HTTP responses are 5xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" query: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0' severity: critical for: 1m - name: Envoy high downstream HTTP 4xx error rate description: "More than 10% of downstream HTTP responses are 4xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" query: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0' severity: warning for: 5m - name: Envoy downstream connections overflowing description: "Downstream connections are being rejected due to listener overflow on {{ $labels.instance }} ({{ $value }} in the last 5m)" query: "increase(envoy_listener_downstream_cx_overflow[5m]) > 5" severity: warning - name: Envoy cluster membership empty description: "Envoy cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} has no healthy members" query: "envoy_cluster_membership_healthy == 0" severity: critical for: 1m - name: Envoy cluster membership degraded description: "More than 25% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are unhealthy" query: "envoy_cluster_membership_healthy / envoy_cluster_membership_total * 100 < 75 and envoy_cluster_membership_total > 0" severity: warning for: 5m - name: Envoy high cluster upstream connection failures description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)" query: "increase(envoy_cluster_upstream_cx_connect_fail[5m]) > 10" severity: warning for: 5m - name: Envoy high cluster upstream request timeout rate description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}" query: "rate(envoy_cluster_upstream_rq_timeout[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0" severity: warning for: 5m - name: Envoy high cluster upstream 5xx error rate description: "More than 5% of upstream requests return 5xx in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}" query: 'rate(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0' severity: critical for: 1m - name: Envoy cluster health check failures description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)" query: "increase(envoy_cluster_health_check_failure[5m]) > 5" severity: warning for: 5m - name: Envoy cluster outlier detection ejections active description: "There are active outlier detection ejections in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}" query: "envoy_cluster_outlier_detection_ejections_active > 0" severity: info for: 5m - name: Envoy listener SSL connection errors description: "Envoy listener is experiencing SSL/TLS connection errors on {{ $labels.instance }} ({{ $value }} in the last 5m)" query: "increase(envoy_listener_ssl_connection_error[5m]) > 5" severity: warning - name: Envoy global downstream connections overflowing description: "Downstream connections are being rejected due to global connection limit on {{ $labels.instance }} ({{ $value }} in the last 5m)" query: "increase(envoy_listener_downstream_global_cx_overflow[5m]) > 5" severity: critical - name: Envoy SSL certificate expiring soon description: "SSL certificate loaded by Envoy on {{ $labels.instance }} expires in less than 7 days" query: "envoy_server_days_until_first_cert_expiring < 7" severity: warning - name: Envoy SSL certificate expired description: "SSL certificate loaded by Envoy on {{ $labels.instance }} has expired" query: "envoy_server_days_until_first_cert_expiring < 0" severity: critical - name: Envoy cluster circuit breaker tripped description: "Circuit breaker is open for cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}" query: "envoy_cluster_circuit_breakers_default_cx_open == 1 or envoy_cluster_circuit_breakers_default_rq_open == 1" severity: critical - name: Envoy no healthy upstream description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)" query: "increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 0" severity: critical - name: Envoy high downstream request timeout rate description: "Downstream requests are timing out on {{ $labels.instance }} ({{ $value }} in the last 5m)" query: "increase(envoy_http_downstream_rq_timeout[5m]) > 5" severity: warning for: 5m - name: Linkerd exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://linkerd.io/2/tasks/exporting-metrics/ rules: - name: Linkerd high error rate description: "Linkerd error rate for {{ $labels.deployment }}{{ $labels.statefulset }}{{ $labels.daemonset }} is over 10%" query: 'sum(rate(response_total{classification="failure"}[1m])) by (deployment, statefulset, daemonset) / sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10 and sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) > 0' comments: | Linkerd does not expose request_errors_total. Errors are tracked via response_total{classification="failure"}. severity: warning for: 1m - name: Istio exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://istio.io/latest/docs/tasks/observability/metrics/querying-metrics/ rules: - name: Istio Kubernetes gateway availability drop description: Gateway pods have dropped. Inbound traffic will likely be affected. query: 'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2' severity: warning for: 1m - name: Istio Pilot high total request rate description: Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration. query: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5 and sum(rate(pilot_xds_pushes[1m])) > 0" severity: warning for: 1m - name: Istio Mixer Prometheus dispatches low description: Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly. query: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180' severity: warning for: 1m - name: Istio high total request rate description: Global request rate in the service mesh is unusually high. query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000' severity: warning for: 2m - name: Istio low total request rate description: Global request rate in the service mesh is unusually low. query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100' severity: warning for: 2m - name: Istio high 4xx error rate description: High percentage of HTTP 4xx responses in Istio (> 5%). query: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0' severity: warning for: 1m - name: Istio high 5xx error rate description: High percentage of HTTP 5xx responses in Istio (> 5%). query: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0' severity: warning for: 1m - name: Istio high request latency description: Istio average requests execution is longer than 100ms. query: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100 and rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 0' severity: warning for: 1m - name: Istio latency 99 percentile description: Istio 1% slowest requests are longer than 1000ms. query: "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000" severity: warning for: 1m - name: Istio Pilot Duplicate Entry description: Istio pilot duplicate entry error. query: "sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0" severity: critical - name: Runtimes services: - name: PHP-FPM exporters: - name: bakins/php-fpm-exporter slug: bakins-fpm-exporter doc_url: https://github.com/bakins/php-fpm-exporter rules: - name: PHP-FPM max-children reached description: PHP-FPM reached max children on {{ $labels.instance }} ({{ $value }} times in the last 5m) query: "sum(increase(phpfpm_max_children_reached_total[5m])) by (instance) > 3" severity: warning - name: JVM exporters: - name: java-client slug: jvm-exporter doc_url: https://github.com/prometheus/client_java rules: - name: JVM memory filling up description: JVM memory is filling up (> 80%) query: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80 and sum by (instance)(jvm_memory_max_bytes{area="heap"}) > 0' severity: warning for: 2m - name: JVM non-heap memory filling up description: JVM non-heap memory (metaspace/code cache) is filling up (> 80%) query: '(sum by (instance)(jvm_memory_used_bytes{area="nonheap"}) / (sum by (instance)(jvm_memory_max_bytes{area="nonheap"}) > 0)) * 100 > 80' severity: warning for: 2m comments: | Many JVM configurations leave metaspace unbounded, in which case jvm_memory_max_bytes{area="nonheap"} is -1 and this alert will not fire. The query filters out max_bytes <= 0 to avoid false negatives. - name: JVM GC time too high description: JVM is spending too much time in garbage collection (> 5% of wall clock time) query: 'sum by (instance)(rate(jvm_gc_collection_seconds_sum[5m])) > 0.05' severity: warning for: 5m - name: JVM threads deadlocked description: JVM has deadlocked threads query: 'jvm_threads_deadlocked > 0' severity: critical for: 1m - name: JVM thread count high description: JVM thread count is high (> 300), potential thread leak query: 'jvm_threads_current > 300' severity: warning for: 5m - name: JVM threads BLOCKED description: JVM has high number of BLOCKED threads, indicating lock contention query: 'jvm_threads_state{state="BLOCKED"} > 50' severity: warning for: 5m - name: JVM old gen GC frequency description: Frequent old/major GC cycles, indicating memory pressure query: 'rate(jvm_gc_collection_seconds_count{gc=~".*old.*|.*major.*"}[5m]) > 0.3' severity: warning for: 5m comments: | This regex matches CMS, G1, and Parallel collector names. It will not match ZGC or Shenandoah cycle names. Adjust the gc label filter if you use a different collector. - name: JVM direct buffer pool filling up description: JVM direct buffer pool is filling up (> 90%) query: '(jvm_buffer_pool_used_bytes / jvm_buffer_pool_capacity_bytes) * 100 > 90 and jvm_buffer_pool_capacity_bytes > 0' severity: warning for: 5m - name: JVM objects pending finalization description: JVM has objects pending finalization, potential memory leak query: 'jvm_memory_objects_pending_finalization > 1000' severity: warning for: 5m - name: JVM file descriptors exhaustion description: JVM process is running out of file descriptors (> 90% used) query: '(process_open_fds / process_max_fds) * 100 > 90 and process_max_fds > 0' severity: warning for: 5m comments: | process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not JVM-specific. This alert will also fire for Go, Python, or any process exposing these metrics. - name: JVM class loading anomaly description: Rapid class loading detected, potential classloader leak query: 'rate(jvm_classes_loaded_total[5m]) > 100' severity: warning for: 5m - name: JVM compilation time spike description: Excessive JIT compilation time consuming CPU query: 'rate(jvm_compilation_time_seconds_total[5m]) > 0.1' severity: warning for: 5m - name: Golang exporters: - name: client_golang slug: golang-exporter doc_url: https://github.com/prometheus/client_golang rules: - name: Go goroutine count high description: Go application has too many goroutines (> 1000), potential goroutine leak query: 'go_goroutines > 1000' severity: warning for: 5m comments: | Threshold is a rough default. High-concurrency servers may legitimately run thousands of goroutines. Adjust to match your baseline. - name: Go GC duration high description: Go GC pause duration is too high (max > 1s) query: 'go_gc_duration_seconds{quantile="1"} > 1' severity: warning for: 5m comments: | quantile="1" is the maximum observed GC pause in the current summary window, not p99. A single outlier pause can push this above 1s. The for: 5m ensures the max stays elevated. - name: Go memory usage high description: Go heap allocation is using most of the runtime's reserved memory (> 90%), indicating the process may need more memory or has a leak query: '(go_memstats_heap_alloc_bytes / go_memstats_sys_bytes) * 100 > 90' severity: warning for: 5m comments: | go_memstats_sys_bytes is the total memory obtained from the OS by the Go runtime, not total host memory. This ratio measures Go-internal memory utilization, not system-level memory pressure. - name: Go thread count high description: Go OS thread count is high (> 500), potential blocking syscall or CGo leak query: 'go_threads > 500' severity: warning for: 5m comments: | Threshold is workload-dependent. Applications with heavy CGo or blocking I/O may legitimately use more OS threads. Adjust to match your baseline. - name: Go heap objects count high description: Go heap has too many live objects (> 10M), high GC pressure query: 'go_memstats_heap_objects > 10000000' severity: warning for: 5m comments: | Threshold is a rough default. Adjust based on your application's normal object count. - name: Go GC CPU fraction high description: Go GC is consuming too much CPU (> 5%) query: 'go_memstats_gc_cpu_fraction > 0.05' severity: warning for: 5m comments: | go_memstats_gc_cpu_fraction is deprecated since Go 1.20 and may return 0 in newer versions. Consider using runtime/metrics-based alternatives if running Go >= 1.20. - name: Go goroutine spike description: Go goroutine count is growing rapidly query: 'deriv(go_goroutines[5m]) > 100' severity: warning for: 5m - name: Go heap fragmentation description: Go heap has high idle ratio (> 90%), indicating memory fragmentation query: 'go_memstats_heap_idle_bytes / go_memstats_heap_sys_bytes > 0.9' severity: warning for: 5m - name: Go memory leak description: Go application has sustained high allocation rate (> 1GB/s), potential memory leak query: 'rate(go_memstats_alloc_bytes_total[5m]) > 1e9' severity: warning for: 5m - name: Go stack memory high description: Go stack memory usage is high (> 1GB), likely excessive goroutines or deep recursion query: 'go_memstats_stack_inuse_bytes > 1e9' severity: warning for: 5m - name: Ruby exporters: - name: prometheus_exporter slug: ruby-exporter doc_url: https://github.com/discourse/prometheus_exporter rules: - name: Ruby heap live slots high description: Ruby heap has too many live slots (> 500k), heap bloat query: 'ruby_heap_live_slots > 500000' severity: warning for: 5m comments: | Threshold is a rough default. Adjust based on your application's normal heap size. - name: Ruby heap free slots high description: Ruby heap has too many free slots (> 500k), memory fragmentation after large allocations query: 'ruby_heap_free_slots > 500000' severity: warning for: 5m - name: Ruby major GC rate high description: Ruby is performing too many major GC cycles, indicating memory pressure query: 'rate(ruby_major_gc_ops_total[5m]) > 5' severity: warning for: 5m comments: | Major GC rate > 5/s is extremely high. Consider lowering to > 1 or > 2 for earlier detection. - name: Ruby RSS high description: Ruby process RSS is high (> 1GB) query: 'ruby_rss > 1e9' severity: warning for: 5m - name: Ruby allocated objects spike description: Ruby is allocating objects at a high rate query: 'rate(ruby_allocated_objects_total[5m]) > 100000' severity: warning for: 5m - name: Python exporters: - name: client_python slug: python-exporter doc_url: https://github.com/prometheus/client_python rules: - name: Python GC objects uncollectable description: Python has uncollectable objects, potential memory leak via reference cycles query: 'increase(python_gc_objects_uncollectable_total[5m]) > 0' severity: warning for: 5m - name: Python GC collections high description: Python GC is collecting too many objects (> 10k/s), high allocation pressure query: 'rate(python_gc_objects_collected_total[5m]) > 10000' severity: warning for: 5m - name: Python file descriptors exhaustion description: Python process is running out of file descriptors (> 90% used) query: '(process_open_fds / process_max_fds) * 100 > 90 and process_max_fds > 0' severity: warning for: 5m comments: | process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not Python-specific. - name: Python GC generation 2 collections high description: Python full GC (generation 2) is running too frequently, indicating memory pressure query: 'rate(python_gc_collections_total{generation="2"}[5m]) > 1' severity: warning for: 5m comments: | Gen2 collection rate > 1/s is very high. In most applications, gen2 runs are infrequent. Adjust threshold based on your workload. - name: Python virtual memory high description: Python process virtual memory is high (> 4GB) query: 'process_virtual_memory_bytes > 4e9' severity: warning for: 5m comments: | Threshold is a rough default. Adjust based on your application's expected memory footprint. - name: Sidekiq exporters: - name: Strech/sidekiq-prometheus-exporter slug: strech-sidekiq-exporter doc_url: https://github.com/Strech/sidekiq-prometheus-exporter rules: - name: Sidekiq queue size description: Sidekiq queue {{ $labels.name }} is growing query: "sidekiq_queue_size > 100" severity: warning for: 1m - name: Sidekiq scheduling latency too high description: Sidekiq jobs are taking more than 1min to be picked up. Users may be seeing delays in background processing. query: "max(sidekiq_queue_latency) > 60" severity: critical - name: Data engineering services: - name: Apache Flink exporters: - name: Built-in Prometheus reporter slug: flink-prometheus-reporter doc_url: https://nightlies.apache.org/flink/flink-docs-stable/docs/deployment/metric_reporters/ rules: - name: Flink job is not running description: "No Flink jobs are currently running. All jobs may have failed or been cancelled." query: "flink_jobmanager_numRunningJobs == 0" severity: critical for: 1m - name: Flink no TaskManagers registered description: "No TaskManagers are registered with the JobManager. The cluster has no processing capacity." query: "flink_jobmanager_numRegisteredTaskManagers == 0" severity: critical for: 1m - name: Flink all task slots used description: "All Flink task slots are in use ({{ $value }} available). New jobs cannot be scheduled." query: "flink_jobmanager_taskSlotsAvailable == 0" severity: warning for: 5m comments: | This alert fires when there are no available task slots. Adjust the threshold if your cluster is expected to run at full capacity. - name: Flink job restart increasing description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes." query: "increase(flink_jobmanager_job_numRestarts[5m]) > 1" severity: warning for: 5m comments: | A single restart may be normal during deployments. Adjust threshold based on restart tolerance. - name: Flink checkpoint failures description: "Flink job {{ $labels.job_name }} has {{ $value }} failed checkpoints in the last 10 minutes." query: "increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1" severity: warning for: 5m - name: Flink checkpoint duration high description: "Flink job {{ $labels.job_name }} last checkpoint took {{ $value | humanizeDuration }} to complete." query: "flink_jobmanager_job_lastCheckpointDuration / 1000 > 60" severity: warning for: 5m comments: | Value is converted from milliseconds to seconds for correct humanizeDuration display. Threshold is 60 seconds. Adjust based on your checkpoint interval and state size. - name: Flink task backpressured description: "Flink task {{ $labels.task_name }} in job {{ $labels.job_name }} is backpressured." query: "flink_taskmanager_job_task_isBackPressured == 1" severity: warning for: 5m - name: Flink task high backpressure time description: "Flink task {{ $labels.task_name }} is spending {{ $value | humanize }}ms/sec in backpressure." query: "flink_taskmanager_job_task_backPressuredTimeMsPerSecond > 500" severity: warning for: 5m comments: | Fires when a task spends more than 500ms/sec backpressured. This indicates the task cannot keep up with upstream data rate. - name: Flink TaskManager heap memory high description: "Flink TaskManager {{ $labels.instance }} heap memory usage is above 90%." query: "flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9" severity: warning for: 5m - name: Flink JobManager heap memory high description: "Flink JobManager {{ $labels.instance }} heap memory usage is above 90%." query: "flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9" severity: warning for: 5m - name: Flink TaskManager GC time high description: "Flink TaskManager {{ $labels.instance }} is spending more than 10% of time in garbage collection." query: "rate(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100" severity: warning for: 5m comments: | Threshold: more than 100ms/sec of GC time (10% of wall clock). Adjust based on your workload. - name: Flink no records processed description: "Flink task {{ $labels.task_name }} has not processed any records in the last 5 minutes." query: "rate(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0" severity: warning for: 5m comments: | Only fires for tasks that have previously received records, to avoid false positives during startup. - name: Apache Spark exporters: - name: Built-in Prometheus (PrometheusServlet + PrometheusResource) slug: spark-prometheus doc_url: https://spark.apache.org/docs/latest/monitoring.html comments: | Spark exposes metrics via two built-in endpoints: - PrometheusServlet: master/worker/driver metrics at /metrics/prometheus/ (ports 8080, 8081, 4040) - PrometheusResource: executor metrics at /metrics/executors/prometheus/ (port 4040, requires spark.ui.prometheus.enabled=true in Spark 3.x) Metric names from PrometheusServlet include a dynamic namespace (application ID), making static PromQL queries challenging. Configuration: spark.metrics.conf.*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet rules: - name: Spark no alive workers description: "No Spark workers are alive. The cluster has no processing capacity." query: "metrics_master_aliveWorkers_Value == 0" severity: critical for: 1m - name: Spark too many waiting apps description: "Spark has {{ $value }} applications waiting for resources." query: "metrics_master_waitingApps_Value > 10" severity: warning for: 5m comments: | Adjust the threshold based on your cluster's typical queuing behavior. - name: Spark worker memory exhausted description: "Spark worker {{ $labels.instance }} has no free memory ({{ $value }}MB free)." query: "metrics_worker_memFree_MB_Value == 0" severity: warning for: 2m - name: Spark worker cores exhausted description: "Spark worker {{ $labels.instance }} has no free cores." query: "metrics_worker_coresFree_Value == 0" severity: warning for: 5m comments: | Fires when a worker has no free cores. This may be normal under high load but can indicate capacity issues. - name: Spark executor high GC time description: "Spark executor {{ $labels.executor_id }} in {{ $labels.application_name }} is spending too much time in GC." query: "metrics_executor_totalGCTime_seconds_total / metrics_executor_totalDuration > 0.1 and metrics_executor_totalDuration > 0" severity: warning for: 5m comments: | Fires when more than 10% of executor time is spent in garbage collection. This metric comes from the PrometheusResource endpoint (/metrics/executors/prometheus/). - name: Spark executor all tasks failing description: "Spark executor {{ $labels.executor_id }} has only failing tasks ({{ $value }} failed, 0 completed)." query: "metrics_executor_failedTasks_total > 0 and metrics_executor_completedTasks_total == 0" severity: critical for: 5m - name: Spark executor high task failure rate description: "Spark executor {{ $labels.executor_id }} has a task failure rate above 10%." query: "metrics_executor_failedTasks_total / metrics_executor_totalTasks_total > 0.1 and metrics_executor_totalTasks_total > 0" severity: warning for: 5m - name: Spark executor high disk spill description: "Spark executor {{ $labels.executor_id }} is spilling data to disk. Consider increasing executor memory." query: "metrics_executor_diskUsed_bytes > 1e9" severity: warning for: 5m comments: | diskUsed is a gauge, not a counter — do not use rate(). Threshold of 1GB is a rough default. Disk spilling indicates insufficient memory for the workload. - name: Hadoop exporters: - name: hadoop/jmx_exporter slug: jmx_exporter doc_url: https://github.com/prometheus/jmx_exporter rules: # Alert rule for NameNode availability - name: Hadoop Name Node Down query: up{job="hadoop-namenode"} == 0 for: 5m severity: critical description: "The Hadoop NameNode service is unavailable." # Alert rule for ResourceManager availability - name: Hadoop Resource Manager Down query: up{job="hadoop-resourcemanager"} == 0 for: 5m severity: critical description: "The Hadoop ResourceManager service is unavailable." # Alert rule for DataNode status - name: Hadoop Data Node Out Of Service query: hadoop_datanode_last_heartbeat == 0 for: 10m severity: warning description: "The Hadoop DataNode is not sending heartbeats." # Alert rule for low HDFS disk space - name: Hadoop HDFS Disk Space Low query: (hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1 and hadoop_hdfs_bytes_total > 0 for: 15m severity: warning description: "Available HDFS disk space is running low." # Alert rule for excessive MapReduce task failures - name: Hadoop Map Reduce Task Failures query: increase(hadoop_mapreduce_task_failures_total[1h]) > 100 for: 10m severity: critical description: "There is an unusually high number of MapReduce task failures." # Alert rule for high ResourceManager memory usage - name: Hadoop Resource Manager Memory High query: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8 for: 15m severity: warning description: "The Hadoop ResourceManager is approaching its memory limit." # Alert rule for high YARN container allocation failures - name: Hadoop YARN Container Allocation Failures query: increase(hadoop_yarn_container_allocation_failures_total[1h]) > 10 for: 10m severity: warning description: "There is a significant number of YARN container allocation failures." # Alert rule for excessive HBase region server region count - name: Hadoop HBase Region Count High query: hadoop_hbase_region_count > 5000 for: 15m severity: warning description: "The HBase cluster has an unusually high number of regions." # Alert rule for low HBase region server heap space - name: Hadoop HBase Region Server Heap Low query: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8 for: 10m severity: warning description: "HBase Region Servers are running low on heap space." # Alert rule for high HBase Write Requests latency - name: Hadoop HBase Write Requests Latency High query: hadoop_hbase_write_requests_latency_seconds > 0.5 for: 10m severity: warning description: "HBase Write Requests are experiencing high latency." - name: Orchestrators services: - name: Kubernetes exporters: - name: kube-state-metrics slug: kubestate-exporter doc_url: https://github.com/kubernetes/kube-state-metrics/tree/master/docs rules: - name: Kubernetes Node not ready description: Node {{ $labels.node }} has been unready for a long time query: 'kube_node_status_condition{condition="Ready",status="true"} == 0' severity: critical for: 10m - name: Kubernetes Node scheduling disabled description: Node {{ $labels.node }} has been marked as unschedulable for more than 30 minutes. query: 'kube_node_spec_taint{key="node.kubernetes.io/unschedulable"} == 1' severity: warning for: 30m comments: | Kubernetes Node with disabled schedules are fine. This alarm can be useful to get warned if there are nodes which are longer unscheduled. - name: Kubernetes Node memory pressure description: "Node {{ $labels.node }} has MemoryPressure condition" query: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1' severity: critical for: 2m - name: Kubernetes Node disk pressure description: "Node {{ $labels.node }} has DiskPressure condition" query: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1' severity: critical for: 2m - name: Kubernetes Node network unavailable description: "Node {{ $labels.node }} has NetworkUnavailable condition" query: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1' severity: critical for: 2m - name: Kubernetes Node out of pod capacity description: "Node {{ $labels.node }} is out of pod capacity" query: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90' severity: warning for: 2m - name: Kubernetes Container oom killer description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes." query: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1' severity: warning - name: Kubernetes Job failed description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete" query: "kube_job_status_failed > 0" severity: warning - name: Kubernetes Job not starting description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes" query: "kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600" severity: warning - name: Kubernetes CronJob failing description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is failing" query: "(kube_cronjob_status_last_schedule_time > kube_cronjob_status_last_successful_time) AND (kube_cronjob_status_active == 0) AND (kube_cronjob_spec_suspend == 0)" severity: critical - name: Kubernetes CronJob suspended description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended" query: "kube_cronjob_spec_suspend != 0" severity: warning - name: Kubernetes PersistentVolumeClaim pending description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending" query: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1' severity: warning for: 2m - name: Kubernetes Volume out of disk space description: Volume is almost full (< 10% left) query: "kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10 and kubelet_volume_stats_capacity_bytes > 0" severity: warning for: 2m - name: Kubernetes Volume full in four days description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available." query: "predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0" severity: critical - name: Kubernetes PersistentVolume error description: "Persistent volume {{ $labels.persistentvolume }} is in bad state" query: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0' severity: critical - name: Kubernetes StatefulSet down description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down query: "kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0" severity: critical for: 1m - name: Kubernetes HPA scale inability description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale query: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0' severity: warning for: 2m - name: Kubernetes HPA metrics unavailability description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics query: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1' severity: warning - name: Kubernetes HPA scale maximum description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods query: "(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)" severity: info for: 2m - name: Kubernetes HPA underutilized description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here. query: "max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3" # allow minimum 3 replicas running severity: info - name: Kubernetes Pod not healthy description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes. query: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0' severity: critical for: 15m - name: Kubernetes pod crash looping description: Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping query: "increase(kube_pod_container_status_restarts_total[1m]) > 3" severity: warning for: 2m - name: Kubernetes ReplicaSet replicas mismatch description: ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch query: "kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas" severity: warning for: 10m - name: Kubernetes Deployment replicas mismatch description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch query: "kube_deployment_spec_replicas != kube_deployment_status_replicas_available" severity: warning for: 10m - name: Kubernetes StatefulSet replicas mismatch description: StatefulSet does not match the expected number of replicas. query: "kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas" severity: warning for: 10m - name: Kubernetes Deployment generation mismatch description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back. query: "kube_deployment_status_observed_generation != kube_deployment_metadata_generation" severity: critical for: 10m - name: Kubernetes StatefulSet generation mismatch description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back. query: "kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation" severity: critical for: 10m - name: Kubernetes StatefulSet update not rolled out description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. query: "max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)" severity: warning for: 10m - name: Kubernetes DaemonSet rollout stuck description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready query: "(kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 and kube_daemonset_status_desired_number_scheduled > 0) or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0" severity: warning for: 10m - name: Kubernetes DaemonSet misscheduled description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run query: "kube_daemonset_status_number_misscheduled > 0" severity: critical for: 1m - name: Kubernetes CronJob too long description: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete. query: "kube_job_status_start_time > 0 and absent(kube_job_status_completion_time) and (time() - kube_job_status_start_time) > 3600" severity: warning comments: | Threshold should be customized for each cronjob name. - name: Kubernetes Job slow completion description: Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time. query: "kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0" severity: critical for: 12h - name: Kubernetes API server errors description: "Kubernetes API server is experiencing {{ $value | humanize }}% error rate" query: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3 and sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) > 0' severity: critical for: 2m - name: Kubernetes API client errors description: "Kubernetes API client is experiencing {{ $value | humanize }}% error rate" query: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 and sum(rate(rest_client_requests_total[1m])) by (instance, job) > 0' severity: critical for: 2m - name: Kubernetes client certificate expires next week description: A client certificate used to authenticate to the apiserver is expiring next week. query: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60' severity: warning - name: Kubernetes client certificate expires soon description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. query: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60' severity: critical - name: Kubernetes API server latency description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}." query: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1' severity: warning for: 2m - name: Nomad exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: Nomad job failed description: Nomad job failed query: "nomad_nomad_job_summary_failed > 0" severity: warning - name: Nomad job lost description: Nomad job lost query: "nomad_nomad_job_summary_lost > 0" severity: warning - name: Nomad job queued description: Nomad job queued query: "nomad_nomad_job_summary_queued > 0" severity: warning for: 2m - name: Nomad blocked evaluation description: Nomad blocked evaluation query: "nomad_nomad_blocked_evals_total_blocked > 0" severity: warning - name: Consul exporters: - name: prometheus/consul_exporter slug: consul-exporter doc_url: https://github.com/prometheus/consul_exporter rules: - name: Consul service healthcheck failed description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`" query: "consul_catalog_service_node_healthy == 0" severity: critical for: 1m # allows a short service restart - name: Consul missing master node description: Numbers of consul raft peers should be 3, in order to preserve quorum. query: "consul_raft_peers < 3" severity: critical - name: Consul agent unhealthy description: A Consul agent is down query: 'consul_health_node_status{status="critical"} == 1' severity: critical - name: Etcd exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: Etcd insufficient Members description: Etcd cluster should have an odd number of members query: "count(etcd_server_id) % 2 == 0" severity: critical - name: Etcd no Leader description: Etcd cluster have no leader query: "etcd_server_has_leader == 0" severity: critical - name: Etcd high number of leader changes description: "Etcd leader changed {{ $value }} times during 10 minutes" query: "increase(etcd_server_leader_changes_seen_total[10m]) > 2" severity: warning - name: Etcd high number of failed GRPC requests warning description: More than 1% GRPC request failure detected in Etcd query: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0' severity: warning for: 2m comments: | Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled. - name: Etcd high number of failed GRPC requests critical description: More than 5% GRPC request failure detected in Etcd query: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0' severity: critical for: 2m comments: | Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled. - name: Etcd GRPC requests slow description: GRPC requests slowing down, 99th percentile is over 0.15s query: 'histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service, grpc_method, le)) > 0.15' severity: warning for: 2m - name: Etcd high number of failed HTTP requests warning description: More than 1% HTTP failure detected in Etcd query: "sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0" severity: warning for: 2m - name: Etcd high number of failed HTTP requests critical description: More than 5% HTTP failure detected in Etcd query: "sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0" severity: critical for: 2m - name: Etcd HTTP requests slow description: HTTP requests slowing down, 99th percentile is over 0.15s query: "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15" severity: warning for: 2m - name: Etcd member communication slow description: Etcd member communication slowing down, 99th percentile is over 0.15s query: "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15" severity: warning for: 2m - name: Etcd high number of failed proposals description: "Etcd server got {{ $value }} failed proposals in the past hour" query: "increase(etcd_server_proposals_failed_total[1h]) > 5" severity: warning for: 2m - name: Etcd high fsync durations description: Etcd WAL fsync duration increasing, 99th percentile is over 0.5s query: "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5" severity: warning for: 2m - name: Etcd high commit durations description: Etcd commit duration increasing, 99th percentile is over 0.25s query: "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25" severity: warning for: 2m - name: OpenStack exporters: - name: openstack-exporter/openstack-exporter slug: openstack-exporter doc_url: https://github.com/openstack-exporter/openstack-exporter rules: - name: OpenStack exporter down description: The OpenStack exporter is down. OpenStack cloud metrics are no longer being collected. query: 'up{job=~".*openstack.*"} == 0' severity: critical for: 2m - name: OpenStack Nova agent down description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}" query: 'openstack_nova_agent_state{adminState="enabled"} == 0' severity: critical for: 2m - name: OpenStack Neutron agent down description: "Neutron agent {{ $labels.hostname }} ({{ $labels.service }}) is down" query: 'openstack_neutron_agent_state{adminState="up"} == 0' severity: critical for: 2m - name: OpenStack Cinder agent down description: "Cinder agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}" query: 'openstack_cinder_agent_state{adminState="enabled"} == 0' severity: critical for: 2m - name: OpenStack hypervisor high vCPU usage description: "Hypervisor {{ $labels.hostname }} vCPU usage is above 90%" query: 'openstack_nova_vcpus_used / openstack_nova_vcpus_available > 0.9 and openstack_nova_vcpus_available > 0' severity: warning for: 5m comments: | The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns. - name: OpenStack hypervisor high memory usage description: "Hypervisor {{ $labels.hostname }} memory usage is above 90%" query: 'openstack_nova_memory_used_bytes / openstack_nova_memory_available_bytes > 0.9 and openstack_nova_memory_available_bytes > 0' severity: warning for: 5m comments: | The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns. - name: OpenStack hypervisor high disk usage description: "Hypervisor {{ $labels.hostname }} local disk usage is above 90%" query: 'openstack_nova_local_storage_used_bytes / openstack_nova_local_storage_available_bytes > 0.9 and openstack_nova_local_storage_available_bytes > 0' severity: warning for: 5m - name: OpenStack Nova tenant vCPU quota nearly exhausted description: "Tenant {{ $labels.tenant }} has used over 90% of its vCPU quota" query: 'openstack_nova_limits_vcpus_used / openstack_nova_limits_vcpus_max > 0.9 and openstack_nova_limits_vcpus_max > 0' severity: warning comments: | A value of -1 for limits_vcpus_max means unlimited quota (no limit set). - name: OpenStack Nova tenant memory quota nearly exhausted description: "Tenant {{ $labels.tenant }} has used over 90% of its memory quota" query: 'openstack_nova_limits_memory_used / openstack_nova_limits_memory_max > 0.9 and openstack_nova_limits_memory_max > 0' severity: warning - name: OpenStack Nova tenant instance quota nearly exhausted description: "Tenant {{ $labels.tenant }} has used over 90% of its instance quota" query: 'openstack_nova_limits_instances_used / openstack_nova_limits_instances_max > 0.9 and openstack_nova_limits_instances_max > 0' severity: warning - name: OpenStack Cinder tenant volume quota nearly exhausted description: "Tenant {{ $labels.tenant }} has used over 90% of its volume storage quota" query: 'openstack_cinder_limits_volume_used_gb / openstack_cinder_limits_volume_max_gb > 0.9 and openstack_cinder_limits_volume_max_gb > 0' severity: warning - name: OpenStack Cinder pool low free capacity description: "Cinder storage pool {{ $labels.name }} has less than 10% free capacity" query: 'openstack_cinder_pool_capacity_free_gb / openstack_cinder_pool_capacity_total_gb < 0.1 and openstack_cinder_pool_capacity_total_gb > 0' severity: warning for: 5m - name: OpenStack Neutron floating IPs associated but not active description: "{{ $value }} floating IPs are associated to a private IP but are not in ACTIVE state" query: 'openstack_neutron_floating_ips_associated_not_active > 0' severity: warning for: 5m - name: OpenStack Neutron routers not active description: "{{ $value }} Neutron routers are not in ACTIVE state" query: 'openstack_neutron_routers_not_active > 0' severity: warning for: 5m - name: OpenStack Neutron subnet IP pool exhaustion description: "Subnet {{ $labels.subnet_name }} on network {{ $labels.network_name }} has used over 90% of its IP pool" query: 'openstack_neutron_network_ip_availabilities_used / openstack_neutron_network_ip_availabilities_total > 0.9 and openstack_neutron_network_ip_availabilities_total > 0' severity: warning - name: OpenStack Neutron ports without IPs description: "{{ $value }} active ports have no IP addresses assigned" query: 'openstack_neutron_ports_no_ips > 0' severity: warning for: 5m - name: OpenStack load balancer not online description: "Load balancer {{ $labels.name }} ({{ $labels.id }}) operating status is {{ $labels.operating_status }}" query: 'openstack_loadbalancer_loadbalancer_status{operating_status!="ONLINE"} > 0' severity: warning for: 5m - name: OpenStack Nova instances in ERROR state description: "{{ $value }} Nova instances are in ERROR state" query: 'sum(openstack_nova_server_status{status="ERROR"}) > 0' severity: warning for: 5m - name: OpenStack Cinder volumes in error state description: "{{ $value }} Cinder volumes are in an error state" query: 'openstack_cinder_volume_status_counter{status=~"error.*"} > 0' severity: warning for: 5m - name: OpenStack placement resource high usage description: "Resource {{ $labels.resourcetype }} on host {{ $labels.hostname }} usage exceeds 90% of its allocation" query: 'openstack_placement_resource_usage / (openstack_placement_resource_total * openstack_placement_resource_allocation_ratio) > 0.9 and openstack_placement_resource_total > 0' severity: warning for: 5m comments: | This alert factors in the allocation ratio to compute effective capacity. The threshold of 90% is a rough default. Adjust based on your allocation ratios and workload patterns. - name: CI/CD services: - name: Jenkins exporters: - name: Metric plugin slug: metric-plugin doc_url: https://plugins.jenkins.io/prometheus/ rules: - name: Jenkins node offline description: "At least one Jenkins node offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" query: "jenkins_node_offline_value > 0" severity: critical for: 5m - name: Jenkins no node online description: "No Jenkins nodes are online: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" query: "jenkins_node_online_value == 0" severity: critical - name: Jenkins healthcheck description: "Jenkins healthcheck score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" query: "jenkins_health_check_score < 1" severity: critical - name: Jenkins outdated plugins description: "{{ $value }} plugins need update" query: "sum(jenkins_plugins_withUpdate) by (instance) > 3" severity: warning for: 1d - name: Jenkins builds health score description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" query: "default_jenkins_builds_health_score < 1" severity: critical - name: Jenkins run failure total description: "Job run failures: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" query: "delta(jenkins_runs_failure_total[1h]) > 100" severity: warning - name: Jenkins build tests failing description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})" query: "default_jenkins_builds_last_build_tests_failing > 0" severity: warning - name: Jenkins last build failed description: "Last build failed: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})" query: "default_jenkins_builds_last_build_result_ordinal == 2" severity: warning comments: | * RUNNING -1 true - The build had no errors. * SUCCESS 0 true - The build had no errors. * UNSTABLE 1 true - The build had some errors but they were not fatal. For example, some tests failed. * FAILURE 2 false - The build had a fatal error. * NOT_BUILT 3 false - The module was not built. * ABORTED 4 false - The build was manually aborted. - name: ArgoCD exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://argo-cd.readthedocs.io/en/stable/operator-manual/metrics/ rules: - name: ArgoCD service not synced description: Service {{ $labels.name }} run by argo is currently not in sync. query: 'argocd_app_info{sync_status!="Synced"} != 0' severity: warning for: 15m - name: ArgoCD service unhealthy description: Service {{ $labels.name }} run by argo is currently not healthy. query: 'argocd_app_info{health_status!="Healthy"} != 0' severity: warning for: 15m - name: FluxCD exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://fluxcd.io/flux/monitoring/metrics/ rules: - name: Flux Kustomization Failure description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready. query: 'gotk_resource_info{ready="False", customresource_kind="Kustomization"} > 0' severity: warning for: 15m - name: Flux HelmRelease Failure description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready. query: 'gotk_resource_info{ready="False", customresource_kind="HelmRelease"} > 0' severity: warning for: 15m - name: Flux Source Issue description: Flux source {{ $labels.customresource_kind }} '{{ $labels.name }}' has issue(s). query: 'gotk_resource_info{ready="False", customresource_kind=~"GitRepository|HelmRepository|Bucket|OCIRepository"} > 0' severity: warning for: 15m - name: Flux Image Issue description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' is marked as not ready. query: 'gotk_resource_info{ready="False", customresource_kind=~"ImagePolicy|ImageRepository|ImageUpdateAutomation"} > 0' severity: warning for: 15m - name: GitLab CI exporters: - name: GitLab built-in exporter slug: gitlab-built-in-exporter doc_url: https://docs.gitlab.com/administration/monitoring/prometheus/gitlab_metrics/ rules: # Puma web server - name: GitLab Puma high queued connections description: "GitLab Puma has {{ $value }} queued connections on {{ $labels.instance }}. Requests are waiting for an available worker thread." query: "puma_queued_connections > 5" severity: warning for: 5m comments: | Queued connections indicate Puma workers are saturated. Consider increasing puma['worker_processes'] or puma['max_threads'] in gitlab.rb. - name: GitLab Puma no available pool capacity description: "GitLab Puma pool capacity on {{ $labels.instance }} has been at 0 for 5 minutes. All threads are busy." query: "puma_pool_capacity == 0" severity: critical for: 5m - name: GitLab Puma workers not running description: "GitLab Puma on {{ $labels.instance }} has {{ $value }} running workers out of expected total." query: "puma_running_workers < puma_workers" severity: warning for: 5m # HTTP request handling - name: GitLab high HTTP error rate description: "GitLab is returning more than 5% HTTP 5xx errors on {{ $labels.instance }}." query: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5 and sum(rate(http_requests_total[5m])) > 0' severity: critical for: 5m comments: | Threshold is 5% of all requests returning server errors. Check GitLab logs at /var/log/gitlab/ for root cause. - name: GitLab high HTTP request latency description: "GitLab p95 HTTP request latency on {{ $labels.instance }} is above 10 seconds." query: "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 10" severity: warning for: 5m comments: | Threshold of 10s may need adjustment based on your instance size and workload. # Sidekiq background jobs - name: GitLab Sidekiq jobs failing description: "GitLab Sidekiq jobs are failing at a rate of {{ $value }} per second on {{ $labels.instance }}." query: "rate(sidekiq_jobs_failed_total[5m]) > 0.1" severity: warning for: 10m comments: | This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. A sustained failure rate indicates background processing issues. - name: GitLab Sidekiq queue too large description: "GitLab Sidekiq has {{ $value }} running jobs, approaching concurrency limit on {{ $labels.instance }}." query: "sum(sidekiq_running_jobs) >= sum(sidekiq_concurrency) * 0.9" severity: warning for: 10m comments: | When running jobs approach the concurrency limit, new jobs will queue up. Consider scaling Sidekiq workers or increasing concurrency. - name: GitLab Sidekiq high job completion time description: "GitLab Sidekiq job p95 completion time on {{ $labels.instance }} is above 5 minutes ({{ $value | humanizeDuration }})." query: "histogram_quantile(0.95, sum(rate(sidekiq_jobs_completion_seconds_bucket[5m])) by (le, worker)) > 300" severity: warning for: 10m comments: | This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. - name: GitLab Sidekiq high queue latency description: "GitLab Sidekiq jobs on {{ $labels.instance }} are waiting more than 60 seconds before being processed." query: "histogram_quantile(0.95, sum(rate(sidekiq_jobs_queue_duration_seconds_bucket[5m])) by (le)) > 60" severity: warning for: 5m comments: | This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. High queue latency means jobs are stuck waiting. Check Sidekiq concurrency and queue sizes. # Database connection pool - name: GitLab database connection pool saturation description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) is {{ $value }}% busy." query: "gitlab_database_connection_pool_busy / gitlab_database_connection_pool_size * 100 > 90 and gitlab_database_connection_pool_size > 0" severity: warning for: 5m comments: | When the pool is near saturation, requests may block waiting for a connection. Increase db_pool_size in gitlab.rb or investigate slow queries. - name: GitLab database connection pool dead connections description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) has {{ $value }} dead connections." query: "gitlab_database_connection_pool_dead > 0" severity: warning for: 5m - name: GitLab database connection pool waiting description: "GitLab on {{ $labels.instance }} has {{ $value }} threads waiting for a database connection." query: "gitlab_database_connection_pool_waiting > 0" severity: warning for: 5m # CI/CD pipelines - name: GitLab CI pipeline creation slow description: "GitLab CI pipeline creation p95 latency on {{ $labels.instance }} is above 30 seconds." query: "histogram_quantile(0.95, sum(rate(gitlab_ci_pipeline_creation_duration_seconds_bucket[5m])) by (le)) > 30" severity: warning for: 5m - name: GitLab CI pipeline failures increasing description: "GitLab CI pipeline failures are increasing on {{ $labels.instance }} ({{ $value }}/s)." query: "rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0" severity: warning for: 10m comments: | This metric may not exist in all GitLab versions. Verify against your GitLab installation. - name: GitLab CI runner authentication failures description: "GitLab CI runners are experiencing authentication failures on {{ $labels.instance }} ({{ $value }} failures)." query: "increase(gitlab_ci_runner_authentication_failure_total[5m]) > 5" severity: warning for: 5m comments: | Frequent runner auth failures may indicate expired tokens or misconfigured runners. # Ruby process health - name: GitLab high memory usage description: "GitLab process on {{ $labels.instance }} is using {{ $value | humanize1024 }}B of RSS memory." query: "process_resident_memory_bytes{job=~\".*gitlab.*\"} > 2e+9" severity: warning for: 10m comments: | Threshold of 2GB may need adjustment based on your instance size. High memory usage can lead to OOM kills and service disruptions. - name: GitLab Ruby heap fragmentation description: "GitLab Ruby heap fragmentation on {{ $labels.instance }} is {{ $value }}. High fragmentation wastes memory." query: "ruby_gc_stat_ext_heap_fragmentation{job=~\".*gitlab.*\"} > 0.5" severity: warning for: 15m comments: | Heap fragmentation above 50% means a significant amount of memory is wasted. A Puma worker restart may help reclaim memory. # Uncaught errors - name: GitLab rack uncaught errors description: "GitLab is experiencing uncaught errors in the Rack layer on {{ $labels.instance }} ({{ $value }}/s)." query: "rate(rack_uncaught_errors_total[5m]) > 0" severity: warning for: 5m # Application version / deployment - name: GitLab version mismatch description: "Multiple GitLab versions are running across the fleet." query: 'count(count by (version) (gitlab_build_info)) > 1' severity: warning comments: | This may happen during a rolling deployment. If it persists, investigate incomplete upgrades. # File descriptors - name: GitLab high file descriptor usage description: "GitLab on {{ $labels.instance }} is using {{ $value }}% of available file descriptors." query: 'process_open_fds{job=~".*gitlab.*"} / process_max_fds * 100 > 80 and process_max_fds > 0' severity: warning for: 5m # Ruby threads - name: GitLab Ruby threads saturated description: "GitLab running threads on {{ $labels.instance }} have exceeded the expected maximum ({{ $value }})." query: "sum by (instance) (gitlab_ruby_threads_running_threads) > on(instance) gitlab_ruby_threads_max_expected_threads * 1.5" severity: warning for: 10m - name: Workhorse slug: workhorse doc_url: https://docs.gitlab.com/administration/monitoring/prometheus/gitlab_metrics/#gitlab-workhorse rules: - name: GitLab Workhorse high error rate description: "GitLab Workhorse on {{ $labels.instance }} is returning more than 10% HTTP 5xx errors." query: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10 and sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) > 0' severity: critical for: 5m comments: | Workhorse sits in front of Puma and handles Git HTTP, file uploads, and proxying. Threshold from GitLab Omnibus default rules: 10% for high-traffic instances. - name: GitLab Workhorse high latency description: "GitLab Workhorse on {{ $labels.instance }} p95 request latency is above 10 seconds." query: "histogram_quantile(0.95, sum(rate(gitlab_workhorse_http_request_duration_seconds_bucket[5m])) by (le)) > 10" severity: warning for: 5m - name: GitLab Workhorse high in-flight requests description: "GitLab Workhorse on {{ $labels.instance }} has {{ $value }} in-flight requests." query: "gitlab_workhorse_http_in_flight_requests > 100" severity: warning for: 5m comments: | Threshold of 100 may need adjustment based on instance size. - name: Gitaly slug: gitaly doc_url: https://docs.gitlab.com/administration/gitaly/monitoring/ rules: - name: GitLab Gitaly high gRPC error rate description: "Gitaly on {{ $labels.instance }} is returning more than 5% gRPC errors." query: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0' severity: warning for: 5m comments: | grpc_code!="OK" includes non-error codes like NotFound, AlreadyExists. Consider filtering to specific error codes for less noise. - name: GitLab Gitaly resource exhausted description: "Gitaly on {{ $labels.instance }} is returning ResourceExhausted errors, indicating overload ({{ $value }}%)." query: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0' severity: critical for: 5m comments: | ResourceExhausted errors from Gitaly mean Git operations are being rejected due to concurrency limits. This directly impacts users trying to push, pull, or clone. This alert is derived from the GitLab Omnibus default rules. - name: GitLab Gitaly high RPC latency description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s)." query: 'histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job="gitaly",grpc_type="unary"}[5m])) by (le)) > 1' severity: warning for: 5m - name: GitLab Gitaly CPU throttled description: "Gitaly processes on {{ $labels.instance }} are being CPU throttled by cgroups." query: "rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0" severity: warning for: 5m - name: GitLab Gitaly authentication failures description: "Gitaly on {{ $labels.instance }} has authentication failures ({{ $value }})." query: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 0' severity: warning - name: GitLab Gitaly circuit breaker tripped description: "Gitaly circuit breaker has tripped on {{ $labels.instance }}. Git operations are failing." query: 'increase(gitaly_circuit_breaker_transitions_total{to_state="open"}[5m]) > 0' severity: critical comments: | When the circuit breaker trips to "open" state, Git operations (push, pull, clone) will fail. Check Gitaly service health and logs. - name: Spinnaker exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://spinnaker.io/docs/setup/other_config/monitoring/ rules: - name: Spinnaker circuit breaker open description: "Circuit breaker {{ $labels.name }} is open on {{ $labels.instance }}, indicating repeated downstream failures." query: 'resilience4j_circuitbreaker_state{state="open"} == 1' severity: warning for: 5m - name: Spinnaker Orca queue backing up description: "Orca work queue has {{ $value }} messages ready for delivery but not yet picked up. Pipeline executions may be delayed." query: 'queue_ready_depth > 0' severity: warning for: 5m comments: | In a healthy Spinnaker, queue_ready_depth should stay at or near 0. Sustained non-zero values indicate Orca cannot keep up with incoming work. - name: Spinnaker Orca queue message lag high description: "Orca queue message lag is {{ $value }}s. Pipeline stages are waiting too long before being processed." query: 'rate(queue_message_lag_seconds_sum[5m]) / rate(queue_message_lag_seconds_count[5m]) > 30 and rate(queue_message_lag_seconds_count[5m]) > 0' severity: warning for: 5m comments: | The 30s threshold is a rough default. Adjust based on your pipeline SLOs. - name: Spinnaker dead messages description: "Orca is producing dead-lettered messages ({{ $value }} per second). These are tasks that exhausted all retries and will not be executed." query: 'rate(queue_dead_messages_total[5m]) > 0' severity: critical for: 2m - name: Spinnaker zombie executions description: "{{ $value }} zombie pipeline executions detected. These are executions with no corresponding queue messages." query: 'rate(queue_zombies_total[5m]) > 0' severity: warning for: 5m comments: | Zombies are pipeline executions that are running but have lost their queue entry. See https://spinnaker.io/docs/guides/runbooks/orca-zombie-executions/ - name: Spinnaker thread pool exhaustion description: "Orca message handler thread pool has {{ $value }} blocked threads on {{ $labels.instance }}. Pipeline execution throughput is degraded." query: 'threadpool_blockingQueueSize > 0' severity: warning for: 5m - name: Spinnaker polling monitor items over threshold description: "Igor polling monitor {{ $labels.monitor }} for {{ $labels.partition }} has exceeded its item threshold, preventing pipeline triggers." query: 'sum by (monitor, partition) (pollingMonitor_itemsOverThreshold) > 0' severity: critical for: 5m comments: | When this threshold is exceeded, Igor stops triggering pipelines for the affected monitor. See https://kb.armory.io/s/article/Hitting-Igor-s-caching-thresholds - name: Spinnaker polling monitor failures description: "Igor polling monitor is experiencing failures ({{ $value }} per second). CI/SCM integrations may not trigger pipelines." query: 'rate(pollingMonitor_failed_total[5m]) > 0' severity: warning for: 5m - name: Spinnaker high API error rate description: "Spinnaker API 5xx error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}." query: 'sum by (instance) (rate(controller_invocations_total{status="5xx"}[5m])) / sum by (instance) (rate(controller_invocations_total[5m])) > 0.05 and sum by (instance) (rate(controller_invocations_total[5m])) > 0' severity: warning for: 5m comments: | The 5% threshold is a rough default. Adjust based on your traffic patterns. - name: Spinnaker API rate limit throttling description: "Gate is actively throttling API requests on {{ $labels.instance }} ({{ $value }} throttled requests per second)." query: 'rate(rateLimitThrottling_total[5m]) > 0' severity: warning for: 2m - name: Spinnaker Clouddriver high error rate description: "Clouddriver 5xx error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}. Cloud operations may be failing." query: 'sum by (instance) (rate(controller_invocations_total{status="5xx", job=~".*clouddriver.*"}[5m])) / sum by (instance) (rate(controller_invocations_total{job=~".*clouddriver.*"}[5m])) > 0.05 and sum by (instance) (rate(controller_invocations_total{job=~".*clouddriver.*"}[5m])) > 0' severity: warning for: 5m - name: Spinnaker AWS rate limiting description: "Clouddriver is being rate-limited by AWS on {{ $labels.instance }} ({{ $value }}ms delay). Cloud operations will be slower." query: 'amazonClientProvider_rateLimitDelayMil > 1000' severity: warning for: 5m comments: | This metric is specific to AWS cloud providers in Clouddriver. The 1000ms threshold is a rough default. Adjust based on your AWS usage patterns. - name: Network and security services: - name: SpeedTest exporters: - name: Speedtest exporter slug: nlamirault-speedtest-exporter doc_url: https://github.com/nlamirault/speedtest_exporter rules: - name: SpeedTest Slow Internet Download description: Internet download speed is currently {{humanize $value}} Mbps. query: "avg_over_time(speedtest_download[10m]) < 100" severity: warning - name: SpeedTest Slow Internet Upload description: Internet upload speed is currently {{humanize $value}} Mbps. query: "avg_over_time(speedtest_upload[10m]) < 20" severity: warning - name: SSL/TLS exporters: - name: ssl_exporter slug: ribbybibby-ssl-exporter doc_url: https://github.com/ribbybibby/ssl_exporter rules: - name: SSL certificate probe failed description: Failed to fetch SSL information {{ $labels.instance }} query: ssl_probe_success == 0 severity: critical - name: SSL certificate OSCP status unknown description: Failed to get the OSCP status {{ $labels.instance }} query: ssl_ocsp_response_status == 2 severity: warning - name: SSL certificate revoked description: SSL certificate revoked {{ $labels.instance }} query: ssl_ocsp_response_status == 1 severity: critical - name: SSL certificate expiry (< 7 days) description: "{{ $labels.instance }} Certificate is expiring in 7 days" query: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 7 severity: warning - name: cert-manager exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://cert-manager.io/docs/devops-tips/prometheus-metrics/ rules: - name: Cert-Manager absent description: Cert-Manager has disappeared from Prometheus service discovery. New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back. query: 'absent(up{job="cert-manager"})' severity: critical for: 10m - name: Cert-Manager certificate expiring soon description: The certificate {{ $labels.name }} is expiring in less than 21 days. query: 'avg by (exported_namespace, namespace, name) (certmanager_certificate_expiration_timestamp_seconds - time()) < (21 * 24 * 3600)' severity: warning for: 1h comments: | Threshold of 21 days is a rough default. ACME certificates are typically renewed 30 days before expiry, so expiring within 21 days may indicate issuer misconfiguration. - name: Cert-Manager certificate not ready description: "The certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready to serve traffic." query: 'max by (name, exported_namespace, namespace, condition) (certmanager_certificate_ready_status{condition!="True"} == 1)' severity: critical for: 10m - name: Cert-Manager hitting ACME rate limits description: Cert-Manager is being rate-limited by the ACME provider. Certificate issuance and renewal may be blocked for up to a week. query: 'sum by (host) (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) > 0' severity: critical for: 5m comments: | In cert-manager 1.19+, the metric was renamed (dropped http_ prefix). Verify metric name against your version. - name: Juniper exporters: - name: czerwonk/junos_exporter slug: czerwonk-junos-exporter doc_url: https://github.com/czerwonk/junos_exporter rules: - name: Juniper switch down description: The switch appears to be down query: junos_up == 0 severity: critical - name: Juniper critical Bandwidth Usage 1GiB description: Interface is highly saturated. (> 0.90GiB/s) query: "rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.90" severity: critical for: 1m - name: Juniper warning Bandwidth Usage 1GiB description: Interface is getting saturated. (> 0.80GiB/s) query: "rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.80" severity: warning for: 1m - name: CoreDNS exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: CoreDNS Panic Count description: Number of CoreDNS panics encountered query: "increase(coredns_panics_total[1m]) > 0" severity: critical - name: Freeswitch exporters: - name: znerol/prometheus-freeswitch-exporter slug: znerol-freeswitch-exporter doc_url: https://pypi.org/project/prometheus-freeswitch-exporter rules: - name: Freeswitch down description: Freeswitch is unresponsive query: "freeswitch_up == 0" severity: critical - name: Freeswitch Sessions Warning description: 'High sessions usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%' query: "(freeswitch_session_active * 100 / freeswitch_session_limit) > 80 and freeswitch_session_limit > 0" severity: warning for: 10m - name: Freeswitch Sessions Critical description: 'High sessions usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%' query: "(freeswitch_session_active * 100 / freeswitch_session_limit) > 90 and freeswitch_session_limit > 0" severity: critical for: 5m - name: Hashicorp Vault exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://github.com/hashicorp/vault/blob/master/website/content/docs/configuration/telemetry.mdx#prometheus rules: - name: Vault sealed description: "Vault instance is sealed on {{ $labels.instance }}" query: "vault_core_unsealed == 0" severity: critical - name: Vault too many pending tokens description: 'Too many pending tokens {{ $labels.instance }}: {{ $value | printf "%.2f"}}%' query: "avg(vault_token_create_count - vault_token_store_count) > 0" severity: warning for: 5m - name: Vault too many infinity tokens description: 'Too many infinity tokens {{ $labels.instance }}: {{ $value | printf "%.2f"}}%' query: 'vault_token_count_by_ttl{creation_ttl="+Inf"} > 3' severity: warning for: 5m - name: Vault cluster health description: 'Vault cluster is not healthy {{ $labels.instance }}: {{ $value | printf "%.2f"}}%' query: "sum(vault_core_active) / count(vault_core_active) <= 0.5" severity: critical - name: Keycloak exporters: - name: aerogear/keycloak-metrics-spi slug: aerogear-keycloak-metrics-spi doc_url: https://github.com/aerogear/keycloak-metrics-spi rules: - name: Keycloak high login failure rate description: "More than 5% of login attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%)." query: '(sum by (realm) (rate(keycloak_failed_login_attempts_total[5m])) / (sum by (realm) (rate(keycloak_logins_total[5m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[5m])))) * 100 > 5 and (sum by (realm) (rate(keycloak_logins_total[5m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[5m]))) > 0' severity: warning for: 5m comments: | Threshold of 5% is a rough default. Adjust based on your user base and expected error rates. A spike in failed logins may indicate a brute-force attack or misconfigured client. - name: Keycloak no successful logins description: "No successful logins in realm {{ $labels.realm }} for the last 15 minutes." query: 'sum by (realm) (rate(keycloak_logins_total[15m])) == 0 and (sum by (realm) (rate(keycloak_logins_total[15m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[15m]))) > 0' severity: critical for: 5m comments: Only fires when login attempts exist but none succeed — may indicate an authentication outage. - name: Keycloak high token refresh error rate description: "More than 10% of token refresh attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%)." query: '(sum by (realm) (rate(keycloak_refresh_tokens_errors_total[5m])) / sum by (realm) (rate(keycloak_refresh_tokens_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_refresh_tokens_total[5m])) > 0' severity: warning for: 5m comments: Threshold of 10% is a rough default. High refresh token errors may indicate expired sessions or token store issues. - name: Keycloak high code-to-token exchange error rate description: "More than 10% of code-to-token exchanges are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%)." query: '(sum by (realm) (rate(keycloak_code_to_tokens_errors_total[5m])) / sum by (realm) (rate(keycloak_code_to_tokens_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_code_to_tokens_total[5m])) > 0' severity: warning for: 5m comments: Threshold of 10% is a rough default. Code-to-token failures may indicate misconfigured OAuth clients or replay attacks. - name: Keycloak high registration failure rate description: "More than 10% of registration attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%)." query: '(sum by (realm) (rate(keycloak_registrations_errors_total[5m])) / sum by (realm) (rate(keycloak_registrations_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_registrations_total[5m])) > 0' severity: warning for: 5m comments: Threshold of 10% is a rough default. - name: Keycloak slow request response time description: "Keycloak {{ $labels.method }} requests are taking more than 2 seconds on average." query: 'sum by (method) (rate(keycloak_request_duration_sum[5m])) / sum by (method) (rate(keycloak_request_duration_count[5m])) > 2000 and sum by (method) (rate(keycloak_request_duration_count[5m])) > 0' severity: warning for: 5m comments: | keycloak_request_duration is in milliseconds. Threshold of 2000ms (2 seconds) is a rough default. - name: Cloudflare exporters: - name: lablabs/cloudflare-exporter slug: lablabs-cloudflare-exporter doc_url: https://github.com/lablabs/cloudflare-exporter rules: - name: Cloudflare http 4xx error rate description: "Cloudflare high HTTP 4xx error rate (> 5% for domain {{ $labels.zone }})" query: '(sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[15m])) > 0' severity: warning - name: Cloudflare http 5xx error rate description: "Cloudflare high HTTP 5xx error rate (> 5% for domain {{ $labels.zone }})" query: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[5m])) > 0' severity: critical - name: SNMP exporters: - name: prometheus/snmp_exporter slug: snmp-exporter doc_url: https://github.com/prometheus/snmp_exporter comments: | These rules use standard IF-MIB and SNMPv2-MIB metrics. Metric names depend on your snmp.yml module configuration. Thresholds for bandwidth and error rates are rough defaults - adjust to your environment. rules: - name: SNMP target down description: "SNMP device {{ $labels.instance }} is unreachable." query: 'up{job=~"snmp.*"} == 0' severity: critical for: 5m comments: From the official snmp-mixin. - name: SNMP interface down description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} is operationally down while administratively up." query: '(ifOperStatus{job=~"snmp.*"} == 2) and on(instance, job, ifIndex) (ifAdminStatus{job=~"snmp.*"} == 1)' severity: critical for: 2m - name: SNMP interface high inbound error rate description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an inbound error rate above 5%." query: 'rate(ifInErrors{job=~"snmp.*"}[5m]) / (rate(ifHCInUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInMulticastPkts{job=~"snmp.*"}[5m])) > 0.05 and (rate(ifHCInUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInMulticastPkts{job=~"snmp.*"}[5m])) > 0' severity: warning for: 5m comments: Threshold is a rough default. Adjust based on your network environment. - name: SNMP interface high outbound error rate description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an outbound error rate above 5%." query: 'rate(ifOutErrors{job=~"snmp.*"}[5m]) / (rate(ifHCOutUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutMulticastPkts{job=~"snmp.*"}[5m])) > 0.05 and (rate(ifHCOutUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutMulticastPkts{job=~"snmp.*"}[5m])) > 0' severity: warning for: 5m comments: Threshold is a rough default. Adjust based on your network environment. - name: SNMP interface high bandwidth usage inbound description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} inbound utilization is above 80%." query: 'rate(ifHCInOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0' severity: warning for: 15m comments: | Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead. - name: SNMP interface high bandwidth usage outbound description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} outbound utilization is above 80%." query: 'rate(ifHCOutOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0' severity: warning for: 15m comments: | Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead. - name: SNMP device restarted description: "SNMP device {{ $labels.instance }} has restarted (uptime < 5 minutes)." query: "sysUpTime / 100 < 300" severity: info comments: sysUpTime is in centiseconds (hundredths of a second). - name: Cilium exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://docs.cilium.io/en/stable/observability/metrics/ rules: # Agent health - name: Cilium agent unreachable nodes description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} node(s). Check network connectivity and node health." query: "sum(cilium_unreachable_nodes{}) by (pod) > 0" severity: warning for: 15m comments: | Metric name depends on Cilium version. Use cilium_unreachable_nodes (older) or cilium_node_connectivity_status (1.14+). - name: Cilium agent unreachable health endpoints description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} health endpoint(s). Node-to-node health probes are failing." query: "sum(cilium_unreachable_health_endpoints{}) by (pod) > 0" severity: warning for: 15m comments: | Metric name depends on Cilium version. Use cilium_unreachable_health_endpoints (older) or cilium_node_connectivity_status (1.14+). - name: Cilium agent failing controllers description: "Cilium agent {{ $labels.pod }} has {{ $value }} failing controller(s). Check cilium-agent logs for details." query: "sum(cilium_controllers_failing{}) by (pod) > 0" severity: warning for: 5m comments: | Metric name depends on Cilium version. Use cilium_controllers_failing (older) or cilium_controllers_runs_total (1.14+). # Endpoints - name: Cilium agent endpoint failures description: "Cilium agent {{ $labels.pod }} has {{ $value }} endpoint(s) in invalid state." query: 'sum(cilium_endpoint_state{endpoint_state="invalid"}) by (pod) > 0' severity: warning for: 5m - name: Cilium agent endpoint regeneration failures description: "Cilium agent {{ $labels.pod }} is failing to regenerate endpoints. Network policy enforcement may be stale." query: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0' severity: warning for: 5m - name: Cilium agent endpoint update failure description: "Cilium agent {{ $labels.pod }} is failing K8s endpoint update API calls ({{ $labels.method }} {{ $labels.return_code }})." query: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0' severity: warning for: 5m - name: Cilium agent endpoint create failure description: "Cilium agent {{ $labels.pod }} is failing CNI endpoint-create calls. New pods may fail to get networking." query: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0' severity: info for: 5m # BPF maps - name: Cilium agent map operation failures description: "Cilium agent {{ $labels.pod }} has eBPF map operation failures on {{ $labels.map_name }}. Datapath may be degraded." query: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0' severity: warning for: 5m - name: Cilium agent BPF map pressure description: "Cilium agent {{ $labels.pod }} eBPF map {{ $labels.map_name }} is above 90% utilization. Map may become full." query: "cilium_bpf_map_pressure{} > 0.9" severity: warning for: 5m comments: Map pressure is a ratio from 0 to 1. At 1.0, the map is full and new entries will be dropped. # Conntrack and NAT - name: Cilium agent conntrack table full description: "Cilium agent {{ $labels.pod }} conntrack table is full, causing packet drops. Increase CT map size or investigate connection leaks." query: 'sum(rate(cilium_drop_count_total{reason="CT: Map insertion failed"}[5m])) by (pod) > 0' severity: critical for: 5m - name: Cilium agent conntrack failed garbage collection description: "Cilium agent {{ $labels.pod }} conntrack garbage collection is failing. Stale entries may accumulate." query: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0' severity: warning for: 5m - name: Cilium agent NAT table full description: "Cilium agent {{ $labels.pod }} NAT table is full, causing masquerade failures. Increase NAT map size or investigate." query: 'sum(rate(cilium_drop_count_total{reason="No mapping for NAT masquerade"}[1m])) by (pod) > 0' severity: critical for: 5m # Packet drops - name: Cilium agent high denied rate description: "Cilium agent {{ $labels.pod }} is dropping packets due to policy denial. Verify network policies are correct." query: 'sum(rate(cilium_drop_count_total{reason="Policy denied"}[1m])) by (pod) > 0' severity: info for: 10m comments: Policy denials may be expected behavior. Investigate only if unexpected traffic is being blocked. - name: Cilium agent high drop rate description: "Cilium agent {{ $labels.pod }} is dropping packets for reason {{ $labels.reason }}. This indicates infrastructure issues." query: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0' severity: warning for: 5m # Policy - name: Cilium agent policy map pressure description: "Cilium agent {{ $labels.pod }} policy BPF map is above 90% utilization. New policies may fail to apply." query: 'sum(cilium_bpf_map_pressure{map_name=~"cilium_policy_.*"}) by (pod) > 0.9' severity: warning for: 5m - name: Cilium agent policy import errors description: "Cilium agent {{ $labels.pod }} is failing to import network policies. Policy enforcement may be incomplete." query: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0' severity: warning for: 5m - name: Cilium agent policy implementation delay description: "Cilium agent {{ $labels.pod }} P99 policy deployment latency exceeds 60 seconds. Endpoints may run with stale policies." query: "histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay[5m])) by (le, pod)) > 60" severity: warning for: 5m comments: Threshold of 60s is a rough default. Adjust based on cluster size and policy complexity. # Identity - name: Cilium node-local high identity allocation description: "Cilium agent {{ $labels.pod }} node-local identity allocation is above 80%. Approaching the 65535 identity limit." query: '(sum(cilium_identity{type="node_local"}) by (pod) / (2^16-1)) > 0.8' severity: warning for: 5m - name: Cilium cluster high identity allocation description: "Cilium cluster-wide identity allocation is above 80%. Approaching the maximum identity limit." query: '(sum(cilium_identity{type="cluster_local"}) by () / (2^16-256)) > 0.8' severity: warning for: 5m # IPAM - name: Cilium operator exhausted IPAM IPs description: "Cilium operator has no available IPAM IPs. New pods will fail to schedule networking." query: 'sum(cilium_operator_ipam_ips{type="available"}) by () <= 0' severity: critical for: 5m - name: Cilium operator low available IPAM IPs description: "Cilium operator IPAM IP pool is over 90% utilized. Allocate more IPs to avoid exhaustion." query: 'sum(cilium_operator_ipam_ips{type!="available"}) by () / sum(cilium_operator_ipam_ips) by () > 0.9 and sum(cilium_operator_ipam_ips) by () > 0' severity: warning for: 5m comments: Threshold of 90% is a rough default. Adjust based on your pod churn rate and IP pool size. - name: Cilium operator IPAM interface creation failures description: "Cilium operator is failing to create IPAM network interfaces. IP allocation may be impacted." query: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0' severity: warning for: 10m comments: | Some Cilium versions may not have a status label on this metric. Verify against your Cilium version. # API and K8s client - name: Cilium agent API errors description: "Cilium agent {{ $labels.pod }} API is returning 5xx errors ({{ $labels.return_code }}). Agent may be unhealthy." query: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0' severity: warning for: 5m - name: Cilium agent Kubernetes client errors description: "Cilium agent {{ $labels.pod }} is receiving errors from K8s API for endpoint {{ $labels.endpoint }} ({{ $labels.return_code }})." query: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0' severity: info for: 5m # ClusterMesh - name: Cilium ClusterMesh remote cluster not ready description: "Cilium ClusterMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}." query: "count(cilium_clustermesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0" severity: critical for: 5m - name: Cilium ClusterMesh remote cluster failing description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing." query: "sum(rate(cilium_clustermesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0" severity: critical for: 5m # KVStoreMesh - name: Cilium KVStoreMesh remote cluster not ready description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}." query: "count(cilium_kvstoremesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0" severity: critical for: 5m - name: Cilium KVStoreMesh remote cluster failing description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures." query: "sum(rate(cilium_kvstoremesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0" severity: critical for: 5m - name: Cilium KVStoreMesh sync errors description: "Cilium KVStoreMesh from {{ $labels.source_cluster }} is experiencing kvstore sync errors." query: "sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0" severity: critical for: 5m # Hubble - name: Cilium Hubble lost events description: "Cilium Hubble on {{ $labels.pod }} is losing flow events. Observability data may be incomplete." query: "sum(rate(hubble_lost_events_total[5m])) by (pod) > 0" severity: warning for: 5m - name: Cilium Hubble high DNS error rate description: "Cilium Hubble on {{ $labels.pod }} is observing more than 10% DNS error responses." query: 'sum(rate(hubble_dns_responses_total{rcode!="No Error"}[5m])) by (pod) / sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0.1 and sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0' severity: warning for: 5m comments: Threshold of 10% is a rough default. Some DNS errors may be normal depending on your workload. - name: WireGuard exporters: - name: MindFlavor/prometheus_wireguard_exporter slug: mindflavor-prometheus-wireguard-exporter doc_url: https://github.com/MindFlavor/prometheus_wireguard_exporter rules: - name: WireGuard peer handshake too old description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has not had a handshake for over 5 minutes. The tunnel may be down." query: 'time() - wireguard_latest_handshake_seconds > 300 and wireguard_latest_handshake_seconds > 0' severity: warning for: 2m comments: | The threshold of 300 seconds (5 minutes) is a rough default. WireGuard peers that are idle but reachable typically re-handshake every 2 minutes. Adjust based on your keepalive interval. The `> 0` guard excludes peers that have never completed a handshake (covered by a separate rule). - name: WireGuard peer handshake never established description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has never completed a handshake. Check peer configuration and network connectivity." query: 'wireguard_latest_handshake_seconds == 0' severity: critical for: 5m comments: | This alert will fire for all offline mobile/laptop peers. Consider filtering by expected-online peers. - name: WireGuard no traffic on peer description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has had no traffic for 15 minutes despite an active handshake." query: '(rate(wireguard_sent_bytes_total[15m]) + rate(wireguard_received_bytes_total[15m])) == 0 and wireguard_latest_handshake_seconds > 0 and (time() - wireguard_latest_handshake_seconds) < 300' severity: warning for: 15m comments: | This alert fires when a peer has a recent handshake but zero traffic flow. May indicate routing issues or a misconfigured allowed-ips. Only useful if you expect continuous traffic on all peers. - name: Storage services: - name: Ceph exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://docs.ceph.com/en/quincy/mgr/prometheus/ rules: - name: Ceph State description: Ceph instance unhealthy query: "ceph_health_status != 0" severity: critical - name: Ceph monitor clock skew description: Ceph monitor clock skew detected. Please check ntp and hardware clock settings query: "abs(ceph_monitor_clock_skew_seconds) > 0.2" severity: warning for: 2m - name: Ceph monitor low space description: Ceph monitor storage is low. query: "ceph_monitor_avail_percent < 10" severity: warning for: 2m - name: Ceph OSD Down description: Ceph Object Storage Daemon Down query: "ceph_osd_up == 0" severity: critical - name: Ceph high OSD latency description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state." query: "ceph_osd_perf_apply_latency_seconds > 5" severity: warning for: 1m - name: Ceph OSD low space description: Ceph Object Storage Daemon is going out of space. Please add more disks. query: ceph_osd_utilization > 90 severity: warning for: 2m - name: Ceph OSD reweighted description: Ceph Object Storage Daemon takes too much time to resize. query: "ceph_osd_weight < 1" severity: warning for: 2m - name: Ceph PG down description: Some Ceph placement groups are down. Please ensure that all the data are available. query: "ceph_pg_down > 0" severity: critical - name: Ceph PG incomplete description: Some Ceph placement groups are incomplete. Please ensure that all the data are available. query: "ceph_pg_incomplete > 0" severity: critical - name: Ceph PG inconsistent description: Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes. query: ceph_pg_inconsistent > 0 severity: warning - name: Ceph PG activation long description: Some Ceph placement groups are too long to activate. query: "ceph_pg_activating > 0" severity: warning for: 2m - name: Ceph PG backfill full description: Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules. query: "ceph_pg_backfill_toofull > 0" severity: warning for: 2m - name: Ceph PG unavailable description: Some Ceph placement groups are unavailable. query: "ceph_pg_total - ceph_pg_active > 0" severity: critical - name: ZFS exporters: - name: node-exporter slug: node-exporter doc_url: https://github.com/prometheus/node_exporter rules: - name: ZFS offline pool description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}." query: 'node_zfs_zpool_state{state!="online"} > 0' severity: critical for: 1m - name: ZFS exporter slug: zfs_exporter doc_url: https://github.com/pdf/zfs_exporter rules: - name: ZFS pool out of space description: Disk is almost full (< 10% left) query: "zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0 and zfs_pool_size_bytes > 0" severity: warning - name: ZFS pool unhealthy description: ZFS pool state is {{ $value }}. See comments for more information. query: "zfs_pool_health > 0" severity: critical comments: | 0: ONLINE 1: DEGRADED 2: FAULTED 3: OFFLINE 4: UNAVAIL 5: REMOVED 6: SUSPENDED - name: ZFS collector failed description: ZFS collector for {{ $labels.instance }} has failed to collect information query: "zfs_scrape_collector_success != 1" severity: warning - name: OpenEBS exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: OpenEBS used pool capacity description: "OpenEBS Pool use more than 80% of his capacity" query: "openebs_used_pool_capacity_percent > 80" severity: warning for: 2m - name: Minio exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: Minio cluster disk offline description: "Minio cluster disk is offline" query: "minio_cluster_drive_offline_total > 0" severity: critical - name: Minio node disk offline description: "Minio cluster node disk is offline" query: "minio_cluster_nodes_offline_total > 0" severity: critical - name: Minio disk space usage description: "Minio available free space is low (< 10%)" query: minio_cluster_capacity_raw_free_bytes / minio_cluster_capacity_raw_total_bytes * 100 < 10 and minio_cluster_capacity_raw_total_bytes > 0 severity: warning - name: Cloud providers services: - name: AWS CloudWatch exporters: - name: prometheus/cloudwatch_exporter slug: prometheus-cloudwatch-exporter doc_url: https://github.com/prometheus/cloudwatch_exporter comments: | CloudWatch metrics are exported as aws_{namespace}_{metric_name}_{statistic} gauges. The rules below cover both exporter health and common AWS service alerts. Adjust thresholds and label filters to match your CloudWatch exporter configuration. rules: - name: CloudWatch exporter scrape error description: "CloudWatch exporter on {{ $labels.instance }} failed to scrape metrics from AWS CloudWatch API." query: "cloudwatch_exporter_scrape_error > 0" severity: warning for: 5m - name: CloudWatch exporter slow scrape description: "CloudWatch exporter on {{ $labels.instance }} scrape is taking more than 5 minutes ({{ $value }}s). Consider reducing the number of metrics or splitting across multiple exporters." query: "cloudwatch_exporter_scrape_duration_seconds > 300" severity: warning for: 5m - name: CloudWatch API high request rate description: "CloudWatch exporter on {{ $labels.instance }} is making {{ $value }} API calls per minute to namespace {{ $labels.namespace }}. This can lead to high AWS costs." query: "sum by (instance, namespace) (rate(cloudwatch_requests_total[5m])) * 60 > 100" severity: warning comments: | CloudWatch API calls cost money (~$0.01 per 1000 GetMetricData requests). 100 requests/minute ≈ $45/month. Adjust the threshold based on your budget. - name: AWS EC2 high CPU utilization description: "EC2 instance {{ $labels.instance_id }} CPU utilization is above 90% ({{ $value }}%)." query: "aws_ec2_cpuutilization_average > 90" severity: warning for: 15m comments: Requires EC2 CPUUtilization metric configured in the CloudWatch exporter. - name: AWS RDS low free storage space description: "RDS instance {{ $labels.dbinstance_identifier }} has less than 2GB free storage ({{ $value }} bytes remaining)." query: "aws_rds_free_storage_space_average < 2000000000" severity: warning for: 5m comments: | Requires RDS FreeStorageSpace metric. The threshold of 2GB is a rough default. Adjust based on your database size. - name: AWS RDS high CPU utilization description: "RDS instance {{ $labels.dbinstance_identifier }} CPU utilization is above 90% ({{ $value }}%)." query: "aws_rds_cpuutilization_average > 90" severity: warning for: 15m comments: Requires RDS CPUUtilization metric configured in the CloudWatch exporter. - name: AWS RDS high database connections description: "RDS instance {{ $labels.dbinstance_identifier }} has {{ $value }} active connections." query: "aws_rds_database_connections_average > 100" severity: warning for: 5m comments: | The threshold depends on the RDS instance class. Adjust based on your instance type's max_connections parameter. - name: AWS SQS queue messages visible description: "SQS queue {{ $labels.queue_name }} has {{ $value }} messages waiting to be processed." query: "aws_sqs_approximate_number_of_messages_visible_average > 1000" severity: warning for: 10m comments: | Requires SQS ApproximateNumberOfMessagesVisible metric. The threshold of 1000 is a rough default. Adjust based on your expected queue depth. - name: AWS SQS message age too old description: "SQS queue {{ $labels.queue_name }} has messages older than 1 hour ({{ $value }}s)." query: "aws_sqs_approximate_age_of_oldest_message_maximum > 3600" severity: warning comments: Requires SQS ApproximateAgeOfOldestMessage metric. - name: AWS ALB unhealthy targets description: "ALB {{ $labels.load_balancer }} has {{ $value }} unhealthy target(s) in target group {{ $labels.target_group }}." query: "aws_applicationelb_unhealthy_host_count_average > 0" severity: critical for: 5m comments: Requires ApplicationELB UnHealthyHostCount metric. - name: AWS ALB high 5xx error rate description: "ALB {{ $labels.load_balancer }} 5xx error rate is above 5% ({{ $value }}%)." query: "(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5 and aws_applicationelb_request_count_sum > 0" severity: critical for: 5m comments: Requires ApplicationELB HTTPCode_ELB_5XX_Count and RequestCount metrics. - name: AWS ALB high target response time description: "ALB {{ $labels.load_balancer }} average target response time is above 2 seconds ({{ $value }}s)." query: "aws_applicationelb_target_response_time_average > 2" severity: warning for: 5m comments: Requires ApplicationELB TargetResponseTime metric. - name: AWS Lambda high error rate description: "Lambda function {{ $labels.function_name }} error rate is above 5% ({{ $value }}%)." query: "(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5 and aws_lambda_invocations_sum > 0" severity: warning for: 5m comments: Requires Lambda Errors and Invocations metrics. - name: Google Cloud Stackdriver exporters: - name: prometheus-community/stackdriver_exporter slug: stackdriver-exporter doc_url: https://github.com/prometheus-community/stackdriver_exporter comments: | Self-monitoring metrics use the stackdriver_monitoring_* prefix. All self-monitoring metrics include a project_id label. rules: - name: Stackdriver exporter scrape error description: "Stackdriver exporter failed to scrape metrics from Google Cloud Monitoring API for project {{ $labels.project_id }}." query: "stackdriver_monitoring_last_scrape_error > 0" severity: warning for: 5m - name: Stackdriver exporter slow scrape description: "Stackdriver exporter scrape for project {{ $labels.project_id }} is taking more than 5 minutes ({{ $value }}s)." query: "stackdriver_monitoring_last_scrape_duration_seconds > 300" severity: warning for: 5m - name: Stackdriver exporter scrape errors increasing description: "Stackdriver exporter has had {{ $value }} scrape errors in the last 15 minutes for project {{ $labels.project_id }}." query: "increase(stackdriver_monitoring_scrape_errors_total[15m]) > 5" severity: warning - name: Stackdriver exporter high API calls description: "Stackdriver exporter is making {{ $value }} API calls per minute for project {{ $labels.project_id }}. This may hit Google Cloud Monitoring API quotas." query: "rate(stackdriver_monitoring_api_calls_total[5m]) * 60 > 100" severity: warning - name: Stackdriver exporter scrape stale description: "Stackdriver exporter has not successfully scraped metrics for project {{ $labels.project_id }} in the last 10 minutes." query: "time() - stackdriver_monitoring_last_scrape_timestamp > 600" severity: warning - name: DigitalOcean exporters: - name: metalmatze/digitalocean_exporter slug: digitalocean-exporter doc_url: https://github.com/metalmatze/digitalocean_exporter rules: - name: DigitalOcean droplet down description: "DigitalOcean droplet {{ $labels.name }} ({{ $labels.id }}) in {{ $labels.region }} is not running." query: "digitalocean_droplet_up == 0" severity: critical for: 5m - name: DigitalOcean account not active description: "DigitalOcean account is not active. It may be suspended or locked." query: "digitalocean_account_active != 1" severity: critical for: 5m - name: DigitalOcean database down description: "DigitalOcean managed database {{ $labels.name }} ({{ $labels.engine }}) in {{ $labels.region }} is offline." query: "digitalocean_database_status == 0" severity: critical for: 2m - name: DigitalOcean Kubernetes cluster down description: "DigitalOcean Kubernetes cluster {{ $labels.name }} ({{ $labels.version }}) in {{ $labels.region }} is not running." query: "digitalocean_kubernetes_cluster_up == 0" severity: critical for: 5m - name: DigitalOcean load balancer down description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) is not active." query: "digitalocean_loadbalancer_status == 0" severity: critical for: 2m - name: DigitalOcean load balancer no backends description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) has no droplets attached." query: "digitalocean_loadbalancer_droplets == 0" severity: warning for: 1m - name: DigitalOcean floating IP not assigned description: "DigitalOcean floating IP {{ $labels.ipv4 }} in {{ $labels.region }} is not assigned to any droplet." query: "digitalocean_floating_ipv4_active == 0" severity: warning - name: DigitalOcean active incidents description: "DigitalOcean platform has {{ $value }} active incident(s)." query: "digitalocean_incidents_total > 0" severity: warning - name: DigitalOcean exporter collection errors description: "DigitalOcean exporter {{ $labels.collector }} collector has {{ $value }} errors." query: "increase(digitalocean_errors_total[5m]) > 0" severity: warning for: 5m - name: DigitalOcean droplet limit approaching description: "DigitalOcean account is using {{ $value }}% of its droplet quota." query: "(count(digitalocean_droplet_up) / digitalocean_account_droplet_limit) * 100 > 80 and digitalocean_account_droplet_limit > 0" severity: warning comments: Fires when more than 80% of the account's droplet limit is in use. - name: Azure exporters: - name: webdevops/azure-metrics-exporter slug: azure-metrics-exporter doc_url: https://github.com/webdevops/azure-metrics-exporter comments: | The exporter uses azurerm_resource_metric as the default metric name for forwarded Azure Monitor metrics. The metric name can be customized via the name parameter in probe configuration. Self-monitoring metrics use the azurerm_stats_* and azurerm_api_* prefixes. rules: - name: Azure exporter request errors description: "Azure metrics exporter on {{ $labels.instance }} has {{ $value }} API request errors in the last 15 minutes." query: 'increase(azurerm_stats_metric_requests{result="error"}[15m]) > 5' severity: warning - name: Azure exporter high error rate description: "Azure metrics exporter on {{ $labels.instance }} has an error rate above 10% ({{ $value }}%)." query: 'sum by (instance) (rate(azurerm_stats_metric_requests{result="error"}[5m])) / sum by (instance) (rate(azurerm_stats_metric_requests[5m])) * 100 > 10 and sum by (instance) (rate(azurerm_stats_metric_requests[5m])) > 0' severity: warning for: 5m - name: Azure API read rate limit approaching description: "Azure API read rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining)." query: 'azurerm_api_ratelimit{type="read"} < 100' severity: warning comments: | Azure Resource Manager enforces rate limits per subscription. The threshold of 100 remaining calls is a rough default. Adjust based on your scrape interval and number of monitored resources. - name: Azure API write rate limit approaching description: "Azure API write rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining)." query: 'azurerm_api_ratelimit{type="write"} < 50' severity: warning - name: Azure exporter slow collection description: "Azure metrics exporter on {{ $labels.instance }} metric collection is taking more than 5 minutes ({{ $value }}s)." query: "azurerm_stats_metric_collecttime > 300" severity: warning for: 5m - name: Observability services: - name: Thanos exporters: - name: Thanos Compactor slug: thanos-compactor rules: - name: Thanos Compactor Multiple Running description: "No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running." query: 'sum by (job) (up{job=~".*thanos-compact.*"}) > 1' severity: warning for: 5m - name: Thanos Compactor Halted description: "Thanos Compact {{$labels.job}} has failed to run and now is halted." query: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1' severity: warning for: 5m - name: Thanos Compactor High Compaction Failures description: "Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions." query: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) > 0' severity: warning for: 15m - name: Thanos Compact Bucket High Operation Failures description: "Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations." query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) > 0' severity: warning for: 15m - name: Thanos Compact Has Not Run description: "Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours." query: '(time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24' severity: warning - name: Thanos Query slug: thanos-query rules: - name: Thanos Query Http Request Query Error Rate High description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query" requests.' query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m])) > 0' severity: critical for: 5m - name: Thanos Query Http Request Query Range Error Rate High description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query_range" requests.' query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0' severity: critical for: 5m - name: Thanos Query Grpc Server Error Rate description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests." query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5) and sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) > 0' severity: warning for: 5m - name: Thanos Query Grpc Client Error Rate description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests." query: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5 and sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) > 0' severity: warning for: 5m - name: Thanos Query High D N S Failures description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints." query: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) > 0' severity: warning for: 15m - name: Thanos Query Instant Latency High description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries." query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)' severity: critical for: 10m - name: Thanos Query Range Latency High description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries." query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)' severity: critical for: 10m - name: Thanos Query Overload description: "Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support." query: "(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)" severity: warning for: 15m - name: Thanos Receiver slug: thanos-receiver rules: - name: Thanos Receive Http Request Error Rate High description: "Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% of requests." query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0' severity: critical for: 5m - name: Thanos Receive Http Request Latency High description: "Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests." query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)' severity: critical for: 10m - name: Thanos Receive High Replication Failures description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests." query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100' severity: warning for: 5m - name: Thanos Receive High Forward Request Failures description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests." query: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) > 0' severity: info for: 5m - name: Thanos Receive High Hashring File Refresh Failures description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed." query: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0' severity: warning for: 15m - name: Thanos Receive Config Reload Failure description: "Thanos Receive {{$labels.job}} has not been able to reload hashring configurations." query: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1' severity: warning for: 5m - name: Thanos Receive No Upload description: "Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage." query: '(up{job=~".*thanos-receive.*"} - 1) + on (job, instance) (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)' severity: critical for: 3h - name: Thanos Sidecar slug: thanos-sidecar rules: - name: Thanos Sidecar Bucket Operations Failed description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing ({{ $value | humanize }}/s)." query: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: critical for: 5m - name: Thanos Sidecar No Connection To Started Prometheus description: "Thanos Sidecar {{$labels.instance}} is unhealthy." query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0' severity: critical for: 5m - name: Thanos Store slug: thanos-store rules: - name: Thanos Store Grpc Error Rate description: "Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests." query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5) and sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) > 0' severity: warning for: 5m - name: Thanos Store Series Gate Latency High description: "Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests." query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)' severity: warning for: 10m - name: Thanos Store Bucket High Operation Failures description: "Thanos Store {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations." query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) > 0' severity: warning for: 15m - name: Thanos Store Objstore Operation Latency High description: "Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations." query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)' severity: warning for: 10m - name: Thanos Ruler slug: thanos-ruler rules: - name: Thanos Rule Queue Is Dropping Alerts description: "Thanos Rule {{$labels.instance}} is failing to queue alerts ({{ $value | humanize }}/s)." query: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0' severity: critical for: 5m - name: Thanos Rule Sender Is Failing Alerts description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager ({{ $value | humanize }}/s)." query: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0' severity: critical for: 5m - name: Thanos Rule High Rule Evaluation Failures description: "Thanos Rule {{$labels.instance}} is failing to evaluate {{$value | humanize}}% of rules." query: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5) and sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) > 0' severity: critical for: 5m - name: Thanos Rule High Rule Evaluation Warnings description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings ({{ $value | humanize }}/s)." query: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: info for: 15m - name: Thanos Rule Rule Evaluation Latency High description: "Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}." query: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))' severity: warning for: 5m - name: Thanos Rule Grpc Error Rate description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests." query: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5) and sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) > 0' severity: warning for: 5m - name: Thanos Rule Config Reload Failure description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration." query: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1' severity: info for: 5m - name: Thanos Rule Query High D N S Failures description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints." query: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0' severity: warning for: 15m - name: Thanos Rule Alertmanager High D N S Failures description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints." query: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0' severity: warning for: 15m - name: Thanos Rule No Evaluation For10 Intervals description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval." query: 'time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})' severity: info for: 5m - name: Thanos No Rule Evaluations description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes." query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0' severity: critical for: 5m - name: Thanos Bucket Replicate slug: thanos-bucket-replicate rules: - name: Thanos Bucket Replicate Error Rate description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed." query: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])) > 0' severity: critical for: 5m - name: Thanos Bucket Replicate Run Latency description: "Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations." query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)' severity: critical for: 5m - name: Thanos Component Absent slug: thanos-component-absent rules: - name: Thanos Compact Is Down description: "ThanosCompact has disappeared. Prometheus target for the component cannot be discovered." query: 'absent(up{job=~".*thanos-compact.*"} == 1)' severity: critical for: 5m - name: Thanos Query Is Down description: "ThanosQuery has disappeared. Prometheus target for the component cannot be discovered." query: 'absent(up{job=~".*thanos-query.*"} == 1)' severity: critical for: 5m - name: Thanos Receive Is Down description: "ThanosReceive has disappeared. Prometheus target for the component cannot be discovered." query: 'absent(up{job=~".*thanos-receive.*"} == 1)' severity: critical for: 5m - name: Thanos Rule Is Down description: "ThanosRule has disappeared. Prometheus target for the component cannot be discovered." query: 'absent(up{job=~".*thanos-rule.*"} == 1)' severity: critical for: 5m - name: Thanos Sidecar Is Down description: "ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered." query: 'absent(up{job=~".*thanos-sidecar.*"} == 1)' severity: critical for: 5m - name: Thanos Store Is Down description: "ThanosStore has disappeared. Prometheus target for the component cannot be discovered." query: absent(up{job=~".*thanos-store.*"} == 1) severity: critical for: 5m - name: Loki exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: Loki process too many restarts description: A loki process had too many restarts (target {{ $labels.instance }}) query: changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2 severity: warning - name: Loki request errors description: 'The {{ $labels.job }} and {{ $labels.route }} are experiencing {{ printf "%.2f" $value }}% errors.' query: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10 and sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 0' severity: critical for: 15m - name: Loki request panic description: The {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics query: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 severity: critical for: 5m - name: Loki request latency description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency query: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1 severity: critical for: 5m - name: Promtail exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: Promtail request errors description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. query: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10 and sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 0' severity: critical for: 5m - name: Promtail request latency description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. query: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1 severity: critical for: 5m - name: Cortex exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: Cortex ruler configuration reload failure description: Cortex ruler configuration reload failure (instance {{ $labels.instance }}) query: cortex_ruler_config_last_reload_successful != 1 severity: warning - name: Cortex not connected to Alertmanager description: Cortex not connected to Alertmanager (instance {{ $labels.instance }}) query: cortex_prometheus_notifications_alertmanagers_discovered < 1 severity: critical - name: Cortex notification are being dropped description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s)." query: rate(cortex_prometheus_notifications_dropped_total[5m]) > 0.05 comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: critical - name: Cortex notification error description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }}, {{ $value | humanize }}/s)." query: rate(cortex_prometheus_notifications_errors_total[5m]) > 0.05 comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: critical - name: Cortex ingester unhealthy description: Cortex has an unhealthy ingester query: cortex_ring_members{state="Unhealthy", name="ingester"} > 0 severity: critical - name: Cortex frontend queries stuck description: There are queued up queries in query-frontend. query: sum by (job) (cortex_query_frontend_queue_length) > 0 severity: critical for: 5m - name: Grafana Tempo exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://grafana.com/docs/tempo/latest/operations/monitor/ rules: - name: Tempo distributor unhealthy description: Tempo has {{ $value }} unhealthy distributor(s). query: max by (job) (tempo_ring_members{state="Unhealthy", name="distributor"}) > 0 severity: warning for: 15m - name: Tempo live store unhealthy description: Tempo has {{ $value }} unhealthy live store(s). query: max by (job) (tempo_ring_members{state="Unhealthy", name="live-store"}) > 0 severity: critical for: 15m - name: Tempo metrics generator unhealthy description: Tempo has {{ $value }} unhealthy metrics generator(s). query: max by (job) (tempo_ring_members{state="Unhealthy", name="metrics-generator"}) > 0 severity: critical for: 15m - name: Tempo compactions failing description: "{{ $value }} compactions have failed in the past hour." query: sum by (job) (increase(tempodb_compaction_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_compaction_errors_total[5m])) > 0 severity: critical for: 1h comments: | Uses a two-window approach: 1h for historical count and 5m to confirm the issue is ongoing. - name: Tempo polls failing description: "{{ $value }} blocklist polls have failed in the past hour." query: sum by (job) (increase(tempodb_blocklist_poll_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_poll_errors_total[5m])) > 0 severity: critical - name: Tempo tenant index failures description: "{{ $value }} tenant index failures in the past hour." query: sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[5m])) > 0 severity: critical - name: Tempo no tenant index builders description: No tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale. query: sum by (tenant) (tempodb_blocklist_tenant_index_builder) == 0 and on() max(tempodb_blocklist_length) > 0 severity: critical for: 5m - name: Tempo tenant index too old description: Tenant index for {{ $labels.tenant }} is {{ $value }}s old. query: max by (tenant) (tempodb_blocklist_tenant_index_age_seconds) > 600 severity: critical for: 5m comments: | Threshold of 600s (10 minutes). Adjust based on your tenant index build interval. - name: Tempo block list rising quickly description: Tempo blocklist length is up {{ printf "%.0f" $value }}% over the last 7 days. Consider scaling compactors. query: (avg(tempodb_blocklist_length) / avg(tempodb_blocklist_length offset 7d) - 1) * 100 > 40 and avg(tempodb_blocklist_length offset 7d) > 0 severity: critical for: 15m comments: | Fires when the blocklist grows more than 40% over 7 days. - name: Tempo bad overrides description: '{{ $labels.job }} failed to reload runtime overrides.' query: sum by (job) (tempo_runtime_config_last_reload_successful == 0) > 0 severity: critical for: 15m - name: Tempo user configurable overrides reload failing description: "{{ $value }} user-configurable overrides reloads have failed in the past hour." query: sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[1h])) > 5 and sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[5m])) > 0 severity: critical - name: Tempo compaction too many outstanding blocks warning description: There are too many outstanding compaction blocks for {{ $labels.instance }}. Consider increasing compactor resources. query: sum by (instance) (tempodb_compaction_outstanding_blocks) > 100 severity: warning for: 6h comments: | Threshold of 100 blocks per compactor instance. Adjust based on your environment. - name: Tempo compaction too many outstanding blocks critical description: There are too many outstanding compaction blocks for {{ $labels.instance }}. Increase compactor resources immediately. query: sum by (instance) (tempodb_compaction_outstanding_blocks) > 250 severity: critical for: 24h comments: | Official Tempo mixin normalizes by backend-worker count. Adjust threshold based on your compactor configuration. - name: Tempo distributor usage tracker errors description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }})." query: sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0 severity: critical for: 30m - name: Tempo metrics generator processor updates failing description: "Tempo metrics generator processor updates are failing for {{ $labels.job }} ({{ $value }} failures in 5m)." query: sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 0 severity: critical for: 15m - name: Tempo metrics generator service graphs dropping spans description: Tempo metrics generator is dropping {{ printf "%.2f" $value }}% of spans in service graphs for {{ $labels.job }}. query: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0' severity: warning for: 15m - name: Tempo metrics generator collections failing description: "Tempo metrics generator collections are failing for {{ $labels.job }} ({{ $value }} failures in 5m)." query: sum by (job) (increase(tempo_metrics_generator_registry_collections_failed_total[5m])) > 2 severity: critical for: 5m - name: Tempo memcached errors elevated description: 'Tempo memcached error rate is {{ printf "%.2f" $value }}% for {{ $labels.name }} in {{ $labels.job }}.' query: '100 * sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count{status_code="500"}[5m])) / sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 20 and sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 0' severity: warning for: 10m comments: | Fires when the memcached error rate exceeds 20%. Only relevant if Tempo is configured with memcached caching. - name: Grafana Mimir exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://grafana.com/docs/mimir/latest/manage/monitor-grafana-mimir/ comments: | Mimir uses the `cortex_` metric prefix for backward compatibility with Cortex. This is intentional and expected. rules: # Core alerts - name: Mimir ingester unhealthy description: Mimir has {{ $value }} unhealthy ingester(s) in the ring. query: min by (job) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0 severity: critical for: 15m - name: Mimir request errors description: 'Mimir {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.' query: '100 * sum by (job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..", route!~"ready|debug_pprof"}[5m])) / sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 1 and sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 0' severity: critical for: 15m - name: Mimir inconsistent runtime config description: An inconsistent runtime config file is used across Mimir instances. query: count(count by (job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1 severity: critical for: 1h - name: Mimir bad runtime config description: '{{ $labels.job }} failed to reload runtime config.' query: sum by (job) (cortex_runtime_config_last_reload_successful == 0) > 0 severity: critical for: 5m - name: Mimir scheduler queries stuck description: There are {{ $value }} queued up queries in {{ $labels.job }}. query: sum by (job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0 severity: critical for: 7m - name: Mimir cache request errors description: 'Mimir cache {{ $labels.name }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation.' query: '(sum by (name, operation, job) (rate(thanos_cache_operation_failures_total[5m])) / sum by (name, operation, job) (rate(thanos_cache_operations_total[5m]))) * 100 > 5 and sum by (name, operation, job) (rate(thanos_cache_operations_total[5m])) > 0' severity: warning for: 5m - name: Mimir KV store failure description: 'Mimir {{ $labels.job }} KV store {{ $labels.kv_name }} is failing with 100% error rate.' query: '(sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.."}[5m])) / sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m]))) == 1 and sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m])) > 0' severity: critical for: 5m - name: Mimir memory map areas too high description: 'Mimir {{ $labels.job }} is using {{ printf "%.0f" $value }}% of its memory map area limit.' query: 'process_memory_map_areas{job=~".*(ingester|cortex|mimir|store-gateway).*"} / process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} * 100 > 80 and process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} > 0' severity: critical for: 5m - name: Mimir ingester instance has no tenants description: Mimir ingester {{ $labels.instance }} has no tenants assigned. query: (cortex_ingester_memory_users == 0) and on (instance) (cortex_ingester_memory_users offset 1h > 0) severity: warning for: 1h - name: Mimir ruler instance has no rule groups description: Mimir ruler {{ $labels.instance }} has no rule groups assigned. query: (cortex_ruler_managers_total == 0) and on (instance) (cortex_ruler_managers_total offset 1h > 0) severity: warning for: 1h - name: Mimir ingested data too far in the future description: Mimir ingester {{ $labels.job }} has ingested samples with timestamps more than 1 hour in the future. query: max by (job) (cortex_ingester_tsdb_head_max_timestamp_seconds - time() and cortex_ingester_tsdb_head_max_timestamp_seconds > 0) > 3600 severity: warning for: 5m - name: Mimir store gateway too many failed operations description: Mimir store-gateway {{ $labels.job }} bucket operations are failing ({{ $value | humanize }}/s). query: sum by (job) (rate(thanos_objstore_bucket_operation_failures_total[5m])) > 0.05 comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: warning for: 5m - name: Mimir ring members mismatch description: Mimir {{ $labels.name }} ring has inconsistent member counts across instances. query: max by (name, job) (sum by (name, job, instance) (cortex_ring_members)) != min by (name, job) (sum by (name, job, instance) (cortex_ring_members)) severity: warning for: 15m # Instance limits - name: Mimir ingester reaching series limit warning description: 'Mimir ingester {{ $labels.instance }} has reached {{ printf "%.0f" $value }}% of its series limit.' query: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_series"} > 0' severity: warning for: 3h - name: Mimir ingester reaching series limit critical description: 'Mimir ingester {{ $labels.instance }} has reached {{ printf "%.0f" $value }}% of its series limit.' query: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 90) and cortex_ingester_instance_limits{limit="max_series"} > 0' severity: critical for: 5m - name: Mimir ingester reaching tenants limit warning description: 'Mimir ingester {{ $labels.instance }} has reached {{ printf "%.0f" $value }}% of its tenants limit.' query: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 70) and cortex_ingester_instance_limits{limit="max_tenants"} > 0' severity: warning for: 5m - name: Mimir ingester reaching tenants limit critical description: 'Mimir ingester {{ $labels.instance }} has reached {{ printf "%.0f" $value }}% of its tenants limit.' query: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_tenants"} > 0' severity: critical for: 5m - name: Mimir reaching TCP connections limit description: 'Mimir instance {{ $labels.instance }} is using {{ printf "%.0f" $value }}% of its TCP connections limit.' query: cortex_tcp_connections / cortex_tcp_connections_limit * 100 > 80 and cortex_tcp_connections_limit > 0 severity: critical for: 5m - name: Mimir distributor inflight requests high description: 'Mimir distributor {{ $labels.instance }} is using {{ printf "%.0f" $value }}% of its inflight push requests limit.' query: '(cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"} * 100 > 80) and cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0' severity: critical for: 5m # Blocks and TSDB - name: Mimir ingester TSDB head compaction failed description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head ({{ $value | humanize }}/s)." query: rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 severity: critical for: 15m - name: Mimir ingester TSDB head truncation failed description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head ({{ $value | humanize }}/s)." query: rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0 severity: critical - name: Mimir ingester TSDB checkpoint creation failed description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints ({{ $value | humanize }}/s)." query: rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0 severity: critical - name: Mimir ingester TSDB checkpoint deletion failed description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints ({{ $value | humanize }}/s)." query: rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0 severity: critical - name: Mimir ingester TSDB WAL truncation failed description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL ({{ $value | humanize }}/s)." query: rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0 severity: warning - name: Mimir ingester TSDB WAL writes failed description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL ({{ $value | humanize }}/s)." query: rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0 severity: critical for: 3m - name: Mimir store gateway has not synced bucket description: Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 10 minutes. query: (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0 comments: | Threshold aligned with official Mimir mixin (30 minutes). severity: critical for: 5m - name: Mimir store gateway no synced tenants description: Mimir store-gateway {{ $labels.instance }} has no synced tenants. query: (min by (instance, job) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0) and on (instance) (cortex_bucket_stores_tenants_synced{component="store-gateway"} offset 1h > 0) severity: warning for: 1h - name: Mimir bucket index not updated description: 'Mimir bucket index for tenant {{ $labels.user }} has not been updated for more than 35 minutes.' query: min by (user, job) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 severity: critical # Compactor - name: Mimir compactor not cleaning up blocks description: Mimir compactor {{ $labels.instance }} has not cleaned up blocks in the last 6 hours. query: (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 21600) and cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 0 severity: critical for: 1h - name: Mimir compactor not running compaction description: Mimir compactor {{ $labels.instance }} has not run compaction in the last 24 hours. query: (time() - cortex_compactor_last_successful_run_timestamp_seconds > 86400) and cortex_compactor_last_successful_run_timestamp_seconds > 0 severity: critical for: 15m - name: Mimir compactor has consecutive failures description: "Mimir compactor {{ $labels.instance }} has had {{ $value }} compaction failures in the last 2 hours." query: increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) > 1 severity: critical - name: Mimir compactor has run out of disk space description: Mimir compactor {{ $labels.instance }} has run out of disk space. query: increase(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1 severity: critical - name: Mimir compactor has not uploaded blocks description: Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours. query: (time() - thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 86400) and thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 0 severity: critical for: 15m - name: Mimir compactor skipped blocks description: "Mimir compactor has found {{ $value }} blocks that cannot be compacted (reason {{ $labels.reason }})." query: increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0 comments: | Using 24h window per official mixin — compaction skips are rare events. severity: warning for: 5m # Ruler - name: Mimir ruler too many failed pushes description: 'Mimir ruler {{ $labels.instance }} is failing to push {{ printf "%.2f" $value }}% of write requests.' query: '100 * sum by (instance, job) (rate(cortex_ruler_write_requests_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 0' severity: critical for: 5m - name: Mimir ruler too many failed queries description: 'Mimir ruler {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% of query evaluations.' query: '100 * sum by (instance, job) (rate(cortex_ruler_queries_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 0' severity: critical for: 5m - name: Mimir ruler missed evaluations description: 'Mimir ruler {{ $labels.instance }} is missing {{ printf "%.2f" $value }}% of rule group evaluations.' query: '100 * sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_missed_total[5m])) / sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 1 and sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 0' severity: warning for: 5m - name: Mimir ruler failed ring check description: Mimir ruler {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s). query: sum by (job) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0.05 comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: critical for: 5m # Alertmanager - name: Mimir alertmanager sync configs failing description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs ({{ $value | humanize }}/s)." query: rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 severity: critical for: 30m - name: Mimir alertmanager ring check failing description: "Mimir alertmanager {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s)." query: rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0 severity: critical for: 10m - name: Mimir alertmanager state merge failing description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates ({{ $value | humanize }}/s)." query: rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0 severity: critical for: 10m - name: Mimir alertmanager replication failing description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state ({{ $value | humanize }}/s)." query: rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0 severity: critical for: 10m - name: Mimir alertmanager persist state failing description: "Mimir alertmanager {{ $labels.job }} is failing to persist state ({{ $value | humanize }}/s)." query: rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 severity: critical for: 1h - name: Mimir alertmanager initial sync failed description: Mimir alertmanager {{ $labels.job }} failed initial state sync. query: increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 severity: warning - name: Mimir alertmanager instance has no tenants description: Mimir alertmanager {{ $labels.instance }} has no tenants assigned. query: (cortex_alertmanager_tenants_owned == 0) and on (instance) (cortex_alertmanager_tenants_owned offset 1h > 0) severity: warning for: 1h # Gossip - name: Mimir gossip members count too high description: Mimir gossip cluster has more members than expected. query: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 1.15 + 10 < max(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)' severity: warning for: 20m - name: Mimir gossip members count too low description: Mimir gossip cluster has fewer members than expected. query: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 0.5 > min(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)' severity: warning for: 20m # Go runtime - name: Mimir go threads too high warning description: 'Mimir {{ $labels.instance }} has {{ $value }} Go threads.' query: 'go_threads{job=~".*(mimir|cortex).*"} > 5000' severity: warning for: 15m comments: | A high number of Go threads may indicate a goroutine leak. - name: Mimir go threads too high critical description: 'Mimir {{ $labels.instance }} has {{ $value }} Go threads.' query: 'go_threads{job=~".*(mimir|cortex).*"} > 8000' severity: critical for: 15m - name: Grafana Alloy exporters: - slug: embedded-exporter rules: - name: Grafana Alloy service down description: "Alloy on instance {{ $labels.instance }} is not responding or has stopped running." query: "count by (instance) (alloy_build_info offset 2h) unless count by (instance) (alloy_build_info)" severity: critical - name: OpenTelemetry Collector exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://opentelemetry.io/docs/collector/internal-telemetry/ comments: | OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint. These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly. All collector internal metrics are prefixed with 'otelcol_'. rules: - name: OpenTelemetry Collector down description: OpenTelemetry Collector instance has disappeared or is not being scraped query: 'up{job=~".*otel.*collector.*"} == 0' severity: critical for: 1m - name: OpenTelemetry Collector receiver refused spans description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}." query: 'rate(otelcol_receiver_refused_spans[5m]) > 0' severity: critical for: 5m - name: OpenTelemetry Collector receiver refused metric points description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s metric points on {{ $labels.receiver }}." query: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0' severity: critical for: 5m - name: OpenTelemetry Collector receiver refused log records description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s log records on {{ $labels.receiver }}." query: 'rate(otelcol_receiver_refused_log_records[5m]) > 0' severity: critical for: 5m - name: OpenTelemetry Collector exporter failed spans description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s spans via {{ $labels.exporter }}." query: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: warning for: 5m - name: OpenTelemetry Collector exporter failed metric points description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s metric points via {{ $labels.exporter }}." query: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: warning for: 5m - name: OpenTelemetry Collector exporter failed log records description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s log records via {{ $labels.exporter }}." query: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: warning for: 5m - name: OpenTelemetry Collector exporter queue nearly full description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full" query: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0' severity: warning - name: OpenTelemetry Collector processor refused spans description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans ({{ $value | humanize }}/s), likely due to backpressure." query: 'rate(otelcol_processor_refused_spans[5m]) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: warning for: 5m - name: OpenTelemetry Collector processor refused metric points description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points ({{ $value | humanize }}/s), likely due to backpressure." query: 'rate(otelcol_processor_refused_metric_points[5m]) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: warning for: 5m - name: OpenTelemetry Collector high memory usage description: "OpenTelemetry Collector memory usage is above 90%" query: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9' severity: warning for: 5m - name: OpenTelemetry Collector OTLP receiver errors description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused" query: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0' severity: critical for: 2m - name: Jaeger exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://www.jaegertracing.io/docs/latest/monitoring/ rules: - name: Jaeger agent HTTP server errors description: "Jaeger agent on {{ $labels.instance }} is experiencing {{ $value | humanize }}% HTTP server errors." query: '100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Jaeger client RPC request errors description: "Jaeger client on {{ $labels.instance }} is experiencing {{ $value | humanize }}% RPC HTTP errors." query: '100 * sum(rate(jaeger_client_jaeger_rpc_http_requests{status_code=~"4xx|5xx"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Jaeger client spans dropped description: "Jaeger client on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans." query: '100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Jaeger agent spans dropped description: "Jaeger agent on {{ $labels.instance }} is dropping {{ $value | humanize }}% of span batches." query: '100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Jaeger collector dropping spans description: "Jaeger collector on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans." query: '100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Jaeger sampling update failing description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of sampling policy updates." query: '100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Jaeger throttling update failing description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of throttling policy updates." query: '100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Jaeger query request failures description: "Jaeger query on {{ $labels.instance }} is failing {{ $value | humanize }}% of requests." query: '100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Other services: - name: APC UPS exporters: - name: mdlayher/apcupsd_exporter slug: apcupsd_exporter doc_url: https://github.com/mdlayher/apcupsd_exporter rules: - name: APC UPS Battery nearly empty description: Battery is almost empty (< 10% left) query: "apcupsd_battery_charge_percent < 10" severity: critical - name: APC UPS Less than 15 Minutes of battery time remaining description: Battery is almost empty (< 15 Minutes remaining) query: "apcupsd_battery_time_left_seconds < 900" severity: critical - name: APC UPS AC input outage description: UPS now running on battery (since {{$value | humanizeDuration}}) query: "apcupsd_battery_time_on_seconds > 0" severity: warning - name: APC UPS low battery voltage description: Battery voltage is lower than nominal (< 95%) query: "(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95" severity: warning - name: APC UPS high temperature description: Internal temperature is high ({{$value}}°C) query: "apcupsd_internal_temperature_celsius >= 40" severity: warning for: 2m - name: APC UPS high load description: UPS load is > 80% query: "apcupsd_ups_load_percent > 80" severity: warning - name: Graph Node exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: Provider failed because net_version failed description: "Failed net_version for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`" query: "eth_rpc_status == 1" severity: critical - name: Provider failed because get genesis failed description: "Failed to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`" query: "eth_rpc_status == 2" severity: critical - name: Provider failed because net_version timeout description: "net_version timeout for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`" query: "eth_rpc_status == 3" severity: critical - name: Provider failed because get genesis timeout description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`" query: "eth_rpc_status == 4" severity: critical - name: Store connection slow description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`" query: "store_connection_wait_time_ms > 10" severity: warning - name: Store connection very slow description: "Store connection is very slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`" query: "store_connection_wait_time_ms > 20" severity: critical ================================================ FILE: _layouts/default.html ================================================ {% seo %} Skip to the content.
{{ content }}
================================================ FILE: alertmanager.md ================================================

Global configuration

If you notice a delay between an event and the first notification, read the following blog post => [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html). ## Prometheus configuration {% highlight yaml %} # prometheus.yml global: scrape_interval: 20s # A short evaluation_interval will check alerting rules very often. # It can be costly if you run Prometheus with 100+ alerts. evaluation_interval: 20s ... rule_files: - 'alerts/*.yml' scrape_configs: ... {% endhighlight %} {% highlight yaml %} # alerts/example-redis.yml groups: - name: ExampleRedisGroup rules: - alert: ExampleRedisDown expr: redis_up{} == 0 for: 2m labels: severity: critical annotations: summary: "Redis instance down" description: "Whatever" {% endhighlight %} ## AlertManager configuration {% highlight yaml %} {% raw %} # alertmanager.yml route: # When a new group of alerts is created by an incoming alert, wait at # least 'group_wait' to send the initial notification. # This way ensures that you get multiple alerts for the same group that start # firing shortly after another are batched together on the first # notification. group_wait: 10s # When the first notification was sent, wait 'group_interval' to send a batch # of new alerts that started firing for that group. group_interval: 30s # If an alert has successfully been sent, wait 'repeat_interval' to # resend them. repeat_interval: 30m # A default receiver receiver: "slack" # All the above attributes are inherited by all child routes and can # overwritten on each. routes: - receiver: "slack" group_wait: 10s match_re: severity: critical|warning continue: true - receiver: "pager" group_wait: 10s match_re: severity: critical continue: true receivers: - name: "slack" slack_configs: - api_url: 'https://hooks.slack.com/services/XXXXXXXXX/XXXXXXXXX/xxxxxxxxxxxxxxxxxxxxxxxxxxx' send_resolved: true channel: 'monitoring' text: "{{ range .Alerts }} {{ .Annotations.summary }}\n{{ .Annotations.description }}\n{{ end }}" - name: "pager" webhook_configs: - url: http://a.b.c.d:8080/send/sms send_resolved: true {% endraw %} {% endhighlight %} ## Reduce Prometheus server load For expansive or frequent PromQL queries, Prometheus allows to precompute rules. {% highlight yaml %} {% raw %} groups: # first define the recorded rule - name: ExampleRecordedGroup rules: - record: job:rabbitmq_queue_messages_delivered_total:rate:5m expr: rate(rabbitmq_queue_messages_delivered_total[5m]) # then use it in alerts - name: ExampleAlertingGroup rules: - alert: ExampleRabbitmqLowMessageDelivery expr: sum(job:rabbitmq_queue_messages_delivered_total:rate:5m) < 10 for: 2m labels: severity: critical annotations: summary: "Low delivery rate in Rabbitmq queues" {% endraw %} {% endhighlight %} ## Troubleshooting If the notification takes too much time to be triggered, check the following delays: - `scrape_interval = 20s` (prometheus.yml) - `evaluation_interval = 20s` (prometheus.yml) - `increase(mysql_global_status_slow_queries[1m]) > 0` (alerts/example-mysql.yml) - `for: 5m` (alerts/example-mysql.yml) - `group_wait = 10s` (alertmanager.yml) Also read: - [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html). - [https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/](https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/) - [https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/](https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/) ================================================ FILE: assets/css/app.css ================================================ a.anchor { font-size: 15px; vertical-align: middle; color: darkblue; display: inline-block; padding-bottom: 5px; margin-right: 5px; opacity: 0; transition: opacity 0.4s; } h2:hover a.anchor, h3:hover a.anchor, h4:hover a.anchor { opacity: 1; } summary { position: relative; padding-left: 60px; padding-right: 50px; margin-bottom: 15px; font-size: 15px; } h2 { position: relative; } .clipboard-single, .clipboard-multiple { right: 0; position: absolute; cursor: pointer; font-size: 14px; color: #606c71; } /* NAVBAR */ #rules-navbar.affix { /* showed by JS */ display: none; position: fixed; overflow: auto; top: 0; right: 0; max-width: 250px; max-height: 100%; padding-top: 20px; padding-bottom: 20px; padding-left: 20px; padding-right: 10px; background-color: #f3f6fa; } /* hide menu on small screens */ @media screen and (max-width: 1350px) { #rules-navbar.affix { display: none !important; } } /* hide menu scrollbar */ #rules-navbar.affix::-webkit-scrollbar { display: none; } #rules-navbar.affix { -ms-overflow-style: none; /* IE and Edge */ scrollbar-width: none; /* Firefox */ } #rules-navbar.affix h3 { margin-bottom: 10px; } #rules-navbar.affix h4 { margin: 0; font-weight: bold; font-size: 14px; line-height: 14px; } #rules-navbar.affix ul, #rules-navbar.affix ul li { margin: 0; padding-top: 0; padding-bottom: 0; line-height: normal; } #rules-navbar.affix>ul { padding-left: 0; padding-right: 0; } #rules-navbar.affix>ul>li { margin-bottom: 10px; padding-left: 0; padding-right: 0; } #rules-navbar.affix a { font-size: 14px; line-height: 14px; } /* https://github.com/samber/awesome-prometheus-alerts/issues/356 */ @media screen and (min-width: 64em) { .main-content { max-width: 85rem; } } ul#sponsoring { display: flex; align-items: center; justify-content: center; margin-top: 50px; } ul#sponsoring li { display: flex; padding: 0px 15px; font-size: 16px; } ul#sponsoring li a { display: flex; } ul#sponsoring li a img { max-width: 180px; max-height: 40px; } .page-header { padding-bottom: 30px; } @media (prefers-color-scheme: dark) { #rules-navbar.affix { background-color: #2b2b2b; } /*********************** style.css overrides ******************************/ /* This should *probably* be its own theme instead. */ body { color: #a3b0b6; background-color: #242424; } .page-header { color: #fff; text-align: center; background-color: #006128; background-image: linear-gradient(120deg, #002968, #003c04); } .site-footer { border-top: solid 1px #525354; } hr { background-color: #525354!important; } a { color: #3d86d6; } .main-content h1 , .main-content h2 , .main-content h3 , .main-content h4 , .main-content h5 , .main-content h6 { color: #55c883; } /* Syntax Highlighting from dark-plus of pygments-styles */ /* See: https://github.com/lepture/pygments-styles */ .main-content pre { background: #1E1E1E; border: solid 1px #272f36; } code, .highlight { background: #1E1E1E; color: #D4D4D4 } .highlight .hll { background-color: #ADD6FF26 } .highlight .c { color: #6A9955 } .highlight .err { color: #F44747 } .highlight .k { color: #C586C0 } .highlight .l { color: #CE9178 } .highlight .ch { color: #6A9955 } .highlight .cm { color: #6A9955 } .highlight .cp { color: #C586C0 } .highlight .cpf { color: #CE9178 } .highlight .c1 { color: #6A9955 } .highlight .cs { color: #6A9955 } .highlight .gd { color: #CE9178 } .highlight .ge { font-style: italic } .highlight .gr { color: #F44747 } .highlight .gh { color: #569CD6 } .highlight .gi { color: #B5CEA8 } .highlight .go { color: #CE9178 } .highlight .gp { color: #C8C8C8 } .highlight .gs { color: #569CD6; font-weight: bold } .highlight .gu { color: #569CD6 } .highlight .gt { color: #F44747 } .highlight .kc { color: #CE9178 } .highlight .kd { color: #C586C0 } .highlight .kn { color: #C586C0 } .highlight .kp { color: #D7BA7D } .highlight .kr { color: #C586C0 } .highlight .kt { color: #569CD6 } .highlight .ld { color: #CE9178 } .highlight .m { color: #B5CEA8 } .highlight .s { color: #CE9178 } .highlight .na { color: #9CDCFE } .highlight .nb { color: #DCDCAA } .highlight .nc { color: #4EC9B0 } .highlight .no { color: #B5CEA8 } .highlight .nd { color: #DCDCAA } .highlight .ne { color: #4EC9B0 } .highlight .nf { color: #DCDCAA } .highlight .nl { color: #C8C8C8 } .highlight .nx { color: #D4D4D4 } .highlight .nt { color: #569CD6 } .highlight .w { color: #D4D4D4 } .highlight .mb { color: #B5CEA8 } .highlight .mf { color: #B5CEA8 } .highlight .mh { color: #B5CEA8 } .highlight .mi { color: #B5CEA8 } .highlight .mo { color: #B5CEA8 } .highlight .sa { color: #CE9178 } .highlight .sb { color: #CE9178 } .highlight .sc { color: #CE9178 } .highlight .dl { color: #CE9178 } .highlight .sd { color: #CE9178 } .highlight .s2 { color: #CE9178 } .highlight .se { color: #CE9178 } .highlight .sh { color: #CE9178 } .highlight .si { color: #569CD6 } .highlight .sx { color: #CE9178 } .highlight .sr { color: #D16969 } .highlight .s1 { color: #CE9178 } .highlight .ss { color: #CE9178 } .highlight .bp { color: #D7BA7D } .highlight .fm { color: #DCDCAA } .highlight .il { color: #B5CEA8 } } ================================================ FILE: assets/js/app.js ================================================ $(function () { var clipboardRules = new ClipboardJS('.clipboard-single', { text: function (trigger) { const id = trigger.getAttribute('data-clipboard-target-id'); const html = $("#" + id + " .highlight"); return html.text() + '\n'; }, }); var clipboardCategories = new ClipboardJS('.clipboard-multiple', { text: function (trigger) { const id = trigger.getAttribute('data-clipboard-target-id'); const html = $("[id^=" + id + "] .highlight"); return Array.from(html.map((i, target) => $(target).text())).join('\n\n'); }, }); }); ================================================ FILE: blackbox-exporter.md ================================================

Blackbox exporter

## Wordwide probes Blackbox Exporter gives you the ability to probe endpoints over HTTP, HTTPS, DNS, TCP and ICMP. You should deploy blackbox exporters in multiple Point of Presence around the globe, to monitor latency. Feel free to use the following endpoints for your own projects: - https://probe-montreal.cleverapps.io - https://probe-paris.cleverapps.io - https://probe-jeddah.cleverapps.io - https://probe-singapore.cleverapps.io - https://probe-sydney.cleverapps.io - https://probe-warsaw.cleverapps.io ☝️ Logs have been disabled. More probes from the community would be appreciated, please contribute here! These blackbox exporters use the following configuration. ## Prometheus Configuration Blackbox exporters and endpoints must be declared in Prometheus. Here is a simple configuration, inspired by [Hayk Davtyan medium post](https://medium.com/geekculture/single-prometheus-job-for-dozens-of-blackbox-exporters-2a7ba492d6c8): ```yml # sd/blackbox.yml - targets: # # Montreal # # http - probe-montreal.cleverapps.io:_:http_2xx:_:Montreal:_:f229cy:_:https://api.screeb.app - probe-montreal.cleverapps.io:_:http_2xx:_:Montreal:_:f229cy:_:https://t.screeb.app/tag.js # icmp - probe-montreal.cleverapps.io:_:icmp_ipv4:_:Montreal:_:f229cy:_:api.screeb.app - probe-montreal.cleverapps.io:_:icmp_ipv4:_:Montreal:_:f229cy:_:t.screeb.app # # Paris # # http - probe-paris.cleverapps.io:_:http_2xx:_:Paris:_:u09tgy:_:https://api.screeb.app - probe-paris.cleverapps.io:_:http_2xx:_:Paris:_:u09tgy:_:https://t.screeb.app/tag.js # icmp - probe-paris.cleverapps.io:_:icmp_ipv4:_:Paris:_:u09tgy:_:api.screeb.app - probe-paris.cleverapps.io:_:icmp_ipv4:_:Paris:_:u09tgy:_:t.screeb.app # # Sydney # # http - probe-sydney.cleverapps.io:_:http_2xx:_:Sydney:_:r3gpkn:_:https://api.screeb.app - probe-sydney.cleverapps.io:_:http_2xx:_:Sydney:_:r3gpkn:_:https://t.screeb.app/tag.js # icmp - probe-sydney.cleverapps.io:_:icmp_ipv4:_:Sydney:_:r3gpkn:_:api.screeb.app - probe-sydney.cleverapps.io:_:icmp_ipv4:_:Sydney:_:r3gpkn:_:t.screeb.app # ... ``` ```yml # prometheus.yml global: # ... scrape_configs: - job_name: 'blackbox' metrics_path: /probe scrape_interval: 30s scheme: https file_sd_configs: - files: - /etc/prometheus/sd/blackbox.yml relabel_configs: # adds "module" label in the final labelset - source_labels: [__address__] regex: '.*:_:(.*):_:.*:_:.*:_:.*' target_label: module # adds "geohash" label in the final labelset - source_labels: [__address__] regex: '.*:_:.*:_:.*:_:(.*):_:.*' target_label: geohash # rewrites "instance" label with corresponding URL - source_labels: [__address__] regex: '.*:_:.*:_:.*:_:.*:_:(.*)' target_label: instance # rewrites "pop" label with corresponding location name - source_labels: [__address__] regex: '.*:_:.*:_:(.*):_:.*:_:.*' target_label: pop # passes "module" parameter to Blackbox exporter - source_labels: [module] target_label: __param_module # passes "target" parameter to Blackbox exporter - source_labels: [instance] target_label: __param_target # the Blackbox exporter's real hostname:port - source_labels: [__address__] regex: '(.*):_:.*:_:.*:_:.*:_:.*' target_label: __address__ # ... ``` ## Geohash ![](assets/grafana-map-panel.png) To display nice maps in Grafana, you need to instruct blackbox exporters about the location. Grafana map panel speaks the "geohash" format: - go to google map - extract the lat/long from the url - convert lat/long to geohash here: http://geohash.co ## Grafana Some great dashboard have been created by the community: https://grafana.com/grafana/dashboards/?search=blackbox Since Grafana v5.0.0, a map panel is available: https://grafana.com/docs/grafana/latest/panels-visualizations/visualizations/geomap/ ================================================ FILE: dist/rules/apache/lusitaniae-apache-exporter.yml ================================================ groups: - name: LusitaniaeApacheExporter rules: - alert: ApacheDown expr: 'apache_up == 0' for: 0m labels: severity: critical annotations: summary: Apache down (instance {{ $labels.instance }}) description: "Apache down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ApacheWorkersLoad expr: '(sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80 and sum by (instance) (apache_scoreboard) > 0' for: 2m labels: severity: warning annotations: summary: Apache workers load (instance {{ $labels.instance }}) description: "Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ApacheRestart expr: 'apache_uptime_seconds_total / 60 < 1' for: 0m labels: severity: warning annotations: summary: Apache restart (instance {{ $labels.instance }}) description: "Apache has just been restarted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/apache-flink/flink-prometheus-reporter.yml ================================================ groups: - name: FlinkPrometheusReporter rules: - alert: FlinkJobIsNotRunning expr: 'flink_jobmanager_numRunningJobs == 0' for: 1m labels: severity: critical annotations: summary: Flink job is not running (instance {{ $labels.instance }}) description: "No Flink jobs are currently running. All jobs may have failed or been cancelled.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FlinkNoTaskmanagersRegistered expr: 'flink_jobmanager_numRegisteredTaskManagers == 0' for: 1m labels: severity: critical annotations: summary: Flink no TaskManagers registered (instance {{ $labels.instance }}) description: "No TaskManagers are registered with the JobManager. The cluster has no processing capacity.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This alert fires when there are no available task slots. Adjust the threshold if your cluster is expected to run at full capacity. - alert: FlinkAllTaskSlotsUsed expr: 'flink_jobmanager_taskSlotsAvailable == 0' for: 5m labels: severity: warning annotations: summary: Flink all task slots used (instance {{ $labels.instance }}) description: "All Flink task slots are in use ({{ $value }} available). New jobs cannot be scheduled.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # A single restart may be normal during deployments. Adjust threshold based on restart tolerance. - alert: FlinkJobRestartIncreasing expr: 'increase(flink_jobmanager_job_numRestarts[5m]) > 1' for: 5m labels: severity: warning annotations: summary: Flink job restart increasing (instance {{ $labels.instance }}) description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FlinkCheckpointFailures expr: 'increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1' for: 5m labels: severity: warning annotations: summary: Flink checkpoint failures (instance {{ $labels.instance }}) description: "Flink job {{ $labels.job_name }} has {{ $value }} failed checkpoints in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Value is converted from milliseconds to seconds for correct humanizeDuration display. # Threshold is 60 seconds. Adjust based on your checkpoint interval and state size. - alert: FlinkCheckpointDurationHigh expr: 'flink_jobmanager_job_lastCheckpointDuration / 1000 > 60' for: 5m labels: severity: warning annotations: summary: Flink checkpoint duration high (instance {{ $labels.instance }}) description: "Flink job {{ $labels.job_name }} last checkpoint took {{ $value | humanizeDuration }} to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FlinkTaskBackpressured expr: 'flink_taskmanager_job_task_isBackPressured == 1' for: 5m labels: severity: warning annotations: summary: Flink task backpressured (instance {{ $labels.instance }}) description: "Flink task {{ $labels.task_name }} in job {{ $labels.job_name }} is backpressured.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Fires when a task spends more than 500ms/sec backpressured. This indicates the task cannot keep up with upstream data rate. - alert: FlinkTaskHighBackpressureTime expr: 'flink_taskmanager_job_task_backPressuredTimeMsPerSecond > 500' for: 5m labels: severity: warning annotations: summary: Flink task high backpressure time (instance {{ $labels.instance }}) description: "Flink task {{ $labels.task_name }} is spending {{ $value | humanize }}ms/sec in backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FlinkTaskmanagerHeapMemoryHigh expr: 'flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9' for: 5m labels: severity: warning annotations: summary: Flink TaskManager heap memory high (instance {{ $labels.instance }}) description: "Flink TaskManager {{ $labels.instance }} heap memory usage is above 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FlinkJobmanagerHeapMemoryHigh expr: 'flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9' for: 5m labels: severity: warning annotations: summary: Flink JobManager heap memory high (instance {{ $labels.instance }}) description: "Flink JobManager {{ $labels.instance }} heap memory usage is above 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold: more than 100ms/sec of GC time (10% of wall clock). Adjust based on your workload. - alert: FlinkTaskmanagerGcTimeHigh expr: 'rate(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100' for: 5m labels: severity: warning annotations: summary: Flink TaskManager GC time high (instance {{ $labels.instance }}) description: "Flink TaskManager {{ $labels.instance }} is spending more than 10% of time in garbage collection.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Only fires for tasks that have previously received records, to avoid false positives during startup. - alert: FlinkNoRecordsProcessed expr: 'rate(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0' for: 5m labels: severity: warning annotations: summary: Flink no records processed (instance {{ $labels.instance }}) description: "Flink task {{ $labels.task_name }} has not processed any records in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/apache-spark/spark-prometheus.yml ================================================ groups: - name: SparkPrometheus # Spark exposes metrics via two built-in endpoints: # - PrometheusServlet: master/worker/driver metrics at /metrics/prometheus/ (ports 8080, 8081, 4040) # - PrometheusResource: executor metrics at /metrics/executors/prometheus/ (port 4040, requires spark.ui.prometheus.enabled=true in Spark 3.x) # Metric names from PrometheusServlet include a dynamic namespace (application ID), making static PromQL queries challenging. # Configuration: spark.metrics.conf.*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet rules: - alert: SparkNoAliveWorkers expr: 'metrics_master_aliveWorkers_Value == 0' for: 1m labels: severity: critical annotations: summary: Spark no alive workers (instance {{ $labels.instance }}) description: "No Spark workers are alive. The cluster has no processing capacity.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Adjust the threshold based on your cluster's typical queuing behavior. - alert: SparkTooManyWaitingApps expr: 'metrics_master_waitingApps_Value > 10' for: 5m labels: severity: warning annotations: summary: Spark too many waiting apps (instance {{ $labels.instance }}) description: "Spark has {{ $value }} applications waiting for resources.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SparkWorkerMemoryExhausted expr: 'metrics_worker_memFree_MB_Value == 0' for: 2m labels: severity: warning annotations: summary: Spark worker memory exhausted (instance {{ $labels.instance }}) description: "Spark worker {{ $labels.instance }} has no free memory ({{ $value }}MB free).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Fires when a worker has no free cores. This may be normal under high load but can indicate capacity issues. - alert: SparkWorkerCoresExhausted expr: 'metrics_worker_coresFree_Value == 0' for: 5m labels: severity: warning annotations: summary: Spark worker cores exhausted (instance {{ $labels.instance }}) description: "Spark worker {{ $labels.instance }} has no free cores.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Fires when more than 10% of executor time is spent in garbage collection. # This metric comes from the PrometheusResource endpoint (/metrics/executors/prometheus/). - alert: SparkExecutorHighGcTime expr: 'metrics_executor_totalGCTime_seconds_total / metrics_executor_totalDuration > 0.1 and metrics_executor_totalDuration > 0' for: 5m labels: severity: warning annotations: summary: Spark executor high GC time (instance {{ $labels.instance }}) description: "Spark executor {{ $labels.executor_id }} in {{ $labels.application_name }} is spending too much time in GC.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SparkExecutorAllTasksFailing expr: 'metrics_executor_failedTasks_total > 0 and metrics_executor_completedTasks_total == 0' for: 5m labels: severity: critical annotations: summary: Spark executor all tasks failing (instance {{ $labels.instance }}) description: "Spark executor {{ $labels.executor_id }} has only failing tasks ({{ $value }} failed, 0 completed).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SparkExecutorHighTaskFailureRate expr: 'metrics_executor_failedTasks_total / metrics_executor_totalTasks_total > 0.1 and metrics_executor_totalTasks_total > 0' for: 5m labels: severity: warning annotations: summary: Spark executor high task failure rate (instance {{ $labels.instance }}) description: "Spark executor {{ $labels.executor_id }} has a task failure rate above 10%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # diskUsed is a gauge, not a counter — do not use rate(). Threshold of 1GB is a rough default. # Disk spilling indicates insufficient memory for the workload. - alert: SparkExecutorHighDiskSpill expr: 'metrics_executor_diskUsed_bytes > 1e9' for: 5m labels: severity: warning annotations: summary: Spark executor high disk spill (instance {{ $labels.instance }}) description: "Spark executor {{ $labels.executor_id }} is spilling data to disk. Consider increasing executor memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/apc-ups/apcupsd_exporter.yml ================================================ groups: - name: Apcupsd_exporter rules: - alert: ApcUpsBatteryNearlyEmpty expr: 'apcupsd_battery_charge_percent < 10' for: 0m labels: severity: critical annotations: summary: APC UPS Battery nearly empty (instance {{ $labels.instance }}) description: "Battery is almost empty (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ApcUpsLessThan15MinutesOfBatteryTimeRemaining expr: 'apcupsd_battery_time_left_seconds < 900' for: 0m labels: severity: critical annotations: summary: APC UPS Less than 15 Minutes of battery time remaining (instance {{ $labels.instance }}) description: "Battery is almost empty (< 15 Minutes remaining)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ApcUpsAcInputOutage expr: 'apcupsd_battery_time_on_seconds > 0' for: 0m labels: severity: warning annotations: summary: APC UPS AC input outage (instance {{ $labels.instance }}) description: "UPS now running on battery (since {{$value | humanizeDuration}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ApcUpsLowBatteryVoltage expr: '(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95' for: 0m labels: severity: warning annotations: summary: APC UPS low battery voltage (instance {{ $labels.instance }}) description: "Battery voltage is lower than nominal (< 95%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ApcUpsHighTemperature expr: 'apcupsd_internal_temperature_celsius >= 40' for: 2m labels: severity: warning annotations: summary: APC UPS high temperature (instance {{ $labels.instance }}) description: "Internal temperature is high ({{$value}}°C)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ApcUpsHighLoad expr: 'apcupsd_ups_load_percent > 80' for: 0m labels: severity: warning annotations: summary: APC UPS high load (instance {{ $labels.instance }}) description: "UPS load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/argocd/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: ArgocdServiceNotSynced expr: 'argocd_app_info{sync_status!="Synced"} != 0' for: 15m labels: severity: warning annotations: summary: ArgoCD service not synced (instance {{ $labels.instance }}) description: "Service {{ $labels.name }} run by argo is currently not in sync.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ArgocdServiceUnhealthy expr: 'argocd_app_info{health_status!="Healthy"} != 0' for: 15m labels: severity: warning annotations: summary: ArgoCD service unhealthy (instance {{ $labels.instance }}) description: "Service {{ $labels.name }} run by argo is currently not healthy.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml ================================================ groups: - name: PrometheusCloudwatchExporter # CloudWatch metrics are exported as aws_{namespace}_{metric_name}_{statistic} gauges. # The rules below cover both exporter health and common AWS service alerts. # Adjust thresholds and label filters to match your CloudWatch exporter configuration. rules: - alert: CloudwatchExporterScrapeError expr: 'cloudwatch_exporter_scrape_error > 0' for: 5m labels: severity: warning annotations: summary: CloudWatch exporter scrape error (instance {{ $labels.instance }}) description: "CloudWatch exporter on {{ $labels.instance }} failed to scrape metrics from AWS CloudWatch API.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CloudwatchExporterSlowScrape expr: 'cloudwatch_exporter_scrape_duration_seconds > 300' for: 5m labels: severity: warning annotations: summary: CloudWatch exporter slow scrape (instance {{ $labels.instance }}) description: "CloudWatch exporter on {{ $labels.instance }} scrape is taking more than 5 minutes ({{ $value }}s). Consider reducing the number of metrics or splitting across multiple exporters.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # CloudWatch API calls cost money (~$0.01 per 1000 GetMetricData requests). # 100 requests/minute ≈ $45/month. Adjust the threshold based on your budget. - alert: CloudwatchApiHighRequestRate expr: 'sum by (instance, namespace) (rate(cloudwatch_requests_total[5m])) * 60 > 100' for: 0m labels: severity: warning annotations: summary: CloudWatch API high request rate (instance {{ $labels.instance }}) description: "CloudWatch exporter on {{ $labels.instance }} is making {{ $value }} API calls per minute to namespace {{ $labels.namespace }}. This can lead to high AWS costs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires EC2 CPUUtilization metric configured in the CloudWatch exporter. - alert: AwsEc2HighCpuUtilization expr: 'aws_ec2_cpuutilization_average > 90' for: 15m labels: severity: warning annotations: summary: AWS EC2 high CPU utilization (instance {{ $labels.instance }}) description: "EC2 instance {{ $labels.instance_id }} CPU utilization is above 90% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires RDS FreeStorageSpace metric. The threshold of 2GB is a rough default. # Adjust based on your database size. - alert: AwsRdsLowFreeStorageSpace expr: 'aws_rds_free_storage_space_average < 2000000000' for: 5m labels: severity: warning annotations: summary: AWS RDS low free storage space (instance {{ $labels.instance }}) description: "RDS instance {{ $labels.dbinstance_identifier }} has less than 2GB free storage ({{ $value }} bytes remaining).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires RDS CPUUtilization metric configured in the CloudWatch exporter. - alert: AwsRdsHighCpuUtilization expr: 'aws_rds_cpuutilization_average > 90' for: 15m labels: severity: warning annotations: summary: AWS RDS high CPU utilization (instance {{ $labels.instance }}) description: "RDS instance {{ $labels.dbinstance_identifier }} CPU utilization is above 90% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # The threshold depends on the RDS instance class. Adjust based on your # instance type's max_connections parameter. - alert: AwsRdsHighDatabaseConnections expr: 'aws_rds_database_connections_average > 100' for: 5m labels: severity: warning annotations: summary: AWS RDS high database connections (instance {{ $labels.instance }}) description: "RDS instance {{ $labels.dbinstance_identifier }} has {{ $value }} active connections.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires SQS ApproximateNumberOfMessagesVisible metric. The threshold of 1000 # is a rough default. Adjust based on your expected queue depth. - alert: AwsSqsQueueMessagesVisible expr: 'aws_sqs_approximate_number_of_messages_visible_average > 1000' for: 10m labels: severity: warning annotations: summary: AWS SQS queue messages visible (instance {{ $labels.instance }}) description: "SQS queue {{ $labels.queue_name }} has {{ $value }} messages waiting to be processed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires SQS ApproximateAgeOfOldestMessage metric. - alert: AwsSqsMessageAgeTooOld expr: 'aws_sqs_approximate_age_of_oldest_message_maximum > 3600' for: 0m labels: severity: warning annotations: summary: AWS SQS message age too old (instance {{ $labels.instance }}) description: "SQS queue {{ $labels.queue_name }} has messages older than 1 hour ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires ApplicationELB UnHealthyHostCount metric. - alert: AwsAlbUnhealthyTargets expr: 'aws_applicationelb_unhealthy_host_count_average > 0' for: 5m labels: severity: critical annotations: summary: AWS ALB unhealthy targets (instance {{ $labels.instance }}) description: "ALB {{ $labels.load_balancer }} has {{ $value }} unhealthy target(s) in target group {{ $labels.target_group }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires ApplicationELB HTTPCode_ELB_5XX_Count and RequestCount metrics. - alert: AwsAlbHigh5xxErrorRate expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5 and aws_applicationelb_request_count_sum > 0' for: 5m labels: severity: critical annotations: summary: AWS ALB high 5xx error rate (instance {{ $labels.instance }}) description: "ALB {{ $labels.load_balancer }} 5xx error rate is above 5% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires ApplicationELB TargetResponseTime metric. - alert: AwsAlbHighTargetResponseTime expr: 'aws_applicationelb_target_response_time_average > 2' for: 5m labels: severity: warning annotations: summary: AWS ALB high target response time (instance {{ $labels.instance }}) description: "ALB {{ $labels.load_balancer }} average target response time is above 2 seconds ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires Lambda Errors and Invocations metrics. - alert: AwsLambdaHighErrorRate expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5 and aws_lambda_invocations_sum > 0' for: 5m labels: severity: warning annotations: summary: AWS Lambda high error rate (instance {{ $labels.instance }}) description: "Lambda function {{ $labels.function_name }} error rate is above 5% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/azure/azure-metrics-exporter.yml ================================================ groups: - name: AzureMetricsExporter # The exporter uses azurerm_resource_metric as the default metric name for forwarded Azure Monitor metrics. # The metric name can be customized via the name parameter in probe configuration. # Self-monitoring metrics use the azurerm_stats_* and azurerm_api_* prefixes. rules: - alert: AzureExporterRequestErrors expr: 'increase(azurerm_stats_metric_requests{result="error"}[15m]) > 5' for: 0m labels: severity: warning annotations: summary: Azure exporter request errors (instance {{ $labels.instance }}) description: "Azure metrics exporter on {{ $labels.instance }} has {{ $value }} API request errors in the last 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: AzureExporterHighErrorRate expr: 'sum by (instance) (rate(azurerm_stats_metric_requests{result="error"}[5m])) / sum by (instance) (rate(azurerm_stats_metric_requests[5m])) * 100 > 10 and sum by (instance) (rate(azurerm_stats_metric_requests[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Azure exporter high error rate (instance {{ $labels.instance }}) description: "Azure metrics exporter on {{ $labels.instance }} has an error rate above 10% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Azure Resource Manager enforces rate limits per subscription. # The threshold of 100 remaining calls is a rough default. Adjust based on your # scrape interval and number of monitored resources. - alert: AzureApiReadRateLimitApproaching expr: 'azurerm_api_ratelimit{type="read"} < 100' for: 0m labels: severity: warning annotations: summary: Azure API read rate limit approaching (instance {{ $labels.instance }}) description: "Azure API read rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: AzureApiWriteRateLimitApproaching expr: 'azurerm_api_ratelimit{type="write"} < 50' for: 0m labels: severity: warning annotations: summary: Azure API write rate limit approaching (instance {{ $labels.instance }}) description: "Azure API write rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: AzureExporterSlowCollection expr: 'azurerm_stats_metric_collecttime > 300' for: 5m labels: severity: warning annotations: summary: Azure exporter slow collection (instance {{ $labels.instance }}) description: "Azure metrics exporter on {{ $labels.instance }} metric collection is taking more than 5 minutes ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/blackbox/blackbox-exporter.yml ================================================ groups: - name: BlackboxExporter rules: - alert: BlackboxProbeFailed expr: 'probe_success == 0' for: 0m labels: severity: critical annotations: summary: Blackbox probe failed (instance {{ $labels.instance }}) description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxConfigurationReloadFailure expr: 'blackbox_exporter_config_last_reload_successful != 1' for: 0m labels: severity: warning annotations: summary: Blackbox configuration reload failure (instance {{ $labels.instance }}) description: "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxSlowProbe expr: 'probe_duration_seconds > 1' for: 1m labels: severity: warning annotations: summary: Blackbox slow probe (instance {{ $labels.instance }}) description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxProbeHttpFailure expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400' for: 0m labels: severity: critical annotations: summary: Blackbox probe HTTP failure (instance {{ $labels.instance }}) description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxSslCertificateWillExpireSoon expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20' for: 0m labels: severity: warning annotations: summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxSslCertificateWillExpireVerySoon expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3' for: 0m labels: severity: critical annotations: summary: Blackbox SSL certificate will expire very soon (instance {{ $labels.instance }}) description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # For probe_ssl_earliest_cert_expiry to be exposed after expiration, you # need to enable insecure_skip_verify. Note that this will disable # certificate validation. # See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config - alert: BlackboxSslCertificateExpired expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0' for: 0m labels: severity: critical annotations: summary: Blackbox SSL certificate expired (instance {{ $labels.instance }}) description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxProbeSlowHttp expr: 'probe_http_duration_seconds > 1' for: 1m labels: severity: warning annotations: summary: Blackbox probe slow HTTP (instance {{ $labels.instance }}) description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxProbeSlowPing expr: 'probe_icmp_duration_seconds > 1' for: 1m labels: severity: warning annotations: summary: Blackbox probe slow ping (instance {{ $labels.instance }}) description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/caddy/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: CaddyReverseProxyDown expr: 'count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0' for: 0m labels: severity: critical annotations: summary: Caddy Reverse Proxy Down (instance {{ $labels.instance }}) description: "All Caddy reverse proxies are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CaddyHighHttp4xxErrorRateService expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0' for: 1m labels: severity: critical annotations: summary: Caddy high HTTP 4xx error rate service (instance {{ $labels.instance }}) description: "Caddy service 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CaddyHighHttp5xxErrorRateService expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0' for: 1m labels: severity: critical annotations: summary: Caddy high HTTP 5xx error rate service (instance {{ $labels.instance }}) description: "Caddy service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/cassandra/criteo-cassandra-exporter.yml ================================================ groups: - name: CriteoCassandraExporter rules: - alert: CassandraHintsCount expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:totalhints:count"}[1m]) > 3' for: 0m labels: severity: critical annotations: summary: Cassandra hints count (instance {{ $labels.instance }}) description: "Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraCompactionTaskPending expr: 'cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"} > 100' for: 2m labels: severity: warning annotations: summary: Cassandra compaction task pending (instance {{ $labels.instance }}) description: "Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraViewwriteLatency expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile"} > 100000' for: 2m labels: severity: warning annotations: summary: Cassandra viewwrite latency (instance {{ $labels.instance }}) description: "High viewwrite latency on {{ $labels.instance }} cassandra node\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraAuthenticationFailures expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5' for: 2m labels: severity: warning annotations: summary: Cassandra authentication failures (instance {{ $labels.instance }}) description: "Increase of Cassandra authentication failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1m delay allows a restart without triggering an alert. - alert: CassandraNodeDown expr: 'sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0' for: 1m labels: severity: critical annotations: summary: Cassandra node down (instance {{ $labels.instance }}) description: "Cassandra node down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraCommitlogPendingTasks(criteo) expr: 'cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15' for: 2m labels: severity: warning annotations: summary: Cassandra commitlog pending tasks (Criteo) (instance {{ $labels.instance }}) description: "Unexpected number of Cassandra commitlog pending tasks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraCompactionExecutorBlockedTasks(criteo) expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:compactionexecutor:currentlyblockedtasks:count"} > 0' for: 2m labels: severity: warning annotations: summary: Cassandra compaction executor blocked tasks (Criteo) (instance {{ $labels.instance }}) description: "Some Cassandra compaction executor tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraFlushWriterBlockedTasks(criteo) expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:memtableflushwriter:currentlyblockedtasks:count"} > 0' for: 2m labels: severity: warning annotations: summary: Cassandra flush writer blocked tasks (Criteo) (instance {{ $labels.instance }}) description: "Some Cassandra flush writer tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraRepairPendingTasks expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:pendingtasks:value"} > 2' for: 2m labels: severity: warning annotations: summary: Cassandra repair pending tasks (instance {{ $labels.instance }}) description: "Some Cassandra repair tasks are pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraRepairBlockedTasks expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:currentlyblockedtasks:count"} > 0' for: 2m labels: severity: warning annotations: summary: Cassandra repair blocked tasks (instance {{ $labels.instance }}) description: "Some Cassandra repair tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraConnectionTimeoutsTotal(criteo) expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5' for: 2m labels: severity: critical annotations: summary: Cassandra connection timeouts total (Criteo) (instance {{ $labels.instance }}) description: "Some connection between nodes are ending in timeout\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraStorageExceptions(criteo) expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:exceptions:count"}[1m]) > 1' for: 0m labels: severity: critical annotations: summary: Cassandra storage exceptions (Criteo) (instance {{ $labels.instance }}) description: "Something is going wrong with cassandra storage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraTombstoneDump(criteo) expr: 'cassandra_stats{name="org:apache:cassandra:metrics:table:tombstonescannedhistogram:99thpercentile"} > 1000' for: 0m labels: severity: critical annotations: summary: Cassandra tombstone dump (Criteo) (instance {{ $labels.instance }}) description: "Too much tombstones scanned in queries\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraClientRequestUnavailableWrite(criteo) expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:unavailables:count"}[1m]) > 0' for: 0m labels: severity: critical annotations: summary: Cassandra client request unavailable write (Criteo) (instance {{ $labels.instance }}) description: "Write failures have occurred because too many nodes are unavailable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraClientRequestUnavailableRead(criteo) expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:unavailables:count"}[1m]) > 0' for: 0m labels: severity: critical annotations: summary: Cassandra client request unavailable read (Criteo) (instance {{ $labels.instance }}) description: "Read failures have occurred because too many nodes are unavailable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraClientRequestWriteFailure(criteo) expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0' for: 0m labels: severity: critical annotations: summary: Cassandra client request write failure (Criteo) (instance {{ $labels.instance }}) description: "A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraClientRequestReadFailure(criteo) expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0' for: 0m labels: severity: critical annotations: summary: Cassandra client request read failure (Criteo) (instance {{ $labels.instance }}) description: "A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraCacheHitRateKeyCache expr: 'cassandra_stats{name="org:apache:cassandra:metrics:cache:keycache:hitrate:value"} < .85' for: 2m labels: severity: critical annotations: summary: Cassandra cache hit rate key cache (instance {{ $labels.instance }}) description: "Key cache hit rate is below 85%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/cassandra/instaclustr-cassandra-exporter.yml ================================================ groups: - name: InstaclustrCassandraExporter rules: # 1m delay allows a restart without triggering an alert. - alert: CassandraNodeIsUnavailable expr: 'cassandra_endpoint_active < 1' for: 1m labels: severity: critical annotations: summary: Cassandra Node is unavailable (instance {{ $labels.instance }}) description: "Cassandra Node is unavailable - {{ $labels.cassandra_cluster }} {{ $labels.exported_endpoint }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraManyCompactionTasksArePending expr: 'cassandra_table_estimated_pending_compactions > 100' for: 0m labels: severity: warning annotations: summary: Cassandra many compaction tasks are pending (instance {{ $labels.instance }}) description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraCommitlogPendingTasks(instaclustr) expr: 'cassandra_commit_log_pending_tasks > 15' for: 2m labels: severity: warning annotations: summary: Cassandra commitlog pending tasks (Instaclustr) (instance {{ $labels.instance }}) description: "Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraCompactionExecutorBlockedTasks(instaclustr) expr: 'cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15' for: 2m labels: severity: warning annotations: summary: Cassandra compaction executor blocked tasks (Instaclustr) (instance {{ $labels.instance }}) description: "Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraFlushWriterBlockedTasks(instaclustr) expr: 'cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15' for: 2m labels: severity: warning annotations: summary: Cassandra flush writer blocked tasks (Instaclustr) (instance {{ $labels.instance }}) description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraConnectionTimeoutsTotal(instaclustr) expr: 'sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5' for: 2m labels: severity: critical annotations: summary: Cassandra connection timeouts total (Instaclustr) (instance {{ $labels.instance }}) description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraStorageExceptions(instaclustr) expr: 'changes(cassandra_storage_exceptions_total[1m]) > 1' for: 0m labels: severity: critical annotations: summary: Cassandra storage exceptions (Instaclustr) (instance {{ $labels.instance }}) description: "Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraTombstoneDump(instaclustr) expr: 'avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100' for: 2m labels: severity: critical annotations: summary: Cassandra tombstone dump (Instaclustr) (instance {{ $labels.instance }}) description: "Cassandra tombstone dump - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraClientRequestUnavailableWrite(instaclustr) expr: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0' for: 2m labels: severity: critical annotations: summary: Cassandra client request unavailable write (Instaclustr) (instance {{ $labels.instance }}) description: "Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraClientRequestUnavailableRead(instaclustr) expr: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0' for: 2m labels: severity: critical annotations: summary: Cassandra client request unavailable read (Instaclustr) (instance {{ $labels.instance }}) description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraClientRequestWriteFailure(instaclustr) expr: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0' for: 2m labels: severity: critical annotations: summary: Cassandra client request write failure (Instaclustr) (instance {{ $labels.instance }}) description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraClientRequestReadFailure(instaclustr) expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0' for: 2m labels: severity: critical annotations: summary: Cassandra client request read failure (Instaclustr) (instance {{ $labels.instance }}) description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/ceph/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: CephState expr: 'ceph_health_status != 0' for: 0m labels: severity: critical annotations: summary: Ceph State (instance {{ $labels.instance }}) description: "Ceph instance unhealthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CephMonitorClockSkew expr: 'abs(ceph_monitor_clock_skew_seconds) > 0.2' for: 2m labels: severity: warning annotations: summary: Ceph monitor clock skew (instance {{ $labels.instance }}) description: "Ceph monitor clock skew detected. Please check ntp and hardware clock settings\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CephMonitorLowSpace expr: 'ceph_monitor_avail_percent < 10' for: 2m labels: severity: warning annotations: summary: Ceph monitor low space (instance {{ $labels.instance }}) description: "Ceph monitor storage is low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CephOsdDown expr: 'ceph_osd_up == 0' for: 0m labels: severity: critical annotations: summary: Ceph OSD Down (instance {{ $labels.instance }}) description: "Ceph Object Storage Daemon Down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CephHighOsdLatency expr: 'ceph_osd_perf_apply_latency_seconds > 5' for: 1m labels: severity: warning annotations: summary: Ceph high OSD latency (instance {{ $labels.instance }}) description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CephOsdLowSpace expr: 'ceph_osd_utilization > 90' for: 2m labels: severity: warning annotations: summary: Ceph OSD low space (instance {{ $labels.instance }}) description: "Ceph Object Storage Daemon is going out of space. Please add more disks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CephOsdReweighted expr: 'ceph_osd_weight < 1' for: 2m labels: severity: warning annotations: summary: Ceph OSD reweighted (instance {{ $labels.instance }}) description: "Ceph Object Storage Daemon takes too much time to resize.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CephPgDown expr: 'ceph_pg_down > 0' for: 0m labels: severity: critical annotations: summary: Ceph PG down (instance {{ $labels.instance }}) description: "Some Ceph placement groups are down. Please ensure that all the data are available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CephPgIncomplete expr: 'ceph_pg_incomplete > 0' for: 0m labels: severity: critical annotations: summary: Ceph PG incomplete (instance {{ $labels.instance }}) description: "Some Ceph placement groups are incomplete. Please ensure that all the data are available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CephPgInconsistent expr: 'ceph_pg_inconsistent > 0' for: 0m labels: severity: warning annotations: summary: Ceph PG inconsistent (instance {{ $labels.instance }}) description: "Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CephPgActivationLong expr: 'ceph_pg_activating > 0' for: 2m labels: severity: warning annotations: summary: Ceph PG activation long (instance {{ $labels.instance }}) description: "Some Ceph placement groups are too long to activate.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CephPgBackfillFull expr: 'ceph_pg_backfill_toofull > 0' for: 2m labels: severity: warning annotations: summary: Ceph PG backfill full (instance {{ $labels.instance }}) description: "Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CephPgUnavailable expr: 'ceph_pg_total - ceph_pg_active > 0' for: 0m labels: severity: critical annotations: summary: Ceph PG unavailable (instance {{ $labels.instance }}) description: "Some Ceph placement groups are unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/cert-manager/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: Cert-managerAbsent expr: 'absent(up{job="cert-manager"})' for: 10m labels: severity: critical annotations: summary: Cert-Manager absent (instance {{ $labels.instance }}) description: "Cert-Manager has disappeared from Prometheus service discovery. New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 21 days is a rough default. ACME certificates are typically renewed 30 days before expiry, so expiring within 21 days may indicate issuer misconfiguration. - alert: Cert-managerCertificateExpiringSoon expr: 'avg by (exported_namespace, namespace, name) (certmanager_certificate_expiration_timestamp_seconds - time()) < (21 * 24 * 3600)' for: 1h labels: severity: warning annotations: summary: Cert-Manager certificate expiring soon (instance {{ $labels.instance }}) description: "The certificate {{ $labels.name }} is expiring in less than 21 days.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: Cert-managerCertificateNotReady expr: 'max by (name, exported_namespace, namespace, condition) (certmanager_certificate_ready_status{condition!="True"} == 1)' for: 10m labels: severity: critical annotations: summary: Cert-Manager certificate not ready (instance {{ $labels.instance }}) description: "The certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready to serve traffic.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # In cert-manager 1.19+, the metric was renamed (dropped http_ prefix). Verify metric name against your version. - alert: Cert-managerHittingAcmeRateLimits expr: 'sum by (host) (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) > 0' for: 5m labels: severity: critical annotations: summary: Cert-Manager hitting ACME rate limits (instance {{ $labels.instance }}) description: "Cert-Manager is being rate-limited by the ACME provider. Certificate issuance and renewal may be blocked for up to a week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/cilium/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: # Metric name depends on Cilium version. Use cilium_unreachable_nodes (older) or cilium_node_connectivity_status (1.14+). - alert: CiliumAgentUnreachableNodes expr: 'sum(cilium_unreachable_nodes{}) by (pod) > 0' for: 15m labels: severity: warning annotations: summary: Cilium agent unreachable nodes (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} node(s). Check network connectivity and node health.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Metric name depends on Cilium version. Use cilium_unreachable_health_endpoints (older) or cilium_node_connectivity_status (1.14+). - alert: CiliumAgentUnreachableHealthEndpoints expr: 'sum(cilium_unreachable_health_endpoints{}) by (pod) > 0' for: 15m labels: severity: warning annotations: summary: Cilium agent unreachable health endpoints (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} health endpoint(s). Node-to-node health probes are failing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Metric name depends on Cilium version. Use cilium_controllers_failing (older) or cilium_controllers_runs_total (1.14+). - alert: CiliumAgentFailingControllers expr: 'sum(cilium_controllers_failing{}) by (pod) > 0' for: 5m labels: severity: warning annotations: summary: Cilium agent failing controllers (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} has {{ $value }} failing controller(s). Check cilium-agent logs for details.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentEndpointFailures expr: 'sum(cilium_endpoint_state{endpoint_state="invalid"}) by (pod) > 0' for: 5m labels: severity: warning annotations: summary: Cilium agent endpoint failures (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} has {{ $value }} endpoint(s) in invalid state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentEndpointRegenerationFailures expr: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0' for: 5m labels: severity: warning annotations: summary: Cilium agent endpoint regeneration failures (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} is failing to regenerate endpoints. Network policy enforcement may be stale.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentEndpointUpdateFailure expr: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0' for: 5m labels: severity: warning annotations: summary: Cilium agent endpoint update failure (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} is failing K8s endpoint update API calls ({{ $labels.method }} {{ $labels.return_code }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentEndpointCreateFailure expr: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0' for: 5m labels: severity: info annotations: summary: Cilium agent endpoint create failure (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} is failing CNI endpoint-create calls. New pods may fail to get networking.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentMapOperationFailures expr: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0' for: 5m labels: severity: warning annotations: summary: Cilium agent map operation failures (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} has eBPF map operation failures on {{ $labels.map_name }}. Datapath may be degraded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Map pressure is a ratio from 0 to 1. At 1.0, the map is full and new entries will be dropped. - alert: CiliumAgentBpfMapPressure expr: 'cilium_bpf_map_pressure{} > 0.9' for: 5m labels: severity: warning annotations: summary: Cilium agent BPF map pressure (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} eBPF map {{ $labels.map_name }} is above 90% utilization. Map may become full.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentConntrackTableFull expr: 'sum(rate(cilium_drop_count_total{reason="CT: Map insertion failed"}[5m])) by (pod) > 0' for: 5m labels: severity: critical annotations: summary: Cilium agent conntrack table full (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} conntrack table is full, causing packet drops. Increase CT map size or investigate connection leaks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentConntrackFailedGarbageCollection expr: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0' for: 5m labels: severity: warning annotations: summary: Cilium agent conntrack failed garbage collection (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} conntrack garbage collection is failing. Stale entries may accumulate.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentNatTableFull expr: 'sum(rate(cilium_drop_count_total{reason="No mapping for NAT masquerade"}[1m])) by (pod) > 0' for: 5m labels: severity: critical annotations: summary: Cilium agent NAT table full (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} NAT table is full, causing masquerade failures. Increase NAT map size or investigate.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Policy denials may be expected behavior. Investigate only if unexpected traffic is being blocked. - alert: CiliumAgentHighDeniedRate expr: 'sum(rate(cilium_drop_count_total{reason="Policy denied"}[1m])) by (pod) > 0' for: 10m labels: severity: info annotations: summary: Cilium agent high denied rate (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} is dropping packets due to policy denial. Verify network policies are correct.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentHighDropRate expr: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0' for: 5m labels: severity: warning annotations: summary: Cilium agent high drop rate (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} is dropping packets for reason {{ $labels.reason }}. This indicates infrastructure issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentPolicyMapPressure expr: 'sum(cilium_bpf_map_pressure{map_name=~"cilium_policy_.*"}) by (pod) > 0.9' for: 5m labels: severity: warning annotations: summary: Cilium agent policy map pressure (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} policy BPF map is above 90% utilization. New policies may fail to apply.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentPolicyImportErrors expr: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0' for: 5m labels: severity: warning annotations: summary: Cilium agent policy import errors (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} is failing to import network policies. Policy enforcement may be incomplete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 60s is a rough default. Adjust based on cluster size and policy complexity. - alert: CiliumAgentPolicyImplementationDelay expr: 'histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay[5m])) by (le, pod)) > 60' for: 5m labels: severity: warning annotations: summary: Cilium agent policy implementation delay (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} P99 policy deployment latency exceeds 60 seconds. Endpoints may run with stale policies.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumNode-localHighIdentityAllocation expr: '(sum(cilium_identity{type="node_local"}) by (pod) / (2^16-1)) > 0.8' for: 5m labels: severity: warning annotations: summary: Cilium node-local high identity allocation (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} node-local identity allocation is above 80%. Approaching the 65535 identity limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumClusterHighIdentityAllocation expr: '(sum(cilium_identity{type="cluster_local"}) by () / (2^16-256)) > 0.8' for: 5m labels: severity: warning annotations: summary: Cilium cluster high identity allocation (instance {{ $labels.instance }}) description: "Cilium cluster-wide identity allocation is above 80%. Approaching the maximum identity limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumOperatorExhaustedIpamIps expr: 'sum(cilium_operator_ipam_ips{type="available"}) by () <= 0' for: 5m labels: severity: critical annotations: summary: Cilium operator exhausted IPAM IPs (instance {{ $labels.instance }}) description: "Cilium operator has no available IPAM IPs. New pods will fail to schedule networking.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 90% is a rough default. Adjust based on your pod churn rate and IP pool size. - alert: CiliumOperatorLowAvailableIpamIps expr: 'sum(cilium_operator_ipam_ips{type!="available"}) by () / sum(cilium_operator_ipam_ips) by () > 0.9 and sum(cilium_operator_ipam_ips) by () > 0' for: 5m labels: severity: warning annotations: summary: Cilium operator low available IPAM IPs (instance {{ $labels.instance }}) description: "Cilium operator IPAM IP pool is over 90% utilized. Allocate more IPs to avoid exhaustion.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Some Cilium versions may not have a status label on this metric. Verify against your Cilium version. - alert: CiliumOperatorIpamInterfaceCreationFailures expr: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0' for: 10m labels: severity: warning annotations: summary: Cilium operator IPAM interface creation failures (instance {{ $labels.instance }}) description: "Cilium operator is failing to create IPAM network interfaces. IP allocation may be impacted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentApiErrors expr: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0' for: 5m labels: severity: warning annotations: summary: Cilium agent API errors (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} API is returning 5xx errors ({{ $labels.return_code }}). Agent may be unhealthy.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentKubernetesClientErrors expr: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0' for: 5m labels: severity: info annotations: summary: Cilium agent Kubernetes client errors (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} is receiving errors from K8s API for endpoint {{ $labels.endpoint }} ({{ $labels.return_code }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumClustermeshRemoteClusterNotReady expr: 'count(cilium_clustermesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0' for: 5m labels: severity: critical annotations: summary: Cilium ClusterMesh remote cluster not ready (instance {{ $labels.instance }}) description: "Cilium ClusterMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumClustermeshRemoteClusterFailing expr: 'sum(rate(cilium_clustermesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0' for: 5m labels: severity: critical annotations: summary: Cilium ClusterMesh remote cluster failing (instance {{ $labels.instance }}) description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumKvstoremeshRemoteClusterNotReady expr: 'count(cilium_kvstoremesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0' for: 5m labels: severity: critical annotations: summary: Cilium KVStoreMesh remote cluster not ready (instance {{ $labels.instance }}) description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumKvstoremeshRemoteClusterFailing expr: 'sum(rate(cilium_kvstoremesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0' for: 5m labels: severity: critical annotations: summary: Cilium KVStoreMesh remote cluster failing (instance {{ $labels.instance }}) description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumKvstoremeshSyncErrors expr: 'sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0' for: 5m labels: severity: critical annotations: summary: Cilium KVStoreMesh sync errors (instance {{ $labels.instance }}) description: "Cilium KVStoreMesh from {{ $labels.source_cluster }} is experiencing kvstore sync errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumHubbleLostEvents expr: 'sum(rate(hubble_lost_events_total[5m])) by (pod) > 0' for: 5m labels: severity: warning annotations: summary: Cilium Hubble lost events (instance {{ $labels.instance }}) description: "Cilium Hubble on {{ $labels.pod }} is losing flow events. Observability data may be incomplete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 10% is a rough default. Some DNS errors may be normal depending on your workload. - alert: CiliumHubbleHighDnsErrorRate expr: 'sum(rate(hubble_dns_responses_total{rcode!="No Error"}[5m])) by (pod) / sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0.1 and sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0' for: 5m labels: severity: warning annotations: summary: Cilium Hubble high DNS error rate (instance {{ $labels.instance }}) description: "Cilium Hubble on {{ $labels.pod }} is observing more than 10% DNS error responses.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/clickhouse/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: # Adjust the job label to match your Prometheus configuration. - alert: ClickhouseNodeDown expr: 'up{job="clickhouse"} == 0' for: 2m labels: severity: critical annotations: summary: ClickHouse node down (instance {{ $labels.instance }}) description: "No metrics received from ClickHouse exporter for over 2 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseMemoryUsageCritical expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0' for: 5m labels: severity: critical annotations: summary: ClickHouse Memory Usage Critical (instance {{ $labels.instance }}) description: "Memory usage is critically high, over 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseMemoryUsageWarning expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0' for: 5m labels: severity: warning annotations: summary: ClickHouse Memory Usage Warning (instance {{ $labels.instance }}) description: "Memory usage is over 80%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseDiskSpaceLowOnDefault expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0' for: 2m labels: severity: warning annotations: summary: ClickHouse Disk Space Low on Default (instance {{ $labels.instance }}) description: "Disk space on default is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseDiskSpaceCriticalOnDefault expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0' for: 2m labels: severity: critical annotations: summary: ClickHouse Disk Space Critical on Default (instance {{ $labels.instance }}) description: "Disk space on default disk is critically low, below 10%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseDiskSpaceLowOnBackups expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) > 0' for: 2m labels: severity: warning annotations: summary: ClickHouse Disk Space Low on Backups (instance {{ $labels.instance }}) description: "Disk space on backups is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseReplicaErrors expr: 'ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1' for: 0m labels: severity: critical annotations: summary: ClickHouse Replica Errors (instance {{ $labels.instance }}) description: "Critical replica errors detected, either all replicas are stale or lost.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseNoAvailableReplicas expr: 'ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1' for: 0m labels: severity: critical annotations: summary: ClickHouse No Available Replicas (instance {{ $labels.instance }}) description: "No available replicas in ClickHouse.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseNoLiveReplicas expr: 'ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1' for: 0m labels: severity: critical annotations: summary: ClickHouse No Live Replicas (instance {{ $labels.instance }}) description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Please replace the threshold with an appropriate value - alert: ClickhouseHighTcpConnections expr: 'ClickHouseMetrics_TCPConnection > 400' for: 5m labels: severity: warning annotations: summary: ClickHouse High TCP Connections (instance {{ $labels.instance }}) description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Adjust the threshold based on your cluster size and expected replication traffic. - alert: ClickhouseInterserverConnectionIssues expr: 'ClickHouseMetrics_InterserverConnection > 50' for: 5m labels: severity: warning annotations: summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }}) description: "High number of interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseZookeeperConnectionIssues expr: 'ClickHouseMetrics_ZooKeeperSession != 1' for: 3m labels: severity: warning annotations: summary: ClickHouse ZooKeeper Connection Issues (instance {{ $labels.instance }}) description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseAuthenticationFailures expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 3' for: 0m labels: severity: info annotations: summary: ClickHouse Authentication Failures (instance {{ $labels.instance }}) description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseAccessDeniedErrors expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 3' for: 0m labels: severity: info annotations: summary: ClickHouse Access Denied Errors (instance {{ $labels.instance }}) description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseRejectedInsertQueries expr: 'increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 0' for: 1m labels: severity: warning annotations: summary: ClickHouse rejected insert queries (instance {{ $labels.instance }}) description: "INSERTs rejected due to too many active data parts. Reduce insert frequency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseDelayedInsertQueries expr: 'increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 0' for: 2m labels: severity: warning annotations: summary: ClickHouse delayed insert queries (instance {{ $labels.instance }}) description: "INSERTs delayed due to high number of active parts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseZookeeperHardwareException expr: 'increase(ClickHouseProfileEvents_ZooKeeperHardwareExceptions[1m]) > 0' for: 1m labels: severity: critical annotations: summary: ClickHouse zookeeper hardware exception (instance {{ $labels.instance }}) description: "Zookeeper hardware exception: network issues communicating with ZooKeeper\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Please replace the threshold with an appropriate value - alert: ClickhouseHighNetworkUsage expr: 'rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024' for: 2m labels: severity: warning annotations: summary: ClickHouse high network usage (instance {{ $labels.instance }}) description: "High network usage. ClickHouse network usage exceeds 100MB/s.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseDistributedRejectedInserts expr: 'increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 0' for: 2m labels: severity: critical annotations: summary: ClickHouse distributed rejected inserts (instance {{ $labels.instance }}) description: "INSERTs into Distributed tables rejected due to pending bytes limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/cloudflare/lablabs-cloudflare-exporter.yml ================================================ groups: - name: LablabsCloudflareExporter rules: - alert: CloudflareHttp4xxErrorRate expr: '(sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[15m])) > 0' for: 0m labels: severity: warning annotations: summary: Cloudflare http 4xx error rate (instance {{ $labels.instance }}) description: "Cloudflare high HTTP 4xx error rate (> 5% for domain {{ $labels.zone }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CloudflareHttp5xxErrorRate expr: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[5m])) > 0' for: 0m labels: severity: critical annotations: summary: Cloudflare http 5xx error rate (instance {{ $labels.instance }}) description: "Cloudflare high HTTP 5xx error rate (> 5% for domain {{ $labels.zone }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/consul/consul-exporter.yml ================================================ groups: - name: ConsulExporter rules: - alert: ConsulServiceHealthcheckFailed expr: 'consul_catalog_service_node_healthy == 0' for: 1m labels: severity: critical annotations: summary: Consul service healthcheck failed (instance {{ $labels.instance }}) description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ConsulMissingMasterNode expr: 'consul_raft_peers < 3' for: 0m labels: severity: critical annotations: summary: Consul missing master node (instance {{ $labels.instance }}) description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ConsulAgentUnhealthy expr: 'consul_health_node_status{status="critical"} == 1' for: 0m labels: severity: critical annotations: summary: Consul agent unhealthy (instance {{ $labels.instance }}) description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/coredns/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: CorednsPanicCount expr: 'increase(coredns_panics_total[1m]) > 0' for: 0m labels: severity: critical annotations: summary: CoreDNS Panic Count (instance {{ $labels.instance }}) description: "Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/cortex/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: CortexRulerConfigurationReloadFailure expr: 'cortex_ruler_config_last_reload_successful != 1' for: 0m labels: severity: warning annotations: summary: Cortex ruler configuration reload failure (instance {{ $labels.instance }}) description: "Cortex ruler configuration reload failure (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CortexNotConnectedToAlertmanager expr: 'cortex_prometheus_notifications_alertmanagers_discovered < 1' for: 0m labels: severity: critical annotations: summary: Cortex not connected to Alertmanager (instance {{ $labels.instance }}) description: "Cortex not connected to Alertmanager (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: CortexNotificationAreBeingDropped expr: 'rate(cortex_prometheus_notifications_dropped_total[5m]) > 0.05' for: 0m labels: severity: critical annotations: summary: Cortex notification are being dropped (instance {{ $labels.instance }}) description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: CortexNotificationError expr: 'rate(cortex_prometheus_notifications_errors_total[5m]) > 0.05' for: 0m labels: severity: critical annotations: summary: Cortex notification error (instance {{ $labels.instance }}) description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CortexIngesterUnhealthy expr: 'cortex_ring_members{state="Unhealthy", name="ingester"} > 0' for: 0m labels: severity: critical annotations: summary: Cortex ingester unhealthy (instance {{ $labels.instance }}) description: "Cortex has an unhealthy ingester\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CortexFrontendQueriesStuck expr: 'sum by (job) (cortex_query_frontend_queue_length) > 0' for: 5m labels: severity: critical annotations: summary: Cortex frontend queries stuck (instance {{ $labels.instance }}) description: "There are queued up queries in query-frontend.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml ================================================ groups: - name: GesellixCouchdbPrometheusExporter rules: - alert: CouchdbNodeDown expr: 'couchdb_httpd_node_up == 0 or couchdb_httpd_up == 0' for: 2m labels: severity: critical annotations: summary: CouchDB node down (instance {{ $labels.instance }}) description: "CouchDB node is not responding (node_up metric is 0) for more than 2 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbAtomMemoryUsageCritical expr: 'couchdb_erlang_memory_atom_used > 0.9 * couchdb_erlang_memory_atom' for: 5m labels: severity: critical annotations: summary: CouchDB atom memory usage critical (instance {{ $labels.instance }}) description: "Atom memory usage is above 90% of limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbOpenDatabasesCritical expr: 'couchdb_httpd_open_databases > 0.9 * 1000' for: 5m labels: severity: critical annotations: summary: CouchDB open databases critical (instance {{ $labels.instance }}) description: "Number of open databases exceeds 90% of node capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbOpenOsFilesCritical expr: 'couchdb_httpd_open_os_files > 0.9 * 65535' for: 5m labels: severity: critical annotations: summary: CouchDB open OS files critical (instance {{ $labels.instance }}) description: "CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: Couchdb5xxErrorRatioHigh expr: 'rate(couchdb_httpd_status_codes{code=~"5.."}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05 and rate(couchdb_httpd_requests[5m]) > 0' for: 5m labels: severity: critical annotations: summary: CouchDB 5xx error ratio high (instance {{ $labels.instance }}) description: "More than 5% of HTTP requests are returning 5xx errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbTemporaryViewReadRateCritical expr: 'rate(couchdb_httpd_temporary_view_reads[5m]) > 100' for: 5m labels: severity: critical annotations: summary: CouchDB temporary view read rate critical (instance {{ $labels.instance }}) description: "Temporary view read rate exceeds 100 reads/sec, high risk of performance degradation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbMangoQueriesScanningTooManyDocs expr: 'rate(couchdb_mango_too_many_docs_scanned[5m]) > 50' for: 5m labels: severity: warning annotations: summary: CouchDB Mango queries scanning too many docs (instance {{ $labels.instance }}) description: "Some Mango queries are scanning too many documents, consider adding indexes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbMangoQueriesFailedDueToInvalidIndex expr: 'rate(couchdb_mango_query_invalid_index[5m]) > 5' for: 5m labels: severity: warning annotations: summary: CouchDB Mango queries failed due to invalid index (instance {{ $labels.instance }}) description: "Some Mango queries failed to execute because the index was missing or invalid\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbMangoDocsExaminedHigh expr: 'rate(couchdb_mango_docs_examined[5m]) > 1000' for: 5m labels: severity: warning annotations: summary: CouchDB Mango docs examined high (instance {{ $labels.instance }}) description: "High number of documents examined per Mango queries, consider indexing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbReplicatorManagerDied expr: 'increase(couchdb_replicator_changes_manager_deaths[5m]) > 0' for: 1m labels: severity: critical annotations: summary: CouchDB Replicator manager died (instance {{ $labels.instance }}) description: "Replication manager process has crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbReplicatorQueueProcessDied expr: 'increase(couchdb_replicator_changes_queue_deaths[5m]) > 0' for: 1m labels: severity: critical annotations: summary: CouchDB Replicator queue process died (instance {{ $labels.instance }}) description: "Replication queue process has crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbReplicatorReaderProcessDied expr: 'increase(couchdb_replicator_changes_reader_deaths[5m]) > 0' for: 1m labels: severity: critical annotations: summary: CouchDB Replicator reader process died (instance {{ $labels.instance }}) description: "Replication reader process has crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbReplicatorFailedToStart expr: 'increase(couchdb_replicator_failed_starts[5m]) > 0' for: 1m labels: severity: critical annotations: summary: CouchDB Replicator failed to start (instance {{ $labels.instance }}) description: "One or more replication tasks failed to start\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbReplicationClusterUnstable expr: 'couchdb_replicator_cluster_is_stable == 0' for: 2m labels: severity: critical annotations: summary: CouchDB replication cluster unstable (instance {{ $labels.instance }}) description: "The replication cluster is unstable, replication may be interrupted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbReplicationReadFailures expr: 'increase(couchdb_replicator_changes_read_failures[5m]) > 5' for: 5m labels: severity: warning annotations: summary: CouchDB replication read failures (instance {{ $labels.instance }}) description: "Replication changes feed has failed reads more than 5 times in 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbFileDescriptorsHigh expr: 'process_open_fds / process_max_fds > 0.85 and process_max_fds > 0' for: 5m labels: severity: warning annotations: summary: CouchDB file descriptors high (instance {{ $labels.instance }}) description: "Process is using more than 85% of allowed file descriptors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbProcessRestarted expr: 'changes(process_start_time_seconds[1h]) > 0' for: 1m labels: severity: info annotations: summary: CouchDB process restarted (instance {{ $labels.instance }}) description: "CouchDB process has restarted recently\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbCriticalLogEntries expr: 'increase(couchdb_server_couch_log{level=~"error|critical"}[5m]) > 0' for: 1m labels: severity: critical annotations: summary: CouchDB critical log entries (instance {{ $labels.instance }}) description: "Critical or error log entries detected in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/digitalocean/digitalocean-exporter.yml ================================================ groups: - name: DigitaloceanExporter rules: - alert: DigitaloceanDropletDown expr: 'digitalocean_droplet_up == 0' for: 5m labels: severity: critical annotations: summary: DigitalOcean droplet down (instance {{ $labels.instance }}) description: "DigitalOcean droplet {{ $labels.name }} ({{ $labels.id }}) in {{ $labels.region }} is not running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: DigitaloceanAccountNotActive expr: 'digitalocean_account_active != 1' for: 5m labels: severity: critical annotations: summary: DigitalOcean account not active (instance {{ $labels.instance }}) description: "DigitalOcean account is not active. It may be suspended or locked.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: DigitaloceanDatabaseDown expr: 'digitalocean_database_status == 0' for: 2m labels: severity: critical annotations: summary: DigitalOcean database down (instance {{ $labels.instance }}) description: "DigitalOcean managed database {{ $labels.name }} ({{ $labels.engine }}) in {{ $labels.region }} is offline.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: DigitaloceanKubernetesClusterDown expr: 'digitalocean_kubernetes_cluster_up == 0' for: 5m labels: severity: critical annotations: summary: DigitalOcean Kubernetes cluster down (instance {{ $labels.instance }}) description: "DigitalOcean Kubernetes cluster {{ $labels.name }} ({{ $labels.version }}) in {{ $labels.region }} is not running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: DigitaloceanLoadBalancerDown expr: 'digitalocean_loadbalancer_status == 0' for: 2m labels: severity: critical annotations: summary: DigitalOcean load balancer down (instance {{ $labels.instance }}) description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) is not active.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: DigitaloceanLoadBalancerNoBackends expr: 'digitalocean_loadbalancer_droplets == 0' for: 1m labels: severity: warning annotations: summary: DigitalOcean load balancer no backends (instance {{ $labels.instance }}) description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) has no droplets attached.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: DigitaloceanFloatingIpNotAssigned expr: 'digitalocean_floating_ipv4_active == 0' for: 0m labels: severity: warning annotations: summary: DigitalOcean floating IP not assigned (instance {{ $labels.instance }}) description: "DigitalOcean floating IP {{ $labels.ipv4 }} in {{ $labels.region }} is not assigned to any droplet.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: DigitaloceanActiveIncidents expr: 'digitalocean_incidents_total > 0' for: 0m labels: severity: warning annotations: summary: DigitalOcean active incidents (instance {{ $labels.instance }}) description: "DigitalOcean platform has {{ $value }} active incident(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: DigitaloceanExporterCollectionErrors expr: 'increase(digitalocean_errors_total[5m]) > 0' for: 5m labels: severity: warning annotations: summary: DigitalOcean exporter collection errors (instance {{ $labels.instance }}) description: "DigitalOcean exporter {{ $labels.collector }} collector has {{ $value }} errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Fires when more than 80% of the account's droplet limit is in use. - alert: DigitaloceanDropletLimitApproaching expr: '(count(digitalocean_droplet_up) / digitalocean_account_droplet_limit) * 100 > 80 and digitalocean_account_droplet_limit > 0' for: 0m labels: severity: warning annotations: summary: DigitalOcean droplet limit approaching (instance {{ $labels.instance }}) description: "DigitalOcean account is using {{ $value }}% of its droplet quota.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/docker-containers/google-cadvisor.yml ================================================ groups: - name: GoogleCadvisor rules: # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. - alert: ContainerKilled expr: 'time() - container_last_seen > 60' for: 0m labels: severity: warning annotations: summary: Container killed (instance {{ $labels.instance }}) description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. - alert: ContainerAbsent expr: 'absent(container_last_seen)' for: 5m labels: severity: warning annotations: summary: Container absent (instance {{ $labels.instance }}) description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Only fires for containers with explicit CPU limits. Containers without limits have cpu_quota=0, which is filtered out by the guard. - alert: ContainerHighCpuUtilization expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0' for: 2m labels: severity: warning annotations: summary: Container High CPU utilization (instance {{ $labels.instance }}) description: "Container CPU utilization is above 80% (current: {{ $value | printf \"%.2f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d - alert: ContainerHighMemoryUsage expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80' for: 2m labels: severity: warning annotations: summary: Container High Memory usage (instance {{ $labels.instance }}) description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ContainerVolumeUsage expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80 and sum(container_fs_inodes_total) BY (instance) > 0' for: 2m labels: severity: warning annotations: summary: Container Volume usage (instance {{ $labels.instance }}) description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ContainerHighThrottleRate expr: 'sum(rate(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0' for: 5m labels: severity: warning annotations: summary: Container high throttle rate (instance {{ $labels.instance }}) description: "Container is being throttled ({{ $value | humanizePercentage }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ContainerHighLowChangeCpuUsage expr: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25' for: 0m labels: severity: info annotations: summary: Container high low change CPU usage (instance {{ $labels.instance }}) description: "This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ContainerLowCpuUtilization expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20' for: 7d labels: severity: info annotations: summary: Container Low CPU utilization (instance {{ $labels.instance }}) description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU. (current: {{ $value | printf \"%.2f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ContainerLowMemoryUsage expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20' for: 7d labels: severity: info annotations: summary: Container Low Memory usage (instance {{ $labels.instance }}) description: "Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/ebpf/ebpf-exporter.yml ================================================ groups: - name: EbpfExporter rules: # The exporter uses loose attachment: if a program fails to load (missing BTF, kernel incompatibility), it sets this metric to 0 and continues running. - alert: EbpfExporterProgramNotAttached expr: 'ebpf_exporter_ebpf_program_attached == 0' for: 5m labels: severity: warning annotations: summary: eBPF exporter program not attached (instance {{ $labels.instance }}) description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EbpfExporterDecoderErrors expr: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0' for: 5m labels: severity: warning annotations: summary: eBPF exporter decoder errors (instance {{ $labels.instance }}) description: "eBPF exporter is experiencing decoder errors for config {{ $labels.config }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EbpfExporterNoEnabledConfigs expr: 'ebpf_exporter_enabled_configs == 0 or absent(ebpf_exporter_enabled_configs)' for: 5m labels: severity: warning annotations: summary: eBPF exporter no enabled configs (instance {{ $labels.instance }}) description: "eBPF exporter has no enabled configurations. No eBPF programs are being run. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml ================================================ groups: - name: PrometheusCommunityElasticsearchExporter rules: - alert: ElasticsearchHeapUsageTooHigh expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0' for: 2m labels: severity: critical annotations: summary: Elasticsearch Heap Usage Too High (instance {{ $labels.instance }}) description: "The heap usage is over 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchHeapUsageWarning expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0' for: 2m labels: severity: warning annotations: summary: Elasticsearch Heap Usage warning (instance {{ $labels.instance }}) description: "The heap usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchDiskOutOfSpace expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10 and elasticsearch_filesystem_data_size_bytes > 0' for: 0m labels: severity: critical annotations: summary: Elasticsearch disk out of space (instance {{ $labels.instance }}) description: "The disk usage is over 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchDiskSpaceLow expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20 and elasticsearch_filesystem_data_size_bytes > 0' for: 2m labels: severity: warning annotations: summary: Elasticsearch disk space low (instance {{ $labels.instance }}) description: "The disk usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchClusterRed expr: 'elasticsearch_cluster_health_status{color="red"} == 1' for: 0m labels: severity: critical annotations: summary: Elasticsearch Cluster Red (instance {{ $labels.instance }}) description: "Elastic Cluster Red status\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchClusterYellow expr: 'elasticsearch_cluster_health_status{color="yellow"} == 1' for: 0m labels: severity: warning annotations: summary: Elasticsearch Cluster Yellow (instance {{ $labels.instance }}) description: "Elastic Cluster Yellow status\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1m delay allows a restart without triggering an alert. - alert: ElasticsearchHealthyNodes expr: 'elasticsearch_cluster_health_number_of_nodes < 3' for: 1m labels: severity: critical annotations: summary: Elasticsearch Healthy Nodes (instance {{ $labels.instance }}) description: "Missing node in Elasticsearch cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1m delay allows a restart without triggering an alert. - alert: ElasticsearchHealthyDataNodes expr: 'elasticsearch_cluster_health_number_of_data_nodes < 3' for: 1m labels: severity: critical annotations: summary: Elasticsearch Healthy Data Nodes (instance {{ $labels.instance }}) description: "Missing data node in Elasticsearch cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchRelocatingShards expr: 'elasticsearch_cluster_health_relocating_shards > 0' for: 0m labels: severity: info annotations: summary: Elasticsearch relocating shards (instance {{ $labels.instance }}) description: "Elasticsearch is relocating shards\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchRelocatingShardsTooLong expr: 'elasticsearch_cluster_health_relocating_shards > 0' for: 15m labels: severity: warning annotations: summary: Elasticsearch relocating shards too long (instance {{ $labels.instance }}) description: "Elasticsearch has been relocating shards for 15min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchInitializingShards expr: 'elasticsearch_cluster_health_initializing_shards > 0' for: 0m labels: severity: info annotations: summary: Elasticsearch initializing shards (instance {{ $labels.instance }}) description: "Elasticsearch is initializing shards\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchInitializingShardsTooLong expr: 'elasticsearch_cluster_health_initializing_shards > 0' for: 15m labels: severity: warning annotations: summary: Elasticsearch initializing shards too long (instance {{ $labels.instance }}) description: "Elasticsearch has been initializing shards for 15 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchUnassignedShards expr: 'elasticsearch_cluster_health_unassigned_shards > 0' for: 2m labels: severity: critical annotations: summary: Elasticsearch unassigned shards (instance {{ $labels.instance }}) description: "Elasticsearch has unassigned shards\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchPendingTasks expr: 'elasticsearch_cluster_health_number_of_pending_tasks > 0' for: 15m labels: severity: warning annotations: summary: Elasticsearch pending tasks (instance {{ $labels.instance }}) description: "Elasticsearch has pending tasks. Cluster works slowly.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchNoNewDocuments expr: 'increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1' for: 0m labels: severity: warning annotations: summary: Elasticsearch no new documents (instance {{ $labels.instance }}) description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchHighIndexingLatency expr: 'rate(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / rate(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and rate(elasticsearch_indices_indexing_index_total[1m]) > 0' for: 10m labels: severity: warning annotations: summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }}) description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchHighIndexingRate expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000' for: 5m labels: severity: warning annotations: summary: Elasticsearch High Indexing Rate (instance {{ $labels.instance }}) description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchHighQueryRate expr: 'sum(rate(elasticsearch_indices_search_query_total[1m])) > 100' for: 5m labels: severity: warning annotations: summary: Elasticsearch High Query Rate (instance {{ $labels.instance }}) description: "The query rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchHighQueryLatency expr: 'rate(elasticsearch_indices_search_query_time_seconds[1m]) / rate(elasticsearch_indices_search_query_total[1m]) > 1 and rate(elasticsearch_indices_search_query_total[1m]) > 0' for: 5m labels: severity: warning annotations: summary: Elasticsearch High Query Latency (instance {{ $labels.instance }}) description: "The query latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/envoy/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: EnvoyServerNotLive expr: 'envoy_server_live != 1' for: 1m labels: severity: critical annotations: summary: Envoy server not live (instance {{ $labels.instance }}) description: "Envoy server is not live (draining or shutting down) on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyHighMemoryUsage expr: 'envoy_server_memory_allocated / envoy_server_memory_heap_size * 100 > 90 and envoy_server_memory_heap_size > 0' for: 5m labels: severity: warning annotations: summary: Envoy high memory usage (instance {{ $labels.instance }}) description: "Envoy memory allocated is above 90% of heap size on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyHighDownstreamHttp5xxErrorRate expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0' for: 1m labels: severity: critical annotations: summary: Envoy high downstream HTTP 5xx error rate (instance {{ $labels.instance }}) description: "More than 5% of downstream HTTP responses are 5xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyHighDownstreamHttp4xxErrorRate expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Envoy high downstream HTTP 4xx error rate (instance {{ $labels.instance }}) description: "More than 10% of downstream HTTP responses are 4xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyDownstreamConnectionsOverflowing expr: 'increase(envoy_listener_downstream_cx_overflow[5m]) > 5' for: 0m labels: severity: warning annotations: summary: Envoy downstream connections overflowing (instance {{ $labels.instance }}) description: "Downstream connections are being rejected due to listener overflow on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyClusterMembershipEmpty expr: 'envoy_cluster_membership_healthy == 0' for: 1m labels: severity: critical annotations: summary: Envoy cluster membership empty (instance {{ $labels.instance }}) description: "Envoy cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} has no healthy members\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyClusterMembershipDegraded expr: 'envoy_cluster_membership_healthy / envoy_cluster_membership_total * 100 < 75 and envoy_cluster_membership_total > 0' for: 5m labels: severity: warning annotations: summary: Envoy cluster membership degraded (instance {{ $labels.instance }}) description: "More than 25% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are unhealthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyHighClusterUpstreamConnectionFailures expr: 'increase(envoy_cluster_upstream_cx_connect_fail[5m]) > 10' for: 5m labels: severity: warning annotations: summary: Envoy high cluster upstream connection failures (instance {{ $labels.instance }}) description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyHighClusterUpstreamRequestTimeoutRate expr: 'rate(envoy_cluster_upstream_rq_timeout[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0' for: 5m labels: severity: warning annotations: summary: Envoy high cluster upstream request timeout rate (instance {{ $labels.instance }}) description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyHighClusterUpstream5xxErrorRate expr: 'rate(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0' for: 1m labels: severity: critical annotations: summary: Envoy high cluster upstream 5xx error rate (instance {{ $labels.instance }}) description: "More than 5% of upstream requests return 5xx in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyClusterHealthCheckFailures expr: 'increase(envoy_cluster_health_check_failure[5m]) > 5' for: 5m labels: severity: warning annotations: summary: Envoy cluster health check failures (instance {{ $labels.instance }}) description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyClusterOutlierDetectionEjectionsActive expr: 'envoy_cluster_outlier_detection_ejections_active > 0' for: 5m labels: severity: info annotations: summary: Envoy cluster outlier detection ejections active (instance {{ $labels.instance }}) description: "There are active outlier detection ejections in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyListenerSslConnectionErrors expr: 'increase(envoy_listener_ssl_connection_error[5m]) > 5' for: 0m labels: severity: warning annotations: summary: Envoy listener SSL connection errors (instance {{ $labels.instance }}) description: "Envoy listener is experiencing SSL/TLS connection errors on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyGlobalDownstreamConnectionsOverflowing expr: 'increase(envoy_listener_downstream_global_cx_overflow[5m]) > 5' for: 0m labels: severity: critical annotations: summary: Envoy global downstream connections overflowing (instance {{ $labels.instance }}) description: "Downstream connections are being rejected due to global connection limit on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoySslCertificateExpiringSoon expr: 'envoy_server_days_until_first_cert_expiring < 7' for: 0m labels: severity: warning annotations: summary: Envoy SSL certificate expiring soon (instance {{ $labels.instance }}) description: "SSL certificate loaded by Envoy on {{ $labels.instance }} expires in less than 7 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoySslCertificateExpired expr: 'envoy_server_days_until_first_cert_expiring < 0' for: 0m labels: severity: critical annotations: summary: Envoy SSL certificate expired (instance {{ $labels.instance }}) description: "SSL certificate loaded by Envoy on {{ $labels.instance }} has expired\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyClusterCircuitBreakerTripped expr: 'envoy_cluster_circuit_breakers_default_cx_open == 1 or envoy_cluster_circuit_breakers_default_rq_open == 1' for: 0m labels: severity: critical annotations: summary: Envoy cluster circuit breaker tripped (instance {{ $labels.instance }}) description: "Circuit breaker is open for cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyNoHealthyUpstream expr: 'increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 0' for: 0m labels: severity: critical annotations: summary: Envoy no healthy upstream (instance {{ $labels.instance }}) description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyHighDownstreamRequestTimeoutRate expr: 'increase(envoy_http_downstream_rq_timeout[5m]) > 5' for: 5m labels: severity: warning annotations: summary: Envoy high downstream request timeout rate (instance {{ $labels.instance }}) description: "Downstream requests are timing out on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/etcd/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: EtcdInsufficientMembers expr: 'count(etcd_server_id) % 2 == 0' for: 0m labels: severity: critical annotations: summary: Etcd insufficient Members (instance {{ $labels.instance }}) description: "Etcd cluster should have an odd number of members\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdNoLeader expr: 'etcd_server_has_leader == 0' for: 0m labels: severity: critical annotations: summary: Etcd no Leader (instance {{ $labels.instance }}) description: "Etcd cluster have no leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdHighNumberOfLeaderChanges expr: 'increase(etcd_server_leader_changes_seen_total[10m]) > 2' for: 0m labels: severity: warning annotations: summary: Etcd high number of leader changes (instance {{ $labels.instance }}) description: "Etcd leader changed {{ $value }} times during 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled. - alert: EtcdHighNumberOfFailedGrpcRequestsWarning expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0' for: 2m labels: severity: warning annotations: summary: Etcd high number of failed GRPC requests warning (instance {{ $labels.instance }}) description: "More than 1% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled. - alert: EtcdHighNumberOfFailedGrpcRequestsCritical expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0' for: 2m labels: severity: critical annotations: summary: Etcd high number of failed GRPC requests critical (instance {{ $labels.instance }}) description: "More than 5% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdGrpcRequestsSlow expr: 'histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service, grpc_method, le)) > 0.15' for: 2m labels: severity: warning annotations: summary: Etcd GRPC requests slow (instance {{ $labels.instance }}) description: "GRPC requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdHighNumberOfFailedHttpRequestsWarning expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0' for: 2m labels: severity: warning annotations: summary: Etcd high number of failed HTTP requests warning (instance {{ $labels.instance }}) description: "More than 1% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdHighNumberOfFailedHttpRequestsCritical expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0' for: 2m labels: severity: critical annotations: summary: Etcd high number of failed HTTP requests critical (instance {{ $labels.instance }}) description: "More than 5% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdHttpRequestsSlow expr: 'histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15' for: 2m labels: severity: warning annotations: summary: Etcd HTTP requests slow (instance {{ $labels.instance }}) description: "HTTP requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdMemberCommunicationSlow expr: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15' for: 2m labels: severity: warning annotations: summary: Etcd member communication slow (instance {{ $labels.instance }}) description: "Etcd member communication slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdHighNumberOfFailedProposals expr: 'increase(etcd_server_proposals_failed_total[1h]) > 5' for: 2m labels: severity: warning annotations: summary: Etcd high number of failed proposals (instance {{ $labels.instance }}) description: "Etcd server got {{ $value }} failed proposals in the past hour\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdHighFsyncDurations expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5' for: 2m labels: severity: warning annotations: summary: Etcd high fsync durations (instance {{ $labels.instance }}) description: "Etcd WAL fsync duration increasing, 99th percentile is over 0.5s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdHighCommitDurations expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25' for: 2m labels: severity: warning annotations: summary: Etcd high commit durations (instance {{ $labels.instance }}) description: "Etcd commit duration increasing, 99th percentile is over 0.25s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/fluxcd/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: FluxKustomizationFailure expr: 'gotk_resource_info{ready="False", customresource_kind="Kustomization"} > 0' for: 15m labels: severity: warning annotations: summary: Flux Kustomization Failure (instance {{ $labels.instance }}) description: "The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FluxHelmreleaseFailure expr: 'gotk_resource_info{ready="False", customresource_kind="HelmRelease"} > 0' for: 15m labels: severity: warning annotations: summary: Flux HelmRelease Failure (instance {{ $labels.instance }}) description: "The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FluxSourceIssue expr: 'gotk_resource_info{ready="False", customresource_kind=~"GitRepository|HelmRepository|Bucket|OCIRepository"} > 0' for: 15m labels: severity: warning annotations: summary: Flux Source Issue (instance {{ $labels.instance }}) description: "Flux source {{ $labels.customresource_kind }} '{{ $labels.name }}' has issue(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FluxImageIssue expr: 'gotk_resource_info{ready="False", customresource_kind=~"ImagePolicy|ImageRepository|ImageUpdateAutomation"} > 0' for: 15m labels: severity: warning annotations: summary: Flux Image Issue (instance {{ $labels.instance }}) description: "The {{ $labels.customresource_kind }} '{{ $labels.name }}' is marked as not ready.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/freeswitch/znerol-freeswitch-exporter.yml ================================================ groups: - name: ZnerolFreeswitchExporter rules: - alert: FreeswitchDown expr: 'freeswitch_up == 0' for: 0m labels: severity: critical annotations: summary: Freeswitch down (instance {{ $labels.instance }}) description: "Freeswitch is unresponsive\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FreeswitchSessionsWarning expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 80 and freeswitch_session_limit > 0' for: 10m labels: severity: warning annotations: summary: Freeswitch Sessions Warning (instance {{ $labels.instance }}) description: "High sessions usage on {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FreeswitchSessionsCritical expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 90 and freeswitch_session_limit > 0' for: 5m labels: severity: critical annotations: summary: Freeswitch Sessions Critical (instance {{ $labels.instance }}) description: "High sessions usage on {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/gitlab-ci/gitaly.yml ================================================ groups: - name: Gitaly rules: # grpc_code!="OK" includes non-error codes like NotFound, AlreadyExists. Consider filtering to specific error codes for less noise. - alert: GitlabGitalyHighGrpcErrorRate expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0' for: 5m labels: severity: warning annotations: summary: GitLab Gitaly high gRPC error rate (instance {{ $labels.instance }}) description: "Gitaly on {{ $labels.instance }} is returning more than 5% gRPC errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # ResourceExhausted errors from Gitaly mean Git operations are being rejected due to # concurrency limits. This directly impacts users trying to push, pull, or clone. # This alert is derived from the GitLab Omnibus default rules. - alert: GitlabGitalyResourceExhausted expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0' for: 5m labels: severity: critical annotations: summary: GitLab Gitaly resource exhausted (instance {{ $labels.instance }}) description: "Gitaly on {{ $labels.instance }} is returning ResourceExhausted errors, indicating overload ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GitlabGitalyHighRpcLatency expr: 'histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job="gitaly",grpc_type="unary"}[5m])) by (le)) > 1' for: 5m labels: severity: warning annotations: summary: GitLab Gitaly high RPC latency (instance {{ $labels.instance }}) description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GitlabGitalyCpuThrottled expr: 'rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0' for: 5m labels: severity: warning annotations: summary: GitLab Gitaly CPU throttled (instance {{ $labels.instance }}) description: "Gitaly processes on {{ $labels.instance }} are being CPU throttled by cgroups.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GitlabGitalyAuthenticationFailures expr: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 0' for: 0m labels: severity: warning annotations: summary: GitLab Gitaly authentication failures (instance {{ $labels.instance }}) description: "Gitaly on {{ $labels.instance }} has authentication failures ({{ $value }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # When the circuit breaker trips to "open" state, Git operations (push, pull, clone) will fail. # Check Gitaly service health and logs. - alert: GitlabGitalyCircuitBreakerTripped expr: 'increase(gitaly_circuit_breaker_transitions_total{to_state="open"}[5m]) > 0' for: 0m labels: severity: critical annotations: summary: GitLab Gitaly circuit breaker tripped (instance {{ $labels.instance }}) description: "Gitaly circuit breaker has tripped on {{ $labels.instance }}. Git operations are failing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/gitlab-ci/gitlab-built-in-exporter.yml ================================================ groups: - name: GitlabBuiltInExporter rules: # Queued connections indicate Puma workers are saturated. # Consider increasing puma['worker_processes'] or puma['max_threads'] in gitlab.rb. - alert: GitlabPumaHighQueuedConnections expr: 'puma_queued_connections > 5' for: 5m labels: severity: warning annotations: summary: GitLab Puma high queued connections (instance {{ $labels.instance }}) description: "GitLab Puma has {{ $value }} queued connections on {{ $labels.instance }}. Requests are waiting for an available worker thread.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GitlabPumaNoAvailablePoolCapacity expr: 'puma_pool_capacity == 0' for: 5m labels: severity: critical annotations: summary: GitLab Puma no available pool capacity (instance {{ $labels.instance }}) description: "GitLab Puma pool capacity on {{ $labels.instance }} has been at 0 for 5 minutes. All threads are busy.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GitlabPumaWorkersNotRunning expr: 'puma_running_workers < puma_workers' for: 5m labels: severity: warning annotations: summary: GitLab Puma workers not running (instance {{ $labels.instance }}) description: "GitLab Puma on {{ $labels.instance }} has {{ $value }} running workers out of expected total.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold is 5% of all requests returning server errors. # Check GitLab logs at /var/log/gitlab/ for root cause. - alert: GitlabHighHttpErrorRate expr: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5 and sum(rate(http_requests_total[5m])) > 0' for: 5m labels: severity: critical annotations: summary: GitLab high HTTP error rate (instance {{ $labels.instance }}) description: "GitLab is returning more than 5% HTTP 5xx errors on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 10s may need adjustment based on your instance size and workload. - alert: GitlabHighHttpRequestLatency expr: 'histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 10' for: 5m labels: severity: warning annotations: summary: GitLab high HTTP request latency (instance {{ $labels.instance }}) description: "GitLab p95 HTTP request latency on {{ $labels.instance }} is above 10 seconds.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. # A sustained failure rate indicates background processing issues. - alert: GitlabSidekiqJobsFailing expr: 'rate(sidekiq_jobs_failed_total[5m]) > 0.1' for: 10m labels: severity: warning annotations: summary: GitLab Sidekiq jobs failing (instance {{ $labels.instance }}) description: "GitLab Sidekiq jobs are failing at a rate of {{ $value }} per second on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # When running jobs approach the concurrency limit, new jobs will queue up. # Consider scaling Sidekiq workers or increasing concurrency. - alert: GitlabSidekiqQueueTooLarge expr: 'sum(sidekiq_running_jobs) >= sum(sidekiq_concurrency) * 0.9' for: 10m labels: severity: warning annotations: summary: GitLab Sidekiq queue too large (instance {{ $labels.instance }}) description: "GitLab Sidekiq has {{ $value }} running jobs, approaching concurrency limit on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. - alert: GitlabSidekiqHighJobCompletionTime expr: 'histogram_quantile(0.95, sum(rate(sidekiq_jobs_completion_seconds_bucket[5m])) by (le, worker)) > 300' for: 10m labels: severity: warning annotations: summary: GitLab Sidekiq high job completion time (instance {{ $labels.instance }}) description: "GitLab Sidekiq job p95 completion time on {{ $labels.instance }} is above 5 minutes ({{ $value | humanizeDuration }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. # High queue latency means jobs are stuck waiting. Check Sidekiq concurrency and queue sizes. - alert: GitlabSidekiqHighQueueLatency expr: 'histogram_quantile(0.95, sum(rate(sidekiq_jobs_queue_duration_seconds_bucket[5m])) by (le)) > 60' for: 5m labels: severity: warning annotations: summary: GitLab Sidekiq high queue latency (instance {{ $labels.instance }}) description: "GitLab Sidekiq jobs on {{ $labels.instance }} are waiting more than 60 seconds before being processed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # When the pool is near saturation, requests may block waiting for a connection. # Increase db_pool_size in gitlab.rb or investigate slow queries. - alert: GitlabDatabaseConnectionPoolSaturation expr: 'gitlab_database_connection_pool_busy / gitlab_database_connection_pool_size * 100 > 90 and gitlab_database_connection_pool_size > 0' for: 5m labels: severity: warning annotations: summary: GitLab database connection pool saturation (instance {{ $labels.instance }}) description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) is {{ $value }}% busy.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GitlabDatabaseConnectionPoolDeadConnections expr: 'gitlab_database_connection_pool_dead > 0' for: 5m labels: severity: warning annotations: summary: GitLab database connection pool dead connections (instance {{ $labels.instance }}) description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) has {{ $value }} dead connections.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GitlabDatabaseConnectionPoolWaiting expr: 'gitlab_database_connection_pool_waiting > 0' for: 5m labels: severity: warning annotations: summary: GitLab database connection pool waiting (instance {{ $labels.instance }}) description: "GitLab on {{ $labels.instance }} has {{ $value }} threads waiting for a database connection.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GitlabCiPipelineCreationSlow expr: 'histogram_quantile(0.95, sum(rate(gitlab_ci_pipeline_creation_duration_seconds_bucket[5m])) by (le)) > 30' for: 5m labels: severity: warning annotations: summary: GitLab CI pipeline creation slow (instance {{ $labels.instance }}) description: "GitLab CI pipeline creation p95 latency on {{ $labels.instance }} is above 30 seconds.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This metric may not exist in all GitLab versions. Verify against your GitLab installation. - alert: GitlabCiPipelineFailuresIncreasing expr: 'rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0' for: 10m labels: severity: warning annotations: summary: GitLab CI pipeline failures increasing (instance {{ $labels.instance }}) description: "GitLab CI pipeline failures are increasing on {{ $labels.instance }} ({{ $value }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Frequent runner auth failures may indicate expired tokens or misconfigured runners. - alert: GitlabCiRunnerAuthenticationFailures expr: 'increase(gitlab_ci_runner_authentication_failure_total[5m]) > 5' for: 5m labels: severity: warning annotations: summary: GitLab CI runner authentication failures (instance {{ $labels.instance }}) description: "GitLab CI runners are experiencing authentication failures on {{ $labels.instance }} ({{ $value }} failures).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 2GB may need adjustment based on your instance size. # High memory usage can lead to OOM kills and service disruptions. - alert: GitlabHighMemoryUsage expr: 'process_resident_memory_bytes{job=~".*gitlab.*"} > 2e+9' for: 10m labels: severity: warning annotations: summary: GitLab high memory usage (instance {{ $labels.instance }}) description: "GitLab process on {{ $labels.instance }} is using {{ $value | humanize1024 }}B of RSS memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Heap fragmentation above 50% means a significant amount of memory is wasted. # A Puma worker restart may help reclaim memory. - alert: GitlabRubyHeapFragmentation expr: 'ruby_gc_stat_ext_heap_fragmentation{job=~".*gitlab.*"} > 0.5' for: 15m labels: severity: warning annotations: summary: GitLab Ruby heap fragmentation (instance {{ $labels.instance }}) description: "GitLab Ruby heap fragmentation on {{ $labels.instance }} is {{ $value }}. High fragmentation wastes memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GitlabRackUncaughtErrors expr: 'rate(rack_uncaught_errors_total[5m]) > 0' for: 5m labels: severity: warning annotations: summary: GitLab rack uncaught errors (instance {{ $labels.instance }}) description: "GitLab is experiencing uncaught errors in the Rack layer on {{ $labels.instance }} ({{ $value }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This may happen during a rolling deployment. If it persists, investigate incomplete upgrades. - alert: GitlabVersionMismatch expr: 'count(count by (version) (gitlab_build_info)) > 1' for: 0m labels: severity: warning annotations: summary: GitLab version mismatch (instance {{ $labels.instance }}) description: "Multiple GitLab versions are running across the fleet.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GitlabHighFileDescriptorUsage expr: 'process_open_fds{job=~".*gitlab.*"} / process_max_fds * 100 > 80 and process_max_fds > 0' for: 5m labels: severity: warning annotations: summary: GitLab high file descriptor usage (instance {{ $labels.instance }}) description: "GitLab on {{ $labels.instance }} is using {{ $value }}% of available file descriptors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GitlabRubyThreadsSaturated expr: 'sum by (instance) (gitlab_ruby_threads_running_threads) > on(instance) gitlab_ruby_threads_max_expected_threads * 1.5' for: 10m labels: severity: warning annotations: summary: GitLab Ruby threads saturated (instance {{ $labels.instance }}) description: "GitLab running threads on {{ $labels.instance }} have exceeded the expected maximum ({{ $value }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/gitlab-ci/workhorse.yml ================================================ groups: - name: Workhorse rules: # Workhorse sits in front of Puma and handles Git HTTP, file uploads, and proxying. # Threshold from GitLab Omnibus default rules: 10% for high-traffic instances. - alert: GitlabWorkhorseHighErrorRate expr: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10 and sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) > 0' for: 5m labels: severity: critical annotations: summary: GitLab Workhorse high error rate (instance {{ $labels.instance }}) description: "GitLab Workhorse on {{ $labels.instance }} is returning more than 10% HTTP 5xx errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GitlabWorkhorseHighLatency expr: 'histogram_quantile(0.95, sum(rate(gitlab_workhorse_http_request_duration_seconds_bucket[5m])) by (le)) > 10' for: 5m labels: severity: warning annotations: summary: GitLab Workhorse high latency (instance {{ $labels.instance }}) description: "GitLab Workhorse on {{ $labels.instance }} p95 request latency is above 10 seconds.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 100 may need adjustment based on instance size. - alert: GitlabWorkhorseHighIn-flightRequests expr: 'gitlab_workhorse_http_in_flight_requests > 100' for: 5m labels: severity: warning annotations: summary: GitLab Workhorse high in-flight requests (instance {{ $labels.instance }}) description: "GitLab Workhorse on {{ $labels.instance }} has {{ $value }} in-flight requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/golang/golang-exporter.yml ================================================ groups: - name: GolangExporter rules: # Threshold is a rough default. High-concurrency servers may legitimately run thousands of goroutines. Adjust to match your baseline. - alert: GoGoroutineCountHigh expr: 'go_goroutines > 1000' for: 5m labels: severity: warning annotations: summary: Go goroutine count high (instance {{ $labels.instance }}) description: "Go application has too many goroutines (> 1000), potential goroutine leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # quantile="1" is the maximum observed GC pause in the current summary window, not p99. # A single outlier pause can push this above 1s. The for: 5m ensures the max stays elevated. - alert: GoGcDurationHigh expr: 'go_gc_duration_seconds{quantile="1"} > 1' for: 5m labels: severity: warning annotations: summary: Go GC duration high (instance {{ $labels.instance }}) description: "Go GC pause duration is too high (max > 1s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # go_memstats_sys_bytes is the total memory obtained from the OS by the Go runtime, not total host memory. # This ratio measures Go-internal memory utilization, not system-level memory pressure. - alert: GoMemoryUsageHigh expr: '(go_memstats_heap_alloc_bytes / go_memstats_sys_bytes) * 100 > 90' for: 5m labels: severity: warning annotations: summary: Go memory usage high (instance {{ $labels.instance }}) description: "Go heap allocation is using most of the runtime's reserved memory (> 90%), indicating the process may need more memory or has a leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold is workload-dependent. Applications with heavy CGo or blocking I/O may legitimately use more OS threads. Adjust to match your baseline. - alert: GoThreadCountHigh expr: 'go_threads > 500' for: 5m labels: severity: warning annotations: summary: Go thread count high (instance {{ $labels.instance }}) description: "Go OS thread count is high (> 500), potential blocking syscall or CGo leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold is a rough default. Adjust based on your application's normal object count. - alert: GoHeapObjectsCountHigh expr: 'go_memstats_heap_objects > 10000000' for: 5m labels: severity: warning annotations: summary: Go heap objects count high (instance {{ $labels.instance }}) description: "Go heap has too many live objects (> 10M), high GC pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # go_memstats_gc_cpu_fraction is deprecated since Go 1.20 and may return 0 in newer versions. # Consider using runtime/metrics-based alternatives if running Go >= 1.20. - alert: GoGcCpuFractionHigh expr: 'go_memstats_gc_cpu_fraction > 0.05' for: 5m labels: severity: warning annotations: summary: Go GC CPU fraction high (instance {{ $labels.instance }}) description: "Go GC is consuming too much CPU (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GoGoroutineSpike expr: 'deriv(go_goroutines[5m]) > 100' for: 5m labels: severity: warning annotations: summary: Go goroutine spike (instance {{ $labels.instance }}) description: "Go goroutine count is growing rapidly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GoHeapFragmentation expr: 'go_memstats_heap_idle_bytes / go_memstats_heap_sys_bytes > 0.9' for: 5m labels: severity: warning annotations: summary: Go heap fragmentation (instance {{ $labels.instance }}) description: "Go heap has high idle ratio (> 90%), indicating memory fragmentation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GoMemoryLeak expr: 'rate(go_memstats_alloc_bytes_total[5m]) > 1e9' for: 5m labels: severity: warning annotations: summary: Go memory leak (instance {{ $labels.instance }}) description: "Go application has sustained high allocation rate (> 1GB/s), potential memory leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GoStackMemoryHigh expr: 'go_memstats_stack_inuse_bytes > 1e9' for: 5m labels: severity: warning annotations: summary: Go stack memory high (instance {{ $labels.instance }}) description: "Go stack memory usage is high (> 1GB), likely excessive goroutines or deep recursion\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/google-cloud-stackdriver/stackdriver-exporter.yml ================================================ groups: - name: StackdriverExporter # Self-monitoring metrics use the stackdriver_monitoring_* prefix. # All self-monitoring metrics include a project_id label. rules: - alert: StackdriverExporterScrapeError expr: 'stackdriver_monitoring_last_scrape_error > 0' for: 5m labels: severity: warning annotations: summary: Stackdriver exporter scrape error (instance {{ $labels.instance }}) description: "Stackdriver exporter failed to scrape metrics from Google Cloud Monitoring API for project {{ $labels.project_id }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: StackdriverExporterSlowScrape expr: 'stackdriver_monitoring_last_scrape_duration_seconds > 300' for: 5m labels: severity: warning annotations: summary: Stackdriver exporter slow scrape (instance {{ $labels.instance }}) description: "Stackdriver exporter scrape for project {{ $labels.project_id }} is taking more than 5 minutes ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: StackdriverExporterScrapeErrorsIncreasing expr: 'increase(stackdriver_monitoring_scrape_errors_total[15m]) > 5' for: 0m labels: severity: warning annotations: summary: Stackdriver exporter scrape errors increasing (instance {{ $labels.instance }}) description: "Stackdriver exporter has had {{ $value }} scrape errors in the last 15 minutes for project {{ $labels.project_id }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: StackdriverExporterHighApiCalls expr: 'rate(stackdriver_monitoring_api_calls_total[5m]) * 60 > 100' for: 0m labels: severity: warning annotations: summary: Stackdriver exporter high API calls (instance {{ $labels.instance }}) description: "Stackdriver exporter is making {{ $value }} API calls per minute for project {{ $labels.project_id }}. This may hit Google Cloud Monitoring API quotas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: StackdriverExporterScrapeStale expr: 'time() - stackdriver_monitoring_last_scrape_timestamp > 600' for: 0m labels: severity: warning annotations: summary: Stackdriver exporter scrape stale (instance {{ $labels.instance }}) description: "Stackdriver exporter has not successfully scraped metrics for project {{ $labels.project_id }} in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/grafana-alloy/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: GrafanaAlloyServiceDown expr: 'count by (instance) (alloy_build_info offset 2h) unless count by (instance) (alloy_build_info)' for: 0m labels: severity: critical annotations: summary: Grafana Alloy service down (instance {{ $labels.instance }}) description: "Alloy on instance {{ $labels.instance }} is not responding or has stopped running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/grafana-mimir/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter # Mimir uses the `cortex_` metric prefix for backward compatibility with Cortex. This is intentional and expected. rules: - alert: MimirIngesterUnhealthy expr: 'min by (job) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0' for: 15m labels: severity: critical annotations: summary: Mimir ingester unhealthy (instance {{ $labels.instance }}) description: "Mimir has {{ $value }} unhealthy ingester(s) in the ring.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirRequestErrors expr: '100 * sum by (job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..", route!~"ready|debug_pprof"}[5m])) / sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 1 and sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 0' for: 15m labels: severity: critical annotations: summary: Mimir request errors (instance {{ $labels.instance }}) description: "Mimir {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirInconsistentRuntimeConfig expr: 'count(count by (job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1' for: 1h labels: severity: critical annotations: summary: Mimir inconsistent runtime config (instance {{ $labels.instance }}) description: "An inconsistent runtime config file is used across Mimir instances.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirBadRuntimeConfig expr: 'sum by (job) (cortex_runtime_config_last_reload_successful == 0) > 0' for: 5m labels: severity: critical annotations: summary: Mimir bad runtime config (instance {{ $labels.instance }}) description: "{{ $labels.job }} failed to reload runtime config.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirSchedulerQueriesStuck expr: 'sum by (job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0' for: 7m labels: severity: critical annotations: summary: Mimir scheduler queries stuck (instance {{ $labels.instance }}) description: "There are {{ $value }} queued up queries in {{ $labels.job }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirCacheRequestErrors expr: '(sum by (name, operation, job) (rate(thanos_cache_operation_failures_total[5m])) / sum by (name, operation, job) (rate(thanos_cache_operations_total[5m]))) * 100 > 5 and sum by (name, operation, job) (rate(thanos_cache_operations_total[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Mimir cache request errors (instance {{ $labels.instance }}) description: "Mimir cache {{ $labels.name }} is experiencing {{ printf \"%.2f\" $value }}% errors for {{ $labels.operation }} operation.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirKvStoreFailure expr: '(sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.."}[5m])) / sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m]))) == 1 and sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m])) > 0' for: 5m labels: severity: critical annotations: summary: Mimir KV store failure (instance {{ $labels.instance }}) description: "Mimir {{ $labels.job }} KV store {{ $labels.kv_name }} is failing with 100% error rate.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirMemoryMapAreasTooHigh expr: 'process_memory_map_areas{job=~".*(ingester|cortex|mimir|store-gateway).*"} / process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} * 100 > 80 and process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} > 0' for: 5m labels: severity: critical annotations: summary: Mimir memory map areas too high (instance {{ $labels.instance }}) description: "Mimir {{ $labels.job }} is using {{ printf \"%.0f\" $value }}% of its memory map area limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterInstanceHasNoTenants expr: '(cortex_ingester_memory_users == 0) and on (instance) (cortex_ingester_memory_users offset 1h > 0)' for: 1h labels: severity: warning annotations: summary: Mimir ingester instance has no tenants (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} has no tenants assigned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirRulerInstanceHasNoRuleGroups expr: '(cortex_ruler_managers_total == 0) and on (instance) (cortex_ruler_managers_total offset 1h > 0)' for: 1h labels: severity: warning annotations: summary: Mimir ruler instance has no rule groups (instance {{ $labels.instance }}) description: "Mimir ruler {{ $labels.instance }} has no rule groups assigned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngestedDataTooFarInTheFuture expr: 'max by (job) (cortex_ingester_tsdb_head_max_timestamp_seconds - time() and cortex_ingester_tsdb_head_max_timestamp_seconds > 0) > 3600' for: 5m labels: severity: warning annotations: summary: Mimir ingested data too far in the future (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.job }} has ingested samples with timestamps more than 1 hour in the future.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: MimirStoreGatewayTooManyFailedOperations expr: 'sum by (job) (rate(thanos_objstore_bucket_operation_failures_total[5m])) > 0.05' for: 5m labels: severity: warning annotations: summary: Mimir store gateway too many failed operations (instance {{ $labels.instance }}) description: "Mimir store-gateway {{ $labels.job }} bucket operations are failing ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirRingMembersMismatch expr: 'max by (name, job) (sum by (name, job, instance) (cortex_ring_members)) != min by (name, job) (sum by (name, job, instance) (cortex_ring_members))' for: 15m labels: severity: warning annotations: summary: Mimir ring members mismatch (instance {{ $labels.instance }}) description: "Mimir {{ $labels.name }} ring has inconsistent member counts across instances.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterReachingSeriesLimitWarning expr: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_series"} > 0' for: 3h labels: severity: warning annotations: summary: Mimir ingester reaching series limit warning (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its series limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterReachingSeriesLimitCritical expr: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 90) and cortex_ingester_instance_limits{limit="max_series"} > 0' for: 5m labels: severity: critical annotations: summary: Mimir ingester reaching series limit critical (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its series limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterReachingTenantsLimitWarning expr: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 70) and cortex_ingester_instance_limits{limit="max_tenants"} > 0' for: 5m labels: severity: warning annotations: summary: Mimir ingester reaching tenants limit warning (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its tenants limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterReachingTenantsLimitCritical expr: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_tenants"} > 0' for: 5m labels: severity: critical annotations: summary: Mimir ingester reaching tenants limit critical (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its tenants limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirReachingTcpConnectionsLimit expr: 'cortex_tcp_connections / cortex_tcp_connections_limit * 100 > 80 and cortex_tcp_connections_limit > 0' for: 5m labels: severity: critical annotations: summary: Mimir reaching TCP connections limit (instance {{ $labels.instance }}) description: "Mimir instance {{ $labels.instance }} is using {{ printf \"%.0f\" $value }}% of its TCP connections limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirDistributorInflightRequestsHigh expr: '(cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"} * 100 > 80) and cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0' for: 5m labels: severity: critical annotations: summary: Mimir distributor inflight requests high (instance {{ $labels.instance }}) description: "Mimir distributor {{ $labels.instance }} is using {{ printf \"%.0f\" $value }}% of its inflight push requests limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterTsdbHeadCompactionFailed expr: 'rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0' for: 15m labels: severity: critical annotations: summary: Mimir ingester TSDB head compaction failed (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterTsdbHeadTruncationFailed expr: 'rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0' for: 0m labels: severity: critical annotations: summary: Mimir ingester TSDB head truncation failed (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterTsdbCheckpointCreationFailed expr: 'rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0' for: 0m labels: severity: critical annotations: summary: Mimir ingester TSDB checkpoint creation failed (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterTsdbCheckpointDeletionFailed expr: 'rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0' for: 0m labels: severity: critical annotations: summary: Mimir ingester TSDB checkpoint deletion failed (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterTsdbWalTruncationFailed expr: 'rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0' for: 0m labels: severity: warning annotations: summary: Mimir ingester TSDB WAL truncation failed (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterTsdbWalWritesFailed expr: 'rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0' for: 3m labels: severity: critical annotations: summary: Mimir ingester TSDB WAL writes failed (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold aligned with official Mimir mixin (30 minutes). - alert: MimirStoreGatewayHasNotSyncedBucket expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0' for: 5m labels: severity: critical annotations: summary: Mimir store gateway has not synced bucket (instance {{ $labels.instance }}) description: "Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirStoreGatewayNoSyncedTenants expr: '(min by (instance, job) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0) and on (instance) (cortex_bucket_stores_tenants_synced{component="store-gateway"} offset 1h > 0)' for: 1h labels: severity: warning annotations: summary: Mimir store gateway no synced tenants (instance {{ $labels.instance }}) description: "Mimir store-gateway {{ $labels.instance }} has no synced tenants.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirBucketIndexNotUpdated expr: 'min by (user, job) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100' for: 0m labels: severity: critical annotations: summary: Mimir bucket index not updated (instance {{ $labels.instance }}) description: "Mimir bucket index for tenant {{ $labels.user }} has not been updated for more than 35 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirCompactorNotCleaningUpBlocks expr: '(time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 21600) and cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 0' for: 1h labels: severity: critical annotations: summary: Mimir compactor not cleaning up blocks (instance {{ $labels.instance }}) description: "Mimir compactor {{ $labels.instance }} has not cleaned up blocks in the last 6 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirCompactorNotRunningCompaction expr: '(time() - cortex_compactor_last_successful_run_timestamp_seconds > 86400) and cortex_compactor_last_successful_run_timestamp_seconds > 0' for: 15m labels: severity: critical annotations: summary: Mimir compactor not running compaction (instance {{ $labels.instance }}) description: "Mimir compactor {{ $labels.instance }} has not run compaction in the last 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirCompactorHasConsecutiveFailures expr: 'increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) > 1' for: 0m labels: severity: critical annotations: summary: Mimir compactor has consecutive failures (instance {{ $labels.instance }}) description: "Mimir compactor {{ $labels.instance }} has had {{ $value }} compaction failures in the last 2 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirCompactorHasRunOutOfDiskSpace expr: 'increase(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1' for: 0m labels: severity: critical annotations: summary: Mimir compactor has run out of disk space (instance {{ $labels.instance }}) description: "Mimir compactor {{ $labels.instance }} has run out of disk space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirCompactorHasNotUploadedBlocks expr: '(time() - thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 86400) and thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 0' for: 15m labels: severity: critical annotations: summary: Mimir compactor has not uploaded blocks (instance {{ $labels.instance }}) description: "Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Using 24h window per official mixin — compaction skips are rare events. - alert: MimirCompactorSkippedBlocks expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0' for: 5m labels: severity: warning annotations: summary: Mimir compactor skipped blocks (instance {{ $labels.instance }}) description: "Mimir compactor has found {{ $value }} blocks that cannot be compacted (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirRulerTooManyFailedPushes expr: '100 * sum by (instance, job) (rate(cortex_ruler_write_requests_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 0' for: 5m labels: severity: critical annotations: summary: Mimir ruler too many failed pushes (instance {{ $labels.instance }}) description: "Mimir ruler {{ $labels.instance }} is failing to push {{ printf \"%.2f\" $value }}% of write requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirRulerTooManyFailedQueries expr: '100 * sum by (instance, job) (rate(cortex_ruler_queries_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 0' for: 5m labels: severity: critical annotations: summary: Mimir ruler too many failed queries (instance {{ $labels.instance }}) description: "Mimir ruler {{ $labels.instance }} is failing {{ printf \"%.2f\" $value }}% of query evaluations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirRulerMissedEvaluations expr: '100 * sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_missed_total[5m])) / sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 1 and sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Mimir ruler missed evaluations (instance {{ $labels.instance }}) description: "Mimir ruler {{ $labels.instance }} is missing {{ printf \"%.2f\" $value }}% of rule group evaluations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: MimirRulerFailedRingCheck expr: 'sum by (job) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0.05' for: 5m labels: severity: critical annotations: summary: Mimir ruler failed ring check (instance {{ $labels.instance }}) description: "Mimir ruler {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirAlertmanagerSyncConfigsFailing expr: 'rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0' for: 30m labels: severity: critical annotations: summary: Mimir alertmanager sync configs failing (instance {{ $labels.instance }}) description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirAlertmanagerRingCheckFailing expr: 'rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0' for: 10m labels: severity: critical annotations: summary: Mimir alertmanager ring check failing (instance {{ $labels.instance }}) description: "Mimir alertmanager {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirAlertmanagerStateMergeFailing expr: 'rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0' for: 10m labels: severity: critical annotations: summary: Mimir alertmanager state merge failing (instance {{ $labels.instance }}) description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirAlertmanagerReplicationFailing expr: 'rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0' for: 10m labels: severity: critical annotations: summary: Mimir alertmanager replication failing (instance {{ $labels.instance }}) description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirAlertmanagerPersistStateFailing expr: 'rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0' for: 1h labels: severity: critical annotations: summary: Mimir alertmanager persist state failing (instance {{ $labels.instance }}) description: "Mimir alertmanager {{ $labels.job }} is failing to persist state ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirAlertmanagerInitialSyncFailed expr: 'increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0' for: 0m labels: severity: warning annotations: summary: Mimir alertmanager initial sync failed (instance {{ $labels.instance }}) description: "Mimir alertmanager {{ $labels.job }} failed initial state sync.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirAlertmanagerInstanceHasNoTenants expr: '(cortex_alertmanager_tenants_owned == 0) and on (instance) (cortex_alertmanager_tenants_owned offset 1h > 0)' for: 1h labels: severity: warning annotations: summary: Mimir alertmanager instance has no tenants (instance {{ $labels.instance }}) description: "Mimir alertmanager {{ $labels.instance }} has no tenants assigned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirGossipMembersCountTooHigh expr: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 1.15 + 10 < max(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)' for: 20m labels: severity: warning annotations: summary: Mimir gossip members count too high (instance {{ $labels.instance }}) description: "Mimir gossip cluster has more members than expected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirGossipMembersCountTooLow expr: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 0.5 > min(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)' for: 20m labels: severity: warning annotations: summary: Mimir gossip members count too low (instance {{ $labels.instance }}) description: "Mimir gossip cluster has fewer members than expected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # A high number of Go threads may indicate a goroutine leak. - alert: MimirGoThreadsTooHighWarning expr: 'go_threads{job=~".*(mimir|cortex).*"} > 5000' for: 15m labels: severity: warning annotations: summary: Mimir go threads too high warning (instance {{ $labels.instance }}) description: "Mimir {{ $labels.instance }} has {{ $value }} Go threads.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirGoThreadsTooHighCritical expr: 'go_threads{job=~".*(mimir|cortex).*"} > 8000' for: 15m labels: severity: critical annotations: summary: Mimir go threads too high critical (instance {{ $labels.instance }}) description: "Mimir {{ $labels.instance }} has {{ $value }} Go threads.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/grafana-tempo/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: TempoDistributorUnhealthy expr: 'max by (job) (tempo_ring_members{state="Unhealthy", name="distributor"}) > 0' for: 15m labels: severity: warning annotations: summary: Tempo distributor unhealthy (instance {{ $labels.instance }}) description: "Tempo has {{ $value }} unhealthy distributor(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoLiveStoreUnhealthy expr: 'max by (job) (tempo_ring_members{state="Unhealthy", name="live-store"}) > 0' for: 15m labels: severity: critical annotations: summary: Tempo live store unhealthy (instance {{ $labels.instance }}) description: "Tempo has {{ $value }} unhealthy live store(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoMetricsGeneratorUnhealthy expr: 'max by (job) (tempo_ring_members{state="Unhealthy", name="metrics-generator"}) > 0' for: 15m labels: severity: critical annotations: summary: Tempo metrics generator unhealthy (instance {{ $labels.instance }}) description: "Tempo has {{ $value }} unhealthy metrics generator(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Uses a two-window approach: 1h for historical count and 5m to confirm the issue is ongoing. - alert: TempoCompactionsFailing expr: 'sum by (job) (increase(tempodb_compaction_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_compaction_errors_total[5m])) > 0' for: 1h labels: severity: critical annotations: summary: Tempo compactions failing (instance {{ $labels.instance }}) description: "{{ $value }} compactions have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoPollsFailing expr: 'sum by (job) (increase(tempodb_blocklist_poll_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_poll_errors_total[5m])) > 0' for: 0m labels: severity: critical annotations: summary: Tempo polls failing (instance {{ $labels.instance }}) description: "{{ $value }} blocklist polls have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoTenantIndexFailures expr: 'sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[5m])) > 0' for: 0m labels: severity: critical annotations: summary: Tempo tenant index failures (instance {{ $labels.instance }}) description: "{{ $value }} tenant index failures in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoNoTenantIndexBuilders expr: 'sum by (tenant) (tempodb_blocklist_tenant_index_builder) == 0 and on() max(tempodb_blocklist_length) > 0' for: 5m labels: severity: critical annotations: summary: Tempo no tenant index builders (instance {{ $labels.instance }}) description: "No tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 600s (10 minutes). Adjust based on your tenant index build interval. - alert: TempoTenantIndexTooOld expr: 'max by (tenant) (tempodb_blocklist_tenant_index_age_seconds) > 600' for: 5m labels: severity: critical annotations: summary: Tempo tenant index too old (instance {{ $labels.instance }}) description: "Tenant index for {{ $labels.tenant }} is {{ $value }}s old.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Fires when the blocklist grows more than 40% over 7 days. - alert: TempoBlockListRisingQuickly expr: '(avg(tempodb_blocklist_length) / avg(tempodb_blocklist_length offset 7d) - 1) * 100 > 40 and avg(tempodb_blocklist_length offset 7d) > 0' for: 15m labels: severity: critical annotations: summary: Tempo block list rising quickly (instance {{ $labels.instance }}) description: "Tempo blocklist length is up {{ printf \"%.0f\" $value }}% over the last 7 days. Consider scaling compactors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoBadOverrides expr: 'sum by (job) (tempo_runtime_config_last_reload_successful == 0) > 0' for: 15m labels: severity: critical annotations: summary: Tempo bad overrides (instance {{ $labels.instance }}) description: "{{ $labels.job }} failed to reload runtime overrides.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoUserConfigurableOverridesReloadFailing expr: 'sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[1h])) > 5 and sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[5m])) > 0' for: 0m labels: severity: critical annotations: summary: Tempo user configurable overrides reload failing (instance {{ $labels.instance }}) description: "{{ $value }} user-configurable overrides reloads have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 100 blocks per compactor instance. Adjust based on your environment. - alert: TempoCompactionTooManyOutstandingBlocksWarning expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 100' for: 6h labels: severity: warning annotations: summary: Tempo compaction too many outstanding blocks warning (instance {{ $labels.instance }}) description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Consider increasing compactor resources.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Official Tempo mixin normalizes by backend-worker count. Adjust threshold based on your compactor configuration. - alert: TempoCompactionTooManyOutstandingBlocksCritical expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 250' for: 24h labels: severity: critical annotations: summary: Tempo compaction too many outstanding blocks critical (instance {{ $labels.instance }}) description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Increase compactor resources immediately.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoDistributorUsageTrackerErrors expr: 'sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0' for: 30m labels: severity: critical annotations: summary: Tempo distributor usage tracker errors (instance {{ $labels.instance }}) description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoMetricsGeneratorProcessorUpdatesFailing expr: 'sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 0' for: 15m labels: severity: critical annotations: summary: Tempo metrics generator processor updates failing (instance {{ $labels.instance }}) description: "Tempo metrics generator processor updates are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoMetricsGeneratorServiceGraphsDroppingSpans expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0' for: 15m labels: severity: warning annotations: summary: Tempo metrics generator service graphs dropping spans (instance {{ $labels.instance }}) description: "Tempo metrics generator is dropping {{ printf \"%.2f\" $value }}% of spans in service graphs for {{ $labels.job }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoMetricsGeneratorCollectionsFailing expr: 'sum by (job) (increase(tempo_metrics_generator_registry_collections_failed_total[5m])) > 2' for: 5m labels: severity: critical annotations: summary: Tempo metrics generator collections failing (instance {{ $labels.instance }}) description: "Tempo metrics generator collections are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Fires when the memcached error rate exceeds 20%. Only relevant if Tempo is configured with memcached caching. - alert: TempoMemcachedErrorsElevated expr: '100 * sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count{status_code="500"}[5m])) / sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 20 and sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 0' for: 10m labels: severity: warning annotations: summary: Tempo memcached errors elevated (instance {{ $labels.instance }}) description: "Tempo memcached error rate is {{ printf \"%.2f\" $value }}% for {{ $labels.name }} in {{ $labels.job }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/graph-node/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: ProviderFailedBecauseNet_versionFailed expr: 'eth_rpc_status == 1' for: 0m labels: severity: critical annotations: summary: Provider failed because net_version failed (instance {{ $labels.instance }}) description: "Failed net_version for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ProviderFailedBecauseGetGenesisFailed expr: 'eth_rpc_status == 2' for: 0m labels: severity: critical annotations: summary: Provider failed because get genesis failed (instance {{ $labels.instance }}) description: "Failed to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ProviderFailedBecauseNet_versionTimeout expr: 'eth_rpc_status == 3' for: 0m labels: severity: critical annotations: summary: Provider failed because net_version timeout (instance {{ $labels.instance }}) description: "net_version timeout for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ProviderFailedBecauseGetGenesisTimeout expr: 'eth_rpc_status == 4' for: 0m labels: severity: critical annotations: summary: Provider failed because get genesis timeout (instance {{ $labels.instance }}) description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: StoreConnectionSlow expr: 'store_connection_wait_time_ms > 10' for: 0m labels: severity: warning annotations: summary: Store connection slow (instance {{ $labels.instance }}) description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: StoreConnectionVerySlow expr: 'store_connection_wait_time_ms > 20' for: 0m labels: severity: critical annotations: summary: Store connection very slow (instance {{ $labels.instance }}) description: "Store connection is very slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/hadoop/jmx_exporter.yml ================================================ groups: - name: Jmx_exporter rules: - alert: HadoopNameNodeDown expr: 'up{job="hadoop-namenode"} == 0' for: 5m labels: severity: critical annotations: summary: Hadoop Name Node Down (instance {{ $labels.instance }}) description: "The Hadoop NameNode service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HadoopResourceManagerDown expr: 'up{job="hadoop-resourcemanager"} == 0' for: 5m labels: severity: critical annotations: summary: Hadoop Resource Manager Down (instance {{ $labels.instance }}) description: "The Hadoop ResourceManager service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HadoopDataNodeOutOfService expr: 'hadoop_datanode_last_heartbeat == 0' for: 10m labels: severity: warning annotations: summary: Hadoop Data Node Out Of Service (instance {{ $labels.instance }}) description: "The Hadoop DataNode is not sending heartbeats.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HadoopHdfsDiskSpaceLow expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1 and hadoop_hdfs_bytes_total > 0' for: 15m labels: severity: warning annotations: summary: Hadoop HDFS Disk Space Low (instance {{ $labels.instance }}) description: "Available HDFS disk space is running low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HadoopMapReduceTaskFailures expr: 'increase(hadoop_mapreduce_task_failures_total[1h]) > 100' for: 10m labels: severity: critical annotations: summary: Hadoop Map Reduce Task Failures (instance {{ $labels.instance }}) description: "There is an unusually high number of MapReduce task failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HadoopResourceManagerMemoryHigh expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8' for: 15m labels: severity: warning annotations: summary: Hadoop Resource Manager Memory High (instance {{ $labels.instance }}) description: "The Hadoop ResourceManager is approaching its memory limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HadoopYarnContainerAllocationFailures expr: 'increase(hadoop_yarn_container_allocation_failures_total[1h]) > 10' for: 10m labels: severity: warning annotations: summary: Hadoop YARN Container Allocation Failures (instance {{ $labels.instance }}) description: "There is a significant number of YARN container allocation failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HadoopHbaseRegionCountHigh expr: 'hadoop_hbase_region_count > 5000' for: 15m labels: severity: warning annotations: summary: Hadoop HBase Region Count High (instance {{ $labels.instance }}) description: "The HBase cluster has an unusually high number of regions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HadoopHbaseRegionServerHeapLow expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8' for: 10m labels: severity: warning annotations: summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }}) description: "HBase Region Servers are running low on heap space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HadoopHbaseWriteRequestsLatencyHigh expr: 'hadoop_hbase_write_requests_latency_seconds > 0.5' for: 10m labels: severity: warning annotations: summary: Hadoop HBase Write Requests Latency High (instance {{ $labels.instance }}) description: "HBase Write Requests are experiencing high latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/haproxy/embedded-exporter-v2.yml ================================================ groups: - name: EmbeddedExporterV2 rules: - alert: HaproxyHighHttp4xxErrorRateBackend expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }}) description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp5xxErrorRateBackend expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }}) description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp4xxErrorRateServer expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: summary: HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }}) description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp5xxErrorRateServer expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: summary: HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }}) description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyServerResponseErrors expr: '(sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: summary: HAProxy server response errors (instance {{ $labels.instance }}) description: "Too many response errors to {{ $labels.server }} server (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyBackendConnectionErrors expr: '(sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100' for: 1m labels: severity: critical annotations: summary: HAProxy backend connection errors (instance {{ $labels.instance }}) description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyServerConnectionErrors expr: '(sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100' for: 0m labels: severity: critical annotations: summary: HAProxy server connection errors (instance {{ $labels.instance }}) description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyBackendMaxActiveSession>80% expr: '((haproxy_backend_current_sessions >0) * 100) / (haproxy_backend_limit_sessions > 0) > 80' for: 2m labels: severity: warning annotations: summary: HAProxy backend max active session > 80% (instance {{ $labels.instance }}) description: "Session limit from backend {{ $labels.proxy }} reached 80% of limit - {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # haproxy_backend_current_queue is a gauge (current queue depth), not a counter. - alert: HaproxyPendingRequests expr: 'sum by (proxy) (haproxy_backend_current_queue) > 0' for: 2m labels: severity: warning annotations: summary: HAProxy pending requests (instance {{ $labels.instance }}) description: "Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHttpSlowingDown expr: 'avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1' for: 1m labels: severity: warning annotations: summary: HAProxy HTTP slowing down (instance {{ $labels.instance }}) description: "Average request time is increasing - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyRetryHigh expr: 'sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10' for: 2m labels: severity: warning annotations: summary: HAProxy retry high (instance {{ $labels.instance }}) description: "High rate of retry on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHasNoAliveBackends expr: 'haproxy_backend_active_servers + haproxy_backend_backup_servers == 0' for: 0m labels: severity: critical annotations: summary: HAproxy has no alive backends (instance {{ $labels.instance }}) description: "HAProxy has no alive active or backup backends for {{ $labels.proxy }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyFrontendSecurityBlockedRequests expr: 'sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10' for: 2m labels: severity: warning annotations: summary: HAProxy frontend security blocked requests (instance {{ $labels.instance }}) description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyServerHealthcheckFailure expr: 'increase(haproxy_server_check_failures_total[1m]) > 0' for: 1m labels: severity: warning annotations: summary: HAProxy server healthcheck failure (instance {{ $labels.instance }}) description: "Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/haproxy/haproxy-exporter-v1.yml ================================================ groups: - name: HaproxyExporterV1 rules: - alert: HaproxyDown expr: 'haproxy_up == 0' for: 0m labels: severity: critical annotations: summary: HAProxy down (instance {{ $labels.instance }}) description: "HAProxy down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp4xxErrorRateBackend(v1) expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: summary: HAProxy high HTTP 4xx error rate backend (v1) (instance {{ $labels.instance }}) description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp5xxErrorRateBackend(v1) expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: summary: HAProxy high HTTP 5xx error rate backend (v1) (instance {{ $labels.instance }}) description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp4xxErrorRateServer(v1) expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: summary: HAProxy high HTTP 4xx error rate server (v1) (instance {{ $labels.instance }}) description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp5xxErrorRateServer(v1) expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: summary: HAProxy high HTTP 5xx error rate server (v1) (instance {{ $labels.instance }}) description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyServerResponseErrors(v1) expr: 'sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: summary: HAProxy server response errors (v1) (instance {{ $labels.instance }}) description: "Too many response errors to {{ $labels.server }} server (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyBackendConnectionErrors(v1) expr: 'sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100' for: 1m labels: severity: critical annotations: summary: HAProxy backend connection errors (v1) (instance {{ $labels.instance }}) description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyServerConnectionErrors(v1) expr: 'sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100' for: 0m labels: severity: critical annotations: summary: HAProxy server connection errors (v1) (instance {{ $labels.instance }}) description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyBackendMaxActiveSession expr: '((sum by (backend) (haproxy_backend_current_sessions * 100) / sum by (backend) (haproxy_backend_limit_sessions))) > 80 and sum by (backend) (haproxy_backend_limit_sessions) > 0' for: 2m labels: severity: warning annotations: summary: HAProxy backend max active session (instance {{ $labels.instance }}) description: "HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyPendingRequests(v1) expr: 'sum by (backend) (haproxy_backend_current_queue) > 0' for: 2m labels: severity: warning annotations: summary: HAProxy pending requests (v1) (instance {{ $labels.instance }}) description: "Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHttpSlowingDown(v1) expr: 'avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1' for: 1m labels: severity: warning annotations: summary: HAProxy HTTP slowing down (v1) (instance {{ $labels.instance }}) description: "Average request time is increasing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyRetryHigh(v1) expr: 'sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10' for: 2m labels: severity: warning annotations: summary: HAProxy retry high (v1) (instance {{ $labels.instance }}) description: "High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyBackendDown expr: 'haproxy_backend_up == 0' for: 0m labels: severity: critical annotations: summary: HAProxy backend down (instance {{ $labels.instance }}) description: "HAProxy backend is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyServerDown expr: 'haproxy_server_up == 0' for: 0m labels: severity: critical annotations: summary: HAProxy server down (instance {{ $labels.instance }}) description: "HAProxy server is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyFrontendSecurityBlockedRequests(v1) expr: 'sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10' for: 2m labels: severity: warning annotations: summary: HAProxy frontend security blocked requests (v1) (instance {{ $labels.instance }}) description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyServerHealthcheckFailure(v1) expr: 'increase(haproxy_server_check_failures_total[1m]) > 0' for: 1m labels: severity: warning annotations: summary: HAProxy server healthcheck failure (v1) (instance {{ $labels.instance }}) description: "Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/hashicorp-vault/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: VaultSealed expr: 'vault_core_unsealed == 0' for: 0m labels: severity: critical annotations: summary: Vault sealed (instance {{ $labels.instance }}) description: "Vault instance is sealed on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: VaultTooManyPendingTokens expr: 'avg(vault_token_create_count - vault_token_store_count) > 0' for: 5m labels: severity: warning annotations: summary: Vault too many pending tokens (instance {{ $labels.instance }}) description: "Too many pending tokens {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: VaultTooManyInfinityTokens expr: 'vault_token_count_by_ttl{creation_ttl="+Inf"} > 3' for: 5m labels: severity: warning annotations: summary: Vault too many infinity tokens (instance {{ $labels.instance }}) description: "Too many infinity tokens {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: VaultClusterHealth expr: 'sum(vault_core_active) / count(vault_core_active) <= 0.5' for: 0m labels: severity: critical annotations: summary: Vault cluster health (instance {{ $labels.instance }}) description: "Vault cluster is not healthy {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/host-and-hardware/node-exporter.yml ================================================ groups: - name: NodeExporter rules: - alert: HostOutOfMemory expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)' for: 2m labels: severity: warning annotations: summary: Host out of memory (instance {{ $labels.instance }}) description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostMemoryUnderMemoryPressure expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)' for: 0m labels: severity: warning annotations: summary: Host memory under memory pressure (instance {{ $labels.instance }}) description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - alert: HostMemoryIsUnderutilized expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8' for: 0m labels: severity: info annotations: summary: Host Memory is underutilized (instance {{ $labels.instance }}) description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualNetworkThroughputIn expr: '((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0' for: 0m labels: severity: warning annotations: summary: Host unusual network throughput in (instance {{ $labels.instance }}) description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualNetworkThroughputOut expr: '((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0' for: 0m labels: severity: warning annotations: summary: Host unusual network throughput out (instance {{ $labels.instance }}) description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostDiskIoUtilizationHigh expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)' for: 0m labels: severity: warning annotations: summary: Host disk IO utilization high (instance {{ $labels.instance }}) description: "Disk utilization is high (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Please add ignored mountpoints in node_exporter parameters like # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - alert: HostOutOfDiskSpace expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)' for: 2m labels: severity: critical annotations: summary: Host out of disk space (instance {{ $labels.instance }}) description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Please add ignored mountpoints in node_exporter parameters like # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - alert: HostDiskMayFillIn24Hours expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0' for: 2m labels: severity: warning annotations: summary: Host disk may fill in 24 hours (instance {{ $labels.instance }}) description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostOutOfInodes expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) and node_filesystem_files > 0' for: 2m labels: severity: critical annotations: summary: Host out of inodes (instance {{ $labels.instance }}) description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostFilesystemDeviceError expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1' for: 2m labels: severity: critical annotations: summary: Host filesystem device error (instance {{ $labels.instance }}) description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostInodesMayFillIn24Hours expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0' for: 2m labels: severity: warning annotations: summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }}) description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualDiskReadLatency expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)' for: 2m labels: severity: warning annotations: summary: Host unusual disk read latency (instance {{ $labels.instance }}) description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualDiskWriteLatency expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)' for: 2m labels: severity: warning annotations: summary: Host unusual disk write latency (instance {{ $labels.instance }}) description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostHighCpuLoad expr: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80' for: 10m labels: severity: warning annotations: summary: Host high CPU load (instance {{ $labels.instance }}) description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - alert: HostCpuIsUnderutilized expr: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' for: 1w labels: severity: info annotations: summary: Host CPU is underutilized (instance {{ $labels.instance }}) description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostCpuStealNoisyNeighbor expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' for: 0m labels: severity: warning annotations: summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostCpuHighIowait expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' for: 0m labels: severity: warning annotations: summary: Host CPU high iowait (instance {{ $labels.instance }}) description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualDiskIo expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8' for: 5m labels: severity: warning annotations: summary: Host unusual disk IO (instance {{ $labels.instance }}) description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # x2 context switches is an arbitrary number. # The alert threshold depends on the nature of the application. # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 - alert: HostContextSwitchingHigh expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2' for: 0m labels: severity: warning annotations: summary: Host context switching high (instance {{ $labels.instance }}) description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostSwapIsFillingUp expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) and node_memory_SwapTotal_bytes > 0' for: 2m labels: severity: warning annotations: summary: Host swap is filling up (instance {{ $labels.instance }}) description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostSystemdServiceCrashed expr: '(node_systemd_unit_state{state="failed"} == 1)' for: 0m labels: severity: warning annotations: summary: Host systemd service crashed (instance {{ $labels.instance }}) description: "systemd service {{ $labels.name }} crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostPhysicalComponentTooHot expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius' for: 5m labels: severity: warning annotations: summary: Host physical component too hot (instance {{ $labels.instance }}) description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNodeOvertemperatureAlarm expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))' for: 0m labels: severity: critical annotations: summary: Host node overtemperature alarm (instance {{ $labels.instance }}) description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Uses ignoring(state) to handle additional labels on node_md_disks. Matches the official node-exporter mixin. - alert: HostSoftwareRaidInsufficientDrives expr: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)' for: 0m labels: severity: critical annotations: summary: Host software RAID insufficient drives (instance {{ $labels.instance }}) description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostSoftwareRaidDiskFailure expr: '(node_md_disks{state="failed"} > 0)' for: 2m labels: severity: warning annotations: summary: Host software RAID disk failure (instance {{ $labels.instance }}) description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostKernelVersionDeviations expr: 'changes(node_uname_info[1h]) > 0' for: 0m labels: severity: info annotations: summary: Host kernel version deviations (instance {{ $labels.instance }}) description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 15–20 minutes to recover, the alert should still trigger. - alert: HostOomKillDetected expr: '(increase(node_vmstat_oom_kill[30m]) > 0)' for: 0m labels: severity: warning annotations: summary: Host OOM kill detected (instance {{ $labels.instance }}) description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostEdacCorrectableErrorsDetected expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)' for: 0m labels: severity: info annotations: summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostEdacUncorrectableErrorsDetected expr: '(node_edac_uncorrectable_errors_total > 0)' for: 0m labels: severity: warning annotations: summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNetworkReceiveErrors expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) and rate(node_network_receive_packets_total[2m]) > 0' for: 2m labels: severity: warning annotations: summary: Host Network Receive Errors (instance {{ $labels.instance }}) description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNetworkTransmitErrors expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) and rate(node_network_transmit_packets_total[2m]) > 0' for: 2m labels: severity: warning annotations: summary: Host Network Transmit Errors (instance {{ $labels.instance }}) description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNetworkBondDegraded expr: '((node_bonding_active - node_bonding_slaves) != 0)' for: 2m labels: severity: warning annotations: summary: Host Network Bond Degraded (instance {{ $labels.instance }}) description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostConntrackLimit expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) and node_nf_conntrack_entries_limit > 0' for: 5m labels: severity: warning annotations: summary: Host conntrack limit (instance {{ $labels.instance }}) description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostClockSkew expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))' for: 10m labels: severity: warning annotations: summary: Host clock skew (instance {{ $labels.instance }}) description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostClockNotSynchronising expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)' for: 2m labels: severity: warning annotations: summary: Host clock not synchronising (instance {{ $labels.instance }}) description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/ipmi/ipmi-exporter.yml ================================================ groups: - name: IpmiExporter rules: # The ipmi_up metric is per-collector. A value of 0 means the collector could not retrieve data from the BMC. - alert: IpmiCollectorDown expr: 'ipmi_up == 0' for: 5m labels: severity: warning annotations: summary: IPMI collector down (instance {{ $labels.instance }}) description: "IPMI collector {{ $labels.collector }} on {{ $labels.instance }} failed to scrape sensor data. Check FreeIPMI tools and BMC connectivity.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # State values: 0=nominal, 1=warning, 2=critical. Thresholds are defined in the BMC firmware. - alert: IpmiTemperatureSensorWarning expr: 'ipmi_temperature_state == 1' for: 5m labels: severity: warning annotations: summary: IPMI temperature sensor warning (instance {{ $labels.instance }}) description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IpmiTemperatureSensorCritical expr: 'ipmi_temperature_state == 2' for: 0m labels: severity: critical annotations: summary: IPMI temperature sensor critical (instance {{ $labels.instance }}) description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Immediate attention required to prevent hardware damage.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IpmiFanSpeedSensorWarning expr: 'ipmi_fan_speed_state == 1' for: 5m labels: severity: warning annotations: summary: IPMI fan speed sensor warning (instance {{ $labels.instance }}) description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IpmiFanSpeedSensorCritical expr: 'ipmi_fan_speed_state == 2' for: 0m labels: severity: critical annotations: summary: IPMI fan speed sensor critical (instance {{ $labels.instance }}) description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. A fan may have failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IpmiFanSpeedZero expr: 'ipmi_fan_speed_rpm == 0' for: 5m labels: severity: critical annotations: summary: IPMI fan speed zero (instance {{ $labels.instance }}) description: "IPMI fan {{ $labels.name }} on {{ $labels.instance }} reports 0 RPM. The fan may have failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IpmiVoltageSensorWarning expr: 'ipmi_voltage_state == 1' for: 5m labels: severity: warning annotations: summary: IPMI voltage sensor warning (instance {{ $labels.instance }}) description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IpmiVoltageSensorCritical expr: 'ipmi_voltage_state == 2' for: 0m labels: severity: critical annotations: summary: IPMI voltage sensor critical (instance {{ $labels.instance }}) description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Power supply or motherboard issue possible.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IpmiCurrentSensorWarning expr: 'ipmi_current_state == 1' for: 5m labels: severity: warning annotations: summary: IPMI current sensor warning (instance {{ $labels.instance }}) description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IpmiCurrentSensorCritical expr: 'ipmi_current_state == 2' for: 0m labels: severity: critical annotations: summary: IPMI current sensor critical (instance {{ $labels.instance }}) description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IpmiPowerSensorWarning expr: 'ipmi_power_state == 1' for: 5m labels: severity: warning annotations: summary: IPMI power sensor warning (instance {{ $labels.instance }}) description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IpmiPowerSensorCritical expr: 'ipmi_power_state == 2' for: 0m labels: severity: critical annotations: summary: IPMI power sensor critical (instance {{ $labels.instance }}) description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Catches any sensor type not covered by the specific temperature/fan/voltage/current/power alerts. - alert: IpmiGenericSensorCritical expr: 'ipmi_sensor_state == 2' for: 5m labels: severity: critical annotations: summary: IPMI generic sensor critical (instance {{ $labels.instance }}) description: "IPMI sensor {{ $labels.name }} (type={{ $labels.type }}) on {{ $labels.instance }} is in critical state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IpmiChassisPowerOff expr: 'ipmi_chassis_power_state == 0' for: 0m labels: severity: critical annotations: summary: IPMI chassis power off (instance {{ $labels.instance }}) description: "IPMI reports chassis power is off on {{ $labels.instance }}. The server may have shut down unexpectedly.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # The metric uses inverted logic: 1=no fault, 0=fault detected. - alert: IpmiChassisDriveFault expr: 'ipmi_chassis_drive_fault_state == 0' for: 0m labels: severity: critical annotations: summary: IPMI chassis drive fault (instance {{ $labels.instance }}) description: "IPMI reports a drive fault on {{ $labels.instance }}. Check disk health.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # The metric uses inverted logic: 1=no fault, 0=fault detected. - alert: IpmiChassisCoolingFault expr: 'ipmi_chassis_cooling_fault_state == 0' for: 0m labels: severity: critical annotations: summary: IPMI chassis cooling fault (instance {{ $labels.instance }}) description: "IPMI reports a cooling/fan fault on {{ $labels.instance }}. Check fans and airflow.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # SEL storage is typically very limited (e.g., 16KB). When full, new events may be dropped. - alert: IpmiSelAlmostFull expr: 'ipmi_sel_free_space_bytes < 512' for: 5m labels: severity: warning annotations: summary: IPMI SEL almost full (instance {{ $labels.instance }}) description: "IPMI System Event Log on {{ $labels.instance }} has only {{ printf \"%.0f\" $value }} bytes free. Clear the SEL to prevent loss of new events.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/istio/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: IstioKubernetesGatewayAvailabilityDrop expr: 'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2' for: 1m labels: severity: warning annotations: summary: Istio Kubernetes gateway availability drop (instance {{ $labels.instance }}) description: "Gateway pods have dropped. Inbound traffic will likely be affected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioPilotHighTotalRequestRate expr: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5 and sum(rate(pilot_xds_pushes[1m])) > 0' for: 1m labels: severity: warning annotations: summary: Istio Pilot high total request rate (instance {{ $labels.instance }}) description: "Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioMixerPrometheusDispatchesLow expr: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180' for: 1m labels: severity: warning annotations: summary: Istio Mixer Prometheus dispatches low (instance {{ $labels.instance }}) description: "Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioHighTotalRequestRate expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000' for: 2m labels: severity: warning annotations: summary: Istio high total request rate (instance {{ $labels.instance }}) description: "Global request rate in the service mesh is unusually high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioLowTotalRequestRate expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100' for: 2m labels: severity: warning annotations: summary: Istio low total request rate (instance {{ $labels.instance }}) description: "Global request rate in the service mesh is unusually low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioHigh4xxErrorRate expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0' for: 1m labels: severity: warning annotations: summary: Istio high 4xx error rate (instance {{ $labels.instance }}) description: "High percentage of HTTP 4xx responses in Istio (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioHigh5xxErrorRate expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0' for: 1m labels: severity: warning annotations: summary: Istio high 5xx error rate (instance {{ $labels.instance }}) description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioHighRequestLatency expr: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100 and rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 0' for: 1m labels: severity: warning annotations: summary: Istio high request latency (instance {{ $labels.instance }}) description: "Istio average requests execution is longer than 100ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioLatency99Percentile expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000' for: 1m labels: severity: warning annotations: summary: Istio latency 99 percentile (instance {{ $labels.instance }}) description: "Istio 1% slowest requests are longer than 1000ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioPilotDuplicateEntry expr: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0' for: 0m labels: severity: critical annotations: summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }}) description: "Istio pilot duplicate entry error.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/jaeger/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: JaegerAgentHttpServerErrors expr: '100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning annotations: summary: Jaeger agent HTTP server errors (instance {{ $labels.instance }}) description: "Jaeger agent on {{ $labels.instance }} is experiencing {{ $value | humanize }}% HTTP server errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JaegerClientRpcRequestErrors expr: '100 * sum(rate(jaeger_client_jaeger_rpc_http_requests{status_code=~"4xx|5xx"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning annotations: summary: Jaeger client RPC request errors (instance {{ $labels.instance }}) description: "Jaeger client on {{ $labels.instance }} is experiencing {{ $value | humanize }}% RPC HTTP errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JaegerClientSpansDropped expr: '100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning annotations: summary: Jaeger client spans dropped (instance {{ $labels.instance }}) description: "Jaeger client on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JaegerAgentSpansDropped expr: '100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning annotations: summary: Jaeger agent spans dropped (instance {{ $labels.instance }}) description: "Jaeger agent on {{ $labels.instance }} is dropping {{ $value | humanize }}% of span batches.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JaegerCollectorDroppingSpans expr: '100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning annotations: summary: Jaeger collector dropping spans (instance {{ $labels.instance }}) description: "Jaeger collector on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JaegerSamplingUpdateFailing expr: '100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning annotations: summary: Jaeger sampling update failing (instance {{ $labels.instance }}) description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of sampling policy updates.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JaegerThrottlingUpdateFailing expr: '100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning annotations: summary: Jaeger throttling update failing (instance {{ $labels.instance }}) description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of throttling policy updates.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JaegerQueryRequestFailures expr: '100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning annotations: summary: Jaeger query request failures (instance {{ $labels.instance }}) description: "Jaeger query on {{ $labels.instance }} is failing {{ $value | humanize }}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/jenkins/metric-plugin.yml ================================================ groups: - name: MetricPlugin rules: - alert: JenkinsNodeOffline expr: 'jenkins_node_offline_value > 0' for: 5m labels: severity: critical annotations: summary: Jenkins node offline (instance {{ $labels.instance }}) description: "At least one Jenkins node offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JenkinsNoNodeOnline expr: 'jenkins_node_online_value == 0' for: 0m labels: severity: critical annotations: summary: Jenkins no node online (instance {{ $labels.instance }}) description: "No Jenkins nodes are online: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JenkinsHealthcheck expr: 'jenkins_health_check_score < 1' for: 0m labels: severity: critical annotations: summary: Jenkins healthcheck (instance {{ $labels.instance }}) description: "Jenkins healthcheck score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JenkinsOutdatedPlugins expr: 'sum(jenkins_plugins_withUpdate) by (instance) > 3' for: 1d labels: severity: warning annotations: summary: Jenkins outdated plugins (instance {{ $labels.instance }}) description: "{{ $value }} plugins need update\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JenkinsBuildsHealthScore expr: 'default_jenkins_builds_health_score < 1' for: 0m labels: severity: critical annotations: summary: Jenkins builds health score (instance {{ $labels.instance }}) description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JenkinsRunFailureTotal expr: 'delta(jenkins_runs_failure_total[1h]) > 100' for: 0m labels: severity: warning annotations: summary: Jenkins run failure total (instance {{ $labels.instance }}) description: "Job run failures: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JenkinsBuildTestsFailing expr: 'default_jenkins_builds_last_build_tests_failing > 0' for: 0m labels: severity: warning annotations: summary: Jenkins build tests failing (instance {{ $labels.instance }}) description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # * RUNNING -1 true - The build had no errors. # * SUCCESS 0 true - The build had no errors. # * UNSTABLE 1 true - The build had some errors but they were not fatal. For example, some tests failed. # * FAILURE 2 false - The build had a fatal error. # * NOT_BUILT 3 false - The module was not built. # * ABORTED 4 false - The build was manually aborted. - alert: JenkinsLastBuildFailed expr: 'default_jenkins_builds_last_build_result_ordinal == 2' for: 0m labels: severity: warning annotations: summary: Jenkins last build failed (instance {{ $labels.instance }}) description: "Last build failed: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/juniper/czerwonk-junos-exporter.yml ================================================ groups: - name: CzerwonkJunosExporter rules: - alert: JuniperSwitchDown expr: 'junos_up == 0' for: 0m labels: severity: critical annotations: summary: Juniper switch down (instance {{ $labels.instance }}) description: "The switch appears to be down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JuniperCriticalBandwidthUsage1gib expr: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.90' for: 1m labels: severity: critical annotations: summary: Juniper critical Bandwidth Usage 1GiB (instance {{ $labels.instance }}) description: "Interface is highly saturated. (> 0.90GiB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JuniperWarningBandwidthUsage1gib expr: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.80' for: 1m labels: severity: warning annotations: summary: Juniper warning Bandwidth Usage 1GiB (instance {{ $labels.instance }}) description: "Interface is getting saturated. (> 0.80GiB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/jvm/jvm-exporter.yml ================================================ groups: - name: JvmExporter rules: - alert: JvmMemoryFillingUp expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80 and sum by (instance)(jvm_memory_max_bytes{area="heap"}) > 0' for: 2m labels: severity: warning annotations: summary: JVM memory filling up (instance {{ $labels.instance }}) description: "JVM memory is filling up (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Many JVM configurations leave metaspace unbounded, in which case jvm_memory_max_bytes{area="nonheap"} is -1 and this alert will not fire. # The query filters out max_bytes <= 0 to avoid false negatives. - alert: JvmNon-heapMemoryFillingUp expr: '(sum by (instance)(jvm_memory_used_bytes{area="nonheap"}) / (sum by (instance)(jvm_memory_max_bytes{area="nonheap"}) > 0)) * 100 > 80' for: 2m labels: severity: warning annotations: summary: JVM non-heap memory filling up (instance {{ $labels.instance }}) description: "JVM non-heap memory (metaspace/code cache) is filling up (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JvmGcTimeTooHigh expr: 'sum by (instance)(rate(jvm_gc_collection_seconds_sum[5m])) > 0.05' for: 5m labels: severity: warning annotations: summary: JVM GC time too high (instance {{ $labels.instance }}) description: "JVM is spending too much time in garbage collection (> 5% of wall clock time)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JvmThreadsDeadlocked expr: 'jvm_threads_deadlocked > 0' for: 1m labels: severity: critical annotations: summary: JVM threads deadlocked (instance {{ $labels.instance }}) description: "JVM has deadlocked threads\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JvmThreadCountHigh expr: 'jvm_threads_current > 300' for: 5m labels: severity: warning annotations: summary: JVM thread count high (instance {{ $labels.instance }}) description: "JVM thread count is high (> 300), potential thread leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JvmThreadsBlocked expr: 'jvm_threads_state{state="BLOCKED"} > 50' for: 5m labels: severity: warning annotations: summary: JVM threads BLOCKED (instance {{ $labels.instance }}) description: "JVM has high number of BLOCKED threads, indicating lock contention\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This regex matches CMS, G1, and Parallel collector names. It will not match ZGC or Shenandoah cycle names. # Adjust the gc label filter if you use a different collector. - alert: JvmOldGenGcFrequency expr: 'rate(jvm_gc_collection_seconds_count{gc=~".*old.*|.*major.*"}[5m]) > 0.3' for: 5m labels: severity: warning annotations: summary: JVM old gen GC frequency (instance {{ $labels.instance }}) description: "Frequent old/major GC cycles, indicating memory pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JvmDirectBufferPoolFillingUp expr: '(jvm_buffer_pool_used_bytes / jvm_buffer_pool_capacity_bytes) * 100 > 90 and jvm_buffer_pool_capacity_bytes > 0' for: 5m labels: severity: warning annotations: summary: JVM direct buffer pool filling up (instance {{ $labels.instance }}) description: "JVM direct buffer pool is filling up (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JvmObjectsPendingFinalization expr: 'jvm_memory_objects_pending_finalization > 1000' for: 5m labels: severity: warning annotations: summary: JVM objects pending finalization (instance {{ $labels.instance }}) description: "JVM has objects pending finalization, potential memory leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not JVM-specific. # This alert will also fire for Go, Python, or any process exposing these metrics. - alert: JvmFileDescriptorsExhaustion expr: '(process_open_fds / process_max_fds) * 100 > 90 and process_max_fds > 0' for: 5m labels: severity: warning annotations: summary: JVM file descriptors exhaustion (instance {{ $labels.instance }}) description: "JVM process is running out of file descriptors (> 90% used)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JvmClassLoadingAnomaly expr: 'rate(jvm_classes_loaded_total[5m]) > 100' for: 5m labels: severity: warning annotations: summary: JVM class loading anomaly (instance {{ $labels.instance }}) description: "Rapid class loading detected, potential classloader leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JvmCompilationTimeSpike expr: 'rate(jvm_compilation_time_seconds_total[5m]) > 0.1' for: 5m labels: severity: warning annotations: summary: JVM compilation time spike (instance {{ $labels.instance }}) description: "Excessive JIT compilation time consuming CPU\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/kafka/danielqsj-kafka-exporter.yml ================================================ groups: - name: DanielqsjKafkaExporter rules: - alert: KafkaTopicsReplicas expr: 'min(kafka_topic_partition_in_sync_replica) by (topic) < 3' for: 0m labels: severity: critical annotations: summary: Kafka topics replicas (instance {{ $labels.instance }}) description: "Kafka topic in-sync partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KafkaConsumerGroupLag expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 10000' for: 1m labels: severity: warning annotations: summary: Kafka consumer group lag (instance {{ $labels.instance }}) description: "Kafka consumer group {{ $labels.consumergroup }} is lagging behind ({{ $value }} messages)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/kafka/linkedin-kafka-exporter.yml ================================================ groups: - name: LinkedinKafkaExporter rules: - alert: KafkaTopicOffsetDecreased expr: 'delta(kafka_burrow_partition_current_offset[1m]) < 0' for: 0m labels: severity: warning annotations: summary: Kafka topic offset decreased (instance {{ $labels.instance }}) description: "Kafka topic offset has decreased\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KafkaConsumerLag expr: 'kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0' for: 15m labels: severity: warning annotations: summary: Kafka consumer lag (instance {{ $labels.instance }}) description: "Kafka consumer has a 30 minutes and increasing lag\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml ================================================ groups: - name: AerogearKeycloakMetricsSpi rules: # Threshold of 5% is a rough default. Adjust based on your user base and expected error rates. # A spike in failed logins may indicate a brute-force attack or misconfigured client. - alert: KeycloakHighLoginFailureRate expr: '(sum by (realm) (rate(keycloak_failed_login_attempts_total[5m])) / (sum by (realm) (rate(keycloak_logins_total[5m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[5m])))) * 100 > 5 and (sum by (realm) (rate(keycloak_logins_total[5m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[5m]))) > 0' for: 5m labels: severity: warning annotations: summary: Keycloak high login failure rate (instance {{ $labels.instance }}) description: "More than 5% of login attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Only fires when login attempts exist but none succeed — may indicate an authentication outage. - alert: KeycloakNoSuccessfulLogins expr: 'sum by (realm) (rate(keycloak_logins_total[15m])) == 0 and (sum by (realm) (rate(keycloak_logins_total[15m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[15m]))) > 0' for: 5m labels: severity: critical annotations: summary: Keycloak no successful logins (instance {{ $labels.instance }}) description: "No successful logins in realm {{ $labels.realm }} for the last 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 10% is a rough default. High refresh token errors may indicate expired sessions or token store issues. - alert: KeycloakHighTokenRefreshErrorRate expr: '(sum by (realm) (rate(keycloak_refresh_tokens_errors_total[5m])) / sum by (realm) (rate(keycloak_refresh_tokens_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_refresh_tokens_total[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Keycloak high token refresh error rate (instance {{ $labels.instance }}) description: "More than 10% of token refresh attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 10% is a rough default. Code-to-token failures may indicate misconfigured OAuth clients or replay attacks. - alert: KeycloakHighCode-to-tokenExchangeErrorRate expr: '(sum by (realm) (rate(keycloak_code_to_tokens_errors_total[5m])) / sum by (realm) (rate(keycloak_code_to_tokens_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_code_to_tokens_total[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Keycloak high code-to-token exchange error rate (instance {{ $labels.instance }}) description: "More than 10% of code-to-token exchanges are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 10% is a rough default. - alert: KeycloakHighRegistrationFailureRate expr: '(sum by (realm) (rate(keycloak_registrations_errors_total[5m])) / sum by (realm) (rate(keycloak_registrations_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_registrations_total[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Keycloak high registration failure rate (instance {{ $labels.instance }}) description: "More than 10% of registration attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # keycloak_request_duration is in milliseconds. Threshold of 2000ms (2 seconds) is a rough default. - alert: KeycloakSlowRequestResponseTime expr: 'sum by (method) (rate(keycloak_request_duration_sum[5m])) / sum by (method) (rate(keycloak_request_duration_count[5m])) > 2000 and sum by (method) (rate(keycloak_request_duration_count[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Keycloak slow request response time (instance {{ $labels.instance }}) description: "Keycloak {{ $labels.method }} requests are taking more than 2 seconds on average.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/kubernetes/kubestate-exporter.yml ================================================ groups: - name: KubestateExporter rules: - alert: KubernetesNodeNotReady expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0' for: 10m labels: severity: critical annotations: summary: Kubernetes Node not ready (instance {{ $labels.instance }}) description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Kubernetes Node with disabled schedules are fine. # This alarm can be useful to get warned if there are nodes which are longer unscheduled. - alert: KubernetesNodeSchedulingDisabled expr: 'kube_node_spec_taint{key="node.kubernetes.io/unschedulable"} == 1' for: 30m labels: severity: warning annotations: summary: Kubernetes Node scheduling disabled (instance {{ $labels.instance }}) description: "Node {{ $labels.node }} has been marked as unschedulable for more than 30 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesNodeMemoryPressure expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1' for: 2m labels: severity: critical annotations: summary: Kubernetes Node memory pressure (instance {{ $labels.instance }}) description: "Node {{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesNodeDiskPressure expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1' for: 2m labels: severity: critical annotations: summary: Kubernetes Node disk pressure (instance {{ $labels.instance }}) description: "Node {{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesNodeNetworkUnavailable expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1' for: 2m labels: severity: critical annotations: summary: Kubernetes Node network unavailable (instance {{ $labels.instance }}) description: "Node {{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesNodeOutOfPodCapacity expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90' for: 2m labels: severity: warning annotations: summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }}) description: "Node {{ $labels.node }} is out of pod capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesContainerOomKiller expr: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1' for: 0m labels: severity: warning annotations: summary: Kubernetes Container oom killer (instance {{ $labels.instance }}) description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesJobFailed expr: 'kube_job_status_failed > 0' for: 0m labels: severity: warning annotations: summary: Kubernetes Job failed (instance {{ $labels.instance }}) description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesJobNotStarting expr: 'kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600' for: 0m labels: severity: warning annotations: summary: Kubernetes Job not starting (instance {{ $labels.instance }}) description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesCronjobFailing expr: '(kube_cronjob_status_last_schedule_time > kube_cronjob_status_last_successful_time) AND (kube_cronjob_status_active == 0) AND (kube_cronjob_spec_suspend == 0)' for: 0m labels: severity: critical annotations: summary: Kubernetes CronJob failing (instance {{ $labels.instance }}) description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is failing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesCronjobSuspended expr: 'kube_cronjob_spec_suspend != 0' for: 0m labels: severity: warning annotations: summary: Kubernetes CronJob suspended (instance {{ $labels.instance }}) description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesPersistentvolumeclaimPending expr: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1' for: 2m labels: severity: warning annotations: summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }}) description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesVolumeOutOfDiskSpace expr: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10 and kubelet_volume_stats_capacity_bytes > 0' for: 2m labels: severity: warning annotations: summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }}) description: "Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesVolumeFullInFourDays expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0' for: 0m labels: severity: critical annotations: summary: Kubernetes Volume full in four days (instance {{ $labels.instance }}) description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesPersistentvolumeError expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0' for: 0m labels: severity: critical annotations: summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }}) description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesStatefulsetDown expr: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0' for: 1m labels: severity: critical annotations: summary: Kubernetes StatefulSet down (instance {{ $labels.instance }}) description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesHpaScaleInability expr: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0' for: 2m labels: severity: warning annotations: summary: Kubernetes HPA scale inability (instance {{ $labels.instance }}) description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesHpaMetricsUnavailability expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1' for: 0m labels: severity: warning annotations: summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }}) description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesHpaScaleMaximum expr: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)' for: 2m labels: severity: info annotations: summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }}) description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesHpaUnderutilized expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3' for: 0m labels: severity: info annotations: summary: Kubernetes HPA underutilized (instance {{ $labels.instance }}) description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesPodNotHealthy expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0' for: 15m labels: severity: critical annotations: summary: Kubernetes Pod not healthy (instance {{ $labels.instance }}) description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesPodCrashLooping expr: 'increase(kube_pod_container_status_restarts_total[1m]) > 3' for: 2m labels: severity: warning annotations: summary: Kubernetes pod crash looping (instance {{ $labels.instance }}) description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesReplicasetReplicasMismatch expr: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas' for: 10m labels: severity: warning annotations: summary: Kubernetes ReplicaSet replicas mismatch (instance {{ $labels.instance }}) description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesDeploymentReplicasMismatch expr: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available' for: 10m labels: severity: warning annotations: summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }}) description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesStatefulsetReplicasMismatch expr: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas' for: 10m labels: severity: warning annotations: summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }}) description: "StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesDeploymentGenerationMismatch expr: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation' for: 10m labels: severity: critical annotations: summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }}) description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesStatefulsetGenerationMismatch expr: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation' for: 10m labels: severity: critical annotations: summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }}) description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesStatefulsetUpdateNotRolledOut expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)' for: 10m labels: severity: warning annotations: summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }}) description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesDaemonsetRolloutStuck expr: '(kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 and kube_daemonset_status_desired_number_scheduled > 0) or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0' for: 10m labels: severity: warning annotations: summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }}) description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesDaemonsetMisscheduled expr: 'kube_daemonset_status_number_misscheduled > 0' for: 1m labels: severity: critical annotations: summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }}) description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold should be customized for each cronjob name. - alert: KubernetesCronjobTooLong expr: 'kube_job_status_start_time > 0 and absent(kube_job_status_completion_time) and (time() - kube_job_status_start_time) > 3600' for: 0m labels: severity: warning annotations: summary: Kubernetes CronJob too long (instance {{ $labels.instance }}) description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesJobSlowCompletion expr: 'kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0' for: 12h labels: severity: critical annotations: summary: Kubernetes Job slow completion (instance {{ $labels.instance }}) description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesApiServerErrors expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3 and sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) > 0' for: 2m labels: severity: critical annotations: summary: Kubernetes API server errors (instance {{ $labels.instance }}) description: "Kubernetes API server is experiencing {{ $value | humanize }}% error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesApiClientErrors expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 and sum(rate(rest_client_requests_total[1m])) by (instance, job) > 0' for: 2m labels: severity: critical annotations: summary: Kubernetes API client errors (instance {{ $labels.instance }}) description: "Kubernetes API client is experiencing {{ $value | humanize }}% error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesClientCertificateExpiresNextWeek expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60' for: 0m labels: severity: warning annotations: summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }}) description: "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesClientCertificateExpiresSoon expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60' for: 0m labels: severity: critical annotations: summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }}) description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesApiServerLatency expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1' for: 2m labels: severity: warning annotations: summary: Kubernetes API server latency (instance {{ $labels.instance }}) description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/linkerd/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: # Linkerd does not expose request_errors_total. Errors are tracked via response_total{classification="failure"}. - alert: LinkerdHighErrorRate expr: 'sum(rate(response_total{classification="failure"}[1m])) by (deployment, statefulset, daemonset) / sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10 and sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) > 0' for: 1m labels: severity: warning annotations: summary: Linkerd high error rate (instance {{ $labels.instance }}) description: "Linkerd error rate for {{ $labels.deployment }}{{ $labels.statefulset }}{{ $labels.daemonset }} is over 10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/loki/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: LokiProcessTooManyRestarts expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2' for: 0m labels: severity: warning annotations: summary: Loki process too many restarts (instance {{ $labels.instance }}) description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: LokiRequestErrors expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10 and sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 0' for: 15m labels: severity: critical annotations: summary: Loki request errors (instance {{ $labels.instance }}) description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: LokiRequestPanic expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0' for: 5m labels: severity: critical annotations: summary: Loki request panic (instance {{ $labels.instance }}) description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: LokiRequestLatency expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1' for: 5m labels: severity: critical annotations: summary: Loki request latency (instance {{ $labels.instance }}) description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/meilisearch/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: MeilisearchIndexIsEmpty expr: 'meilisearch_index_docs_count == 0' for: 0m labels: severity: warning annotations: summary: Meilisearch index is empty (instance {{ $labels.instance }}) description: "Meilisearch index {{ $labels.index }} has zero documents\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MeilisearchHttpResponseTime expr: 'meilisearch_http_response_time_seconds > 0.5' for: 0m labels: severity: warning annotations: summary: Meilisearch http response time (instance {{ $labels.instance }}) description: "Meilisearch http response time is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/memcached/memcached-exporter.yml ================================================ groups: - name: MemcachedExporter rules: # 1m delay allows a restart without triggering an alert. - alert: MemcachedDown expr: 'memcached_up == 0' for: 1m labels: severity: critical annotations: summary: Memcached down (instance {{ $labels.instance }}) description: "Memcached instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MemcachedConnectionLimitApproaching(>80%) expr: '(memcached_current_connections / memcached_max_connections * 100) > 80 and memcached_max_connections > 0' for: 2m labels: severity: warning annotations: summary: Memcached connection limit approaching (> 80%) (instance {{ $labels.instance }}) description: "Memcached connection usage is above 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MemcachedConnectionLimitApproaching(>95%) expr: '(memcached_current_connections / memcached_max_connections * 100) > 95 and memcached_max_connections > 0' for: 2m labels: severity: critical annotations: summary: Memcached connection limit approaching (> 95%) (instance {{ $labels.instance }}) description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MemcachedOutOfMemoryErrors expr: 'sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Memcached out of memory errors (instance {{ $labels.instance }}) description: "Memcached is returning out-of-memory errors on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions. - alert: MemcachedMemoryUsageHigh(>90%) expr: '(memcached_current_bytes / memcached_limit_bytes * 100) > 90 and memcached_limit_bytes > 0' for: 5m labels: severity: warning annotations: summary: Memcached memory usage high (> 90%) (instance {{ $labels.instance }}) description: "Memcached memory usage is above 90% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # A sustained eviction rate indicates memory pressure. Consider increasing memcached memory limit or reducing cache usage. Threshold of 10 evictions/s is a rough default — adjust based on your workload. - alert: MemcachedHighEvictionRate expr: 'rate(memcached_items_evicted_total[5m]) > 10' for: 5m labels: severity: warning annotations: summary: Memcached high eviction rate (instance {{ $labels.instance }}) description: "Memcached is evicting items at a high rate on {{ $labels.instance }} ({{ $value }} evictions/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # A low hit rate may indicate poor cache utilization, incorrect cache keys, or TTLs that are too short. Threshold of 80% is a rough default — adjust based on your workload and access patterns. - alert: MemcachedLowCacheHitRate(<80%) expr: '(rate(memcached_commands_total{command="get", status="hit"}[5m]) / (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) * 100) < 80 and (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) > 0' for: 10m labels: severity: warning annotations: summary: Memcached low cache hit rate (< 80%) (instance {{ $labels.instance }}) description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MemcachedConnectionsRejected expr: 'increase(memcached_connections_rejected_total[5m]) > 0' for: 5m labels: severity: warning annotations: summary: Memcached connections rejected (instance {{ $labels.instance }}) description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MemcachedItemsTooLarge expr: 'increase(memcached_item_too_large_total[5m]) > 0' for: 5m labels: severity: info annotations: summary: Memcached items too large (instance {{ $labels.instance }}) description: "Memcached is rejecting items exceeding max-item-size on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/minio/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: MinioClusterDiskOffline expr: 'minio_cluster_drive_offline_total > 0' for: 0m labels: severity: critical annotations: summary: Minio cluster disk offline (instance {{ $labels.instance }}) description: "Minio cluster disk is offline\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MinioNodeDiskOffline expr: 'minio_cluster_nodes_offline_total > 0' for: 0m labels: severity: critical annotations: summary: Minio node disk offline (instance {{ $labels.instance }}) description: "Minio cluster node disk is offline\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MinioDiskSpaceUsage expr: 'minio_cluster_capacity_raw_free_bytes / minio_cluster_capacity_raw_total_bytes * 100 < 10 and minio_cluster_capacity_raw_total_bytes > 0' for: 0m labels: severity: warning annotations: summary: Minio disk space usage (instance {{ $labels.instance }}) description: "Minio available free space is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/mongodb/dcu-mongodb-exporter.yml ================================================ groups: - name: DcuMongodbExporter rules: - alert: MongodbReplicationLag(dcu) expr: 'avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}) > 10' for: 0m labels: severity: critical annotations: summary: MongoDB replication lag (DCU) (instance {{ $labels.instance }}) description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbReplicationStatus3 expr: 'mongodb_replset_member_state == 3' for: 0m labels: severity: critical annotations: summary: MongoDB replication Status 3 (instance {{ $labels.instance }}) description: "MongoDB Replication set member either perform startup self-checks, or transition from completing a rollback or resync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbReplicationStatus6 expr: 'mongodb_replset_member_state == 6' for: 0m labels: severity: critical annotations: summary: MongoDB replication Status 6 (instance {{ $labels.instance }}) description: "MongoDB Replication set member as seen from another member of the set, is not yet known\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbReplicationStatus8 expr: 'mongodb_replset_member_state == 8' for: 0m labels: severity: critical annotations: summary: MongoDB replication Status 8 (instance {{ $labels.instance }}) description: "MongoDB Replication set member as seen from another member of the set, is unreachable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbReplicationStatus9 expr: 'mongodb_replset_member_state == 9' for: 0m labels: severity: critical annotations: summary: MongoDB replication Status 9 (instance {{ $labels.instance }}) description: "MongoDB Replication set member is actively performing a rollback. Data is not available for reads\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbReplicationStatus10 expr: 'mongodb_replset_member_state == 10' for: 0m labels: severity: critical annotations: summary: MongoDB replication Status 10 (instance {{ $labels.instance }}) description: "MongoDB Replication set member was once in a replica set but was subsequently removed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbNumberCursorsOpen(dcu) expr: 'mongodb_metrics_cursor_open{state="total_open"} > 10000' for: 2m labels: severity: warning annotations: summary: MongoDB number cursors open (DCU) (instance {{ $labels.instance }}) description: "Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbCursorsTimeouts(dcu) expr: 'increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100' for: 2m labels: severity: warning annotations: summary: MongoDB cursors timeouts (DCU) (instance {{ $labels.instance }}) description: "Too many cursors are timing out ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbTooManyConnections(dcu) expr: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80 and (mongodb_connections{state="current"} + mongodb_connections{state="available"}) > 0' for: 2m labels: severity: warning annotations: summary: MongoDB too many connections (DCU) (instance {{ $labels.instance }}) description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/mongodb/percona-mongodb-exporter.yml ================================================ groups: - name: PerconaMongodbExporter rules: # 1m delay allows a restart without triggering an alert. - alert: MongodbDown expr: 'mongodb_up == 0' for: 1m labels: severity: critical annotations: summary: MongoDB Down (instance {{ $labels.instance }}) description: "MongoDB instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1m delay allows a restart without triggering an alert. - alert: MongodbReplicaMemberUnhealthy expr: 'mongodb_rs_members_health == 0' for: 1m labels: severity: critical annotations: summary: Mongodb replica member unhealthy (instance {{ $labels.instance }}) description: "MongoDB replica member is not healthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbReplicationLag(percona) expr: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10' for: 0m labels: severity: critical annotations: summary: MongoDB replication lag (Percona) (instance {{ $labels.instance }}) description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both. - alert: MongodbReplicationHeadroom expr: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0' for: 0m labels: severity: critical annotations: summary: MongoDB replication headroom (instance {{ $labels.instance }}) description: "MongoDB replication headroom is <= 0\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbNumberCursorsOpen(percona) expr: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000' for: 2m labels: severity: warning annotations: summary: MongoDB number cursors open (Percona) (instance {{ $labels.instance }}) description: "Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbCursorsTimeouts(percona) expr: 'increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100' for: 2m labels: severity: warning annotations: summary: MongoDB cursors timeouts (Percona) (instance {{ $labels.instance }}) description: "Too many cursors are timing out ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbTooManyConnections(percona) expr: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80 and (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) > 0' for: 2m labels: severity: warning annotations: summary: MongoDB too many connections (Percona) (instance {{ $labels.instance }}) description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/mongodb/stefanprodan-mgob-exporter.yml ================================================ groups: - name: StefanprodanMgobExporter rules: - alert: MgobBackupFailed expr: 'changes(mgob_scheduler_backup_total{status="500"}[1h]) > 0' for: 0m labels: severity: critical annotations: summary: Mgob backup failed (instance {{ $labels.instance }}) description: "MongoDB backup has failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/mysql/mysqld-exporter.yml ================================================ groups: - name: MysqldExporter rules: # 1m delay allows a restart without triggering an alert. - alert: MysqlDown expr: 'mysql_up == 0' for: 1m labels: severity: critical annotations: summary: MySQL down (instance {{ $labels.instance }}) description: "MySQL instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlTooManyConnections(>80%) expr: 'max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80 and mysql_global_variables_max_connections > 0' for: 2m labels: severity: warning annotations: summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }}) description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlHighPreparedStatementsUtilization(>80%) expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80 and mysql_global_variables_max_prepared_stmt_count > 0' for: 2m labels: severity: warning annotations: summary: MySQL high prepared statements utilization (> 80%) (instance {{ $labels.instance }}) description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlHighThreadsRunning expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60 and mysql_global_variables_max_connections > 0' for: 2m labels: severity: warning annotations: summary: MySQL high threads running (instance {{ $labels.instance }}) description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1m delay allows a restart without triggering an alert. - alert: MysqlSlaveIoThreadNotRunning expr: '( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0' for: 1m labels: severity: critical annotations: summary: MySQL Slave IO thread not running (instance {{ $labels.instance }}) description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1m delay allows a restart without triggering an alert. - alert: MysqlSlaveSqlThreadNotRunning expr: '( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0' for: 1m labels: severity: critical annotations: summary: MySQL Slave SQL thread not running (instance {{ $labels.instance }}) description: "MySQL Slave SQL thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlSlaveReplicationLag expr: '( (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and ON (instance) mysql_slave_status_master_server_id > 0 ) > 30' for: 1m labels: severity: critical annotations: summary: MySQL Slave replication lag (instance {{ $labels.instance }}) description: "MySQL replication lag on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlSlowQueries expr: 'increase(mysql_global_status_slow_queries[1m]) > 0' for: 2m labels: severity: warning annotations: summary: MySQL slow queries (instance {{ $labels.instance }}) description: "MySQL server mysql has some new slow query ({{ $value }} in the last minute).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlInnodbLogWaits expr: 'rate(mysql_global_status_innodb_log_waits[15m]) > 10' for: 0m labels: severity: warning annotations: summary: MySQL InnoDB log waits (instance {{ $labels.instance }}) description: "MySQL innodb log writes stalling ({{ $value }} waits/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlRestarted expr: 'mysql_global_status_uptime < 60' for: 0m labels: severity: info annotations: summary: MySQL restarted (instance {{ $labels.instance }}) description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlHighQps expr: 'irate(mysql_global_status_questions[1m]) > 10000' for: 2m labels: severity: info annotations: summary: MySQL High QPS (instance {{ $labels.instance }}) description: "MySQL is being overload with unusual QPS (> 10k QPS).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlTooManyOpenFiles expr: 'mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75 and mysql_global_variables_open_files_limit > 0' for: 2m labels: severity: warning annotations: summary: MySQL too many open files (instance {{ $labels.instance }}) description: "MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlInnodbForceRecoveryIsEnabled expr: 'mysql_global_variables_innodb_force_recovery != 0' for: 2m labels: severity: warning annotations: summary: MySQL InnoDB Force Recovery is enabled (instance {{ $labels.instance }}) description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlInnodbHistory_lenTooLong expr: 'mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000' for: 2m labels: severity: warning annotations: summary: MySQL InnoDB history_len too long (instance {{ $labels.instance }}) description: "MySQL history_len (undo log) too long on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/nats/nats-exporter.yml ================================================ groups: - name: NatsExporter rules: - alert: NatsHighRoutesCount expr: 'gnatsd_varz_routes > 10' for: 3m labels: severity: warning annotations: summary: Nats high routes count (instance {{ $labels.instance }}) description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighMemoryUsage expr: 'gnatsd_varz_mem > 200 * 1024 * 1024' for: 5m labels: severity: warning annotations: summary: Nats high memory usage (instance {{ $labels.instance }}) description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsSlowConsumers expr: 'gnatsd_varz_slow_consumers > 0' for: 3m labels: severity: critical annotations: summary: Nats slow consumers (instance {{ $labels.instance }}) description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsServerDown expr: 'absent(up{job="nats"})' for: 5m labels: severity: critical annotations: summary: Nats server down (instance {{ $labels.instance }}) description: "NATS server has been down for more than 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # gnatsd_varz_cpu is a gauge reporting CPU percentage (0-100 scale). - alert: NatsHighCpuUsage expr: 'gnatsd_varz_cpu > 80' for: 5m labels: severity: warning annotations: summary: Nats high CPU usage (instance {{ $labels.instance }}) description: "NATS server is using more than 80% CPU for the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighNumberOfConnections expr: 'gnatsd_connz_num_connections > 1000' for: 5m labels: severity: warning annotations: summary: Nats high number of connections (instance {{ $labels.instance }}) description: "NATS server has more than 1000 active connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighJetstreamStoreUsage expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 and gnatsd_varz_jetstream_config_max_storage > 0' for: 5m labels: severity: warning annotations: summary: Nats high JetStream store usage (instance {{ $labels.instance }}) description: "JetStream store usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighJetstreamMemoryUsage expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 and gnatsd_varz_jetstream_config_max_memory > 0' for: 5m labels: severity: warning annotations: summary: Nats high JetStream memory usage (instance {{ $labels.instance }}) description: "JetStream memory usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighNumberOfSubscriptions expr: 'gnatsd_connz_subscriptions > 1000' for: 5m labels: severity: warning annotations: summary: Nats high number of subscriptions (instance {{ $labels.instance }}) description: "NATS server has more than 1000 active subscriptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighPendingBytes expr: 'gnatsd_connz_pending_bytes > 100000' for: 5m labels: severity: warning annotations: summary: Nats high pending bytes (instance {{ $labels.instance }}) description: "NATS server has more than 100,000 pending bytes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsTooManyErrors expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0' for: 5m labels: severity: warning annotations: summary: Nats too many errors (instance {{ $labels.instance }}) description: "NATS server has encountered {{ $value }} JetStream API errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsJetstreamAccountsExceeded expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100' for: 5m labels: severity: warning annotations: summary: Nats JetStream accounts exceeded (instance {{ $labels.instance }}) description: "JetStream has more than 100 active accounts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsLeafNodeConnectionIssue expr: 'gnatsd_varz_leafnodes == 0' for: 5m labels: severity: warning annotations: summary: Nats leaf node connection issue (instance {{ $labels.instance }}) description: "No leaf node connections on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/netdata/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: # This is a gauge metric (not a counter). Checking idle < 20% means CPU usage > 80%. - alert: NetdataHighCpuUsage expr: 'netdata_cpu_cpu_percentage_average{dimension="idle"} < 20' for: 5m labels: severity: warning annotations: summary: Netdata high cpu usage (instance {{ $labels.instance }}) description: "Netdata high CPU usage (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NetdataCpuStealNoisyNeighbor expr: 'netdata_cpu_cpu_percentage_average{dimension="steal"} > 10' for: 5m labels: severity: warning annotations: summary: Netdata CPU steal noisy neighbor (instance {{ $labels.instance }}) description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NetdataHighMemoryUsage expr: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20 and netdata_system_ram_MiB_average > 0' for: 5m labels: severity: warning annotations: summary: Netdata high memory usage (instance {{ $labels.instance }}) description: "Netdata high memory usage (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NetdataLowDiskSpace expr: '100 / netdata_disk_space_GB_average * netdata_disk_space_GB_average{dimension=~"avail|cached"} < 20 and netdata_disk_space_GB_average > 0' for: 5m labels: severity: warning annotations: summary: Netdata low disk space (instance {{ $labels.instance }}) description: "Netdata low disk space (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NetdataPredictedDiskFull expr: 'predict_linear(netdata_disk_space_GB_average{dimension=~"avail|cached"}[3h], 24 * 3600) < 0' for: 0m labels: severity: warning annotations: summary: Netdata predicted disk full (instance {{ $labels.instance }}) description: "Netdata predicted disk full in 24 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NetdataMdMismatchCntUnsynchronizedBlocks expr: 'netdata_md_mismatch_cnt_unsynchronized_blocks_average > 1024' for: 2m labels: severity: warning annotations: summary: Netdata MD mismatch cnt unsynchronized blocks (instance {{ $labels.instance }}) description: "RAID Array have unsynchronized blocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NetdataDiskReallocatedSectors expr: 'increase(netdata_smartd_log_reallocated_sectors_count_sectors_average[1m]) > 0' for: 0m labels: severity: info annotations: summary: Netdata disk reallocated sectors (instance {{ $labels.instance }}) description: "Disk reallocated sectors detected ({{ $value }} sectors)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NetdataDiskCurrentPendingSector expr: 'netdata_smartd_log_current_pending_sector_count_sectors_average > 0' for: 0m labels: severity: warning annotations: summary: Netdata disk current pending sector (instance {{ $labels.instance }}) description: "Disk current pending sector\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NetdataReportedUncorrectableDiskSectors expr: 'increase(netdata_smartd_log_offline_uncorrectable_sector_count_sectors_average[2m]) > 0' for: 0m labels: severity: warning annotations: summary: Netdata reported uncorrectable disk sectors (instance {{ $labels.instance }}) description: "Reported uncorrectable disk sectors ({{ $value }} sectors)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/nginx/knyar-nginx-exporter.yml ================================================ groups: - name: KnyarNginxExporter rules: - alert: NginxHighHttp4xxErrorRate expr: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0' for: 1m labels: severity: critical annotations: summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }}) description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NginxHighHttp5xxErrorRate expr: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0' for: 1m labels: severity: critical annotations: summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }}) description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NginxLatencyHigh expr: 'histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node, le)) > 3' for: 2m labels: severity: warning annotations: summary: Nginx latency high (instance {{ $labels.instance }}) description: "Nginx p99 latency is higher than 3 seconds\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/nomad/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: NomadJobFailed expr: 'nomad_nomad_job_summary_failed > 0' for: 0m labels: severity: warning annotations: summary: Nomad job failed (instance {{ $labels.instance }}) description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NomadJobLost expr: 'nomad_nomad_job_summary_lost > 0' for: 0m labels: severity: warning annotations: summary: Nomad job lost (instance {{ $labels.instance }}) description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NomadJobQueued expr: 'nomad_nomad_job_summary_queued > 0' for: 2m labels: severity: warning annotations: summary: Nomad job queued (instance {{ $labels.instance }}) description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NomadBlockedEvaluation expr: 'nomad_nomad_blocked_evals_total_blocked > 0' for: 0m labels: severity: warning annotations: summary: Nomad blocked evaluation (instance {{ $labels.instance }}) description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/openebs/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: OpenebsUsedPoolCapacity expr: 'openebs_used_pool_capacity_percent > 80' for: 2m labels: severity: warning annotations: summary: OpenEBS used pool capacity (instance {{ $labels.instance }}) description: "OpenEBS Pool use more than 80% of his capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/openstack/openstack-exporter.yml ================================================ groups: - name: OpenstackExporter rules: - alert: OpenstackExporterDown expr: 'up{job=~".*openstack.*"} == 0' for: 2m labels: severity: critical annotations: summary: OpenStack exporter down (instance {{ $labels.instance }}) description: "The OpenStack exporter is down. OpenStack cloud metrics are no longer being collected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackNovaAgentDown expr: 'openstack_nova_agent_state{adminState="enabled"} == 0' for: 2m labels: severity: critical annotations: summary: OpenStack Nova agent down (instance {{ $labels.instance }}) description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackNeutronAgentDown expr: 'openstack_neutron_agent_state{adminState="up"} == 0' for: 2m labels: severity: critical annotations: summary: OpenStack Neutron agent down (instance {{ $labels.instance }}) description: "Neutron agent {{ $labels.hostname }} ({{ $labels.service }}) is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackCinderAgentDown expr: 'openstack_cinder_agent_state{adminState="enabled"} == 0' for: 2m labels: severity: critical annotations: summary: OpenStack Cinder agent down (instance {{ $labels.instance }}) description: "Cinder agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns. - alert: OpenstackHypervisorHighVcpuUsage expr: 'openstack_nova_vcpus_used / openstack_nova_vcpus_available > 0.9 and openstack_nova_vcpus_available > 0' for: 5m labels: severity: warning annotations: summary: OpenStack hypervisor high vCPU usage (instance {{ $labels.instance }}) description: "Hypervisor {{ $labels.hostname }} vCPU usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns. - alert: OpenstackHypervisorHighMemoryUsage expr: 'openstack_nova_memory_used_bytes / openstack_nova_memory_available_bytes > 0.9 and openstack_nova_memory_available_bytes > 0' for: 5m labels: severity: warning annotations: summary: OpenStack hypervisor high memory usage (instance {{ $labels.instance }}) description: "Hypervisor {{ $labels.hostname }} memory usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackHypervisorHighDiskUsage expr: 'openstack_nova_local_storage_used_bytes / openstack_nova_local_storage_available_bytes > 0.9 and openstack_nova_local_storage_available_bytes > 0' for: 5m labels: severity: warning annotations: summary: OpenStack hypervisor high disk usage (instance {{ $labels.instance }}) description: "Hypervisor {{ $labels.hostname }} local disk usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # A value of -1 for limits_vcpus_max means unlimited quota (no limit set). - alert: OpenstackNovaTenantVcpuQuotaNearlyExhausted expr: 'openstack_nova_limits_vcpus_used / openstack_nova_limits_vcpus_max > 0.9 and openstack_nova_limits_vcpus_max > 0' for: 0m labels: severity: warning annotations: summary: OpenStack Nova tenant vCPU quota nearly exhausted (instance {{ $labels.instance }}) description: "Tenant {{ $labels.tenant }} has used over 90% of its vCPU quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackNovaTenantMemoryQuotaNearlyExhausted expr: 'openstack_nova_limits_memory_used / openstack_nova_limits_memory_max > 0.9 and openstack_nova_limits_memory_max > 0' for: 0m labels: severity: warning annotations: summary: OpenStack Nova tenant memory quota nearly exhausted (instance {{ $labels.instance }}) description: "Tenant {{ $labels.tenant }} has used over 90% of its memory quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackNovaTenantInstanceQuotaNearlyExhausted expr: 'openstack_nova_limits_instances_used / openstack_nova_limits_instances_max > 0.9 and openstack_nova_limits_instances_max > 0' for: 0m labels: severity: warning annotations: summary: OpenStack Nova tenant instance quota nearly exhausted (instance {{ $labels.instance }}) description: "Tenant {{ $labels.tenant }} has used over 90% of its instance quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackCinderTenantVolumeQuotaNearlyExhausted expr: 'openstack_cinder_limits_volume_used_gb / openstack_cinder_limits_volume_max_gb > 0.9 and openstack_cinder_limits_volume_max_gb > 0' for: 0m labels: severity: warning annotations: summary: OpenStack Cinder tenant volume quota nearly exhausted (instance {{ $labels.instance }}) description: "Tenant {{ $labels.tenant }} has used over 90% of its volume storage quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackCinderPoolLowFreeCapacity expr: 'openstack_cinder_pool_capacity_free_gb / openstack_cinder_pool_capacity_total_gb < 0.1 and openstack_cinder_pool_capacity_total_gb > 0' for: 5m labels: severity: warning annotations: summary: OpenStack Cinder pool low free capacity (instance {{ $labels.instance }}) description: "Cinder storage pool {{ $labels.name }} has less than 10% free capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackNeutronFloatingIpsAssociatedButNotActive expr: 'openstack_neutron_floating_ips_associated_not_active > 0' for: 5m labels: severity: warning annotations: summary: OpenStack Neutron floating IPs associated but not active (instance {{ $labels.instance }}) description: "{{ $value }} floating IPs are associated to a private IP but are not in ACTIVE state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackNeutronRoutersNotActive expr: 'openstack_neutron_routers_not_active > 0' for: 5m labels: severity: warning annotations: summary: OpenStack Neutron routers not active (instance {{ $labels.instance }}) description: "{{ $value }} Neutron routers are not in ACTIVE state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackNeutronSubnetIpPoolExhaustion expr: 'openstack_neutron_network_ip_availabilities_used / openstack_neutron_network_ip_availabilities_total > 0.9 and openstack_neutron_network_ip_availabilities_total > 0' for: 0m labels: severity: warning annotations: summary: OpenStack Neutron subnet IP pool exhaustion (instance {{ $labels.instance }}) description: "Subnet {{ $labels.subnet_name }} on network {{ $labels.network_name }} has used over 90% of its IP pool\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackNeutronPortsWithoutIps expr: 'openstack_neutron_ports_no_ips > 0' for: 5m labels: severity: warning annotations: summary: OpenStack Neutron ports without IPs (instance {{ $labels.instance }}) description: "{{ $value }} active ports have no IP addresses assigned\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackLoadBalancerNotOnline expr: 'openstack_loadbalancer_loadbalancer_status{operating_status!="ONLINE"} > 0' for: 5m labels: severity: warning annotations: summary: OpenStack load balancer not online (instance {{ $labels.instance }}) description: "Load balancer {{ $labels.name }} ({{ $labels.id }}) operating status is {{ $labels.operating_status }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackNovaInstancesInErrorState expr: 'sum(openstack_nova_server_status{status="ERROR"}) > 0' for: 5m labels: severity: warning annotations: summary: OpenStack Nova instances in ERROR state (instance {{ $labels.instance }}) description: "{{ $value }} Nova instances are in ERROR state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackCinderVolumesInErrorState expr: 'openstack_cinder_volume_status_counter{status=~"error.*"} > 0' for: 5m labels: severity: warning annotations: summary: OpenStack Cinder volumes in error state (instance {{ $labels.instance }}) description: "{{ $value }} Cinder volumes are in an error state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This alert factors in the allocation ratio to compute effective capacity. # The threshold of 90% is a rough default. Adjust based on your allocation ratios and workload patterns. - alert: OpenstackPlacementResourceHighUsage expr: 'openstack_placement_resource_usage / (openstack_placement_resource_total * openstack_placement_resource_allocation_ratio) > 0.9 and openstack_placement_resource_total > 0' for: 5m labels: severity: warning annotations: summary: OpenStack placement resource high usage (instance {{ $labels.instance }}) description: "Resource {{ $labels.resourcetype }} on host {{ $labels.hostname }} usage exceeds 90% of its allocation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/opentelemetry-collector/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter # OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint. # These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly. # All collector internal metrics are prefixed with 'otelcol_'. rules: - alert: OpentelemetryCollectorDown expr: 'up{job=~".*otel.*collector.*"} == 0' for: 1m labels: severity: critical annotations: summary: OpenTelemetry Collector down (instance {{ $labels.instance }}) description: "OpenTelemetry Collector instance has disappeared or is not being scraped\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpentelemetryCollectorReceiverRefusedSpans expr: 'rate(otelcol_receiver_refused_spans[5m]) > 0' for: 5m labels: severity: critical annotations: summary: OpenTelemetry Collector receiver refused spans (instance {{ $labels.instance }}) description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpentelemetryCollectorReceiverRefusedMetricPoints expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0' for: 5m labels: severity: critical annotations: summary: OpenTelemetry Collector receiver refused metric points (instance {{ $labels.instance }}) description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s metric points on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpentelemetryCollectorReceiverRefusedLogRecords expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0' for: 5m labels: severity: critical annotations: summary: OpenTelemetry Collector receiver refused log records (instance {{ $labels.instance }}) description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s log records on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: OpentelemetryCollectorExporterFailedSpans expr: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0.05' for: 5m labels: severity: warning annotations: summary: OpenTelemetry Collector exporter failed spans (instance {{ $labels.instance }}) description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s spans via {{ $labels.exporter }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: OpentelemetryCollectorExporterFailedMetricPoints expr: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0.05' for: 5m labels: severity: warning annotations: summary: OpenTelemetry Collector exporter failed metric points (instance {{ $labels.instance }}) description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s metric points via {{ $labels.exporter }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: OpentelemetryCollectorExporterFailedLogRecords expr: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0.05' for: 5m labels: severity: warning annotations: summary: OpenTelemetry Collector exporter failed log records (instance {{ $labels.instance }}) description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s log records via {{ $labels.exporter }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpentelemetryCollectorExporterQueueNearlyFull expr: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0' for: 0m labels: severity: warning annotations: summary: OpenTelemetry Collector exporter queue nearly full (instance {{ $labels.instance }}) description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: OpentelemetryCollectorProcessorRefusedSpans expr: 'rate(otelcol_processor_refused_spans[5m]) > 0.05' for: 5m labels: severity: warning annotations: summary: OpenTelemetry Collector processor refused spans (instance {{ $labels.instance }}) description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans ({{ $value | humanize }}/s), likely due to backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: OpentelemetryCollectorProcessorRefusedMetricPoints expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0.05' for: 5m labels: severity: warning annotations: summary: OpenTelemetry Collector processor refused metric points (instance {{ $labels.instance }}) description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points ({{ $value | humanize }}/s), likely due to backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpentelemetryCollectorHighMemoryUsage expr: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9' for: 5m labels: severity: warning annotations: summary: OpenTelemetry Collector high memory usage (instance {{ $labels.instance }}) description: "OpenTelemetry Collector memory usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpentelemetryCollectorOtlpReceiverErrors expr: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0' for: 2m labels: severity: critical annotations: summary: OpenTelemetry Collector OTLP receiver errors (instance {{ $labels.instance }}) description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/oracle-database/iamseth-oracledb-exporter.yml ================================================ groups: - name: IamsethOracledbExporter rules: # 1m delay allows a restart without triggering an alert. - alert: OracleDbDown expr: 'oracledb_up == 0' for: 1m labels: severity: critical annotations: summary: Oracle DB down (instance {{ $labels.instance }}) description: "Oracle Database instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold is workload-dependent. Adjust 85% to suit your environment. - alert: OracleDbSessionsReachingLimit(>85%) expr: 'oracledb_resource_current_utilization{resource_name="sessions"} / oracledb_resource_limit_value{resource_name="sessions"} * 100 > 85 and oracledb_resource_limit_value{resource_name="sessions"} > 0' for: 5m labels: severity: warning annotations: summary: Oracle DB sessions reaching limit (> 85%) (instance {{ $labels.instance }}) description: "Oracle Database session utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold is workload-dependent. Adjust 85% to suit your environment. - alert: OracleDbProcessesReachingLimit(>85%) expr: 'oracledb_resource_current_utilization{resource_name="processes"} / oracledb_resource_limit_value{resource_name="processes"} * 100 > 85 and oracledb_resource_limit_value{resource_name="processes"} > 0' for: 5m labels: severity: warning annotations: summary: Oracle DB processes reaching limit (> 85%) (instance {{ $labels.instance }}) description: "Oracle Database process utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OracleDbTablespaceReachingCapacity(>85%) expr: 'oracledb_tablespace_used_percent > 85' for: 5m labels: severity: warning annotations: summary: Oracle DB tablespace reaching capacity (> 85%) (instance {{ $labels.instance }}) description: "Oracle Database tablespace {{ $labels.tablespace }} is above 85% usage on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OracleDbTablespaceFull(>95%) expr: 'oracledb_tablespace_used_percent > 95' for: 5m labels: severity: critical annotations: summary: Oracle DB tablespace full (> 95%) (instance {{ $labels.instance }}) description: "Oracle Database tablespace {{ $labels.tablespace }} is critically full on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # A high rollback rate (>20%) often indicates application-level issues such as deadlocks, constraint violations, or poorly designed transactions. - alert: OracleDbHighUserRollbacks expr: 'rate(oracledb_activity_user_rollbacks[5m]) / (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) * 100 > 20 and (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Oracle DB high user rollbacks (instance {{ $labels.instance }}) description: "Oracle Database on {{ $labels.instance }} has a high rollback rate ({{ $value }}% of transactions are rolled back)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold is highly workload-dependent. Adjust 200 to suit your environment. - alert: OracleDbTooManyActiveSessions expr: 'oracledb_sessions_value{status="ACTIVE", type="USER"} > 200' for: 5m labels: severity: warning annotations: summary: Oracle DB too many active sessions (instance {{ $labels.instance }}) description: "Oracle Database on {{ $labels.instance }} has too many active user sessions (current value: {{ $value }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # The metric from v$waitclassmetric is already a normalized rate (centiseconds per second). Threshold 300 means 3 seconds of I/O wait per second of wall time. - alert: OracleDbHighWaitTime(userI/o) expr: 'oracledb_wait_time_user_io > 300' for: 5m labels: severity: warning annotations: summary: Oracle DB high wait time (user I/O) (instance {{ $labels.instance }}) description: "Oracle Database on {{ $labels.instance }} is experiencing high user I/O wait time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/patroni/embedded-exporter-patroni.yml ================================================ groups: - name: EmbeddedExporterPatroni rules: # 1m delay allows a restart without triggering an alert. - alert: PatroniHasNoLeader expr: '(max by (scope) (patroni_primary) < 1) and (max by (scope) (patroni_standby_leader) < 1)' for: 1m labels: severity: critical annotations: summary: Patroni has no Leader (instance {{ $labels.instance }}) description: "A leader node (neither primary nor standby) cannot be found inside the cluster {{ $labels.scope }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml ================================================ groups: - name: SpreakerPgbouncerExporter rules: - alert: PgbouncerActiveConnections expr: 'pgbouncer_pools_server_active_connections > 200' for: 2m labels: severity: warning annotations: summary: PGBouncer active connections (instance {{ $labels.instance }}) description: "PGBouncer pools are filling up\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PgbouncerErrors expr: 'increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[1m]) > 10' for: 0m labels: severity: warning annotations: summary: PGBouncer errors (instance {{ $labels.instance }}) description: "PGBouncer is logging errors. This may be due to a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PgbouncerMaxConnections expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[2m]) > 0' for: 0m labels: severity: critical annotations: summary: PGBouncer max connections (instance {{ $labels.instance }}) description: "The number of PGBouncer client connections has reached max_client_conn.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/php-fpm/bakins-fpm-exporter.yml ================================================ groups: - name: BakinsFpmExporter rules: - alert: Php-fpmMax-childrenReached expr: 'sum(increase(phpfpm_max_children_reached_total[5m])) by (instance) > 3' for: 0m labels: severity: warning annotations: summary: PHP-FPM max-children reached (instance {{ $labels.instance }}) description: "PHP-FPM reached max children on {{ $labels.instance }} ({{ $value }} times in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/postgresql/postgres-exporter.yml ================================================ groups: - name: PostgresExporter rules: # 1m delay allows a restart without triggering an alert. - alert: PostgresqlDown expr: 'pg_up == 0' for: 1m labels: severity: critical annotations: summary: Postgresql down (instance {{ $labels.instance }}) description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlRestarted expr: 'time() - pg_postmaster_start_time_seconds < 60' for: 0m labels: severity: critical annotations: summary: Postgresql restarted (instance {{ $labels.instance }}) description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlExporterError expr: 'pg_exporter_last_scrape_error > 0' for: 0m labels: severity: critical annotations: summary: Postgresql exporter error (instance {{ $labels.instance }}) description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlTableNotAutoVacuumed expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10' for: 0m labels: severity: warning annotations: summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }}) description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlTableNotAutoAnalyzed expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10' for: 0m labels: severity: warning annotations: summary: Postgresql table not auto analyzed (instance {{ $labels.instance }}) description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlTooManyConnections expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)' for: 2m labels: severity: warning annotations: summary: Postgresql too many connections (instance {{ $labels.instance }}) description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlNotEnoughConnections expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5' for: 2m labels: severity: critical annotations: summary: Postgresql not enough connections (instance {{ $labels.instance }}) description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlDeadLocks expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5' for: 0m labels: severity: warning annotations: summary: Postgresql dead locks (instance {{ $labels.instance }}) description: "PostgreSQL has dead-locks ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlHighRollbackRate expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02' for: 0m labels: severity: warning annotations: summary: Postgresql high rollback rate (instance {{ $labels.instance }}) description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlCommitRateLow expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5' for: 2m labels: severity: critical annotations: summary: Postgresql commit rate low (instance {{ $labels.instance }}) description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlLowXidConsumption expr: 'rate(pg_txid_current[1m]) < 5' for: 2m labels: severity: warning annotations: summary: Postgresql low XID consumption (instance {{ $labels.instance }}) description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlUnusedReplicationSlot expr: '(pg_replication_slots_active == 0) and (pg_replication_is_replica == 0)' for: 1m labels: severity: warning annotations: summary: Postgresql unused replication slot (instance {{ $labels.instance }}) description: "Unused Replication Slots\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlTooManyDeadTuples expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 and (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) > 0' for: 2m labels: severity: warning annotations: summary: Postgresql too many dead tuples (instance {{ $labels.instance }}) description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlConfigurationChanged expr: '{__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} != ON(__name__, instance) {__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} OFFSET 5m' for: 0m labels: severity: info annotations: summary: Postgresql configuration changed (instance {{ $labels.instance }}) description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlSslCompressionActive expr: 'sum by (instance) (pg_stat_ssl_compression) > 0' for: 0m labels: severity: warning annotations: summary: Postgresql SSL compression active (instance {{ $labels.instance }}) description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlTooManyLocksAcquired expr: '((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 and (pg_settings_max_locks_per_transaction * pg_settings_max_connections) > 0' for: 2m labels: severity: critical annotations: summary: Postgresql too many locks acquired (instance {{ $labels.instance }}) description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - alert: PostgresqlBloatIndexHigh(>80%) expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)' for: 1h labels: severity: warning annotations: summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }}) description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - alert: PostgresqlBloatTableHigh(>80%) expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)' for: 1h labels: severity: warning annotations: summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }}) description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - alert: PostgresqlInvalidIndex expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' for: 6h labels: severity: warning annotations: summary: Postgresql invalid index (instance {{ $labels.instance }}) description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlReplicationLag expr: 'pg_replication_lag_seconds > 5' for: 30s labels: severity: warning annotations: summary: Postgresql replication lag (instance {{ $labels.instance }}) description: "The PostgreSQL replication lag is high (> 5s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/process-exporter/process-exporter.yml ================================================ groups: - name: ProcessExporter rules: - alert: ProcessExporterGroupDown expr: 'namedprocess_namegroup_num_procs == 0' for: 5m labels: severity: warning annotations: summary: Process exporter group down (instance {{ $labels.instance }}) description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 4GB is arbitrary and depends on the process being monitored. Adjust per group. - alert: ProcessExporterHighMemoryUsage expr: 'namedprocess_namegroup_memory_bytes{memtype="resident"} > 4e+09' for: 5m labels: severity: warning annotations: summary: Process exporter high memory usage (instance {{ $labels.instance }}) description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of resident memory. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Value is core-equivalent %: 100% = 1 full core, 200% = 2 cores, etc. Threshold of 80% is per-core. Adjust based on expected workload. - alert: ProcessExporterHighCpuUsage expr: 'rate(namedprocess_namegroup_cpu_seconds_total[5m]) * 100 > 80' for: 5m labels: severity: warning annotations: summary: Process exporter high CPU usage (instance {{ $labels.instance }}) description: "Process group {{ $labels.groupname }} is using {{ $value }}% CPU (core-equivalent). (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ProcessExporterHighFileDescriptorUsage expr: 'namedprocess_namegroup_worst_fd_ratio > 0.8' for: 5m labels: severity: warning annotations: summary: Process exporter high file descriptor usage (instance {{ $labels.instance }}) description: "Process group {{ $labels.groupname }} is using more than 80% of its file descriptor limit. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ProcessExporterFileDescriptorsExhausted expr: 'namedprocess_namegroup_worst_fd_ratio > 0.95' for: 2m labels: severity: critical annotations: summary: Process exporter file descriptors exhausted (instance {{ $labels.instance }}) description: "Process group {{ $labels.groupname }} has nearly exhausted its file descriptor limit. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 512MB is arbitrary. Adjust per group and environment. - alert: ProcessExporterHighSwapUsage expr: 'namedprocess_namegroup_memory_bytes{memtype="swapped"} > 512e+06' for: 5m labels: severity: warning annotations: summary: Process exporter high swap usage (instance {{ $labels.instance }}) description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ProcessExporterZombieProcesses expr: 'namedprocess_namegroup_states{state="Zombie"} > 5' for: 5m labels: severity: warning annotations: summary: Process exporter zombie processes (instance {{ $labels.instance }}) description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Filters to voluntary switches only — involuntary switches are normal under CPU contention. Threshold of 50000/s is a rough default. Adjust based on workload. - alert: ProcessExporterHighContextSwitching expr: 'rate(namedprocess_namegroup_context_switches_total{ctxswitchtype="voluntary"}[5m]) > 50000' for: 5m labels: severity: warning annotations: summary: Process exporter high context switching (instance {{ $labels.instance }}) description: "Process group {{ $labels.groupname }} has a high rate of context switches ({{ $value }}/s). (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 100MB/s is arbitrary. Adjust per group. - alert: ProcessExporterHighDiskWriteIo expr: 'rate(namedprocess_namegroup_write_bytes_total[5m]) > 100e+06' for: 5m labels: severity: warning annotations: summary: Process exporter high disk write IO (instance {{ $labels.instance }}) description: "Process group {{ $labels.groupname }} is performing {{ $value | humanize }}B/s of disk writes. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Detects restarts by watching for changes in the oldest process start time within the group. - alert: ProcessExporterProcessRestarting expr: 'changes(namedprocess_namegroup_oldest_start_time_seconds[5m]) > 0 and namedprocess_namegroup_num_procs > 0' for: 0m labels: severity: info annotations: summary: Process exporter process restarting (instance {{ $labels.instance }}) description: "Process group {{ $labels.groupname }} has restarted (oldest process start time changed). (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/prometheus-self-monitoring/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: PrometheusJobMissing expr: 'absent(up{job="prometheus"})' for: 0m labels: severity: warning annotations: summary: Prometheus job missing (instance {{ $labels.instance }}) description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Only fire if at least one target in the job is still up. # If all targets are down, PrometheusJobMissing or PrometheusAllTargetsMissing will fire instead. - alert: PrometheusTargetMissing expr: 'up == 0 unless on(job) (sum by (job) (up) == 0)' for: 1m labels: severity: critical annotations: summary: Prometheus target missing (instance {{ $labels.instance }}) description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusAllTargetsMissing expr: 'sum by (job) (up) == 0' for: 1m labels: severity: critical annotations: summary: Prometheus all targets missing (instance {{ $labels.instance }}) description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTargetMissingWithWarmupTime expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))' for: 1m labels: severity: critical annotations: summary: Prometheus target missing with warmup time (instance {{ $labels.instance }}) description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusConfigurationReloadFailure expr: 'prometheus_config_last_reload_successful != 1' for: 0m labels: severity: warning annotations: summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTooManyRestarts expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2' for: 0m labels: severity: warning annotations: summary: Prometheus too many restarts (instance {{ $labels.instance }}) description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusAlertmanagerJobMissing expr: 'absent(up{job="alertmanager"})' for: 0m labels: severity: warning annotations: summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusAlertmanagerConfigurationReloadFailure expr: 'alertmanager_config_last_reload_successful != 1' for: 0m labels: severity: warning annotations: summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusAlertmanagerConfigNotSynced expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1' for: 0m labels: severity: warning annotations: summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusAlertmanagerE2eDeadManSwitch expr: 'vector(1)' for: 0m labels: severity: critical annotations: summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}) description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusNotConnectedToAlertmanager expr: 'prometheus_notifications_alertmanagers_discovered < 1' for: 0m labels: severity: critical annotations: summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusRuleEvaluationFailures expr: 'increase(prometheus_rule_evaluation_failures_total[3m]) > 0' for: 0m labels: severity: critical annotations: summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTemplateTextExpansionFailures expr: 'increase(prometheus_template_text_expansion_failures_total[3m]) > 0' for: 0m labels: severity: critical annotations: summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusRuleEvaluationSlow expr: 'prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds' for: 5m labels: severity: warning annotations: summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusNotificationsBacklog expr: 'min_over_time(prometheus_notifications_queue_length[10m]) > 0' for: 0m labels: severity: warning annotations: summary: Prometheus notifications backlog (instance {{ $labels.instance }}) description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusAlertmanagerNotificationFailing expr: 'rate(alertmanager_notifications_failed_total[1m]) > 0' for: 0m labels: severity: critical annotations: summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) description: "Alertmanager is failing sending notifications ({{ $value }} notifications/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTargetEmpty expr: 'prometheus_sd_discovered_targets == 0' for: 0m labels: severity: critical annotations: summary: Prometheus target empty (instance {{ $labels.instance }}) description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTargetScrapingSlow expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05' for: 5m labels: severity: warning annotations: summary: Prometheus target scraping slow (instance {{ $labels.instance }}) description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusLargeScrape expr: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10' for: 5m labels: severity: warning annotations: summary: Prometheus large scrape (instance {{ $labels.instance }}) description: "Prometheus has many scrapes that exceed the sample limit ({{ $value }} scrapes)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTargetScrapeDuplicate expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 3' for: 0m labels: severity: warning annotations: summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) description: "Prometheus has many samples rejected due to duplicate timestamps but different values ({{ $value }} samples)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTsdbCheckpointCreationFailures expr: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0' for: 0m labels: severity: critical annotations: summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTsdbCheckpointDeletionFailures expr: 'increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0' for: 0m labels: severity: critical annotations: summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTsdbCompactionsFailed expr: 'increase(prometheus_tsdb_compactions_failed_total[1m]) > 0' for: 0m labels: severity: critical annotations: summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTsdbHeadTruncationsFailed expr: 'increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0' for: 0m labels: severity: critical annotations: summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTsdbReloadFailures expr: 'increase(prometheus_tsdb_reloads_failures_total[1m]) > 0' for: 0m labels: severity: critical annotations: summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTsdbWalCorruptions expr: 'increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0' for: 0m labels: severity: critical annotations: summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTsdbWalTruncationsFailed expr: 'increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0' for: 0m labels: severity: critical annotations: summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTimeseriesCardinality expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000' for: 0m labels: severity: warning annotations: summary: Prometheus timeseries cardinality (instance {{ $labels.instance }}) description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/promtail/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: PromtailRequestErrors expr: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10 and sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 0' for: 5m labels: severity: critical annotations: summary: Promtail request errors (instance {{ $labels.instance }}) description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PromtailRequestLatency expr: 'histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1' for: 5m labels: severity: critical annotations: summary: Promtail request latency (instance {{ $labels.instance }}) description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/proxmox-ve/prometheus-pve-exporter.yml ================================================ groups: - name: PrometheusPveExporter rules: - alert: PveNodeDown expr: 'pve_up{id=~"node/.*"} == 0' for: 2m labels: severity: critical annotations: summary: PVE node down (instance {{ $labels.instance }}) description: "Proxmox VE node {{ $labels.id }} is down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This alert triggers for all VMs and containers that are not running. # You may want to filter by specific guests using the `id` label, or exclude # intentionally stopped guests with additional label matchers. - alert: PveVm/ctDown expr: 'pve_up{id=~"(qemu|lxc)/.*"} == 0' for: 5m labels: severity: warning annotations: summary: PVE VM/CT down (instance {{ $labels.instance }}) description: "Proxmox VE guest {{ $labels.id }} is not running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PveHighCpuUsage expr: 'pve_cpu_usage_ratio * 100 > 90' for: 5m labels: severity: warning annotations: summary: PVE high CPU usage (instance {{ $labels.instance }}) description: "Proxmox VE CPU usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf \"%.2f\" }}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PveHighMemoryUsage expr: 'pve_memory_usage_bytes / pve_memory_size_bytes * 100 > 90 and pve_memory_size_bytes > 0' for: 5m labels: severity: warning annotations: summary: PVE high memory usage (instance {{ $labels.instance }}) description: "Proxmox VE memory usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf \"%.2f\" }}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PveStorageFillingUp expr: 'pve_disk_usage_bytes{id=~"storage/.*"} / pve_disk_size_bytes{id=~"storage/.*"} * 100 > 80 and pve_disk_size_bytes{id=~"storage/.*"} > 0' for: 5m labels: severity: warning annotations: summary: PVE storage filling up (instance {{ $labels.instance }}) description: "Proxmox VE storage {{ $labels.id }} is above 80% used. Current value: {{ $value | printf \"%.2f\" }}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PveStorageAlmostFull expr: 'pve_disk_usage_bytes{id=~"storage/.*"} / pve_disk_size_bytes{id=~"storage/.*"} * 100 > 95 and pve_disk_size_bytes{id=~"storage/.*"} > 0' for: 2m labels: severity: critical annotations: summary: PVE storage almost full (instance {{ $labels.instance }}) description: "Proxmox VE storage {{ $labels.id }} is above 95% used. Current value: {{ $value | printf \"%.2f\" }}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PveGuestNotBackedUp expr: 'pve_not_backed_up_total > 0' for: 0m labels: severity: warning annotations: summary: PVE guest not backed up (instance {{ $labels.instance }}) description: "{{ $value }} Proxmox VE guest(s) are not covered by any backup job.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PveReplicationFailed expr: 'pve_replication_failed_syncs > 0' for: 0m labels: severity: warning annotations: summary: PVE replication failed (instance {{ $labels.instance }}) description: "Proxmox VE replication for {{ $labels.id }} has {{ $value }} failed sync(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Loss of quorum means the cluster cannot make decisions about VM placement # and fencing. This requires immediate attention. - alert: PveClusterNotQuorate expr: 'pve_cluster_info{quorate="0"} == 1' for: 0m labels: severity: critical annotations: summary: PVE cluster not quorate (instance {{ $labels.instance }}) description: "Proxmox VE cluster has lost quorum.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/pulsar/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: PulsarSubscriptionHighNumberOfBacklogEntries expr: 'sum(pulsar_subscription_back_log) by (subscription) > 5000' for: 1h labels: severity: warning annotations: summary: Pulsar subscription high number of backlog entries (instance {{ $labels.instance }}) description: "The number of subscription backlog entries is over 5k\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PulsarSubscriptionVeryHighNumberOfBacklogEntries expr: 'sum(pulsar_subscription_back_log) by (subscription) > 100000' for: 1h labels: severity: critical annotations: summary: Pulsar subscription very high number of backlog entries (instance {{ $labels.instance }}) description: "The number of subscription backlog entries is over 100k\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PulsarTopicLargeBacklogStorageSize expr: 'sum(pulsar_storage_size) by (topic) > 5*1024*1024*1024' for: 1h labels: severity: warning annotations: summary: Pulsar topic large backlog storage size (instance {{ $labels.instance }}) description: "The topic backlog storage size is over 5 GB\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PulsarTopicVeryLargeBacklogStorageSize expr: 'sum(pulsar_storage_size) by (topic) > 20*1024*1024*1024' for: 1h labels: severity: critical annotations: summary: Pulsar topic very large backlog storage size (instance {{ $labels.instance }}) description: "The topic backlog storage size is over 20 GB\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PulsarHighWriteLatency expr: 'sum(pulsar_storage_write_latency_overflow > 0) by (topic)' for: 1h labels: severity: critical annotations: summary: Pulsar high write latency (instance {{ $labels.instance }}) description: "Messages cannot be written in a timely fashion\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PulsarLargeMessagePayload expr: 'sum(pulsar_entry_size_overflow > 0) by (topic)' for: 1h labels: severity: warning annotations: summary: Pulsar large message payload (instance {{ $labels.instance }}) description: "Observing large message payload (> 1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PulsarHighLedgerDiskUsage expr: 'sum(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_usage) by (kubernetes_pod_name) > 75' for: 1h labels: severity: critical annotations: summary: Pulsar high ledger disk usage (instance {{ $labels.instance }}) description: "Observing Ledger Disk Usage (> 75%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PulsarReadOnlyBookies expr: 'count(bookie_SERVER_STATUS{} == 0) by (pod)' for: 5m labels: severity: critical annotations: summary: Pulsar read only bookies (instance {{ $labels.instance }}) description: "Observing Readonly Bookies\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PulsarHighNumberOfFunctionErrors expr: 'sum(rate(pulsar_function_user_exceptions_total[1m]) + rate(pulsar_function_system_exceptions_total[1m])) by (name) > 10' for: 1m labels: severity: critical annotations: summary: Pulsar high number of function errors (instance {{ $labels.instance }}) description: "Observing more than 10 Function errors per minute\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PulsarHighNumberOfSinkErrors expr: 'sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10' for: 1m labels: severity: critical annotations: summary: Pulsar high number of sink errors (instance {{ $labels.instance }}) description: "Observing more than 10 Sink errors per minute\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/python/python-exporter.yml ================================================ groups: - name: PythonExporter rules: - alert: PythonGcObjectsUncollectable expr: 'increase(python_gc_objects_uncollectable_total[5m]) > 0' for: 5m labels: severity: warning annotations: summary: Python GC objects uncollectable (instance {{ $labels.instance }}) description: "Python has uncollectable objects, potential memory leak via reference cycles\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PythonGcCollectionsHigh expr: 'rate(python_gc_objects_collected_total[5m]) > 10000' for: 5m labels: severity: warning annotations: summary: Python GC collections high (instance {{ $labels.instance }}) description: "Python GC is collecting too many objects (> 10k/s), high allocation pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not Python-specific. - alert: PythonFileDescriptorsExhaustion expr: '(process_open_fds / process_max_fds) * 100 > 90 and process_max_fds > 0' for: 5m labels: severity: warning annotations: summary: Python file descriptors exhaustion (instance {{ $labels.instance }}) description: "Python process is running out of file descriptors (> 90% used)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Gen2 collection rate > 1/s is very high. In most applications, gen2 runs are infrequent. Adjust threshold based on your workload. - alert: PythonGcGeneration2CollectionsHigh expr: 'rate(python_gc_collections_total{generation="2"}[5m]) > 1' for: 5m labels: severity: warning annotations: summary: Python GC generation 2 collections high (instance {{ $labels.instance }}) description: "Python full GC (generation 2) is running too frequently, indicating memory pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold is a rough default. Adjust based on your application's expected memory footprint. - alert: PythonVirtualMemoryHigh expr: 'process_virtual_memory_bytes > 4e9' for: 5m labels: severity: warning annotations: summary: Python virtual memory high (instance {{ $labels.instance }}) description: "Python process virtual memory is high (> 4GB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml ================================================ groups: - name: KbuddeRabbitmqExporter rules: # 1m delay allows a restart without triggering an alert. - alert: RabbitmqDown expr: 'rabbitmq_up == 0' for: 1m labels: severity: critical annotations: summary: RabbitMQ down (instance {{ $labels.instance }}) description: "RabbitMQ node down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1m delay allows a restart without triggering an alert. - alert: RabbitmqClusterDown expr: 'sum(rabbitmq_running) < 3' for: 1m labels: severity: critical annotations: summary: RabbitMQ cluster down (instance {{ $labels.instance }}) description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqClusterPartition expr: 'rabbitmq_partitions > 0' for: 0m labels: severity: critical annotations: summary: RabbitMQ cluster partition (instance {{ $labels.instance }}) description: "Cluster partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqOutOfMemory expr: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0' for: 2m labels: severity: warning annotations: summary: RabbitMQ out of memory (instance {{ $labels.instance }}) description: "Memory available for RabbitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqInstanceTooManyConnections expr: 'rabbitmq_connectionsTotal > 1000' for: 2m labels: severity: warning annotations: summary: RabbitMQ instance too many connections (instance {{ $labels.instance }}) description: "RabbitMQ instance has too many connections (> 1000)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Indicate the queue name in dedicated label. - alert: RabbitmqDeadLetterQueueFillingUp expr: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10' for: 1m labels: severity: warning annotations: summary: RabbitMQ dead letter queue filling up (instance {{ $labels.instance }}) description: "Dead letter queue is filling up (> 10 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Indicate the queue name in dedicated label. - alert: RabbitmqTooManyMessagesInQueue expr: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000' for: 2m labels: severity: warning annotations: summary: RabbitMQ too many messages in queue (instance {{ $labels.instance }}) description: "Queue is filling up (> 1000 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Indicate the queue name in dedicated label. - alert: RabbitmqSlowQueueConsuming expr: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60' for: 2m labels: severity: warning annotations: summary: RabbitMQ slow queue consuming (instance {{ $labels.instance }}) description: "Queue messages are consumed slowly (> 60s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Allows a short service restart. - alert: RabbitmqNoConsumer expr: 'rabbitmq_queue_consumers == 0' for: 5m labels: severity: critical annotations: summary: RabbitMQ no consumer (instance {{ $labels.instance }}) description: "Queue has no consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Indicate the queue name in dedicated label. - alert: RabbitmqTooManyConsumers expr: 'rabbitmq_queue_consumers{queue="my-queue"} > 1' for: 0m labels: severity: critical annotations: summary: RabbitMQ too many consumers (instance {{ $labels.instance }}) description: "Queue should have only 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Indicate the exchange name in dedicated label. - alert: RabbitmqInactiveExchange expr: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5' for: 2m labels: severity: warning annotations: summary: RabbitMQ inactive exchange (instance {{ $labels.instance }}) description: "Exchange receive less than 5 msgs per second\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/rabbitmq/rabbitmq-exporter.yml ================================================ groups: - name: RabbitmqExporter rules: # 1m delay allows a restart without triggering an alert. - alert: RabbitmqNodeDown expr: 'sum(rabbitmq_build_info) < 3' for: 1m labels: severity: critical annotations: summary: RabbitMQ node down (instance {{ $labels.instance }}) description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1m delay allows a restart without triggering an alert. - alert: RabbitmqNodeNotDistributed expr: 'erlang_vm_dist_node_state < 3' for: 1m labels: severity: critical annotations: summary: RabbitMQ node not distributed (instance {{ $labels.instance }}) description: "Distribution link state is not 'up'\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqInstancesDifferentVersions expr: 'count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1' for: 1h labels: severity: warning annotations: summary: RabbitMQ instances different versions (instance {{ $labels.instance }}) description: "Running different version of RabbitMQ in the same cluster, can lead to failure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqMemoryHigh expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90 and rabbitmq_resident_memory_limit_bytes > 0' for: 2m labels: severity: warning annotations: summary: RabbitMQ memory high (instance {{ $labels.instance }}) description: "A node use more than 90% of allocated RAM\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqFileDescriptorsUsage expr: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 and rabbitmq_process_max_fds > 0' for: 2m labels: severity: warning annotations: summary: RabbitMQ file descriptors usage (instance {{ $labels.instance }}) description: "A node use more than 90% of file descriptors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqTooManyReadyMessages expr: 'sum(rabbitmq_queue_messages_ready) BY (queue) > 1000' for: 1m labels: severity: warning annotations: summary: RabbitMQ too many ready messages (instance {{ $labels.instance }}) description: "RabbitMQ too many ready messages on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqTooManyUnackMessages expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000' for: 1m labels: severity: warning annotations: summary: RabbitMQ too many unack messages (instance {{ $labels.instance }}) description: "Too many unacknowledged messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqTooManyConnections expr: 'rabbitmq_connections > 1000' for: 2m labels: severity: warning annotations: summary: RabbitMQ too many connections (instance {{ $labels.instance }}) description: "The total connections of a node is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqNoQueueConsumer expr: 'rabbitmq_queue_consumers < 1' for: 1m labels: severity: warning annotations: summary: RabbitMQ no queue consumer (instance {{ $labels.instance }}) description: "A queue has less than 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqUnroutableMessages expr: 'increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0' for: 2m labels: severity: warning annotations: summary: RabbitMQ unroutable messages (instance {{ $labels.instance }}) description: "A queue has unroutable messages ({{ $value }} in the last 1m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/redis/oliver006-redis-exporter.yml ================================================ groups: - name: Oliver006RedisExporter rules: # 1m delay allows a restart without triggering an alert. - alert: RedisDown expr: 'redis_up == 0' for: 1m labels: severity: critical annotations: summary: Redis down (instance {{ $labels.instance }}) description: "Redis instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RedisMissingMaster expr: '(count(redis_instance_info{role="master"}) or vector(0)) < 1' for: 0m labels: severity: critical annotations: summary: Redis missing master (instance {{ $labels.instance }}) description: "Redis cluster has no node marked as master.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1m delay allows a restart without triggering an alert. - alert: RedisTooManyMasters expr: 'count(redis_instance_info{role="master"}) > 1' for: 1m labels: severity: critical annotations: summary: Redis too many masters (instance {{ $labels.instance }}) description: "Redis cluster has too many nodes marked as master.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RedisDisconnectedSlaves expr: 'count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 0' for: 0m labels: severity: critical annotations: summary: Redis disconnected slaves (instance {{ $labels.instance }}) description: "Redis not replicating for all slaves. Consider reviewing the redis replication status.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RedisReplicationBroken expr: 'delta(redis_connected_slaves[1m]) < 0' for: 0m labels: severity: critical annotations: summary: Redis replication broken (instance {{ $labels.instance }}) description: "Redis instance lost a slave\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RedisClusterFlapping expr: 'changes(redis_connected_slaves[1m]) > 1' for: 2m labels: severity: critical annotations: summary: Redis cluster flapping (instance {{ $labels.instance }}) description: "Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RedisMissingBackup expr: 'time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 48' for: 0m labels: severity: critical annotations: summary: Redis missing backup (instance {{ $labels.instance }}) description: "Redis has not been backed up for 48 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable. - alert: RedisOutOfSystemMemory expr: 'redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90 and redis_total_system_memory_bytes > 0' for: 2m labels: severity: warning annotations: summary: Redis out of system memory (instance {{ $labels.instance }}) description: "Redis is running out of system memory (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RedisOutOfConfiguredMaxmemory expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0' for: 2m labels: severity: warning annotations: summary: Redis out of configured maxmemory (instance {{ $labels.instance }}) description: "Redis is running out of configured maxmemory (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RedisTooManyConnections expr: 'redis_connected_clients / redis_config_maxclients * 100 > 90 and redis_config_maxclients > 0' for: 2m labels: severity: warning annotations: summary: Redis too many connections (instance {{ $labels.instance }}) description: "Redis is running out of connections (> 90% used)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RedisNotEnoughConnections expr: 'redis_connected_clients < 5' for: 2m labels: severity: warning annotations: summary: Redis not enough connections (instance {{ $labels.instance }}) description: "Redis instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RedisRejectedConnections expr: 'increase(redis_rejected_connections_total[1m]) > 5' for: 0m labels: severity: warning annotations: summary: Redis rejected connections (instance {{ $labels.instance }}) description: "Some connections to Redis has been rejected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/ruby/ruby-exporter.yml ================================================ groups: - name: RubyExporter rules: # Threshold is a rough default. Adjust based on your application's normal heap size. - alert: RubyHeapLiveSlotsHigh expr: 'ruby_heap_live_slots > 500000' for: 5m labels: severity: warning annotations: summary: Ruby heap live slots high (instance {{ $labels.instance }}) description: "Ruby heap has too many live slots (> 500k), heap bloat\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RubyHeapFreeSlotsHigh expr: 'ruby_heap_free_slots > 500000' for: 5m labels: severity: warning annotations: summary: Ruby heap free slots high (instance {{ $labels.instance }}) description: "Ruby heap has too many free slots (> 500k), memory fragmentation after large allocations\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Major GC rate > 5/s is extremely high. Consider lowering to > 1 or > 2 for earlier detection. - alert: RubyMajorGcRateHigh expr: 'rate(ruby_major_gc_ops_total[5m]) > 5' for: 5m labels: severity: warning annotations: summary: Ruby major GC rate high (instance {{ $labels.instance }}) description: "Ruby is performing too many major GC cycles, indicating memory pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RubyRssHigh expr: 'ruby_rss > 1e9' for: 5m labels: severity: warning annotations: summary: Ruby RSS high (instance {{ $labels.instance }}) description: "Ruby process RSS is high (> 1GB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RubyAllocatedObjectsSpike expr: 'rate(ruby_allocated_objects_total[5m]) > 100000' for: 5m labels: severity: warning annotations: summary: Ruby allocated objects spike (instance {{ $labels.instance }}) description: "Ruby is allocating objects at a high rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml ================================================ groups: - name: SmartctlExporter rules: - alert: SmartDeviceTemperatureWarning expr: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60' for: 0m labels: severity: warning annotations: summary: SMART device temperature warning (instance {{ $labels.instance }}) description: "Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartDeviceTemperatureCritical expr: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70' for: 0m labels: severity: critical annotations: summary: SMART device temperature critical (instance {{ $labels.instance }}) description: "Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartDeviceTemperatureOverTripValue expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}' for: 0m labels: severity: critical annotations: summary: SMART device temperature over trip value (instance {{ $labels.instance }}) description: "Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartDeviceTemperatureNearingTripValue expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)' for: 0m labels: severity: warning annotations: summary: SMART device temperature nearing trip value (instance {{ $labels.instance }}) description: "Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartStatus expr: 'smartctl_device_smart_status != 1' for: 0m labels: severity: critical annotations: summary: SMART status (instance {{ $labels.instance }}) description: "Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartCriticalWarning expr: 'smartctl_device_critical_warning > 0' for: 0m labels: severity: critical annotations: summary: SMART critical warning (instance {{ $labels.instance }}) description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartMediaErrors expr: 'smartctl_device_media_errors > 0' for: 0m labels: severity: critical annotations: summary: SMART media errors (instance {{ $labels.instance }}) description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartWearoutIndicator expr: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold' for: 0m labels: severity: critical annotations: summary: SMART Wearout Indicator (instance {{ $labels.instance }}) description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/sidekiq/strech-sidekiq-exporter.yml ================================================ groups: - name: StrechSidekiqExporter rules: - alert: SidekiqQueueSize expr: 'sidekiq_queue_size > 100' for: 1m labels: severity: warning annotations: summary: Sidekiq queue size (instance {{ $labels.instance }}) description: "Sidekiq queue {{ $labels.name }} is growing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SidekiqSchedulingLatencyTooHigh expr: 'max(sidekiq_queue_latency) > 60' for: 0m labels: severity: critical annotations: summary: Sidekiq scheduling latency too high (instance {{ $labels.instance }}) description: "Sidekiq jobs are taking more than 1min to be picked up. Users may be seeing delays in background processing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/snmp/snmp-exporter.yml ================================================ groups: - name: SnmpExporter # These rules use standard IF-MIB and SNMPv2-MIB metrics. Metric names depend on your snmp.yml module configuration. # Thresholds for bandwidth and error rates are rough defaults - adjust to your environment. rules: # From the official snmp-mixin. - alert: SnmpTargetDown expr: 'up{job=~"snmp.*"} == 0' for: 5m labels: severity: critical annotations: summary: SNMP target down (instance {{ $labels.instance }}) description: "SNMP device {{ $labels.instance }} is unreachable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SnmpInterfaceDown expr: '(ifOperStatus{job=~"snmp.*"} == 2) and on(instance, job, ifIndex) (ifAdminStatus{job=~"snmp.*"} == 1)' for: 2m labels: severity: critical annotations: summary: SNMP interface down (instance {{ $labels.instance }}) description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} is operationally down while administratively up.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold is a rough default. Adjust based on your network environment. - alert: SnmpInterfaceHighInboundErrorRate expr: 'rate(ifInErrors{job=~"snmp.*"}[5m]) / (rate(ifHCInUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInMulticastPkts{job=~"snmp.*"}[5m])) > 0.05 and (rate(ifHCInUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInMulticastPkts{job=~"snmp.*"}[5m])) > 0' for: 5m labels: severity: warning annotations: summary: SNMP interface high inbound error rate (instance {{ $labels.instance }}) description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an inbound error rate above 5%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold is a rough default. Adjust based on your network environment. - alert: SnmpInterfaceHighOutboundErrorRate expr: 'rate(ifOutErrors{job=~"snmp.*"}[5m]) / (rate(ifHCOutUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutMulticastPkts{job=~"snmp.*"}[5m])) > 0.05 and (rate(ifHCOutUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutMulticastPkts{job=~"snmp.*"}[5m])) > 0' for: 5m labels: severity: warning annotations: summary: SNMP interface high outbound error rate (instance {{ $labels.instance }}) description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an outbound error rate above 5%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead. - alert: SnmpInterfaceHighBandwidthUsageInbound expr: 'rate(ifHCInOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0' for: 15m labels: severity: warning annotations: summary: SNMP interface high bandwidth usage inbound (instance {{ $labels.instance }}) description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} inbound utilization is above 80%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead. - alert: SnmpInterfaceHighBandwidthUsageOutbound expr: 'rate(ifHCOutOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0' for: 15m labels: severity: warning annotations: summary: SNMP interface high bandwidth usage outbound (instance {{ $labels.instance }}) description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} outbound utilization is above 80%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # sysUpTime is in centiseconds (hundredths of a second). - alert: SnmpDeviceRestarted expr: 'sysUpTime / 100 < 300' for: 0m labels: severity: info annotations: summary: SNMP device restarted (instance {{ $labels.instance }}) description: "SNMP device {{ $labels.instance }} has restarted (uptime < 5 minutes).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/solr/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: SolrUpdateErrors expr: 'increase(solr_metrics_core_update_handler_errors_total[1m]) > 1' for: 0m labels: severity: critical annotations: summary: Solr update errors (instance {{ $labels.instance }}) description: "Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SolrQueryErrors expr: 'increase(solr_metrics_core_errors_total{category="QUERY"}[1m]) > 1' for: 5m labels: severity: warning annotations: summary: Solr query errors (instance {{ $labels.instance }}) description: "Solr has increased query errors in collection {{ $labels.collection }} for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SolrReplicationErrors expr: 'increase(solr_metrics_core_errors_total{category="REPLICATION"}[1m]) > 1' for: 0m labels: severity: critical annotations: summary: Solr replication errors (instance {{ $labels.instance }}) description: "Solr collection {{ $labels.collection }} has replication errors for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SolrLowLiveNodeCount expr: 'solr_collections_live_nodes < 2' for: 0m labels: severity: critical annotations: summary: Solr low live node count (instance {{ $labels.instance }}) description: "Solr collection {{ $labels.collection }} has less than two live nodes for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/speedtest/nlamirault-speedtest-exporter.yml ================================================ groups: - name: NlamiraultSpeedtestExporter rules: - alert: SpeedtestSlowInternetDownload expr: 'avg_over_time(speedtest_download[10m]) < 100' for: 0m labels: severity: warning annotations: summary: SpeedTest Slow Internet Download (instance {{ $labels.instance }}) description: "Internet download speed is currently {{humanize $value}} Mbps.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SpeedtestSlowInternetUpload expr: 'avg_over_time(speedtest_upload[10m]) < 20' for: 0m labels: severity: warning annotations: summary: SpeedTest Slow Internet Upload (instance {{ $labels.instance }}) description: "Internet upload speed is currently {{humanize $value}} Mbps.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/spinnaker/embedded-exporter.yml ================================================ groups: - name: EmbeddedExporter rules: - alert: SpinnakerCircuitBreakerOpen expr: 'resilience4j_circuitbreaker_state{state="open"} == 1' for: 5m labels: severity: warning annotations: summary: Spinnaker circuit breaker open (instance {{ $labels.instance }}) description: "Circuit breaker {{ $labels.name }} is open on {{ $labels.instance }}, indicating repeated downstream failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # In a healthy Spinnaker, queue_ready_depth should stay at or near 0. # Sustained non-zero values indicate Orca cannot keep up with incoming work. - alert: SpinnakerOrcaQueueBackingUp expr: 'queue_ready_depth > 0' for: 5m labels: severity: warning annotations: summary: Spinnaker Orca queue backing up (instance {{ $labels.instance }}) description: "Orca work queue has {{ $value }} messages ready for delivery but not yet picked up. Pipeline executions may be delayed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # The 30s threshold is a rough default. Adjust based on your pipeline SLOs. - alert: SpinnakerOrcaQueueMessageLagHigh expr: 'rate(queue_message_lag_seconds_sum[5m]) / rate(queue_message_lag_seconds_count[5m]) > 30 and rate(queue_message_lag_seconds_count[5m]) > 0' for: 5m labels: severity: warning annotations: summary: Spinnaker Orca queue message lag high (instance {{ $labels.instance }}) description: "Orca queue message lag is {{ $value }}s. Pipeline stages are waiting too long before being processed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SpinnakerDeadMessages expr: 'rate(queue_dead_messages_total[5m]) > 0' for: 2m labels: severity: critical annotations: summary: Spinnaker dead messages (instance {{ $labels.instance }}) description: "Orca is producing dead-lettered messages ({{ $value }} per second). These are tasks that exhausted all retries and will not be executed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Zombies are pipeline executions that are running but have lost their queue entry. # See https://spinnaker.io/docs/guides/runbooks/orca-zombie-executions/ - alert: SpinnakerZombieExecutions expr: 'rate(queue_zombies_total[5m]) > 0' for: 5m labels: severity: warning annotations: summary: Spinnaker zombie executions (instance {{ $labels.instance }}) description: "{{ $value }} zombie pipeline executions detected. These are executions with no corresponding queue messages.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SpinnakerThreadPoolExhaustion expr: 'threadpool_blockingQueueSize > 0' for: 5m labels: severity: warning annotations: summary: Spinnaker thread pool exhaustion (instance {{ $labels.instance }}) description: "Orca message handler thread pool has {{ $value }} blocked threads on {{ $labels.instance }}. Pipeline execution throughput is degraded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # When this threshold is exceeded, Igor stops triggering pipelines for the affected monitor. # See https://kb.armory.io/s/article/Hitting-Igor-s-caching-thresholds - alert: SpinnakerPollingMonitorItemsOverThreshold expr: 'sum by (monitor, partition) (pollingMonitor_itemsOverThreshold) > 0' for: 5m labels: severity: critical annotations: summary: Spinnaker polling monitor items over threshold (instance {{ $labels.instance }}) description: "Igor polling monitor {{ $labels.monitor }} for {{ $labels.partition }} has exceeded its item threshold, preventing pipeline triggers.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SpinnakerPollingMonitorFailures expr: 'rate(pollingMonitor_failed_total[5m]) > 0' for: 5m labels: severity: warning annotations: summary: Spinnaker polling monitor failures (instance {{ $labels.instance }}) description: "Igor polling monitor is experiencing failures ({{ $value }} per second). CI/SCM integrations may not trigger pipelines.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # The 5% threshold is a rough default. Adjust based on your traffic patterns. - alert: SpinnakerHighApiErrorRate expr: 'sum by (instance) (rate(controller_invocations_total{status="5xx"}[5m])) / sum by (instance) (rate(controller_invocations_total[5m])) > 0.05 and sum by (instance) (rate(controller_invocations_total[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Spinnaker high API error rate (instance {{ $labels.instance }}) description: "Spinnaker API 5xx error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SpinnakerApiRateLimitThrottling expr: 'rate(rateLimitThrottling_total[5m]) > 0' for: 2m labels: severity: warning annotations: summary: Spinnaker API rate limit throttling (instance {{ $labels.instance }}) description: "Gate is actively throttling API requests on {{ $labels.instance }} ({{ $value }} throttled requests per second).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SpinnakerClouddriverHighErrorRate expr: 'sum by (instance) (rate(controller_invocations_total{status="5xx", job=~".*clouddriver.*"}[5m])) / sum by (instance) (rate(controller_invocations_total{job=~".*clouddriver.*"}[5m])) > 0.05 and sum by (instance) (rate(controller_invocations_total{job=~".*clouddriver.*"}[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Spinnaker Clouddriver high error rate (instance {{ $labels.instance }}) description: "Clouddriver 5xx error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}. Cloud operations may be failing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This metric is specific to AWS cloud providers in Clouddriver. # The 1000ms threshold is a rough default. Adjust based on your AWS usage patterns. - alert: SpinnakerAwsRateLimiting expr: 'amazonClientProvider_rateLimitDelayMil > 1000' for: 5m labels: severity: warning annotations: summary: Spinnaker AWS rate limiting (instance {{ $labels.instance }}) description: "Clouddriver is being rate-limited by AWS on {{ $labels.instance }} ({{ $value }}ms delay). Cloud operations will be slower.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/sql-server/ozarklake-mssql-exporter.yml ================================================ groups: - name: OzarklakeMssqlExporter rules: # 1m delay allows a restart without triggering an alert. - alert: SqlServerDown expr: 'mssql_up == 0' for: 1m labels: severity: critical annotations: summary: SQL Server down (instance {{ $labels.instance }}) description: "SQL server instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SqlServerDeadlock expr: 'mssql_deadlocks > 5' for: 1m labels: severity: warning annotations: summary: SQL Server deadlock (instance {{ $labels.instance }}) description: "SQL Server {{ $labels.instance }} is experiencing deadlocks ({{ $value }}/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/ssl/tls/ribbybibby-ssl-exporter.yml ================================================ groups: - name: RibbybibbySslExporter rules: - alert: SslCertificateProbeFailed expr: 'ssl_probe_success == 0' for: 0m labels: severity: critical annotations: summary: SSL certificate probe failed (instance {{ $labels.instance }}) description: "Failed to fetch SSL information {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SslCertificateOscpStatusUnknown expr: 'ssl_ocsp_response_status == 2' for: 0m labels: severity: warning annotations: summary: SSL certificate OSCP status unknown (instance {{ $labels.instance }}) description: "Failed to get the OSCP status {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SslCertificateRevoked expr: 'ssl_ocsp_response_status == 1' for: 0m labels: severity: critical annotations: summary: SSL certificate revoked (instance {{ $labels.instance }}) description: "SSL certificate revoked {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SslCertificateExpiry(<7Days) expr: 'ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 7' for: 0m labels: severity: warning annotations: summary: SSL certificate expiry (< 7 days) (instance {{ $labels.instance }}) description: "{{ $labels.instance }} Certificate is expiring in 7 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/systemd/systemd-exporter.yml ================================================ groups: - name: SystemdExporter rules: - alert: SystemdUnitFailed expr: 'systemd_unit_state{state="failed"} == 1' for: 5m labels: severity: warning annotations: summary: Systemd unit failed (instance {{ $labels.instance }}) description: "Systemd unit {{ $labels.name }} has entered failed state. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Many units are legitimately inactive. You must adjust the name=~ filter to match your critical services. - alert: SystemdUnitInactive expr: 'systemd_unit_state{state="inactive", type="service", name=~"your-critical-service.+"} == 1' for: 5m labels: severity: warning annotations: summary: Systemd unit inactive (instance {{ $labels.instance }}) description: "Systemd unit {{ $labels.name }} is inactive. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SystemdServiceCrashLooping expr: 'increase(systemd_service_restart_total[1h]) > 5' for: 5m labels: severity: critical annotations: summary: Systemd service crash looping (instance {{ $labels.instance }}) description: "Systemd service {{ $labels.name }} has restarted {{ $value }} times in the last hour. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SystemdUnitTasksNearLimit expr: 'systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max > 0.9 and ignoring(type) systemd_unit_tasks_max > 0' for: 5m labels: severity: warning annotations: summary: Systemd unit tasks near limit (instance {{ $labels.instance }}) description: "Systemd unit {{ $labels.name }} is using {{ $value | humanizePercentage }} of its task limit. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SystemdSocketRefusedConnections expr: 'increase(systemd_socket_refused_connections_total[5m]) > 0' for: 2m labels: severity: warning annotations: summary: Systemd socket refused connections (instance {{ $labels.instance }}) description: "Systemd socket {{ $labels.name }} is refusing connections. ({{ $value }} refused in last 5m, instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 100 connections is arbitrary. Adjust to your workload. - alert: SystemdSocketHighConnections expr: 'systemd_socket_current_connections > 100' for: 2m labels: severity: warning annotations: summary: Systemd socket high connections (instance {{ $labels.instance }}) description: "Systemd socket {{ $labels.name }} has {{ $value }} active connections. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Triggers if timer hasn't fired in 24 hours. Adjust threshold per timer schedule. - alert: SystemdTimerMissedTrigger expr: '(time() - systemd_timer_last_trigger_seconds) / 3600 > 24 and systemd_timer_last_trigger_seconds > 0' for: 5m labels: severity: warning annotations: summary: Systemd timer missed trigger (instance {{ $labels.instance }}) description: "Systemd timer {{ $labels.name }} has not triggered for over 24 hours. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/thanos/thanos-bucket-replicate.yml ================================================ groups: - name: ThanosBucketReplicate rules: - alert: ThanosBucketReplicateErrorRate expr: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])) > 0' for: 5m labels: severity: critical annotations: summary: Thanos Bucket Replicate Error Rate (instance {{ $labels.instance }}) description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosBucketReplicateRunLatency expr: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)' for: 5m labels: severity: critical annotations: summary: Thanos Bucket Replicate Run Latency (instance {{ $labels.instance }}) description: "Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/thanos/thanos-compactor.yml ================================================ groups: - name: ThanosCompactor rules: - alert: ThanosCompactorMultipleRunning expr: 'sum by (job) (up{job=~".*thanos-compact.*"}) > 1' for: 5m labels: severity: warning annotations: summary: Thanos Compactor Multiple Running (instance {{ $labels.instance }}) description: "No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosCompactorHalted expr: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1' for: 5m labels: severity: warning annotations: summary: Thanos Compactor Halted (instance {{ $labels.instance }}) description: "Thanos Compact {{$labels.job}} has failed to run and now is halted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosCompactorHighCompactionFailures expr: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) > 0' for: 15m labels: severity: warning annotations: summary: Thanos Compactor High Compaction Failures (instance {{ $labels.instance }}) description: "Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosCompactBucketHighOperationFailures expr: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) > 0' for: 15m labels: severity: warning annotations: summary: Thanos Compact Bucket High Operation Failures (instance {{ $labels.instance }}) description: "Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosCompactHasNotRun expr: '(time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24' for: 0m labels: severity: warning annotations: summary: Thanos Compact Has Not Run (instance {{ $labels.instance }}) description: "Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/thanos/thanos-component-absent.yml ================================================ groups: - name: ThanosComponentAbsent rules: - alert: ThanosCompactIsDown expr: 'absent(up{job=~".*thanos-compact.*"} == 1)' for: 5m labels: severity: critical annotations: summary: Thanos Compact Is Down (instance {{ $labels.instance }}) description: "ThanosCompact has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryIsDown expr: 'absent(up{job=~".*thanos-query.*"} == 1)' for: 5m labels: severity: critical annotations: summary: Thanos Query Is Down (instance {{ $labels.instance }}) description: "ThanosQuery has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosReceiveIsDown expr: 'absent(up{job=~".*thanos-receive.*"} == 1)' for: 5m labels: severity: critical annotations: summary: Thanos Receive Is Down (instance {{ $labels.instance }}) description: "ThanosReceive has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleIsDown expr: 'absent(up{job=~".*thanos-rule.*"} == 1)' for: 5m labels: severity: critical annotations: summary: Thanos Rule Is Down (instance {{ $labels.instance }}) description: "ThanosRule has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosSidecarIsDown expr: 'absent(up{job=~".*thanos-sidecar.*"} == 1)' for: 5m labels: severity: critical annotations: summary: Thanos Sidecar Is Down (instance {{ $labels.instance }}) description: "ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosStoreIsDown expr: 'absent(up{job=~".*thanos-store.*"} == 1)' for: 5m labels: severity: critical annotations: summary: Thanos Store Is Down (instance {{ $labels.instance }}) description: "ThanosStore has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/thanos/thanos-query.yml ================================================ groups: - name: ThanosQuery rules: - alert: ThanosQueryHttpRequestQueryErrorRateHigh expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m])) > 0' for: 5m labels: severity: critical annotations: summary: Thanos Query Http Request Query Error Rate High (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query\" requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0' for: 5m labels: severity: critical annotations: summary: Thanos Query Http Request Query Range Error Rate High (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query_range\" requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryGrpcServerErrorRate expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5) and sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Thanos Query Grpc Server Error Rate (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryGrpcClientErrorRate expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5 and sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Thanos Query Grpc Client Error Rate (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryHighDNSFailures expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) > 0' for: 15m labels: severity: warning annotations: summary: Thanos Query High D N S Failures (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryInstantLatencyHigh expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)' for: 10m labels: severity: critical annotations: summary: Thanos Query Instant Latency High (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryRangeLatencyHigh expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)' for: 10m labels: severity: critical annotations: summary: Thanos Query Range Latency High (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryOverload expr: '(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)' for: 15m labels: severity: warning annotations: summary: Thanos Query Overload (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/thanos/thanos-receiver.yml ================================================ groups: - name: ThanosReceiver rules: - alert: ThanosReceiveHttpRequestErrorRateHigh expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0' for: 5m labels: severity: critical annotations: summary: Thanos Receive Http Request Error Rate High (instance {{ $labels.instance }}) description: "Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosReceiveHttpRequestLatencyHigh expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)' for: 10m labels: severity: critical annotations: summary: Thanos Receive Http Request Latency High (instance {{ $labels.instance }}) description: "Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosReceiveHighReplicationFailures expr: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100' for: 5m labels: severity: warning annotations: summary: Thanos Receive High Replication Failures (instance {{ $labels.instance }}) description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosReceiveHighForwardRequestFailures expr: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) > 0' for: 5m labels: severity: info annotations: summary: Thanos Receive High Forward Request Failures (instance {{ $labels.instance }}) description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosReceiveHighHashringFileRefreshFailures expr: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0' for: 15m labels: severity: warning annotations: summary: Thanos Receive High Hashring File Refresh Failures (instance {{ $labels.instance }}) description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosReceiveConfigReloadFailure expr: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1' for: 5m labels: severity: warning annotations: summary: Thanos Receive Config Reload Failure (instance {{ $labels.instance }}) description: "Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosReceiveNoUpload expr: '(up{job=~".*thanos-receive.*"} - 1) + on (job, instance) (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)' for: 3h labels: severity: critical annotations: summary: Thanos Receive No Upload (instance {{ $labels.instance }}) description: "Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/thanos/thanos-ruler.yml ================================================ groups: - name: ThanosRuler rules: - alert: ThanosRuleQueueIsDroppingAlerts expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0' for: 5m labels: severity: critical annotations: summary: Thanos Rule Queue Is Dropping Alerts (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.instance}} is failing to queue alerts ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleSenderIsFailingAlerts expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0' for: 5m labels: severity: critical annotations: summary: Thanos Rule Sender Is Failing Alerts (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleHighRuleEvaluationFailures expr: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5) and sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) > 0' for: 5m labels: severity: critical annotations: summary: Thanos Rule High Rule Evaluation Failures (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.instance}} is failing to evaluate {{$value | humanize}}% of rules.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: ThanosRuleHighRuleEvaluationWarnings expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0.05' for: 15m labels: severity: info annotations: summary: Thanos Rule High Rule Evaluation Warnings (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleRuleEvaluationLatencyHigh expr: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))' for: 5m labels: severity: warning annotations: summary: Thanos Rule Rule Evaluation Latency High (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleGrpcErrorRate expr: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5) and sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Thanos Rule Grpc Error Rate (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleConfigReloadFailure expr: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1' for: 5m labels: severity: info annotations: summary: Thanos Rule Config Reload Failure (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleQueryHighDNSFailures expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0' for: 15m labels: severity: warning annotations: summary: Thanos Rule Query High D N S Failures (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleAlertmanagerHighDNSFailures expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0' for: 15m labels: severity: warning annotations: summary: Thanos Rule Alertmanager High D N S Failures (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleNoEvaluationFor10Intervals expr: 'time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})' for: 5m labels: severity: info annotations: summary: Thanos Rule No Evaluation For10 Intervals (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosNoRuleEvaluations expr: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0' for: 5m labels: severity: critical annotations: summary: Thanos No Rule Evaluations (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/thanos/thanos-sidecar.yml ================================================ groups: - name: ThanosSidecar rules: # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: ThanosSidecarBucketOperationsFailed expr: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0.05' for: 5m labels: severity: critical annotations: summary: Thanos Sidecar Bucket Operations Failed (instance {{ $labels.instance }}) description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosSidecarNoConnectionToStartedPrometheus expr: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0' for: 5m labels: severity: critical annotations: summary: Thanos Sidecar No Connection To Started Prometheus (instance {{ $labels.instance }}) description: "Thanos Sidecar {{$labels.instance}} is unhealthy.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/thanos/thanos-store.yml ================================================ groups: - name: ThanosStore rules: - alert: ThanosStoreGrpcErrorRate expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5) and sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) > 0' for: 5m labels: severity: warning annotations: summary: Thanos Store Grpc Error Rate (instance {{ $labels.instance }}) description: "Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosStoreSeriesGateLatencyHigh expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)' for: 10m labels: severity: warning annotations: summary: Thanos Store Series Gate Latency High (instance {{ $labels.instance }}) description: "Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosStoreBucketHighOperationFailures expr: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) > 0' for: 15m labels: severity: warning annotations: summary: Thanos Store Bucket High Operation Failures (instance {{ $labels.instance }}) description: "Thanos Store {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosStoreObjstoreOperationLatencyHigh expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)' for: 10m labels: severity: warning annotations: summary: Thanos Store Objstore Operation Latency High (instance {{ $labels.instance }}) description: "Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/traefik/embedded-exporter-v1.yml ================================================ groups: - name: EmbeddedExporterV1 rules: - alert: TraefikBackendDown expr: 'count(traefik_backend_server_up) by (backend) == 0' for: 0m labels: severity: critical annotations: summary: Traefik backend down (instance {{ $labels.instance }}) description: "All Traefik backends are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TraefikHighHttp4xxErrorRateBackend expr: 'sum(rate(traefik_backend_requests_total{code=~"4.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0' for: 1m labels: severity: critical annotations: summary: Traefik high HTTP 4xx error rate backend (instance {{ $labels.instance }}) description: "Traefik backend 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TraefikHighHttp5xxErrorRateBackend expr: 'sum(rate(traefik_backend_requests_total{code=~"5.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0' for: 1m labels: severity: critical annotations: summary: Traefik high HTTP 5xx error rate backend (instance {{ $labels.instance }}) description: "Traefik backend 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/traefik/embedded-exporter-v2.yml ================================================ groups: - name: EmbeddedExporterV2 rules: - alert: TraefikServiceDown expr: 'count(traefik_service_server_up) by (service) == 0' for: 0m labels: severity: critical annotations: summary: Traefik service down (instance {{ $labels.instance }}) description: "All Traefik services are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TraefikHighHttp4xxErrorRateService expr: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0' for: 1m labels: severity: critical annotations: summary: Traefik high HTTP 4xx error rate service (instance {{ $labels.instance }}) description: "Traefik service 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TraefikHighHttp5xxErrorRateService expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0' for: 1m labels: severity: critical annotations: summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }}) description: "Traefik service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/vmware/pryorda-vmware-exporter.yml ================================================ groups: - name: PryordaVmwareExporter rules: - alert: VirtualMachineMemoryWarning expr: 'vmware_vm_mem_usage_average / 100 >= 80 and vmware_vm_mem_usage_average / 100 < 90' for: 5m labels: severity: warning annotations: summary: Virtual Machine Memory Warning (instance {{ $labels.instance }}) description: "High memory usage on {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: VirtualMachineMemoryCritical expr: 'vmware_vm_mem_usage_average / 100 >= 90' for: 1m labels: severity: critical annotations: summary: Virtual Machine Memory Critical (instance {{ $labels.instance }}) description: "High memory usage on {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HighNumberOfSnapshots expr: 'vmware_vm_snapshots > 3' for: 30m labels: severity: warning annotations: summary: High Number of Snapshots (instance {{ $labels.instance }}) description: "High snapshots number on {{ $labels.instance }}: {{ $value }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OutdatedSnapshots expr: '(time() - vmware_vm_snapshot_timestamp_seconds) / (60 * 60 * 24) >= 3' for: 5m labels: severity: warning annotations: summary: Outdated Snapshots (instance {{ $labels.instance }}) description: "Outdated snapshots on {{ $labels.instance }}: {{ $value | printf \"%.0f\"}} days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/windows-server/windows-exporter.yml ================================================ groups: - name: WindowsExporter rules: - alert: WindowsServerCollectorError expr: 'windows_exporter_collector_success == 0' for: 0m labels: severity: critical annotations: summary: Windows Server collector Error (instance {{ $labels.instance }}) description: "Collector {{ $labels.collector }} was not successful\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: WindowsServerServiceStatus expr: 'windows_service_status{status="ok"} != 1' for: 1m labels: severity: critical annotations: summary: Windows Server service Status (instance {{ $labels.instance }}) description: "Windows Service state is not OK\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: WindowsServerCpuUsage expr: '100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80' for: 0m labels: severity: warning annotations: summary: Windows Server CPU Usage (instance {{ $labels.instance }}) description: "CPU Usage is more than 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: WindowsServerMemoryUsage expr: '100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90' for: 2m labels: severity: warning annotations: summary: Windows Server memory Usage (instance {{ $labels.instance }}) description: "Memory usage is more than 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: WindowsServerDiskSpaceUsage expr: '100 - 100 * (windows_logical_disk_free_bytes / windows_logical_disk_size_bytes) > 80 and windows_logical_disk_size_bytes > 0' for: 2m labels: severity: critical annotations: summary: Windows Server disk Space Usage (instance {{ $labels.instance }}) description: "Disk usage is more than 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/wireguard/mindflavor-prometheus-wireguard-exporter.yml ================================================ groups: - name: MindflavorPrometheusWireguardExporter rules: # The threshold of 300 seconds (5 minutes) is a rough default. WireGuard peers that are idle but reachable # typically re-handshake every 2 minutes. Adjust based on your keepalive interval. # The `> 0` guard excludes peers that have never completed a handshake (covered by a separate rule). - alert: WireguardPeerHandshakeTooOld expr: 'time() - wireguard_latest_handshake_seconds > 300 and wireguard_latest_handshake_seconds > 0' for: 2m labels: severity: warning annotations: summary: WireGuard peer handshake too old (instance {{ $labels.instance }}) description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has not had a handshake for over 5 minutes. The tunnel may be down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This alert will fire for all offline mobile/laptop peers. Consider filtering by expected-online peers. - alert: WireguardPeerHandshakeNeverEstablished expr: 'wireguard_latest_handshake_seconds == 0' for: 5m labels: severity: critical annotations: summary: WireGuard peer handshake never established (instance {{ $labels.instance }}) description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has never completed a handshake. Check peer configuration and network connectivity.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This alert fires when a peer has a recent handshake but zero traffic flow. # May indicate routing issues or a misconfigured allowed-ips. # Only useful if you expect continuous traffic on all peers. - alert: WireguardNoTrafficOnPeer expr: '(rate(wireguard_sent_bytes_total[15m]) + rate(wireguard_received_bytes_total[15m])) == 0 and wireguard_latest_handshake_seconds > 0 and (time() - wireguard_latest_handshake_seconds) < 300' for: 15m labels: severity: warning annotations: summary: WireGuard no traffic on peer (instance {{ $labels.instance }}) description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has had no traffic for 15 minutes despite an active handshake.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/zfs/node-exporter.yml ================================================ groups: - name: NodeExporter rules: - alert: ZfsOfflinePool expr: 'node_zfs_zpool_state{state!="online"} > 0' for: 1m labels: severity: critical annotations: summary: ZFS offline pool (instance {{ $labels.instance }}) description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/zfs/zfs_exporter.yml ================================================ groups: - name: Zfs_exporter rules: - alert: ZfsPoolOutOfSpace expr: 'zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0 and zfs_pool_size_bytes > 0' for: 0m labels: severity: warning annotations: summary: ZFS pool out of space (instance {{ $labels.instance }}) description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 0: ONLINE # 1: DEGRADED # 2: FAULTED # 3: OFFLINE # 4: UNAVAIL # 5: REMOVED # 6: SUSPENDED - alert: ZfsPoolUnhealthy expr: 'zfs_pool_health > 0' for: 0m labels: severity: critical annotations: summary: ZFS pool unhealthy (instance {{ $labels.instance }}) description: "ZFS pool state is {{ $value }}. See comments for more information.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ZfsCollectorFailed expr: 'zfs_scrape_collector_success != 1' for: 0m labels: severity: warning annotations: summary: ZFS collector failed (instance {{ $labels.instance }}) description: "ZFS collector for {{ $labels.instance }} has failed to collect information\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/rules/zookeeper/cloudflare-kafka-zookeeper-exporter.yml ================================================ groups: - name: CloudflareKafkaZookeeperExporter rules: ================================================ FILE: dist/rules/zookeeper/dabealu-zookeeper-exporter.yml ================================================ groups: - name: DabealuZookeeperExporter rules: # 1m delay allows a restart without triggering an alert. - alert: ZookeeperDown expr: 'zk_up == 0' for: 1m labels: severity: critical annotations: summary: Zookeeper Down (instance {{ $labels.instance }}) description: "Zookeeper down on instance {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ZookeeperMissingLeader expr: 'sum(zk_server_leader) == 0' for: 0m labels: severity: critical annotations: summary: Zookeeper missing leader (instance {{ $labels.instance }}) description: "Zookeeper cluster has no node marked as leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ZookeeperTooManyLeaders expr: 'sum(zk_server_leader) > 1' for: 0m labels: severity: critical annotations: summary: Zookeeper Too Many Leaders (instance {{ $labels.instance }}) description: "Zookeeper cluster has too many nodes marked as leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ZookeeperNotOk expr: 'zk_ruok == 0' for: 3m labels: severity: warning annotations: summary: Zookeeper Not Ok (instance {{ $labels.instance }}) description: "Zookeeper instance is not ok\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ================================================ FILE: dist/template.yml ================================================ groups: {% assign groupName = slug | split: '-' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} - name: {{ groupNameCamelcase | remove: ' ' | remove: '-' }} {% assign lines = comments | split: " " %}{% for line in lines %}# {{ line | strip }} {% endfor %} rules: {% for rule in rules %}{% assign ruleName = rule.name | split: ' ' %}{% capture ruleNameCamelcase %}{% for word in ruleName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {% assign lines = rule.comments | split: " " %}{% for line in lines %}# {{ line | strip }} {% endfor %}- alert: {{ ruleNameCamelcase | remove: ' ' }} expr: '{{ rule.query }}' for: {% if rule.for %}{{ rule.for }}{% else %}0m{% endif %} labels: severity: {{ rule.severity }} annotations: summary: {% if rule.summary %}{{ rule.summary }}{% else %}{{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %}){% endif %} description: "{{ rule.description | replace: '"', '\"' }}\n VALUE = {% raw %}{{ $value }}{% endraw %}\n LABELS = {% raw %}{{ $labels }}{% endraw %}" {% endfor %} ================================================ FILE: docker-compose.yml ================================================ version: '3' services: jekyll: image: jekyll/jekyll:latest command: jekyll serve volumes: - ./:/srv/jekyll ports: - 4000:4000 ================================================ FILE: index.md ================================================ ![Prometheus logo](/assets/prometheus-logo.png){: .center-image }

Hello world

AlertManager configuration Alerting time window

Out of the box prometheus alerting rules

    {% for group in site.data.rules.groups %}
  • {% assign nbrRules = 0 %} {% for service in group.services %} {% for exporter in service.exporters %} {% for rule in exporter.rules %} {% assign nbrRules = nbrRules | plus: 1 %} {% endfor %} {% endfor %} {% endfor %}

    {{ group.name }} ({{ nbrRules }} rules)

  • {% endfor %}
================================================ FILE: package.json ================================================ { "scripts": { "test": "awesome-lint" }, "devDependencies": { "awesome-lint": "*" } } ================================================ FILE: rules.md ================================================

⚠️ Caution ⚠️

Alert thresholds depend on nature of applications.
Some queries in this page may have arbitrary tolerance threshold.

Building an efficient and battle-tested monitoring platform takes time. 😉



    {% for group in site.data.rules.groups %} {% assign groupIndex = forloop.index %} {% for service in group.services %} {% assign serviceIndex = forloop.index %} {% assign nbrExporters = service.exporters | size %} {% for exporter in service.exporters %} {% assign exporterIndex = forloop.index %} {% assign nbrRules = exporter.rules | size %}
  • {% assign serviceId = service.name | replace: " ", "-" | downcase %}

    # {{ groupIndex }}.{{ serviceIndex }}.{% if nbrExporters > 1 %}{{ exporterIndex }}.{% endif %} {{ service.name }} {% if exporter.name %}: {% if exporter.doc_url %} {{ exporter.name }} {% else %} {{ exporter.name }} {% endif %} {% endif %} {% if nbrRules > 0 %} ({{ nbrRules }} rules) [copy section] {% endif %}

    {% if nbrRules == 0 %} {% highlight javascript %} // @TODO: Please contribute => https://github.com/samber/awesome-prometheus-alerts 👋 {% endhighlight %} {% else %} {{ exporter.comments | strip | newline_to_br }} {% highlight bash %} $ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/refs/heads/master/dist/rules/{{ service.name | replace: " ", "-" | downcase }}/{{ exporter.slug }}.yml {% endhighlight %} {% endif %}
      {% for rule in exporter.rules %} {% assign ruleIndex = forloop.index %} {% assign comments = rule.comments | strip | newline_to_br | split: '
      ' %}
    • # {{ groupIndex}}.{{ serviceIndex }}.{% if nbrExporters > 1 %}{{ exporterIndex }}.{% endif %}{{ ruleIndex }}. {{ rule.name }}

      {{ rule.description }} [copy]
      {% assign ruleName = rule.name | split: ' ' %} {% capture ruleNameCamelcase %}{% for word in ruleName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {% highlight yaml %} {% for comment in comments %}# {{ comment | strip }} {% endfor %}- alert: {{ ruleNameCamelcase | remove: ' ' }} expr: {{ rule.query }} for: {% if rule.for %}{{ rule.for }}{% else %}0m{% endif %} labels: severity: {{ rule.severity }} annotations: summary: {{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %}) description: "{{ rule.description | replace: '"', '\"' }}\n VALUE = {% raw %}{{ $value }}{% endraw %}\n LABELS = {% raw %}{{ $labels }}{% endraw %}" {% endhighlight %}

    • {% endfor %}

  • {% endfor %} {% endfor %} {% endfor %}

Menu

    {% for group in site.data.rules.groups %}
  • {{ group.name }}

  • {% endfor %}
================================================ FILE: sleep-peacefully.md ================================================

Sleep Peacefully

## Alerting time window In some applications, load and activity can vary over the day/week/year. In order to prevent alarm fatigue and busy pager, alerts can be disabled during a period of time (such as night or weekend). Example: - Weekday: `node_load5 > 10 and ON() (0 < day_of_week() < 6)` - Day time: `node_load5 > 10 and ON() (8 < hour() < 18)` - Exclude December: `node_load5 > 10 and ON() (month() != 12)` ## Advanced time windows and timezones ```yml # rules.yml groups: - name: timezones rules: - record: european_summer_time_offset expr: | (vector(1) and (month() > 3 and month() < 10)) or (vector(1) and (month() == 3 and (day_of_month() - day_of_week()) >= 25) and absent((day_of_month() >= 25) and (day_of_week() == 0))) or (vector(1) and (month() == 10 and (day_of_month() - day_of_week()) < 25) and absent((day_of_month() >= 25) and (day_of_week() == 0))) or (vector(1) and ((month() == 10 and hour() < 1) or (month() == 3 and hour() > 0)) and ((day_of_month() >= 25) and (day_of_week() == 0))) or vector(0) - record: europe_london_time expr: time() + 3600 * european_summer_time_offset - record: europe_paris_time expr: time() + 3600 * (1 + european_summer_time_offset) - record: europe_london_hour expr: hour(europe_london_time) - record: europe_paris_hour expr: hour(europe_paris_time) - record: europe_london_weekday expr: 0 < day_of_week(europe_london_time) < 6 - record: europe_paris_weekday expr: 0 < day_of_week(europe_paris_time) < 6 # opposite - record: not_europe_london_weekday expr: absent(europe_london_weekday) - record: not_europe_paris_weekday expr: absent(europe_paris_weekday) - record: europe_london_business_hours expr: 9 <= europe_london_hour < 18 - record: europe_paris_business_hours expr: 9 <= europe_paris_hour < 18 # opposite - record: not_europe_london_business_hours expr: absent(europe_london_business_hours) - record: not_europe_paris_business_hours expr: absent(europe_paris_business_hours) # new year's day / xmas / labor day / all saints' day / ... - record: europe_french_public_holidays expr: | (vector(1) and month(europe_paris_time) == 1 and day_of_month(europe_paris_time) == 1) or (vector(1) and month(europe_paris_time) == 12 and day_of_month(europe_paris_time) == 25) or (vector(1) and month(europe_paris_time) == 5 and day_of_month(europe_paris_time) == 1) or (vector(1) and month(europe_paris_time) == 11 and day_of_month(europe_paris_time) == 1) or vector(0) # opposite - record: not_europe_french_public_holidays expr: absent(europe_french_public_holidays) ``` ```yml # alerts.yml groups: - name: CPU Load rules: - alert: HighLoadQuietDuringWeekendAndNight expr: node_load5 > 10 and ON() (europe_london_weekday and europe_paris_weekday) - alert: HighLoadQuietDuringBackup expr: node_load5 > 10 and ON() absent(hour() == 2) - alert: HighLoad expr: | node_load5 > 20 and ON() (europe_london_weekday and europe_paris_weekday) or node_load5 > 10 ``` ## Sources - [https://medium.com/@tom.fawcett/time-of-day-based-notifications-with-prometheus-and-alertmanager-1bf7a23b7695](https://medium.com/@tom.fawcett/time-of-day-based-notifications-with-prometheus-and-alertmanager-1bf7a23b7695) - [https://promcon.io/2019-munich/slides/improved-alerting-with-prometheus-and-alertmanager.pdf](https://promcon.io/2019-munich/slides/improved-alerting-with-prometheus-and-alertmanager.pdf)