Showing preview only (921K chars total). Download the full file or copy to clipboard to get everything.
Repository: samber/awesome-prometheus-alerts
Branch: master
Commit: 9d9c648cddd1
Files: 133
Total size: 873.6 KB
Directory structure:
gitextract_epx1dxil/
├── .github/
│ ├── FUNDING.yml
│ ├── dependabot.yml
│ └── workflows/
│ ├── dist.yml
│ └── test.yml
├── .gitignore
├── .travis.yml
├── CLAUDE.md
├── CONTRIBUTING.md
├── Gemfile
├── LICENSE
├── README.md
├── _config.yml
├── _data/
│ └── rules.yml
├── _layouts/
│ └── default.html
├── alertmanager.md
├── assets/
│ ├── css/
│ │ └── app.css
│ └── js/
│ └── app.js
├── blackbox-exporter.md
├── dist/
│ ├── rules/
│ │ ├── apache/
│ │ │ └── lusitaniae-apache-exporter.yml
│ │ ├── apache-flink/
│ │ │ └── flink-prometheus-reporter.yml
│ │ ├── apache-spark/
│ │ │ └── spark-prometheus.yml
│ │ ├── apc-ups/
│ │ │ └── apcupsd_exporter.yml
│ │ ├── argocd/
│ │ │ └── embedded-exporter.yml
│ │ ├── aws-cloudwatch/
│ │ │ └── prometheus-cloudwatch-exporter.yml
│ │ ├── azure/
│ │ │ └── azure-metrics-exporter.yml
│ │ ├── blackbox/
│ │ │ └── blackbox-exporter.yml
│ │ ├── caddy/
│ │ │ └── embedded-exporter.yml
│ │ ├── cassandra/
│ │ │ ├── criteo-cassandra-exporter.yml
│ │ │ └── instaclustr-cassandra-exporter.yml
│ │ ├── ceph/
│ │ │ └── embedded-exporter.yml
│ │ ├── cert-manager/
│ │ │ └── embedded-exporter.yml
│ │ ├── cilium/
│ │ │ └── embedded-exporter.yml
│ │ ├── clickhouse/
│ │ │ └── embedded-exporter.yml
│ │ ├── cloudflare/
│ │ │ └── lablabs-cloudflare-exporter.yml
│ │ ├── consul/
│ │ │ └── consul-exporter.yml
│ │ ├── coredns/
│ │ │ └── embedded-exporter.yml
│ │ ├── cortex/
│ │ │ └── embedded-exporter.yml
│ │ ├── couchdb/
│ │ │ └── gesellix-couchdb-prometheus-exporter.yml
│ │ ├── digitalocean/
│ │ │ └── digitalocean-exporter.yml
│ │ ├── docker-containers/
│ │ │ └── google-cadvisor.yml
│ │ ├── ebpf/
│ │ │ └── ebpf-exporter.yml
│ │ ├── elasticsearch/
│ │ │ └── prometheus-community-elasticsearch-exporter.yml
│ │ ├── envoy/
│ │ │ └── embedded-exporter.yml
│ │ ├── etcd/
│ │ │ └── embedded-exporter.yml
│ │ ├── fluxcd/
│ │ │ └── embedded-exporter.yml
│ │ ├── freeswitch/
│ │ │ └── znerol-freeswitch-exporter.yml
│ │ ├── gitlab-ci/
│ │ │ ├── gitaly.yml
│ │ │ ├── gitlab-built-in-exporter.yml
│ │ │ └── workhorse.yml
│ │ ├── golang/
│ │ │ └── golang-exporter.yml
│ │ ├── google-cloud-stackdriver/
│ │ │ └── stackdriver-exporter.yml
│ │ ├── grafana-alloy/
│ │ │ └── embedded-exporter.yml
│ │ ├── grafana-mimir/
│ │ │ └── embedded-exporter.yml
│ │ ├── grafana-tempo/
│ │ │ └── embedded-exporter.yml
│ │ ├── graph-node/
│ │ │ └── embedded-exporter.yml
│ │ ├── hadoop/
│ │ │ └── jmx_exporter.yml
│ │ ├── haproxy/
│ │ │ ├── embedded-exporter-v2.yml
│ │ │ └── haproxy-exporter-v1.yml
│ │ ├── hashicorp-vault/
│ │ │ └── embedded-exporter.yml
│ │ ├── host-and-hardware/
│ │ │ └── node-exporter.yml
│ │ ├── ipmi/
│ │ │ └── ipmi-exporter.yml
│ │ ├── istio/
│ │ │ └── embedded-exporter.yml
│ │ ├── jaeger/
│ │ │ └── embedded-exporter.yml
│ │ ├── jenkins/
│ │ │ └── metric-plugin.yml
│ │ ├── juniper/
│ │ │ └── czerwonk-junos-exporter.yml
│ │ ├── jvm/
│ │ │ └── jvm-exporter.yml
│ │ ├── kafka/
│ │ │ ├── danielqsj-kafka-exporter.yml
│ │ │ └── linkedin-kafka-exporter.yml
│ │ ├── keycloak/
│ │ │ └── aerogear-keycloak-metrics-spi.yml
│ │ ├── kubernetes/
│ │ │ └── kubestate-exporter.yml
│ │ ├── linkerd/
│ │ │ └── embedded-exporter.yml
│ │ ├── loki/
│ │ │ └── embedded-exporter.yml
│ │ ├── meilisearch/
│ │ │ └── embedded-exporter.yml
│ │ ├── memcached/
│ │ │ └── memcached-exporter.yml
│ │ ├── minio/
│ │ │ └── embedded-exporter.yml
│ │ ├── mongodb/
│ │ │ ├── dcu-mongodb-exporter.yml
│ │ │ ├── percona-mongodb-exporter.yml
│ │ │ └── stefanprodan-mgob-exporter.yml
│ │ ├── mysql/
│ │ │ └── mysqld-exporter.yml
│ │ ├── nats/
│ │ │ └── nats-exporter.yml
│ │ ├── netdata/
│ │ │ └── embedded-exporter.yml
│ │ ├── nginx/
│ │ │ └── knyar-nginx-exporter.yml
│ │ ├── nomad/
│ │ │ └── embedded-exporter.yml
│ │ ├── openebs/
│ │ │ └── embedded-exporter.yml
│ │ ├── openstack/
│ │ │ └── openstack-exporter.yml
│ │ ├── opentelemetry-collector/
│ │ │ └── embedded-exporter.yml
│ │ ├── oracle-database/
│ │ │ └── iamseth-oracledb-exporter.yml
│ │ ├── patroni/
│ │ │ └── embedded-exporter-patroni.yml
│ │ ├── pgbouncer/
│ │ │ └── spreaker-pgbouncer-exporter.yml
│ │ ├── php-fpm/
│ │ │ └── bakins-fpm-exporter.yml
│ │ ├── postgresql/
│ │ │ └── postgres-exporter.yml
│ │ ├── process-exporter/
│ │ │ └── process-exporter.yml
│ │ ├── prometheus-self-monitoring/
│ │ │ └── embedded-exporter.yml
│ │ ├── promtail/
│ │ │ └── embedded-exporter.yml
│ │ ├── proxmox-ve/
│ │ │ └── prometheus-pve-exporter.yml
│ │ ├── pulsar/
│ │ │ └── embedded-exporter.yml
│ │ ├── python/
│ │ │ └── python-exporter.yml
│ │ ├── rabbitmq/
│ │ │ ├── kbudde-rabbitmq-exporter.yml
│ │ │ └── rabbitmq-exporter.yml
│ │ ├── redis/
│ │ │ └── oliver006-redis-exporter.yml
│ │ ├── ruby/
│ │ │ └── ruby-exporter.yml
│ │ ├── s.m.a.r.t-device-monitoring/
│ │ │ └── smartctl-exporter.yml
│ │ ├── sidekiq/
│ │ │ └── strech-sidekiq-exporter.yml
│ │ ├── snmp/
│ │ │ └── snmp-exporter.yml
│ │ ├── solr/
│ │ │ └── embedded-exporter.yml
│ │ ├── speedtest/
│ │ │ └── nlamirault-speedtest-exporter.yml
│ │ ├── spinnaker/
│ │ │ └── embedded-exporter.yml
│ │ ├── sql-server/
│ │ │ └── ozarklake-mssql-exporter.yml
│ │ ├── ssl/
│ │ │ └── tls/
│ │ │ └── ribbybibby-ssl-exporter.yml
│ │ ├── systemd/
│ │ │ └── systemd-exporter.yml
│ │ ├── thanos/
│ │ │ ├── thanos-bucket-replicate.yml
│ │ │ ├── thanos-compactor.yml
│ │ │ ├── thanos-component-absent.yml
│ │ │ ├── thanos-query.yml
│ │ │ ├── thanos-receiver.yml
│ │ │ ├── thanos-ruler.yml
│ │ │ ├── thanos-sidecar.yml
│ │ │ └── thanos-store.yml
│ │ ├── traefik/
│ │ │ ├── embedded-exporter-v1.yml
│ │ │ └── embedded-exporter-v2.yml
│ │ ├── vmware/
│ │ │ └── pryorda-vmware-exporter.yml
│ │ ├── windows-server/
│ │ │ └── windows-exporter.yml
│ │ ├── wireguard/
│ │ │ └── mindflavor-prometheus-wireguard-exporter.yml
│ │ ├── zfs/
│ │ │ ├── node-exporter.yml
│ │ │ └── zfs_exporter.yml
│ │ └── zookeeper/
│ │ ├── cloudflare-kafka-zookeeper-exporter.yml
│ │ └── dabealu-zookeeper-exporter.yml
│ └── template.yml
├── docker-compose.yml
├── index.md
├── package.json
├── rules.md
└── sleep-peacefully.md
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/FUNDING.yml
================================================
github: [samber]
ko_fi: samuelberthe
================================================
FILE: .github/dependabot.yml
================================================
---
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "monthly"
================================================
FILE: .github/workflows/dist.yml
================================================
name: Publish
on:
workflow_dispatch:
push:
branches:
- master
permissions:
contents: write
jobs:
publish:
name: Publish
# Check if the PR is not from a fork
if: github.repository_owner == 'samber'
runs-on: ubuntu-latest
steps:
- name: Checkout Repo
uses: actions/checkout@v6
- name: Set up Ruby
uses: ruby/setup-ruby@v1
with:
ruby-version: 3.4
- name: Set up yq
uses: mikefarah/yq@v4
- name: Install liquid
run: |
gem install liquid -v 5.5.1
gem install liquid-cli
- name: Build rule configuration
run: |
cat _data/rules.yml | yq -I 0 -o json > _data/rules.json
rm -rf dist/rules
for service in $(cat _data/rules.json | jq -r '.groups[].services[] | @base64'); do
subdir=dist/rules/$(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")')
mkdir -p "${subdir}"
# groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml
echo ${subdir}/${exporterName}.yml
done
done
rm _data/rules.json
# https://peterevans.dev/posts/github-actions-how-to-automate-code-formatting-in-pull-requests/
- name: Check for modified files
id: git-check
run: echo "modified=$(git status -s --porcelain | wc -l | awk '{$1=$1};1')" >> $GITHUB_OUTPUT
- name: Push changes
if: steps.git-check.outputs.modified != '0'
run: |
git config --global user.name 'samber'
git config --global user.email 'samber@users.noreply.github.com'
git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}
git add .
git commit -m "Publish"
git push
================================================
FILE: .github/workflows/test.yml
================================================
name: Promtool check
on:
pull_request:
push:
branches:
- master
jobs:
promtool-check:
name: Check alert rules syntax
runs-on: ubuntu-latest
steps:
- name: Checkout Repo
uses: actions/checkout@v6
- name: Set up Ruby
uses: ruby/setup-ruby@v1
with:
ruby-version: 3.4
- name: Set up yq
uses: mikefarah/yq@v4
- name: Install liquid
run: gem install liquid-cli
- name: Build rule configuration
run: |
cat _data/rules.yml | yq -I 0 -o json > _data/rules.json
for service in $(cat _data/rules.json | jq -r '.groups[].services[] | @base64'); do
subdir=test/rules/$(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")')
mkdir -p "${subdir}"
# groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml
echo ${subdir}/${exporterName}.yml
done
done
rm _data/rules.json
- name: Check Prometheus alert rules
uses: peimanja/promtool-github-actions@master
with:
promtool_actions_subcommand: 'rules'
promtool_actions_files: 'test/rules/*/*.yml'
promtool_actions_comment: true
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
================================================
FILE: .gitignore
================================================
_site/
.sass-cache/
.jekyll-cache/
.jekyll-metadata
_data/rules.json
test/rules/
/node_modules
.worktrees/
================================================
FILE: .travis.yml
================================================
language: node_js
node_js:
- 'node'
================================================
FILE: CLAUDE.md
================================================
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
## Project Overview
A curated collection of ~940 Prometheus alerting rules covering 90+ services across 100+ exporters, organized in 7 categories: basic resource monitoring (Prometheus, host/hardware, SMART, Docker, Blackbox, Windows, VMware, Netdata), databases and brokers (MySQL, PostgreSQL, Redis, MongoDB, RabbitMQ, Elasticsearch, Cassandra, Clickhouse, Kafka, etc.), reverse proxies and load balancers (Nginx, Apache, HaProxy, Traefik, Caddy), runtimes (PHP-FPM, JVM, Sidekiq), orchestrators (Kubernetes, Nomad, Consul, Etcd, Istio, ArgoCD, FluxCD), network/security/storage (Ceph, ZFS, Minio, SSL/TLS, CoreDNS, Vault, Cloudflare), and observability tools (Thanos, Loki, Cortex, OpenTelemetry Collector, Jenkins).
All rules are stored in a single YAML data file (`_data/rules.yml`) and rendered as a Jekyll-based GitHub Pages site at https://samber.github.io/awesome-prometheus-alerts. The site provides copy-pasteable Prometheus alert snippets and downloadable rule files per exporter.
The project is community-driven. Most contributions are PRs adding or updating rules in `_data/rules.yml`. Files in `dist/rules/` are auto-generated on merge — never edit them manually.
## Architecture
- **`_data/rules.yml`** — The single source of truth for all alerting rules. This is the main file contributors edit. It is NOT a valid Prometheus config; the site renders each rule into copy-pasteable Prometheus alert format.
- **`rules.md`** — Jekyll template that iterates over `_data/rules.yml` and renders the rules page with copy buttons and formatted YAML blocks.
- **`alertmanager.md`** — Static page with Prometheus/AlertManager configuration examples.
- **`_layouts/default.html`** — Site layout (Jekyll theme: cayman).
- **`_config.yml`** — Jekyll configuration.
- **`dist/rules/`** — Pre-built downloadable rule files organized by service/exporter (referenced in the site for `wget` commands).
## Rules YAML Structure
Services are listed in README.md.
`_data/rules.yml` hierarchy:
```
groups:
- name: "<category>" # e.g. "Basic resource monitoring"
services:
- name: "<service>" # e.g. "Host and hardware"
exporters:
- name: "<exporter>"
slug: "<slug>" # used for download URLs
doc_url: "<url>" # optional link to exporter docs
comments: # optional, exporter-level multiline notes rendered before rules
"<comment>"
rules:
- name: "<alert name>"
description: "<text>"
query: "<PromQL>"
severity: warning|critical|info
for: "<duration>" # optional, defaults to 0m
comments: # optional, rendered as multiline YAML comments
"<comment>"
```
Services are grouped in category. If you are not sure about the classification, ask the developer.
## Running Locally
```bash
# With Ruby/Bundler
gem install bundler
bundle install
jekyll serve
# With Docker Compose
docker compose up -d
# With Docker directly
docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve
```
Site serves at http://localhost:4000/awesome-prometheus-alerts.
## Contributing Rules
All rule changes go in `_data/rules.yml`. Each rule needs: `name`, `description`, `query` (valid PromQL), and `severity`. The `for` field is optional. Descriptions should be factual ("what") and include root cause hints ("why"). Queries must be tested against the latest exporter version. Never modify files in `dist/` — they are auto-generated on merge.
## Query Validation
- When adding or updating an alert, verify that the PromQL query references metric series that actually exist in the related exporter. Check the exporter's documentation or source code to confirm series names.
- If a metric series has been deprecated or removed in a newer version of the exporter, update the query to use the replacement series, or remove the rule if no replacement exists. Known examples: `kube_hpa_*` renamed to `kube_horizontalpodautoscaler_*` in kube-state-metrics 2.x; `node_hwmon_temp_alarm` does not exist (correct: `node_hwmon_temp_crit_alarm_celsius`); node-exporter CLI flags get renamed across versions.
- When writing or reviewing a query, search the internet (exporter docs, GitHub issues, changelogs) to validate correctness and catch outdated series names. When you are not sure about a metric name, always search the internet to confirm it exists and is spelled correctly before using it.
- Pay special attention to metric naming conventions: many exporters add `_total` suffixes for counters and `_seconds_total` for time-based counters. Verify the exact name from source code, not just docs. Known examples: Spark's PrometheusResource adds `_total` and `_seconds_total` suffixes (e.g., `metrics_executor_failedTasks_total`, not `metrics_executor_failedTasks`); Oracle's `oracledb_sessions_value` not `oracledb_sessions_activity`.
- Verify that label names used in `{{ $labels.xxx }}` template variables actually exist on the metric. Check the exporter source code for the exact label names. Known examples: cloudflare/ebpf_exporter uses `id` not `name` for programs, and `config` not `name` for decoder errors.
- When a metric uses info-style patterns (value always 1, information carried in labels), `== 0` will never be true — the metric simply won't exist. Use `absent()` instead. Known example: `ebpf_exporter_enabled_configs`.
- Some metrics are version-dependent. When a metric was renamed or removed in a newer version, add a comment noting the version requirement. Known examples: `go_memstats_gc_cpu_fraction` removed in client_golang v1.12+; cert-manager renamed `certmanager_http_acme_client_request_count` to `certmanager_acme_client_request_count` in v1.19+.
- Verify the unit of a metric before setting thresholds. Some metrics use milliseconds while descriptions assume seconds. Known example: Keycloak's `keycloak_request_duration` is in milliseconds, so `> 2` means 2ms not 2s.
- Some exporters expose labels that differ between services even within the same ecosystem. Known example: OpenStack Neutron uses `adminState="up"` while Nova and Cinder use `adminState="enabled"`.
- When an official mixin exists for a service, compare thresholds and time windows against it. Known deviations to watch for: Mimir store-gateway sync uses 1800s (not 600s), Mimir compactor skipped blocks uses `[24h]` (not `[5m]`), Tempo normalizes outstanding blocks per worker.
## Common Review Pitfalls (learned from PR history)
These are the most frequent issues raised during code review on this repo:
### Severity levels
- `critical` = requires immediate human attention. Do not use for informational/security notifications.
- `warning` = needs attention soon but not urgent.
- `info` = awareness only (e.g., config changes, underutilized resources).
- Authentication failures, security notifications, and config-change detections are typically `info`, not `critical`.
### `for` duration
- Omit `for` when the default (0m) is intentional and appropriate — do not add `for: 0m` explicitly.
- Add a `for` duration (e.g., `for: 2m` or `for: 5m`) to tolerate brief unavailability from restarts or transient spikes. Most "service down" rules should have at least `for: 1m`–`2m`.
- Do not blanket-change all `for: 0m` to `for: 1m` — it depends on the alert's semantics and the range window used in `increase()`/`rate()`.
### Query design
- Prefer symptom-based alerts over cause-based alerts to reduce alert fatigue. Example: "service is unreachable" is better than "specific internal counter changed". Metrics like heap object count, allocation rate, or free heap slots are causes, not symptoms — prefer GC duration, latency, or error rate alerts instead.
- Don't add unnecessary aggregation (`avg()`, `avg_over_time()`) on metrics that are local to a single node/instance. Only aggregate when the alert is cluster-wide.
- Don't combine `min_over_time()[1m]` with `for: 2m` redundantly — pick one mechanism for smoothing. Same applies to `avg_over_time()[5m]` with `for: 5m`.
- Remove unnecessary label filters (e.g., `job="cassandra"` or `cluster=~".*"`) that add noise without value.
- Verify comparison operators match the intent — e.g., "high snapshot count" must use `> N`, not `< N`.
- When dividing counters (e.g., error rate = errors / total), guard against division by zero with `and total > 0` or filter appropriately. This is the most common issue in new PRs — check every ratio query.
- Filter out system/template databases explicitly in DB queries (e.g., PostgreSQL: add `datid!="0"` alongside `datname!~"template.*|postgres"`).
- Never use `rate()` on a gauge metric — use `deriv()` instead. `rate()` is for monotonically increasing counters only.
- When using `increase()` for ratio calculations, prefer `rate()` instead — `increase()` can produce incorrect results when counters reset mid-window.
- When filtering gRPC error codes, don't use `grpc_code!="OK"` — this includes normal application responses like `NotFound`, `AlreadyExists`, and `Cancelled`. Filter to actual errors: `grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"`.
- When computing ratios with `rate()` on a metric that is itself already a normalized rate (e.g., Oracle's `v$waitclassmetric`), applying `rate()` computes the rate-of-change of a rate, which is not meaningful.
- When a multi-label metric is used in a binary operation with a metric that has fewer labels, use `ignoring(extra_label)` to avoid join failures. Known example: `systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max`.
- When a query groups by labels (e.g., `by (le, worker)`), consider the cardinality impact — hundreds of label values means hundreds of independent alerts.
- Ensure `{{ $value | humanizeDuration }}` is only used on values in seconds. If the metric is in milliseconds, divide by 1000 first or use `{{ $value | humanize }}ms`.
- Avoid using `up{job=~"exporter-name"} == 0` or `absent(up{job=~"exporter-name"})` to detect whether a service is down. When targets are managed via service discovery or a job reaches multiple targets, a disappeared target causes the `up` series to become stale and vanish rather than drop to 0, so the alert never fires. Prefer application-level or cluster-level metrics instead (e.g., "number of consul cluster members < 3", "PostgreSQL primary node absent").
### Thresholds
- Alert thresholds are inherently arbitrary and depend on workload. Use `comments:` to note this when a threshold is a rough default.
- When threshold values in a PR seem unreasonable (too high or too low), challenge them with real-world reasoning or exporter docs.
- Watch for thresholds that are so high they only catch catastrophic scenarios and miss real problems. Examples: Go goroutine spike at 100/s (misses gradual leaks), Ruby major GC at 5/s (only fires if app is non-functional), Python gen2 GC at >1/s (extremely rare).
- Watch for thresholds that will fire on normal healthy operation. Examples: Memcached at 90% memory is desired (it's a cache), Flink TaskManager at 90% JVM heap is normal, cache hit rate < 80% is common for cold caches.
- For SNMP bandwidth utilization, `ifSpeed` (Gauge32) maxes at ~4.29 Gbps. For 10G+ interfaces, use `ifHighSpeed * 1000000` instead.
- For alerts using `> 0` on counters with `rate()` or `increase()`, consider whether a single event truly warrants alerting. In most cases, a small threshold (e.g., `> 0.05` for rate, `> 3` for increase) better distinguishes real problems from transient noise.
### Comments
- When an alert or its query needs explanation (e.g., non-obvious PromQL logic, threshold rationale, edge cases), use the rule-level `comments:` field. Use multiline comments when needed.
- Use the exporter-level `comments:` field for notes that apply to all rules under that exporter (e.g., exporter version requirements, known quirks, setup prerequisites).
- Comments are rendered as YAML `#` comments in the output, so they are visible to users who copy-paste the rules.
### Descriptions
- Keep descriptions short, factual, and actionable.
- Include what is happening ("Disk is almost full") and why it matters or what to check.
- Use `{{ $labels.instance }}`, `{{ $value }}`, and other template variables in descriptions when useful.
- If the description says "average" but the query uses `histogram_quantile(0.95, ...)`, fix the description to say "p95" (or vice versa).
- When alerting on rates or ratios that may not be intuitive, include `{{ $value }}` in the description so operators can see the actual number.
### Structure
- Some services have multiple exporters (e.g., MongoDB has `percona/mongodb_exporter` and `dcu/mongodb_exporter`). Place rules under the correct exporter.
- Search for duplicates before adding a new rule — a similar alert may already exist under a different exporter or with different thresholds.
- The `slug` field must be unique per exporter and is used for download URLs.
## Reference Sources for Cross-Checking Alerts
Use these sources to criticize and validate PromQL queries, compare thresholds, and find inspiration for new rules.
Everytime you consume an external resource to change a PromQL query, please compare before/after and explain why you think the external source is right.
### Official project mixins (alerts maintained by the project itself)
- https://github.com/prometheus/node_exporter/tree/master/docs/node-mixin/alerts
- https://github.com/prometheus/prometheus/tree/main/documentation/prometheus-mixin
- https://github.com/prometheus/alertmanager/tree/main/doc/alertmanager-mixin
- https://github.com/prometheus/snmp_exporter/tree/main/snmp-mixin
- https://github.com/prometheus/mysqld_exporter/tree/main/mysqld-mixin
- https://github.com/prometheus-community/postgres_exporter/tree/master/postgres_mixin
- https://github.com/prometheus-community/elasticsearch_exporter (mixin via Grafana docs)
- https://github.com/etcd-io/etcd/tree/main/contrib/mixin
- https://github.com/thanos-io/thanos/tree/main/mixin (also: examples/alerts/)
- https://github.com/grafana/loki/tree/main/production/loki-mixin (also: promtail-mixin/)
- https://github.com/grafana/mimir/tree/main/operations/mimir-mixin
- https://github.com/grafana/tempo/tree/main/operations/tempo-mixin
- https://github.com/grafana/grafana/tree/main/grafana-mixin
- https://github.com/ceph/ceph/tree/main/monitoring/ceph-mixin (in-tree; also https://github.com/ceph/ceph-mixins)
- https://github.com/jaegertracing/jaeger/tree/main/monitoring/jaeger-mixin
- https://github.com/kubernetes-monitoring/kubernetes-mixin (includes runbook.md)
- https://github.com/kubernetes/kube-state-metrics/tree/main/jsonnet/kube-state-metrics-mixin
- https://github.com/prometheus-operator/prometheus-operator/tree/main/jsonnet/mixin
- https://github.com/prometheus-operator/kube-prometheus
- https://github.com/cortexproject/cortex-jsonnet
- https://github.com/gluster/gluster-mixins
### Standalone mixin repositories
- https://github.com/povilasv/coredns-mixin
- https://github.com/adinhodovic/rabbitmq-mixin
- https://github.com/adinhodovic/blackbox-exporter-mixin
- https://github.com/adinhodovic/django-mixin
- https://github.com/adinhodovic/argo-cd-mixin
- https://github.com/adinhodovic/ingress-nginx-mixin
- https://github.com/adinhodovic/kubernetes-autoscaling-mixin
- https://github.com/metalmatze/kube-cockroachdb (CockroachDB on Kubernetes)
- https://github.com/bitnami-labs/sealed-secrets (sealed-secrets mixin)
- https://github.com/lukas-vlcek/elasticsearch-mixin (includes runbook.md)
- https://github.com/adinhodovic/postgresql-mixin
- https://github.com/imusmanmalik/cert-manager-mixin
- https://gitlab.com/uneeq-oss/cert-manager-mixin (alternative cert-manager mixin)
- https://github.com/uneeq-oss/spinnaker-mixin
- https://github.com/metalmatze/slo-libsonnet (SLO alerting/recording rules generation library)
### Grafana jsonnet-libs (93 mixins — browse for specific services)
- https://github.com/grafana/jsonnet-libs
- Notable mixins with alerts: consul, memcached, elasticsearch, haproxy, clickhouse, opensearch, redis, mongodb, kafka, nginx, rabbitmq, jvm, vault, envoy, istio, jenkins, caddy, cloudflare, docker, traefik, windows, snmp, argocd, nomad, pgbouncer, minio, ceph, and 60+ more.
### Mixin aggregators
- https://monitoring.mixins.dev/ (central registry of all monitoring mixins)
- https://github.com/monitoring-mixins/website/blob/master/mixins.json (machine-readable list of all mixins with source URLs)
- https://github.com/nlamirault/monitoring-mixins (hub aggregating many mixins)
### GitLab monitoring & infrastructure
- https://gitlab.com/gitlab-com/runbooks (GitLab.com SRE runbooks — production alert rules, runbook docs, alertmanager config)
- https://gitlab.com/gitlab-com/runbooks/-/tree/master/mimir-rules (production Mimir alerting rules organized by tenant/environment)
- https://gitlab.com/gitlab-com/runbooks/-/tree/master/mimir-rules-jsonnet (jsonnet sources for GitLab alerting rules)
- https://gitlab.com/gitlab-org/omnibus-gitlab/-/tree/master/files/gitlab-cookbooks/monitoring/templates/rules (default Prometheus rules shipped with GitLab Omnibus)
### Community alert collections
- https://github.com/jpweber/prometheus-alert-rules
- https://github.com/bdossantos/prometheus-alert-rules
- https://github.com/giantswarm/prometheus-rules
- https://github.com/last9/awesome-prometheus-toolkit
- https://github.com/warpnet/awesome-prometheus (meta-list of Prometheus resources)
================================================
FILE: CONTRIBUTING.md
================================================
# Contributing
## Adding alerting rule
If you don't have time to write a PR, just copy and paste some alerts into an issue. We will format it accordingly.
Rules are here: `_data/rules.yml`.
### Guidelines
Please ensure your pull request adheres to the following guidelines:
- Search previous suggestions before making a new one, as yours may be a duplicate.
- Keep descriptions short and simple, but descriptive.
- Description must be factual (the "what?") and should provide root cause suggestions (the "why?"), for faster resolution.
- Queries must be tested on latest exporter version.
## Improving Github page
### Run locally
```
gem install bundler
bundle install
jekyll serve
```
Or with Docker:
```
docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve
```
Or with Docker Compose:
```
docker compose up -d
```
================================================
FILE: Gemfile
================================================
source 'https://rubygems.org'
gem 'github-pages', '>= 232', group: :jekyll_plugins
gem 'webrick', '~> 1.8'
================================================
FILE: LICENSE
================================================
Creative Commons Attribution 4.0 International License (CC BY 4.0)
http://creativecommons.org/licenses/by/4.0/
================================================
FILE: README.md
================================================
# 👋 Awesome Prometheus Alerts [](https://awesome.re)
> Most alerting rules are common to every Prometheus setup. We need a place to find them all. 🤘 🚨 📊
Collection available here: **[https://samber.github.io/awesome-prometheus-alerts](https://samber.github.io/awesome-prometheus-alerts)**
<div align="center">
<hr>
<sup><b>Sponsored by:</b></sup>
<br>
<a href="https://cast.ai/samuel">
<div>
<img src="https://samber.github.io/awesome-prometheus-alerts/assets/sponsor-cast-ai.png" width="200" alt="Cast AI">
</div>
<div>
Cut Kubernetes & AI costs, boost application stability.
</div>
</a>
<br>
<a href="https://betterstack.com">
<div>
<img src="https://samber.github.io/awesome-prometheus-alerts/assets/sponsor-betterstack.png" width="200" alt="Better Stack">
</div>
<div>
Better Stack lets you centralize, search, and visualize your logs.
</div>
</a>
<hr>
</div>
## ✨ Contents
- [Rules](#-rules)
- [Contributing](#-contributing)
- [Improvements](#-improvements)
- [Help us](#-show-your-support)
- [License](#-license)
## 🚨 Rules
#### Basic resource monitoring
- [Prometheus self-monitoring](https://samber.github.io/awesome-prometheus-alerts/rules#prometheus-internals)
- [Host/Hardware](https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware)
- [SMART](https://samber.github.io/awesome-prometheus-alerts/rules#smart)
- [IPMI](https://samber.github.io/awesome-prometheus-alerts/rules#ipmi)
- [Docker Containers](https://samber.github.io/awesome-prometheus-alerts/rules#docker-containers)
- [Blackbox](https://samber.github.io/awesome-prometheus-alerts/rules#blackbox)
- [Windows](https://samber.github.io/awesome-prometheus-alerts/rules#windows-server)
- [VMWare](https://samber.github.io/awesome-prometheus-alerts/rules#vmware)
- [Proxmox VE](https://samber.github.io/awesome-prometheus-alerts/rules#proxmox-ve)
- [Netdata](https://samber.github.io/awesome-prometheus-alerts/rules#netdata)
- [eBPF](https://samber.github.io/awesome-prometheus-alerts/rules#ebpf)
- [Process Exporter](https://samber.github.io/awesome-prometheus-alerts/rules#process-exporter)
- [Systemd](https://samber.github.io/awesome-prometheus-alerts/rules#systemd)
#### Databases
- [MySQL](https://samber.github.io/awesome-prometheus-alerts/rules#mysql)
- [PostgreSQL](https://samber.github.io/awesome-prometheus-alerts/rules#postgresql)
- [SQL Server](https://samber.github.io/awesome-prometheus-alerts/rules#sql-server)
- [Oracle Database](https://samber.github.io/awesome-prometheus-alerts/rules#oracle-database)
- [Patroni](https://samber.github.io/awesome-prometheus-alerts/rules#patroni)
- [PGBouncer](https://samber.github.io/awesome-prometheus-alerts/rules#pgbouncer)
- [Redis](https://samber.github.io/awesome-prometheus-alerts/rules#redis)
- [Memcached](https://samber.github.io/awesome-prometheus-alerts/rules#memcached)
- [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb)
- [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch)
- [Meilisearch](https://samber.github.io/awesome-prometheus-alerts/rules#meilisearch)
- [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra)
- [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse)
- [CouchDB](https://samber.github.io/awesome-prometheus-alerts/rules#couchdb)
- [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr)
#### Message brokers
- [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq)
- [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper)
- [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka)
- [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar)
- [Nats](https://samber.github.io/awesome-prometheus-alerts/rules#nats)
#### Proxies, load balancers and service meshes
- [Nginx](https://samber.github.io/awesome-prometheus-alerts/rules#nginx)
- [Apache](https://samber.github.io/awesome-prometheus-alerts/rules#apache)
- [HaProxy](https://samber.github.io/awesome-prometheus-alerts/rules#haproxy)
- [Traefik](https://samber.github.io/awesome-prometheus-alerts/rules#traefik)
- [Caddy](https://samber.github.io/awesome-prometheus-alerts/rules#caddy)
- [Envoy](https://samber.github.io/awesome-prometheus-alerts/rules#envoy)
- [Linkerd](https://samber.github.io/awesome-prometheus-alerts/rules#linkerd)
- [Istio](https://samber.github.io/awesome-prometheus-alerts/rules#istio)
#### Runtimes
- [PHP-FPM](https://samber.github.io/awesome-prometheus-alerts/rules#php-fpm)
- [JVM](https://samber.github.io/awesome-prometheus-alerts/rules#jvm)
- [Golang](https://samber.github.io/awesome-prometheus-alerts/rules#golang)
- [Ruby](https://samber.github.io/awesome-prometheus-alerts/rules#ruby)
- [Python](https://samber.github.io/awesome-prometheus-alerts/rules#python)
- [Sidekiq](https://samber.github.io/awesome-prometheus-alerts/rules#sidekiq)
#### Data engineering
- [Apache Flink](https://samber.github.io/awesome-prometheus-alerts/rules#apache-flink)
- [Apache Spark](https://samber.github.io/awesome-prometheus-alerts/rules#apache-spark)
- [Hadoop](https://samber.github.io/awesome-prometheus-alerts/rules#hadoop)
#### Orchestrators
- [Kubernetes](https://samber.github.io/awesome-prometheus-alerts/rules#kubernetes)
- [Nomad](https://samber.github.io/awesome-prometheus-alerts/rules#nomad)
- [Consul](https://samber.github.io/awesome-prometheus-alerts/rules#consul)
- [Etcd](https://samber.github.io/awesome-prometheus-alerts/rules#etcd)
- [OpenStack](https://samber.github.io/awesome-prometheus-alerts/rules#openstack)
#### CI/CD
- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
- [ArgoCD](https://samber.github.io/awesome-prometheus-alerts/rules#argocd)
- [FluxCD](https://samber.github.io/awesome-prometheus-alerts/rules#fluxcd)
- [GitLab CI](https://samber.github.io/awesome-prometheus-alerts/rules#gitlab-ci)
- [Spinnaker](https://samber.github.io/awesome-prometheus-alerts/rules#spinnaker)
#### Network and security
- [SpeedTest](https://samber.github.io/awesome-prometheus-alerts/rules#speedtest)
- [SSL/TLS](https://samber.github.io/awesome-prometheus-alerts/rules#ssl/tls)
- [cert-manager](https://samber.github.io/awesome-prometheus-alerts/rules#cert-manager)
- [Juniper](https://samber.github.io/awesome-prometheus-alerts/rules#juniper)
- [CoreDNS](https://samber.github.io/awesome-prometheus-alerts/rules#coredns)
- [FreeSwitch](https://samber.github.io/awesome-prometheus-alerts/rules#freeswitch)
- [Hashicorp Vault](https://samber.github.io/awesome-prometheus-alerts/rules#hashicorp-vault)
- [Keycloak](https://samber.github.io/awesome-prometheus-alerts/rules#keycloak)
- [Cloudflare](https://samber.github.io/awesome-prometheus-alerts/rules#cloudflare)
- [SNMP](https://samber.github.io/awesome-prometheus-alerts/rules#snmp)
- [Cilium](https://samber.github.io/awesome-prometheus-alerts/rules#cilium)
- [WireGuard](https://samber.github.io/awesome-prometheus-alerts/rules#wireguard)
#### Storage
- [Ceph](https://samber.github.io/awesome-prometheus-alerts/rules#ceph)
- [ZFS](https://samber.github.io/awesome-prometheus-alerts/rules#zfs)
- [OpenEBS](https://samber.github.io/awesome-prometheus-alerts/rules#openebs)
- [Minio](https://samber.github.io/awesome-prometheus-alerts/rules#minio)
#### Cloud providers
- [AWS CloudWatch](https://samber.github.io/awesome-prometheus-alerts/rules#aws-cloudwatch)
- [Google Cloud Stackdriver](https://samber.github.io/awesome-prometheus-alerts/rules#google-cloud-stackdriver)
- [DigitalOcean](https://samber.github.io/awesome-prometheus-alerts/rules#digitalocean)
- [Azure](https://samber.github.io/awesome-prometheus-alerts/rules#azure)
#### Observability
- [Thanos](https://samber.github.io/awesome-prometheus-alerts/rules#thanos)
- [Loki](https://samber.github.io/awesome-prometheus-alerts/rules#loki)
- [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail)
- [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex)
- [Grafana Tempo](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-tempo)
- [Grafana Mimir](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-mimir)
- [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy)
- [OpenTelemetry Collector](https://samber.github.io/awesome-prometheus-alerts/rules#opentelemetry-collector)
- [Jaeger](https://samber.github.io/awesome-prometheus-alerts/rules#jaeger)
#### Other
- [APC UPS](https://samber.github.io/awesome-prometheus-alerts/rules#apc-ups)
- [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node)
## 🤝 Contributing
Contributions from community (you!) are most welcome!
There are many ways to contribute: writing code, alerting rules, documentation, reporting issues, discussing better error tracking...
[Instructions here](CONTRIBUTING.md)
## 🏋️ Improvements
- Create an alert rule builder in Jekyll for custom alerts (severity, thresholds, instances...)
- Add resolution suggestions to rule descriptions, for faster incident resolution ([#85](https://github.com/samber/awesome-prometheus-alerts/issues/85)).
## 💫 Show your support
Give a ⭐️ if this project helped you!
[](https://www.patreon.com/samber)
## 📝 License
[](https://creativecommons.org/licenses/by/4.0/legalcode)
Licensed under the Creative Commons 4.0 License, see LICENSE file for more detail.
================================================
FILE: _config.yml
================================================
theme: jekyll-theme-cayman
title: Awesome Prometheus alerts
description: Collection of alerting rules
repository: samber/awesome-prometheus-alerts
baseurl: /awesome-prometheus-alerts
================================================
FILE: _data/rules.yml
================================================
#
# The following yaml cannot be copy-pasted to Prometheus configuration.
# Please navigate to https://samber.github.io/awesome-prometheus-alerts/rules instead.
#
# Contributing guidelines:
# https://github.com/samber/awesome-prometheus-alerts/blob/master/CONTRIBUTING.md
#
groups:
- name: Basic resource monitoring
services:
- name: Prometheus self-monitoring
exporters:
- slug: embedded-exporter
rules:
- name: Prometheus job missing
description: A Prometheus job has disappeared
query: 'absent(up{job="prometheus"})'
severity: warning
- name: Prometheus target missing
description: A Prometheus target has disappeared. An exporter might be crashed.
query: "up == 0 unless on(job) (sum by (job) (up) == 0)"
severity: critical
for: 1m
comments: |
Only fire if at least one target in the job is still up.
If all targets are down, PrometheusJobMissing or PrometheusAllTargetsMissing will fire instead.
- name: Prometheus all targets missing
description: A Prometheus job does not have living target anymore.
query: "sum by (job) (up) == 0"
severity: critical
for: 1m
- name: Prometheus target missing with warmup time
description: "Allow a job time to start up (10 minutes) before alerting that it's down."
query: "sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))"
severity: critical
for: 1m
- name: Prometheus configuration reload failure
description: Prometheus configuration reload error
query: "prometheus_config_last_reload_successful != 1"
severity: warning
- name: Prometheus too many restarts
description: Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
query: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
severity: warning
- name: Prometheus AlertManager job missing
description: A Prometheus AlertManager job has disappeared
query: 'absent(up{job="alertmanager"})'
severity: warning
- name: Prometheus AlertManager configuration reload failure
description: AlertManager configuration reload error
query: "alertmanager_config_last_reload_successful != 1"
severity: warning
- name: Prometheus AlertManager config not synced
description: Configurations of AlertManager cluster instances are out of sync
query: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
severity: warning
- name: Prometheus AlertManager E2E dead man switch
description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager."
query: "vector(1)"
severity: critical
- name: Prometheus not connected to alertmanager
description: Prometheus cannot connect the alertmanager
query: "prometheus_notifications_alertmanagers_discovered < 1"
severity: critical
- name: Prometheus rule evaluation failures
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
query: "increase(prometheus_rule_evaluation_failures_total[3m]) > 0"
severity: critical
- name: Prometheus template text expansion failures
description: "Prometheus encountered {{ $value }} template text expansion failures"
query: "increase(prometheus_template_text_expansion_failures_total[3m]) > 0"
severity: critical
- name: Prometheus rule evaluation slow
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query."
query: "prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds"
severity: warning
for: 5m
- name: Prometheus notifications backlog
description: The Prometheus notification queue has not been empty for 10 minutes
query: "min_over_time(prometheus_notifications_queue_length[10m]) > 0"
severity: warning
- name: Prometheus AlertManager notification failing
description: "Alertmanager is failing sending notifications ({{ $value }} notifications/s)"
query: "rate(alertmanager_notifications_failed_total[1m]) > 0"
severity: critical
- name: Prometheus target empty
description: Prometheus has no target in service discovery
query: "prometheus_sd_discovered_targets == 0"
severity: critical
- name: Prometheus target scraping slow
description: Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.
query: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
severity: warning
for: 5m
- name: Prometheus large scrape
description: "Prometheus has many scrapes that exceed the sample limit ({{ $value }} scrapes)"
query: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10"
severity: warning
for: 5m
- name: Prometheus target scrape duplicate
description: "Prometheus has many samples rejected due to duplicate timestamps but different values ({{ $value }} samples)"
query: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 3"
severity: warning
- name: Prometheus TSDB checkpoint creation failures
description: "Prometheus encountered {{ $value }} checkpoint creation failures"
query: "increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0"
severity: critical
- name: Prometheus TSDB checkpoint deletion failures
description: "Prometheus encountered {{ $value }} checkpoint deletion failures"
query: "increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0"
severity: critical
- name: Prometheus TSDB compactions failed
description: "Prometheus encountered {{ $value }} TSDB compactions failures"
query: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
severity: critical
- name: Prometheus TSDB head truncations failed
description: "Prometheus encountered {{ $value }} TSDB head truncation failures"
query: "increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0"
severity: critical
- name: Prometheus TSDB reload failures
description: "Prometheus encountered {{ $value }} TSDB reload failures"
query: "increase(prometheus_tsdb_reloads_failures_total[1m]) > 0"
severity: critical
- name: Prometheus TSDB WAL corruptions
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions"
query: "increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0"
severity: critical
- name: Prometheus TSDB WAL truncations failed
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures"
query: "increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0"
severity: critical
- name: Prometheus timeseries cardinality
description: 'The "{{ $labels.name }}" timeseries cardinality is getting very high: {{ $value }}'
query: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
severity: warning
- name: Host and hardware
exporters:
- name: node-exporter
slug: node-exporter
doc_url: https://github.com/prometheus/node_exporter
rules:
- name: Host out of memory
description: Node memory is filling up (< 10% left)
query: "(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)"
severity: warning
for: 2m
- name: Host memory under memory pressure
description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s)."
query: "(rate(node_vmstat_pgmajfault[5m]) > 1000)"
severity: warning
- name: Host Memory is underutilized
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})"
query: "min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8"
severity: info
comments: |
You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- name: Host unusual network throughput in
description: Host receive bandwidth is high (>80%).
query: "((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0"
severity: warning
- name: Host unusual network throughput out
description: Host transmit bandwidth is high (>80%)
query: "((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0"
severity: warning
- name: Host disk IO utilization high
description: Disk utilization is high (> 80%)
query: "(rate(node_disk_io_time_seconds_total[5m]) > .80)"
severity: warning
- name: Host out of disk space
description: Disk is almost full (< 10% left)
query: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
severity: critical
comments: |
Please add ignored mountpoints in node_exporter parameters like
"--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
for: 2m
- name: Host disk may fill in 24 hours
description: Filesystem will likely run out of space within the next 24 hours.
query: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
severity: warning
comments: |
Please add ignored mountpoints in node_exporter parameters like
"--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
for: 2m
- name: Host out of inodes
description: Disk is almost running out of available inodes (< 10% left)
query: "(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) and node_filesystem_files > 0"
severity: critical
for: 2m
- name: Host filesystem device error
description: "Error stat-ing the {{ $labels.mountpoint }} filesystem"
query: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
severity: critical
for: 2m
- name: Host inodes may fill in 24 hours
description: Filesystem will likely run out of inodes within the next 24 hours at current write rate
query: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
severity: warning
for: 2m
- name: Host unusual disk read latency
description: Disk latency is growing (read operations > 100ms)
query: "(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)"
severity: warning
for: 2m
- name: Host unusual disk write latency
description: Disk latency is growing (write operations > 100ms)
query: "(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)"
severity: warning
for: 2m
- name: Host high CPU load
description: CPU load is > 80%
query: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
severity: warning
for: 10m
- name: Host CPU is underutilized
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs."
query: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
severity: info
for: 1w
comments: |
You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- name: Host CPU steal noisy neighbor
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
severity: warning
- name: Host CPU high iowait
description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond.
query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
severity: warning
- name: Host unusual disk IO
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues."
query: "rate(node_disk_io_time_seconds_total[5m]) > 0.8"
severity: warning
for: 5m
- name: Host context switching high
description: Context switching is growing on the node (twice the daily average during the last 15m)
query: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
severity: warning
comments: |
x2 context switches is an arbitrary number.
The alert threshold depends on the nature of the application.
Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- name: Host swap is filling up
description: Swap is filling up (>80%)
query: "((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) and node_memory_SwapTotal_bytes > 0"
severity: warning
for: 2m
- name: Host systemd service crashed
description: "systemd service {{ $labels.name }} crashed"
query: '(node_systemd_unit_state{state="failed"} == 1)'
severity: warning
- name: Host physical component too hot
description: "Physical hardware component too hot"
query: "node_hwmon_temp_celsius > node_hwmon_temp_max_celsius"
severity: warning
for: 5m
- name: Host node overtemperature alarm
description: "Physical node temperature alarm triggered"
query: "((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))"
severity: critical
- name: Host software RAID insufficient drives
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining."
query: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)'
comments: |
Uses ignoring(state) to handle additional labels on node_md_disks. Matches the official node-exporter mixin.
severity: critical
- name: Host software RAID disk failure
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention."
query: '(node_md_disks{state="failed"} > 0)'
severity: warning
for: 2m
- name: Host kernel version deviations
description: Kernel version for {{ $labels.instance }} has changed.
query: "changes(node_uname_info[1h]) > 0"
severity: info
- name: Host OOM kill detected
description: OOM kill detected
query: "(increase(node_vmstat_oom_kill[30m]) > 0)"
severity: warning
comments: |
When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 15–20 minutes to recover, the alert should still trigger.
- name: Host EDAC Correctable Errors detected
description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
query: "(increase(node_edac_correctable_errors_total[1m]) > 0)"
severity: info
- name: Host EDAC Uncorrectable Errors detected
description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
query: "(node_edac_uncorrectable_errors_total > 0)"
severity: warning
- name: Host Network Receive Errors
description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
query: "(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) and rate(node_network_receive_packets_total[2m]) > 0"
severity: warning
for: 2m
- name: Host Network Transmit Errors
description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
query: "(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) and rate(node_network_transmit_packets_total[2m]) > 0"
severity: warning
for: 2m
- name: Host Network Bond Degraded
description: 'Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".'
query: "((node_bonding_active - node_bonding_slaves) != 0)"
severity: warning
for: 2m
- name: Host conntrack limit
description: "The number of conntrack is approaching limit"
query: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) and node_nf_conntrack_entries_limit > 0"
severity: warning
for: 5m
- name: Host clock skew
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host."
query: "((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))"
severity: warning
for: 10m
- name: Host clock not synchronising
description: "Clock not synchronising. Ensure NTP is configured on this host."
query: "(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)"
severity: warning
for: 2m
- name: S.M.A.R.T Device Monitoring
exporters:
- name: smartctl-exporter
slug: smartctl-exporter
doc_url: https://github.com/prometheus-community/smartctl_exporter
rules:
- name: SMART device temperature warning
description: Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C
query: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60'
severity: warning
- name: SMART device temperature critical
description: Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C
query: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70'
severity: critical
- name: SMART device temperature over trip value
description: Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})
query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}'
severity: critical
- name: SMART device temperature nearing trip value
description: Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})
query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)'
severity: warning
- name: SMART status
description: Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})
query: "smartctl_device_smart_status != 1"
severity: critical
- name: SMART critical warning
description: Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})
query: "smartctl_device_critical_warning > 0"
severity: critical
- name: SMART media errors
description: Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})
query: "smartctl_device_media_errors > 0"
severity: critical
- name: SMART Wearout Indicator
description: Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})
query: "smartctl_device_available_spare < smartctl_device_available_spare_threshold"
severity: critical
- name: IPMI
exporters:
- name: prometheus-community/ipmi_exporter
slug: ipmi-exporter
doc_url: https://github.com/prometheus-community/ipmi_exporter
rules:
- name: IPMI collector down
description: "IPMI collector {{ $labels.collector }} on {{ $labels.instance }} failed to scrape sensor data. Check FreeIPMI tools and BMC connectivity."
query: 'ipmi_up == 0'
severity: warning
for: 5m
comments: |
The ipmi_up metric is per-collector. A value of 0 means the collector could not retrieve data from the BMC.
- name: IPMI temperature sensor warning
description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state."
query: 'ipmi_temperature_state == 1'
severity: warning
for: 5m
comments: |
State values: 0=nominal, 1=warning, 2=critical. Thresholds are defined in the BMC firmware.
- name: IPMI temperature sensor critical
description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Immediate attention required to prevent hardware damage."
query: 'ipmi_temperature_state == 2'
severity: critical
- name: IPMI fan speed sensor warning
description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state."
query: 'ipmi_fan_speed_state == 1'
severity: warning
for: 5m
- name: IPMI fan speed sensor critical
description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. A fan may have failed."
query: 'ipmi_fan_speed_state == 2'
severity: critical
- name: IPMI fan speed zero
description: "IPMI fan {{ $labels.name }} on {{ $labels.instance }} reports 0 RPM. The fan may have failed."
query: 'ipmi_fan_speed_rpm == 0'
severity: critical
for: 5m
- name: IPMI voltage sensor warning
description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state."
query: 'ipmi_voltage_state == 1'
severity: warning
for: 5m
- name: IPMI voltage sensor critical
description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Power supply or motherboard issue possible."
query: 'ipmi_voltage_state == 2'
severity: critical
- name: IPMI current sensor warning
description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state."
query: 'ipmi_current_state == 1'
severity: warning
for: 5m
- name: IPMI current sensor critical
description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state."
query: 'ipmi_current_state == 2'
severity: critical
- name: IPMI power sensor warning
description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state."
query: 'ipmi_power_state == 1'
severity: warning
for: 5m
- name: IPMI power sensor critical
description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state."
query: 'ipmi_power_state == 2'
severity: critical
- name: IPMI generic sensor critical
description: "IPMI sensor {{ $labels.name }} (type={{ $labels.type }}) on {{ $labels.instance }} is in critical state."
query: 'ipmi_sensor_state == 2'
severity: critical
for: 5m
comments: |
Catches any sensor type not covered by the specific temperature/fan/voltage/current/power alerts.
- name: IPMI chassis power off
description: "IPMI reports chassis power is off on {{ $labels.instance }}. The server may have shut down unexpectedly."
query: 'ipmi_chassis_power_state == 0'
severity: critical
- name: IPMI chassis drive fault
description: "IPMI reports a drive fault on {{ $labels.instance }}. Check disk health."
query: 'ipmi_chassis_drive_fault_state == 0'
severity: critical
comments: |
The metric uses inverted logic: 1=no fault, 0=fault detected.
- name: IPMI chassis cooling fault
description: "IPMI reports a cooling/fan fault on {{ $labels.instance }}. Check fans and airflow."
query: 'ipmi_chassis_cooling_fault_state == 0'
severity: critical
comments: |
The metric uses inverted logic: 1=no fault, 0=fault detected.
- name: IPMI SEL almost full
description: "IPMI System Event Log on {{ $labels.instance }} has only {{ printf \"%.0f\" $value }} bytes free. Clear the SEL to prevent loss of new events."
query: 'ipmi_sel_free_space_bytes < 512'
severity: warning
for: 5m
comments: |
SEL storage is typically very limited (e.g., 16KB). When full, new events may be dropped.
- name: Docker containers
exporters:
- name: google/cAdvisor
slug: google-cadvisor
doc_url: https://github.com/google/cadvisor
rules:
- name: Container killed
description: A container has disappeared
query: "time() - container_last_seen > 60"
severity: warning
comments: |
This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
- name: Container absent
description: A container is absent for 5 min
query: "absent(container_last_seen)"
severity: warning
for: 5m
comments: |
This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
- name: Container High CPU utilization
description: 'Container CPU utilization is above 80% (current: {{ $value | printf "%.2f" }}%)'
query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0'
comments: |
Only fires for containers with explicit CPU limits. Containers without limits have cpu_quota=0, which is filtered out by the guard.
severity: warning
for: 2m
- name: Container High Memory usage
description: Container Memory usage is above 80%
query: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
severity: warning
comments: See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
for: 2m
- name: Container Volume usage
description: Container Volume usage is above 80%
query: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80 and sum(container_fs_inodes_total) BY (instance) > 0'
severity: warning
for: 2m
- name: Container high throttle rate
description: "Container is being throttled ({{ $value | humanizePercentage }})"
query: 'sum(rate(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0'
severity: warning
for: 5m
- name: Container high low change CPU usage
description: This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%.
query: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25'
severity: info
- name: Container Low CPU utilization
description: 'Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU. (current: {{ $value | printf "%.2f" }}%)'
query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
severity: info
for: 7d
- name: Container Low Memory usage
description: Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.
query: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20'
severity: info
for: 7d
- name: Blackbox
exporters:
- name: prometheus/blackbox_exporter
slug: blackbox-exporter
doc_url: https://github.com/prometheus/blackbox_exporter
rules:
- name: Blackbox probe failed
description: Probe failed
query: probe_success == 0
severity: critical
- name: Blackbox configuration reload failure
description: Blackbox configuration reload failure
query: "blackbox_exporter_config_last_reload_successful != 1"
severity: warning
- name: Blackbox slow probe
description: Blackbox probe took more than 1s to complete
query: "probe_duration_seconds > 1"
severity: warning
for: 1m
- name: Blackbox probe HTTP failure
description: HTTP status code is not 200-399
query: "probe_http_status_code <= 199 OR probe_http_status_code >= 400"
severity: critical
- name: Blackbox SSL certificate will expire soon
description: SSL certificate expires in less than 20 days
query: "3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20"
severity: warning
- name: Blackbox SSL certificate will expire very soon
description: SSL certificate expires in less than 3 days
query: "0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3"
severity: critical
- name: Blackbox SSL certificate expired
description: SSL certificate has expired already
query: "round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0"
severity: critical
comments: |
For probe_ssl_earliest_cert_expiry to be exposed after expiration, you
need to enable insecure_skip_verify. Note that this will disable
certificate validation.
See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config
- name: Blackbox probe slow HTTP
description: HTTP request took more than 1s
query: "probe_http_duration_seconds > 1"
severity: warning
for: 1m
- name: Blackbox probe slow ping
description: Blackbox ping took more than 1s
query: "probe_icmp_duration_seconds > 1"
severity: warning
for: 1m
- name: Windows Server
exporters:
- name: prometheus-community/windows_exporter
slug: windows-exporter
doc_url: https://github.com/prometheus-community/windows_exporter
rules:
- name: Windows Server collector Error
description: "Collector {{ $labels.collector }} was not successful"
query: "windows_exporter_collector_success == 0"
severity: critical
- name: Windows Server service Status
description: Windows Service state is not OK
query: 'windows_service_status{status="ok"} != 1'
severity: critical
for: 1m
- name: Windows Server CPU Usage
description: CPU Usage is more than 80%
query: '100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80'
severity: warning
- name: Windows Server memory Usage
description: Memory usage is more than 90%
query: "100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90"
severity: warning
for: 2m
- name: Windows Server disk Space Usage
description: Disk usage is more than 80%
query: "100 - 100 * (windows_logical_disk_free_bytes / windows_logical_disk_size_bytes) > 80 and windows_logical_disk_size_bytes > 0"
severity: critical
for: 2m
- name: VMware
exporters:
- name: pryorda/vmware_exporter
slug: pryorda-vmware-exporter
doc_url: https://github.com/pryorda/vmware_exporter
rules:
- name: Virtual Machine Memory Warning
description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
query: "vmware_vm_mem_usage_average / 100 >= 80 and vmware_vm_mem_usage_average / 100 < 90"
severity: warning
for: 5m
- name: Virtual Machine Memory Critical
description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
query: "vmware_vm_mem_usage_average / 100 >= 90"
severity: critical
for: 1m
- name: High Number of Snapshots
description: "High snapshots number on {{ $labels.instance }}: {{ $value }}"
query: "vmware_vm_snapshots > 3"
severity: warning
for: 30m
- name: Outdated Snapshots
description: 'Outdated snapshots on {{ $labels.instance }}: {{ $value | printf "%.0f"}} days'
query: "(time() - vmware_vm_snapshot_timestamp_seconds) / (60 * 60 * 24) >= 3"
severity: warning
for: 5m
- name: Proxmox VE
exporters:
- name: prometheus-pve/prometheus-pve-exporter
slug: prometheus-pve-exporter
doc_url: https://github.com/prometheus-pve/prometheus-pve-exporter
rules:
- name: PVE node down
description: 'Proxmox VE node {{ $labels.id }} is down.'
query: 'pve_up{id=~"node/.*"} == 0'
severity: critical
for: 2m
- name: PVE VM/CT down
description: 'Proxmox VE guest {{ $labels.id }} is not running.'
query: 'pve_up{id=~"(qemu|lxc)/.*"} == 0'
severity: warning
for: 5m
comments: |
This alert triggers for all VMs and containers that are not running.
You may want to filter by specific guests using the `id` label, or exclude
intentionally stopped guests with additional label matchers.
- name: PVE high CPU usage
description: 'Proxmox VE CPU usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf "%.2f" }}%'
query: 'pve_cpu_usage_ratio * 100 > 90'
severity: warning
for: 5m
- name: PVE high memory usage
description: 'Proxmox VE memory usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf "%.2f" }}%'
query: 'pve_memory_usage_bytes / pve_memory_size_bytes * 100 > 90 and pve_memory_size_bytes > 0'
severity: warning
for: 5m
- name: PVE storage filling up
description: 'Proxmox VE storage {{ $labels.id }} is above 80% used. Current value: {{ $value | printf "%.2f" }}%'
query: 'pve_disk_usage_bytes{id=~"storage/.*"} / pve_disk_size_bytes{id=~"storage/.*"} * 100 > 80 and pve_disk_size_bytes{id=~"storage/.*"} > 0'
severity: warning
for: 5m
- name: PVE storage almost full
description: 'Proxmox VE storage {{ $labels.id }} is above 95% used. Current value: {{ $value | printf "%.2f" }}%'
query: 'pve_disk_usage_bytes{id=~"storage/.*"} / pve_disk_size_bytes{id=~"storage/.*"} * 100 > 95 and pve_disk_size_bytes{id=~"storage/.*"} > 0'
severity: critical
for: 2m
- name: PVE guest not backed up
description: '{{ $value }} Proxmox VE guest(s) are not covered by any backup job.'
query: 'pve_not_backed_up_total > 0'
severity: warning
- name: PVE replication failed
description: 'Proxmox VE replication for {{ $labels.id }} has {{ $value }} failed sync(s).'
query: 'pve_replication_failed_syncs > 0'
severity: warning
- name: PVE cluster not quorate
description: 'Proxmox VE cluster has lost quorum.'
query: 'pve_cluster_info{quorate="0"} == 1'
severity: critical
comments: |
Loss of quorum means the cluster cannot make decisions about VM placement
and fencing. This requires immediate attention.
- name: Netdata
exporters:
- name: Embedded exporter
slug: embedded-exporter
doc_url: https://github.com/netdata/netdata/blob/master/backends/prometheus/README.md
rules:
- name: Netdata high cpu usage
description: Netdata high CPU usage (> 80%)
query: 'netdata_cpu_cpu_percentage_average{dimension="idle"} < 20'
severity: warning
for: 5m
comments: |
This is a gauge metric (not a counter). Checking idle < 20% means CPU usage > 80%.
- name: Netdata CPU steal noisy neighbor
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
query: 'netdata_cpu_cpu_percentage_average{dimension="steal"} > 10'
severity: warning
for: 5m
- name: Netdata high memory usage
description: Netdata high memory usage (> 80%)
query: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20 and netdata_system_ram_MiB_average > 0'
severity: warning
for: 5m
- name: Netdata low disk space
description: Netdata low disk space (> 80%)
query: '100 / netdata_disk_space_GB_average * netdata_disk_space_GB_average{dimension=~"avail|cached"} < 20 and netdata_disk_space_GB_average > 0'
severity: warning
for: 5m
- name: Netdata predicted disk full
description: Netdata predicted disk full in 24 hours
query: 'predict_linear(netdata_disk_space_GB_average{dimension=~"avail|cached"}[3h], 24 * 3600) < 0'
severity: warning
- name: Netdata MD mismatch cnt unsynchronized blocks
description: RAID Array have unsynchronized blocks
query: "netdata_md_mismatch_cnt_unsynchronized_blocks_average > 1024"
severity: warning
for: 2m
- name: Netdata disk reallocated sectors
description: "Disk reallocated sectors detected ({{ $value }} sectors)"
query: "increase(netdata_smartd_log_reallocated_sectors_count_sectors_average[1m]) > 0"
severity: info
- name: Netdata disk current pending sector
description: Disk current pending sector
query: "netdata_smartd_log_current_pending_sector_count_sectors_average > 0"
severity: warning
- name: Netdata reported uncorrectable disk sectors
description: "Reported uncorrectable disk sectors ({{ $value }} sectors)"
query: "increase(netdata_smartd_log_offline_uncorrectable_sector_count_sectors_average[2m]) > 0"
severity: warning
- name: eBPF
exporters:
- name: cloudflare/ebpf_exporter
slug: ebpf-exporter
doc_url: https://github.com/cloudflare/ebpf_exporter
rules:
- name: eBPF exporter program not attached
description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})"
query: 'ebpf_exporter_ebpf_program_attached == 0'
severity: warning
for: 5m
comments: |
The exporter uses loose attachment: if a program fails to load (missing BTF, kernel incompatibility), it sets this metric to 0 and continues running.
- name: eBPF exporter decoder errors
description: "eBPF exporter is experiencing decoder errors for config {{ $labels.config }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})"
query: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0'
severity: warning
for: 5m
- name: eBPF exporter no enabled configs
description: "eBPF exporter has no enabled configurations. No eBPF programs are being run. (instance {{ $labels.instance }})"
query: 'ebpf_exporter_enabled_configs == 0 or absent(ebpf_exporter_enabled_configs)'
severity: warning
for: 5m
- name: Process Exporter
exporters:
- name: ncabatoff/process-exporter
slug: process-exporter
doc_url: https://github.com/ncabatoff/process-exporter
rules:
- name: Process exporter group down
description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})"
query: 'namedprocess_namegroup_num_procs == 0'
severity: warning
for: 5m
- name: Process exporter high memory usage
description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of resident memory. (instance {{ $labels.instance }})"
query: 'namedprocess_namegroup_memory_bytes{memtype="resident"} > 4e+09'
severity: warning
for: 5m
comments: |
Threshold of 4GB is arbitrary and depends on the process being monitored. Adjust per group.
- name: Process exporter high CPU usage
description: "Process group {{ $labels.groupname }} is using {{ $value }}% CPU (core-equivalent). (instance {{ $labels.instance }})"
query: 'rate(namedprocess_namegroup_cpu_seconds_total[5m]) * 100 > 80'
severity: warning
for: 5m
comments: |
Value is core-equivalent %: 100% = 1 full core, 200% = 2 cores, etc. Threshold of 80% is per-core. Adjust based on expected workload.
- name: Process exporter high file descriptor usage
description: "Process group {{ $labels.groupname }} is using more than 80% of its file descriptor limit. (instance {{ $labels.instance }})"
query: 'namedprocess_namegroup_worst_fd_ratio > 0.8'
severity: warning
for: 5m
- name: Process exporter file descriptors exhausted
description: "Process group {{ $labels.groupname }} has nearly exhausted its file descriptor limit. (instance {{ $labels.instance }})"
query: 'namedprocess_namegroup_worst_fd_ratio > 0.95'
severity: critical
for: 2m
- name: Process exporter high swap usage
description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})"
query: 'namedprocess_namegroup_memory_bytes{memtype="swapped"} > 512e+06'
severity: warning
for: 5m
comments: |
Threshold of 512MB is arbitrary. Adjust per group and environment.
- name: Process exporter zombie processes
description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})"
query: 'namedprocess_namegroup_states{state="Zombie"} > 5'
severity: warning
for: 5m
- name: Process exporter high context switching
description: "Process group {{ $labels.groupname }} has a high rate of context switches ({{ $value }}/s). (instance {{ $labels.instance }})"
query: 'rate(namedprocess_namegroup_context_switches_total{ctxswitchtype="voluntary"}[5m]) > 50000'
severity: warning
for: 5m
comments: |
Filters to voluntary switches only — involuntary switches are normal under CPU contention. Threshold of 50000/s is a rough default. Adjust based on workload.
- name: Process exporter high disk write IO
description: "Process group {{ $labels.groupname }} is performing {{ $value | humanize }}B/s of disk writes. (instance {{ $labels.instance }})"
query: 'rate(namedprocess_namegroup_write_bytes_total[5m]) > 100e+06'
severity: warning
for: 5m
comments: |
Threshold of 100MB/s is arbitrary. Adjust per group.
- name: Process exporter process restarting
description: "Process group {{ $labels.groupname }} has restarted (oldest process start time changed). (instance {{ $labels.instance }})"
query: 'changes(namedprocess_namegroup_oldest_start_time_seconds[5m]) > 0 and namedprocess_namegroup_num_procs > 0'
severity: info
comments: |
Detects restarts by watching for changes in the oldest process start time within the group.
- name: Systemd
exporters:
- name: prometheus-community/systemd_exporter
slug: systemd-exporter
doc_url: https://github.com/prometheus-community/systemd_exporter
rules:
- name: Systemd unit failed
description: "Systemd unit {{ $labels.name }} has entered failed state. (instance {{ $labels.instance }})"
query: 'systemd_unit_state{state="failed"} == 1'
severity: warning
for: 5m
- name: Systemd unit inactive
description: "Systemd unit {{ $labels.name }} is inactive. (instance {{ $labels.instance }})"
query: 'systemd_unit_state{state="inactive", type="service", name=~"your-critical-service.+"} == 1'
severity: warning
for: 5m
comments: |
Many units are legitimately inactive. You must adjust the name=~ filter to match your critical services.
- name: Systemd service crash looping
description: "Systemd service {{ $labels.name }} has restarted {{ $value }} times in the last hour. (instance {{ $labels.instance }})"
query: 'increase(systemd_service_restart_total[1h]) > 5'
severity: critical
for: 5m
- name: Systemd unit tasks near limit
description: "Systemd unit {{ $labels.name }} is using {{ $value | humanizePercentage }} of its task limit. (instance {{ $labels.instance }})"
query: 'systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max > 0.9 and ignoring(type) systemd_unit_tasks_max > 0'
severity: warning
for: 5m
- name: Systemd socket refused connections
description: "Systemd socket {{ $labels.name }} is refusing connections. ({{ $value }} refused in last 5m, instance {{ $labels.instance }})"
query: 'increase(systemd_socket_refused_connections_total[5m]) > 0'
severity: warning
for: 2m
- name: Systemd socket high connections
description: "Systemd socket {{ $labels.name }} has {{ $value }} active connections. (instance {{ $labels.instance }})"
query: 'systemd_socket_current_connections > 100'
severity: warning
for: 2m
comments: |
Threshold of 100 connections is arbitrary. Adjust to your workload.
- name: Systemd timer missed trigger
description: "Systemd timer {{ $labels.name }} has not triggered for over 24 hours. (instance {{ $labels.instance }})"
query: '(time() - systemd_timer_last_trigger_seconds) / 3600 > 24 and systemd_timer_last_trigger_seconds > 0'
severity: warning
for: 5m
comments: |
Triggers if timer hasn't fired in 24 hours. Adjust threshold per timer schedule.
- name: Databases
services:
- name: MySQL
exporters:
- name: prometheus/mysqld_exporter
slug: mysqld-exporter
doc_url: https://github.com/prometheus/mysqld_exporter
rules:
- name: MySQL down
description: MySQL instance is down on {{ $labels.instance }}
query: "mysql_up == 0"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: MySQL too many connections (> 80%)
description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}"
query: "max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80 and mysql_global_variables_max_connections > 0"
severity: warning
for: 2m
- name: MySQL high prepared statements utilization (> 80%)
description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}"
query: "max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80 and mysql_global_variables_max_prepared_stmt_count > 0"
severity: warning
for: 2m
- name: MySQL high threads running
description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}"
query: "max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60 and mysql_global_variables_max_connections > 0"
severity: warning
for: 2m
- name: MySQL Slave IO thread not running
description: "MySQL Slave IO thread not running on {{ $labels.instance }}"
query: "( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: MySQL Slave SQL thread not running
description: "MySQL Slave SQL thread not running on {{ $labels.instance }}"
query: "( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: MySQL Slave replication lag
description: "MySQL replication lag on {{ $labels.instance }}"
query: "( (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and ON (instance) mysql_slave_status_master_server_id > 0 ) > 30"
severity: critical
for: 1m
- name: MySQL slow queries
description: "MySQL server mysql has some new slow query ({{ $value }} in the last minute)."
query: increase(mysql_global_status_slow_queries[1m]) > 0
severity: warning
for: 2m
- name: MySQL InnoDB log waits
description: "MySQL innodb log writes stalling ({{ $value }} waits/s)"
query: rate(mysql_global_status_innodb_log_waits[15m]) > 10
severity: warning
- name: MySQL restarted
description: MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.
query: "mysql_global_status_uptime < 60"
severity: info
- name: MySQL High QPS
description: MySQL is being overload with unusual QPS (> 10k QPS).
query: "irate(mysql_global_status_questions[1m]) > 10000"
severity: info
for: 2m
- name: MySQL too many open files
description: MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.
query: "mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75 and mysql_global_variables_open_files_limit > 0"
severity: warning
for: 2m
- name: MySQL InnoDB Force Recovery is enabled
description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}"
query: "mysql_global_variables_innodb_force_recovery != 0"
severity: warning
for: 2m
- name: MySQL InnoDB history_len too long
description: "MySQL history_len (undo log) too long on {{ $labels.instance }}"
query: "mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000"
severity: warning
for: 2m
- name: PostgreSQL
exporters:
- name: prometheus-community/postgres_exporter
slug: postgres-exporter
doc_url: https://github.com/prometheus-community/postgres_exporter
rules:
- name: Postgresql down
description: Postgresql instance is down
query: "pg_up == 0"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: Postgresql restarted
description: Postgresql restarted
query: "time() - pg_postmaster_start_time_seconds < 60"
severity: critical
- name: Postgresql exporter error
description: Postgresql exporter is showing errors. A query may be buggy in query.yaml
query: "pg_exporter_last_scrape_error > 0"
severity: critical
- name: Postgresql table not auto vacuumed
description: Table {{ $labels.relname }} has not been auto vacuumed for 10 days
query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10"
severity: warning
- name: Postgresql table not auto analyzed
description: Table {{ $labels.relname }} has not been auto analyzed for 10 days
query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10"
severity: warning
- name: Postgresql too many connections
description: PostgreSQL instance has too many connections (> 80%).
query: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)"
severity: warning
for: 2m
- name: Postgresql not enough connections
description: PostgreSQL instance should have more connections (> 5)
query: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
severity: critical
for: 2m
- name: Postgresql dead locks
description: "PostgreSQL has dead-locks ({{ $value }} in the last minute)"
query: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
severity: warning
- name: Postgresql high rollback rate
description: Ratio of transactions being aborted compared to committed is > 2 %
query: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
severity: warning
- name: Postgresql commit rate low
description: Postgresql seems to be processing very few transactions
query: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
severity: critical
for: 2m
- name: Postgresql low XID consumption
description: Postgresql seems to be consuming transaction IDs very slowly
query: "rate(pg_txid_current[1m]) < 5"
severity: warning
for: 2m
- name: Postgresql unused replication slot
description: Unused Replication Slots
query: "(pg_replication_slots_active == 0) and (pg_replication_is_replica == 0)"
severity: warning
for: 1m
- name: Postgresql too many dead tuples
description: PostgreSQL dead tuples is too large
query: "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 and (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) > 0"
severity: warning
for: 2m
- name: Postgresql configuration changed
description: Postgres Database configuration change has occurred
query: '{__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} != ON(__name__, instance) {__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} OFFSET 5m'
severity: info
- name: Postgresql SSL compression active
description: Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.
query: "sum by (instance) (pg_stat_ssl_compression) > 0"
severity: warning
- name: Postgresql too many locks acquired
description: Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.
query: "((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 and (pg_settings_max_locks_per_transaction * pg_settings_max_connections) > 0"
severity: critical
for: 2m
- name: Postgresql bloat index high (> 80%)
description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`"
query: "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)"
severity: warning
for: 1h
comments: |
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- name: Postgresql bloat table high (> 80%)
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`"
query: "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)"
severity: warning
for: 1h
comments: |
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- name: Postgresql invalid index
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`"
query: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
severity: warning
for: 6h
comments: |
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- name: Postgresql replication lag
description: The PostgreSQL replication lag is high (> 5s)
query: "pg_replication_lag_seconds > 5"
severity: warning
for: 30s
- name: SQL Server
exporters:
- name: Ozarklake/prometheus-mssql-exporter
slug: ozarklake-mssql-exporter
doc_url: https://github.com/Ozarklake/prometheus-mssql-exporter
rules:
- name: SQL Server down
description: SQL server instance is down
query: mssql_up == 0
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: SQL Server deadlock
description: SQL Server {{ $labels.instance }} is experiencing deadlocks ({{ $value }}/s)
query: mssql_deadlocks > 5
severity: warning
for: 1m
- name: Oracle Database
exporters:
- name: iamseth/oracledb_exporter
slug: iamseth-oracledb-exporter
doc_url: https://github.com/iamseth/oracledb_exporter
rules:
- name: Oracle DB down
description: Oracle Database instance is down on {{ $labels.instance }}
query: "oracledb_up == 0"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: Oracle DB sessions reaching limit (> 85%)
description: "Oracle Database session utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)"
query: "oracledb_resource_current_utilization{resource_name=\"sessions\"} / oracledb_resource_limit_value{resource_name=\"sessions\"} * 100 > 85 and oracledb_resource_limit_value{resource_name=\"sessions\"} > 0"
severity: warning
for: 5m
comments: |
Threshold is workload-dependent. Adjust 85% to suit your environment.
- name: Oracle DB processes reaching limit (> 85%)
description: "Oracle Database process utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)"
query: "oracledb_resource_current_utilization{resource_name=\"processes\"} / oracledb_resource_limit_value{resource_name=\"processes\"} * 100 > 85 and oracledb_resource_limit_value{resource_name=\"processes\"} > 0"
severity: warning
for: 5m
comments: |
Threshold is workload-dependent. Adjust 85% to suit your environment.
- name: Oracle DB tablespace reaching capacity (> 85%)
description: "Oracle Database tablespace {{ $labels.tablespace }} is above 85% usage on {{ $labels.instance }} (current value: {{ $value }}%)"
query: "oracledb_tablespace_used_percent > 85"
severity: warning
for: 5m
- name: Oracle DB tablespace full (> 95%)
description: "Oracle Database tablespace {{ $labels.tablespace }} is critically full on {{ $labels.instance }} (current value: {{ $value }}%)"
query: "oracledb_tablespace_used_percent > 95"
severity: critical
for: 5m
- name: Oracle DB high user rollbacks
description: "Oracle Database on {{ $labels.instance }} has a high rollback rate ({{ $value }}% of transactions are rolled back)"
query: "rate(oracledb_activity_user_rollbacks[5m]) / (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) * 100 > 20 and (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) > 0"
severity: warning
for: 5m
comments: |
A high rollback rate (>20%) often indicates application-level issues such as deadlocks, constraint violations, or poorly designed transactions.
- name: Oracle DB too many active sessions
description: "Oracle Database on {{ $labels.instance }} has too many active user sessions (current value: {{ $value }})"
query: "oracledb_sessions_value{status=\"ACTIVE\", type=\"USER\"} > 200"
severity: warning
for: 5m
comments: |
Threshold is highly workload-dependent. Adjust 200 to suit your environment.
- name: Oracle DB high wait time (user I/O)
description: "Oracle Database on {{ $labels.instance }} is experiencing high user I/O wait time"
query: "oracledb_wait_time_user_io > 300"
severity: warning
for: 5m
comments: |
The metric from v$waitclassmetric is already a normalized rate (centiseconds per second). Threshold 300 means 3 seconds of I/O wait per second of wall time.
- name: Patroni
exporters:
- name: Embedded exporter (Patroni >= 2.1.0)
slug: embedded-exporter-patroni
doc_url: https://patroni.readthedocs.io/en/latest/rest_api.html?highlight=prometheus#monitoring-endpoint
rules:
- name: Patroni has no Leader
description: A leader node (neither primary nor standby) cannot be found inside the cluster {{ $labels.scope }}
query: (max by (scope) (patroni_primary) < 1) and (max by (scope) (patroni_standby_leader) < 1)
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: PGBouncer
exporters:
- name: spreaker/prometheus-pgbouncer-exporter
slug: spreaker-pgbouncer-exporter
doc_url: https://github.com/spreaker/prometheus-pgbouncer-exporter
rules:
- name: PGBouncer active connections
description: PGBouncer pools are filling up
query: "pgbouncer_pools_server_active_connections > 200"
severity: warning
for: 2m
- name: PGBouncer errors
description: PGBouncer is logging errors. This may be due to a server restart or an admin typing commands at the pgbouncer console.
query: 'increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[1m]) > 10'
severity: warning
- name: PGBouncer max connections
description: The number of PGBouncer client connections has reached max_client_conn.
query: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[2m]) > 0'
severity: critical
- name: Redis
exporters:
- name: oliver006/redis_exporter
slug: oliver006-redis-exporter
doc_url: https://github.com/oliver006/redis_exporter
rules:
- name: Redis down
description: Redis instance is down
query: "redis_up == 0"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: Redis missing master
description: Redis cluster has no node marked as master.
query: '(count(redis_instance_info{role="master"}) or vector(0)) < 1'
severity: critical
- name: Redis too many masters
description: Redis cluster has too many nodes marked as master.
query: 'count(redis_instance_info{role="master"}) > 1'
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: Redis disconnected slaves
description: Redis not replicating for all slaves. Consider reviewing the redis replication status.
query: "count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 0"
severity: critical
- name: Redis replication broken
description: Redis instance lost a slave
query: "delta(redis_connected_slaves[1m]) < 0"
severity: critical
- name: Redis cluster flapping
description: Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).
query: "changes(redis_connected_slaves[1m]) > 1"
severity: critical
for: 2m
- name: Redis missing backup
description: Redis has not been backed up for 48 hours
query: "time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 48"
severity: critical
- name: Redis out of system memory
description: Redis is running out of system memory (> 90%)
query: "redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90 and redis_total_system_memory_bytes > 0"
severity: warning
for: 2m
comments: |
The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
- name: Redis out of configured maxmemory
description: Redis is running out of configured maxmemory (> 90%)
query: "redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0"
severity: warning
for: 2m
- name: Redis too many connections
description: Redis is running out of connections (> 90% used)
query: "redis_connected_clients / redis_config_maxclients * 100 > 90 and redis_config_maxclients > 0"
severity: warning
for: 2m
- name: Redis not enough connections
description: Redis instance should have more connections (> 5)
query: "redis_connected_clients < 5"
severity: warning
for: 2m
- name: Redis rejected connections
description: Some connections to Redis has been rejected
query: "increase(redis_rejected_connections_total[1m]) > 5"
severity: warning
- name: Memcached
exporters:
- name: prometheus/memcached_exporter
slug: memcached-exporter
doc_url: https://github.com/prometheus/memcached_exporter
rules:
- name: Memcached down
description: Memcached instance is down on {{ $labels.instance }}
query: "memcached_up == 0"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: Memcached connection limit approaching (> 80%)
description: "Memcached connection usage is above 80% on {{ $labels.instance }} (current value: {{ $value }}%)"
query: "(memcached_current_connections / memcached_max_connections * 100) > 80 and memcached_max_connections > 0"
severity: warning
for: 2m
- name: Memcached connection limit approaching (> 95%)
description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)"
query: "(memcached_current_connections / memcached_max_connections * 100) > 95 and memcached_max_connections > 0"
severity: critical
for: 2m
- name: Memcached out of memory errors
description: "Memcached is returning out-of-memory errors on {{ $labels.instance }}"
query: "sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0"
severity: warning
for: 5m
- name: Memcached memory usage high (> 90%)
description: "Memcached memory usage is above 90% on {{ $labels.instance }} (current value: {{ $value }}%)"
query: "(memcached_current_bytes / memcached_limit_bytes * 100) > 90 and memcached_limit_bytes > 0"
severity: warning
for: 5m
comments: |
High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions.
- name: Memcached high eviction rate
description: "Memcached is evicting items at a high rate on {{ $labels.instance }} ({{ $value }} evictions/s)"
query: "rate(memcached_items_evicted_total[5m]) > 10"
severity: warning
for: 5m
comments: |
A sustained eviction rate indicates memory pressure. Consider increasing memcached memory limit or reducing cache usage. Threshold of 10 evictions/s is a rough default — adjust based on your workload.
- name: Memcached low cache hit rate (< 80%)
description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)"
query: '(rate(memcached_commands_total{command="get", status="hit"}[5m]) / (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) * 100) < 80 and (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) > 0'
severity: warning
for: 10m
comments: |
A low hit rate may indicate poor cache utilization, incorrect cache keys, or TTLs that are too short. Threshold of 80% is a rough default — adjust based on your workload and access patterns.
- name: Memcached connections rejected
description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)"
query: "increase(memcached_connections_rejected_total[5m]) > 0"
severity: warning
for: 5m
- name: Memcached items too large
description: "Memcached is rejecting items exceeding max-item-size on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)"
query: "increase(memcached_item_too_large_total[5m]) > 0"
severity: info
for: 5m
- name: MongoDB
exporters:
- name: percona/mongodb_exporter
slug: percona-mongodb-exporter
doc_url: https://github.com/percona/mongodb_exporter
rules:
- name: MongoDB Down
description: MongoDB instance is down
query: "mongodb_up == 0"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: Mongodb replica member unhealthy
description: MongoDB replica member is not healthy
query: "mongodb_rs_members_health == 0"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: MongoDB replication lag (Percona)
description: Mongodb replication lag is more than 10s
query: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10'
severity: critical
- name: MongoDB replication headroom
description: MongoDB replication headroom is <= 0
query: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0'
severity: critical
comments: |
This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both.
- name: MongoDB number cursors open (Percona)
description: Too many cursors opened by MongoDB for clients (> 10k)
query: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000'
severity: warning
for: 2m
- name: MongoDB cursors timeouts (Percona)
description: "Too many cursors are timing out ({{ $value }} in the last minute)"
query: "increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100"
severity: warning
for: 2m
- name: MongoDB too many connections (Percona)
description: Too many connections (> 80%)
query: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80 and (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) > 0'
severity: warning
for: 2m
- name: dcu/mongodb_exporter
slug: dcu-mongodb-exporter
doc_url: https://github.com/dcu/mongodb_exporter
rules:
- name: MongoDB replication lag (DCU)
description: Mongodb replication lag is more than 10s
query: 'avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}) > 10'
severity: critical
- name: MongoDB replication Status 3
description: MongoDB Replication set member either perform startup self-checks, or transition from completing a rollback or resync
query: "mongodb_replset_member_state == 3"
severity: critical
- name: MongoDB replication Status 6
description: MongoDB Replication set member as seen from another member of the set, is not yet known
query: "mongodb_replset_member_state == 6"
severity: critical
- name: MongoDB replication Status 8
description: MongoDB Replication set member as seen from another member of the set, is unreachable
query: "mongodb_replset_member_state == 8"
severity: critical
- name: MongoDB replication Status 9
description: MongoDB Replication set member is actively performing a rollback. Data is not available for reads
query: "mongodb_replset_member_state == 9"
severity: critical
- name: MongoDB replication Status 10
description: MongoDB Replication set member was once in a replica set but was subsequently removed
query: "mongodb_replset_member_state == 10"
severity: critical
- name: MongoDB number cursors open (DCU)
description: Too many cursors opened by MongoDB for clients (> 10k)
query: 'mongodb_metrics_cursor_open{state="total_open"} > 10000'
severity: warning
for: 2m
- name: MongoDB cursors timeouts (DCU)
description: "Too many cursors are timing out ({{ $value }} in the last minute)"
query: "increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100"
severity: warning
for: 2m
- name: MongoDB too many connections (DCU)
description: Too many connections (> 80%)
query: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80 and (mongodb_connections{state="current"} + mongodb_connections{state="available"}) > 0'
severity: warning
for: 2m
- name: stefanprodan/mgob
slug: stefanprodan-mgob-exporter
doc_url: https://github.com/stefanprodan/mgob
rules:
- name: Mgob backup failed
description: MongoDB backup has failed
query: 'changes(mgob_scheduler_backup_total{status="500"}[1h]) > 0'
severity: critical
- name: Elasticsearch
exporters:
- name: prometheus-community/elasticsearch_exporter
slug: prometheus-community-elasticsearch-exporter
doc_url: https://github.com/prometheus-community/elasticsearch_exporter
rules:
- name: Elasticsearch Heap Usage Too High
description: "The heap usage is over 90%"
query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0'
severity: critical
for: 2m
- name: Elasticsearch Heap Usage warning
description: "The heap usage is over 80%"
query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0'
severity: warning
for: 2m
- name: Elasticsearch disk out of space
description: The disk usage is over 90%
query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10 and elasticsearch_filesystem_data_size_bytes > 0"
severity: critical
- name: Elasticsearch disk space low
description: The disk usage is over 80%
query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20 and elasticsearch_filesystem_data_size_bytes > 0"
severity: warning
for: 2m
- name: Elasticsearch Cluster Red
description: Elastic Cluster Red status
query: 'elasticsearch_cluster_health_status{color="red"} == 1'
severity: critical
- name: Elasticsearch Cluster Yellow
description: Elastic Cluster Yellow status
query: 'elasticsearch_cluster_health_status{color="yellow"} == 1'
severity: warning
- name: Elasticsearch Healthy Nodes
description: "Missing node in Elasticsearch cluster"
query: "elasticsearch_cluster_health_number_of_nodes < 3"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: Elasticsearch Healthy Data Nodes
description: "Missing data node in Elasticsearch cluster"
query: "elasticsearch_cluster_health_number_of_data_nodes < 3"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: Elasticsearch relocating shards
description: "Elasticsearch is relocating shards"
query: "elasticsearch_cluster_health_relocating_shards > 0"
severity: info
- name: Elasticsearch relocating shards too long
description: "Elasticsearch has been relocating shards for 15min"
query: "elasticsearch_cluster_health_relocating_shards > 0"
severity: warning
for: 15m
- name: Elasticsearch initializing shards
description: "Elasticsearch is initializing shards"
query: "elasticsearch_cluster_health_initializing_shards > 0"
severity: info
- name: Elasticsearch initializing shards too long
description: "Elasticsearch has been initializing shards for 15 min"
query: "elasticsearch_cluster_health_initializing_shards > 0"
severity: warning
for: 15m
- name: Elasticsearch unassigned shards
description: "Elasticsearch has unassigned shards"
query: "elasticsearch_cluster_health_unassigned_shards > 0"
severity: critical
for: 2m
- name: Elasticsearch pending tasks
description: "Elasticsearch has pending tasks. Cluster works slowly."
query: "elasticsearch_cluster_health_number_of_pending_tasks > 0"
severity: warning
for: 15m
- name: Elasticsearch no new documents
description: "No new documents for 10 min!"
query: 'increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1'
severity: warning
- name: Elasticsearch High Indexing Latency
description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s)."
query: "rate(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / rate(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and rate(elasticsearch_indices_indexing_index_total[1m]) > 0"
severity: warning
for: 10m
- name: Elasticsearch High Indexing Rate
description: "The indexing rate on Elasticsearch cluster is higher than the threshold."
query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000"
severity: warning
for: 5m
- name: Elasticsearch High Query Rate
description: "The query rate on Elasticsearch cluster is higher than the threshold."
query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100"
severity: warning
for: 5m
- name: Elasticsearch High Query Latency
description: "The query latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s)."
query: "rate(elasticsearch_indices_search_query_time_seconds[1m]) / rate(elasticsearch_indices_search_query_total[1m]) > 1 and rate(elasticsearch_indices_search_query_total[1m]) > 0"
severity: warning
for: 5m
- name: Meilisearch
exporters:
- name: Embedded exporter
slug: embedded-exporter
doc_url: https://github.com/orgs/meilisearch/discussions/625
rules:
- name: Meilisearch index is empty
description: Meilisearch index {{ $labels.index }} has zero documents
query: "meilisearch_index_docs_count == 0"
severity: warning
- name: Meilisearch http response time
description: Meilisearch http response time is too high
query: "meilisearch_http_response_time_seconds > 0.5"
severity: warning
- name: Cassandra
exporters:
- name: instaclustr/cassandra-exporter
slug: instaclustr-cassandra-exporter
doc_url: https://github.com/instaclustr/cassandra-exporter
rules:
- name: "Cassandra Node is unavailable"
description: "Cassandra Node is unavailable - {{ $labels.cassandra_cluster }} {{ $labels.exported_endpoint }}"
query: "cassandra_endpoint_active < 1"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: "Cassandra many compaction tasks are pending"
description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}"
query: "cassandra_table_estimated_pending_compactions > 100"
severity: warning
- name: "Cassandra commitlog pending tasks (Instaclustr)"
description: "Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}"
query: "cassandra_commit_log_pending_tasks > 15"
for: 2m
severity: warning
- name: "Cassandra compaction executor blocked tasks (Instaclustr)"
description: "Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}"
query: 'cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15'
for: 2m
severity: warning
- name: "Cassandra flush writer blocked tasks (Instaclustr)"
description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}"
query: 'cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15'
for: 2m
severity: warning
- name: "Cassandra connection timeouts total (Instaclustr)"
description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}"
query: "sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5"
for: 2m
severity: critical
- name: "Cassandra storage exceptions (Instaclustr)"
description: "Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}"
query: "changes(cassandra_storage_exceptions_total[1m]) > 1"
severity: critical
- name: "Cassandra tombstone dump (Instaclustr)"
description: "Cassandra tombstone dump - {{ $labels.cassandra_cluster }}"
query: 'avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100'
for: 2m
severity: critical
- name: "Cassandra client request unavailable write (Instaclustr)"
description: "Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}"
query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0'
for: 2m
severity: critical
- name: "Cassandra client request unavailable read (Instaclustr)"
description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}"
query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0'
for: 2m
severity: critical
- name: "Cassandra client request write failure (Instaclustr)"
description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}"
query: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0'
for: 2m
severity: critical
- name: "Cassandra client request read failure (Instaclustr)"
description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}"
query: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'
for: 2m
severity: critical
- name: criteo/cassandra_exporter
slug: criteo-cassandra-exporter
doc_url: https://github.com/criteo/cassandra_exporter
rules:
- name: Cassandra hints count
description: Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down
query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:totalhints:count"}[1m]) > 3'
severity: critical
- name: Cassandra compaction task pending
description: Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster.
query: 'cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"} > 100'
severity: warning
for: 2m
- name: Cassandra viewwrite latency
description: High viewwrite latency on {{ $labels.instance }} cassandra node
query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile"} > 100000'
severity: warning
for: 2m
- name: Cassandra authentication failures
description: Increase of Cassandra authentication failures
query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
severity: warning
for: 2m
- name: Cassandra node down
description: Cassandra node down
query: 'sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0'
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: Cassandra commitlog pending tasks (Criteo)
description: Unexpected number of Cassandra commitlog pending tasks
query: 'cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15'
severity: warning
for: 2m
- name: Cassandra compaction executor blocked tasks (Criteo)
description: Some Cassandra compaction executor tasks are blocked
query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:compactionexecutor:currentlyblockedtasks:count"} > 0'
severity: warning
for: 2m
- name: Cassandra flush writer blocked tasks (Criteo)
description: Some Cassandra flush writer tasks are blocked
query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:memtableflushwriter:currentlyblockedtasks:count"} > 0'
severity: warning
for: 2m
- name: Cassandra repair pending tasks
description: Some Cassandra repair tasks are pending
query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:pendingtasks:value"} > 2'
severity: warning
for: 2m
- name: Cassandra repair blocked tasks
description: Some Cassandra repair tasks are blocked
query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:currentlyblockedtasks:count"} > 0'
severity: warning
for: 2m
- name: Cassandra connection timeouts total (Criteo)
description: Some connection between nodes are ending in timeout
query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
severity: critical
for: 2m
- name: Cassandra storage exceptions (Criteo)
description: Something is going wrong with cassandra storage
query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:exceptions:count"}[1m]) > 1'
severity: critical
- name: Cassandra tombstone dump (Criteo)
description: Too much tombstones scanned in queries
query: 'cassandra_stats{name="org:apache:cassandra:metrics:table:tombstonescannedhistogram:99thpercentile"} > 1000'
severity: critical
- name: Cassandra client request unavailable write (Criteo)
description: Write failures have occurred because too many nodes are unavailable
query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:unavailables:count"}[1m]) > 0'
severity: critical
- name: Cassandra client request unavailable read (Criteo)
description: Read failures have occurred because too many nodes are unavailable
query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:unavailables:count"}[1m]) > 0'
severity: critical
- name: Cassandra client request write failure (Criteo)
description: A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.
query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0'
severity: critical
- name: Cassandra client request read failure (Criteo)
description: A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.
query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0'
severity: critical
- name: Cassandra cache hit rate key cache
description: Key cache hit rate is below 85%
query: 'cassandra_stats{name="org:apache:cassandra:metrics:cache:keycache:hitrate:value"} < .85'
severity: critical
for: 2m
- name: Clickhouse
exporters:
- name: Embedded Exporter
slug: embedded-exporter
doc_url: https://clickhouse.com/docs/en/operations/system-tables/metrics
rules:
- name: ClickHouse node down
description: "No metrics received from ClickHouse exporter for over 2 minutes."
query: 'up{job="clickhouse"} == 0'
severity: critical
for: 2m
comments: |
Adjust the job label to match your Prometheus configuration.
- name: ClickHouse Memory Usage Critical
description: "Memory usage is critically high, over 90%."
query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0"
severity: critical
for: 5m
- name: ClickHouse Memory Usage Warning
description: "Memory usage is over 80%."
query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0"
severity: warning
for: 5m
- name: ClickHouse Disk Space Low on Default
description: "Disk space on default is below 20%."
query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0"
severity: warning
for: 2m
- name: ClickHouse Disk Space Critical on Default
description: "Disk space on default disk is critically low, below 10%."
query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0"
severity: critical
for: 2m
- name: ClickHouse Disk Space Low on Backups
description: "Disk space on backups is below 20%."
query: "ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) > 0"
severity: warning
for: 2m
- name: ClickHouse Replica Errors
description: "Critical replica errors detected, either all replicas are stale or lost."
query: "ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1"
severity: critical
- name: ClickHouse No Available Replicas
description: "No available replicas in ClickHouse."
query: "ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1"
severity: critical
- name: ClickHouse No Live Replicas
description: "There are too few live replicas available, risking data loss and service disruption."
query: "ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1"
severity: critical
- name: ClickHouse High TCP Connections
description: "High number of TCP connections, indicating heavy client or inter-cluster communication."
query: "ClickHouseMetrics_TCPConnection > 400"
severity: warning
for: 5m
comments: |
Please replace the threshold with an appropriate value
- name: ClickHouse Interserver Connection Issues
description: "High number of interserver connections may indicate replication or distributed query handling issues."
query: "ClickHouseMetrics_InterserverConnection > 50"
severity: warning
for: 5m
comments: |
Adjust the threshold based on your cluster size and expected replication traffic.
- name: ClickHouse ZooKeeper Connection Issues
description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination."
query: "ClickHouseMetrics_ZooKeeperSession != 1"
severity: warning
for: 3m
- name: ClickHouse Authentication Failures
description: "Authentication failures detected, indicating potential security issues or misconfiguration."
query: "increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 3"
severity: info
- name: ClickHouse Access Denied Errors
description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts."
query: "increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 3"
severity: info
- name: ClickHouse rejected insert queries
description: "INSERTs rejected due to too many active data parts. Reduce insert frequency."
query: "increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 0"
severity: warning
for: 1m
- name: ClickHouse delayed insert queries
description: "INSERTs delayed due to high number of active parts."
query: "increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 0"
severity: warning
for: 2m
- name: ClickHouse zookeeper hardware exception
description: "Zookeeper hardware exception: network issues communicating with ZooKeeper"
query: "increase(ClickHouseProfileEvents_ZooKeeperHardwareExceptions[1m]) > 0"
severity: critical
for: 1m
- name: ClickHouse high network usage
description: High network usage. ClickHouse network usage exceeds 100MB/s.
query: "rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024"
severity: warning
for: 2m
comments: |
Please replace the threshold with an appropriate value
- name: ClickHouse distributed rejected inserts
description: "INSERTs into Distributed tables rejected due to pending bytes limit."
query: "increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 0"
severity: critical
for: 2m
- name: CouchDB
exporters:
- name: gesellix/couchdb-prometheus-exporter
slug: gesellix-couchdb-prometheus-exporter
doc_url: https://github.com/gesellix/couchdb-prometheus-exporter
rules:
- name: CouchDB node down
description: CouchDB node is not responding (node_up metric is 0) for more than 2 minutes
query: "couchdb_httpd_node_up == 0 or couchdb_httpd_up == 0"
severity: critical
for: 2m
- name: CouchDB atom memory usage critical
description: Atom memory usage is above 90% of limit
query: "couchdb_erlang_memory_atom_used > 0.9 * couchdb_erlang_memory_atom"
severity: critical
for: 5m
- name: CouchDB open databases critical
description: Number of open databases exceeds 90% of node capacity
query: "couchdb_httpd_open_databases > 0.9 * 1000"
severity: critical
for: 5m
- name: CouchDB open OS files critical
description: CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files
query: "couchdb_httpd_open_os_files > 0.9 * 65535"
severity: critical
for: 5m
- name: CouchDB 5xx error ratio high
description: More than 5% of HTTP requests are returning 5xx errors
query: "rate(couchdb_httpd_status_codes{code=~\"5..\"}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05 and rate(couchdb_httpd_requests[5m]) > 0"
severity: critical
for: 5m
- name: CouchDB temporary view read rate critical
description: Temporary view read rate exceeds 100 reads/sec, high risk of performance degradation
query: "rate(couchdb_httpd_temporary_view_reads[5m]) > 100"
severity: critical
for: 5m
- name: CouchDB Mango queries scanning too many docs
description: Some Mango queries are scanning too many documents, consider adding indexes
query: "rate(couchdb_mango_too_many_docs_scanned[5m]) > 50"
severity: warning
for: 5m
- name: CouchDB Mango queries failed due to invalid index
description: Some Mango queries failed to execute because the index was missing or invalid
query: "rate(couchdb_mango_query_invalid_index[5m]) > 5"
severity: warning
for: 5m
- name: CouchDB Mango docs examined high
description: High number of documents examined per Mango queries, consider indexing
query: "rate(couchdb_mango_docs_examined[5m]) > 1000"
severity: warning
for: 5m
- name: CouchDB Replicator manager died
description: Replication manager process has crashed
query: "increase(couchdb_replicator_changes_manager_deaths[5m]) > 0"
severity: critical
for: 1m
- name: CouchDB Replicator queue process died
description: Replication queue process has crashed
query: "increase(couchdb_replicator_changes_queue_deaths[5m]) > 0"
severity: critical
for: 1m
- name: CouchDB Replicator reader process died
description: Replication reader process has crashed
query: "increase(couchdb_replicator_changes_reader_deaths[5m]) > 0"
severity: critical
for: 1m
- name: CouchDB Replicator failed to start
description: One or more replication tasks failed to start
query: "increase(couchdb_replicator_failed_starts[5m]) > 0"
severity: critical
for: 1m
- name: CouchDB replication cluster unstable
description: The replication cluster is unstable, replication may be interrupted
query: "couchdb_replicator_cluster_is_stable == 0"
severity: critical
for: 2m
- name: CouchDB replication read failures
description: Replication changes feed has failed reads more than 5 times in 5 minutes
query: "increase(couchdb_replicator_changes_read_failures[5m]) > 5"
severity: warning
for: 5m
- name: CouchDB file descriptors high
description: Process is using more than 85% of allowed file descriptors
query: "process_open_fds / process_max_fds > 0.85 and process_max_fds > 0"
severity: warning
for: 5m
- name: CouchDB process restarted
description: CouchDB process has restarted recently
query: "changes(process_start_time_seconds[1h]) > 0"
severity: info
for: 1m
- name: CouchDB critical log entries
description: Critical or error log entries detected in the last 5 minutes
query: "increase(couchdb_server_couch_log{level=~\"error|critical\"}[5m]) > 0"
severity: critical
for: 1m
- name: Solr
exporters:
- name: embedded exporter
slug: embedded-exporter
doc_url: https://solr.apache.org/guide/8_11/monitoring-solr-with-prometheus-and-grafana.html
rules:
- name: Solr update errors
description: Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}.
query: "increase(solr_metrics_core_update_handler_errors_total[1m]) > 1"
severity: critical
- name: Solr query errors
description: Solr has increased query errors in collection {{ $labels.collection }} for replica {{ $labels.replica }} on {{ $labels.base_url }}.
query: 'increase(solr_metrics_core_errors_total{category="QUERY"}[1m]) > 1'
severity: warning
for: 5m
- name: Solr replication errors
description: Solr collection {{ $labels.collection }} has replication errors for replica {{ $labels.replica }} on {{ $labels.base_url }}.
query: 'increase(solr_metrics_core_errors_total{category="REPLICATION"}[1m]) > 1'
severity: critical
- name: Solr low live node count
description: Solr collection {{ $labels.collection }} has less than two live nodes for replica {{ $labels.replica }} on {{ $labels.base_url }}.
query: "solr_collections_live_nodes < 2"
severity: critical
- name: Message brokers
services:
- name: RabbitMQ
exporters:
- name: rabbitmq/rabbitmq-prometheus
slug: rabbitmq-exporter
doc_url: https://github.com/rabbitmq/rabbitmq-prometheus
rules:
- name: RabbitMQ node down
description: Less than 3 nodes running in RabbitMQ cluster
query: "sum(rabbitmq_build_info) < 3"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: RabbitMQ node not distributed
description: Distribution link state is not 'up'
query: "erlang_vm_dist_node_state < 3"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: RabbitMQ instances different versions
description: Running different version of RabbitMQ in the same cluster, can lead to failure.
query: "count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1"
severity: warning
for: 1h
- name: RabbitMQ memory high
description: A node use more than 90% of allocated RAM
query: "rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90 and rabbitmq_resident_memory_limit_bytes > 0"
severity: warning
for: 2m
- name: RabbitMQ file descriptors usage
description: A node use more than 90% of file descriptors
query: "rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 and rabbitmq_process_max_fds > 0"
severity: warning
for: 2m
- name: RabbitMQ too many ready messages
description: RabbitMQ too many ready messages on {{ $labels.instance }}
query: "sum(rabbitmq_queue_messages_ready) BY (queue) > 1000"
severity: warning
for: 1m
- name: RabbitMQ too many unack messages
description: Too many unacknowledged messages
query: "sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000"
severity: warning
for: 1m
- name: RabbitMQ too many connections
description: The total connections of a node is too high
query: "rabbitmq_connections > 1000"
severity: warning
for: 2m
- name: RabbitMQ no queue consumer
description: A queue has less than 1 consumer
query: "rabbitmq_queue_consumers < 1"
severity: warning
for: 1m # allows a short service restart
- name: RabbitMQ unroutable messages
description: A queue has unroutable messages ({{ $value }} in the last 1m)
query: "increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0"
severity: warning
for: 2m
- name: kbudde/rabbitmq-exporter
slug: kbudde-rabbitmq-exporter
doc_url: https://github.com/kbudde/rabbitmq_exporter
rules:
- name: RabbitMQ down
description: RabbitMQ node down
query: "rabbitmq_up == 0"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: RabbitMQ cluster down
description: Less than 3 nodes running in RabbitMQ cluster
query: "sum(rabbitmq_running) < 3"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: RabbitMQ cluster partition
description: Cluster partition
query: "rabbitmq_partitions > 0"
severity: critical
- name: RabbitMQ out of memory
description: Memory available for RabbitMQ is low (< 10%)
query: "rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0"
severity: warning
for: 2m
- name: RabbitMQ instance too many connections
description: RabbitMQ instance has too many connections (> 1000)
query: "rabbitmq_connectionsTotal > 1000"
severity: warning
for: 2m
- name: RabbitMQ dead letter queue filling up
description: Dead letter queue is filling up (> 10 msgs)
query: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10'
severity: warning
for: 1m
comments: |
Indicate the queue name in dedicated label.
- name: RabbitMQ too many messages in queue
description: Queue is filling up (> 1000 msgs)
query: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000'
severity: warning
for: 2m
comments: |
Indicate the queue name in dedicated label.
- name: RabbitMQ slow queue consuming
description: Queue messages are consumed slowly (> 60s)
query: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60'
severity: warning
for: 2m
comments: |
Indicate the queue name in dedicated label.
- name: RabbitMQ no consumer
description: Queue has no consumer
query: "rabbitmq_queue_consumers == 0"
severity: critical
for: 5m
comments: |
Allows a short service restart.
- name: RabbitMQ too many consumers
description: Queue should have only 1 consumer
query: 'rabbitmq_queue_consumers{queue="my-queue"} > 1'
severity: critical
comments: |
Indicate the queue name in dedicated label.
- name: RabbitMQ inactive exchange
description: Exchange receive less than 5 msgs per second
query: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5'
severity: warning
comments: |
Indicate the exchange name in dedicated label.
for: 2m
- name: Zookeeper
exporters:
- name: cloudflare/kafka_zookeeper_exporter
slug: cloudflare-kafka-zookeeper-exporter
doc_url: https://github.com/cloudflare/kafka_zookeeper_exporter
rules:
- name: dabealu/zookeeper-exporter
slug: dabealu-zookeeper-exporter
doc_url: https://github.com/dabealu/zookeeper-exporter
rules:
- name: Zookeeper Down
description: "Zookeeper down on instance {{ $labels.instance }}"
query: "zk_up == 0"
severity: critical
for: 1m
comments: |
1m delay allows a restart without triggering an alert.
- name: Zookeeper missing leader
description: "Zookeeper cluster has no node marked as leader"
query: "sum(zk_server_leader) == 0"
severity: critical
- name: Zookeeper Too Many Leaders
description: "Zookeeper cluster has too many nodes marked as leader"
query: "sum(zk_server_leader) > 1"
severity: critical
- name: Zookeeper Not Ok
description: "Zookeeper instance is not ok"
query: "zk_ruok == 0"
severity: warning
for: 3m
- name: Kafka
exporters:
- name: danielqsj/kafka_exporter
slug: danielqsj-kafka-exporter
doc_url: https://github.com/danielqsj/kafka_exporter
rules:
- name: Kafka topics replicas
description: Kafka topic in-sync partition
query: "min(kafka_topic_partition_in_sync_replica) by (topic) < 3"
severity: critical
- name: Kafka consumer group lag
description: Kafka consumer group {{ $labels.consumergroup }} is lagging behind ({{ $value }} messages)
query: "sum(kafka_consumergroup_lag) by (consumergroup) > 10000"
severity: warning
for: 1m
- name: linkedin/Burrow
slug: linkedin-kafka-exporter
doc_url: https://github.com/linkedin/Burrow
rules:
- name: Kafka topic offset decreased
description: Kafka topic offset has decreased
query: "delta(kafka_burrow_partition_current_offset[1m]) < 0"
severity: warning
- name: Kafka consumer lag
description: Kafka consumer has a 30 minutes and increasing lag
query: "kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0"
severity: warning
for: 15m
- name: Pulsar
exporters:
- name: embedded exporter
slug: embedded-exporter
doc_url: https://pulsar.apache.org/docs/reference-metrics/
rules:
- name: Pulsar subscription high number of backlog entries
description: "The number of subscription backlog entries is over 5k"
query: sum(pulsar_subscription_back_log) by (subscription) > 5000
for: 1h
severity: warning
- name: Pulsar subscription very high number of backlog entries
description: "The number of subscription backlog entries is over 100k"
query: sum(pulsar_subscription_back_log) by (subscription) > 100000
for: 1h
severity: critical
- name: Pulsar topic large backlog storage size
description: "The topic backlog storage size is over 5 GB"
query: sum(pulsar_storage_size) by (topic) > 5*1024*1024*1024
for: 1h
severity: warning
- name: Pulsar topic very large backlog storage size
description: "The topic backlog storage size is over 20 GB"
query: sum(pulsar_storage_size) by (topic) > 20*1024*1024*1024
for: 1h
severity: critical
- name: Pulsar high write latency
description: "Messages cannot be written in a timely fashion"
query: sum(pulsar_storage_write_latency_overflow > 0) by (topic)
for: 1h
severity: critical
- name: Pulsar large message payload
description: "Observing large message payload (> 1MB)"
query: sum(pulsar_entry_size_overflow > 0) by (topic)
for: 1h
severity: warning
- name: Pulsar high ledger disk usage
description: "Observing Ledger Disk Usage (> 75%)"
query: sum(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_usage) by (kubernetes_pod_name) > 75
for: 1h
severity: critical
- name: Pulsar read only bookies
description: "Observing Readonly Bookies"
query: count(bookie_SERVER_STATUS{} == 0) by (pod)
for: 5m
severity: critical
- name: Pulsar high number of function errors
description: "Observing more than 10 Function errors per minute"
query: sum(rate(pulsar_function_user_exceptions_total[1m]) + rate(pulsar_function_system_exceptions_total[1m])) by (name) > 10
for: 1m
severity: critical
- name: Pulsar high number of sink errors
description: "Observing more than 10 Sink errors per minute"
query: sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10
for: 1m
severity: critical
- name: Nats
exporters:
- name: nats-io/prometheus-nats-exporter
slug: nats-exporter
doc_url: https://github.com/nats-io/prometheus-nats-exporter
rules:
- name: Nats high routes count
description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }}
query: "gnatsd_varz_routes > 10"
severity: warning
for: 3m
- name: Nats high memory usage
description: NATS server memory usage is above 200MB for {{ $labels.instance }}
query: "gnatsd_varz_mem > 200 * 1024 * 1024"
severity: warning
for: 5m
- name: Nats slow consumers
description: There are slow consumers in NATS for {{ $labels.instance }}
query: "gnatsd_varz_slow_consumers > 0"
severity: critical
for: 3m
- name: Nats server down
description: NATS server has been down for more than 5 minutes
query: 'absent(up{job="nats"})'
severity: critical
for: 5m
- name: Nats high CPU usage
description: NATS server is using more than 80% CPU for the last 5 minutes
query: "gnatsd_varz_cpu > 80"
severity: warning
for: 5m
comments: |
gnatsd_varz_cpu is a gauge reporting CPU percentage (0-100 scale).
- name: Nats high number of connections
description: NATS server has more than 1000 active connections
query: "gnatsd_connz_num_connections > 1000"
severity: warning
for: 5m
- name: Nats high JetStream store usage
description: JetStream store usage is over 80%
query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 and gnatsd_varz_jetstream_config_max_storage > 0"
severity: warning
for: 5m
- name: Nats high JetStream memory usage
description: JetStream memory usage is over 80%
query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 and gnatsd_varz_jetstream_config_max_memory > 0"
severity: warning
for: 5m
- name: Nats high number of subscriptions
description: NATS server has more than 1000 active subscriptions
query: "gnatsd_connz_subscriptions > 1000"
severity: warning
for: 5m
- name: Nats high pending bytes
description: NATS server has more than 100,000 pending bytes
query: "gnatsd_connz_pending_bytes > 100000"
severity: warning
for: 5m
- name: Nats too many errors
description: NATS server has encountered {{ $value }} JetStream API errors in the last 5 minutes
query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0"
severity: warning
for: 5m
- name: Nats JetStream accounts exceeded
description: JetStream has more than 100 active accounts
query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100"
severity: warning
for: 5m
- name: Nats leaf node connection issue
description: No leaf node connections on {{ $labels.instance }}
query: "gnatsd_varz_leafnodes == 0"
severity: warning
for: 5m
- name: Proxies, load balancers and service meshes
services:
- name: Nginx
exporters:
- name: knyar/nginx-lua-prometheus
slug: knyar-nginx-exporter
doc_url: https://github.com/knyar/nginx-lua-prometheus
rules:
- name: Nginx high HTTP 4xx error rate
description: Too many HTTP requests with status 4xx (> 5%)
query: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0'
severity: critical
for: 1m
- name: Nginx high HTTP 5xx error rate
description: Too many HTTP requests with status 5xx (> 5%)
query: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0'
severity: critical
for: 1m
- name: Nginx latency high
description: Nginx p99 latency is higher than 3 seconds
query: "histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node, le)) > 3"
severity: warning
for: 2m
- name: Apache
exporters:
- name: Lusitaniae/apache_exporter
slug: lusitaniae-apache-exporter
doc_url: https://github.com/Lusitaniae/apache_exporter
rules:
- name: Apache down
description: Apache down
query: "apache_up == 0"
severity: critical
- name: Apache workers load
description: Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }}
query: '(sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80 and sum by (instance) (apache_scoreboard) > 0'
severity: warning
for: 2m
- name: Apache restart
description: Apache has just been restarted.
query: "apache_uptime_seconds_total / 60 < 1"
severity: warning
- name: HaProxy
exporters:
- name: Embedded exporter (HAProxy >= v2)
slug: embedded-exporter-v2
doc_url: https://github.com/haproxy/haproxy/tree/master/contrib/prometheus-exporter
rules:
- name: HAProxy high HTTP 4xx error rate backend
description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0
severity: critical
for: 1m
- name: HAProxy high HTTP 5xx error rate backend
description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0
severity: critical
for: 1m
- name: HAProxy high HTTP 4xx error rate server
description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}
query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0
severity: critical
for: 1m
- name: HAProxy high HTTP 5xx error rate server
description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}
query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0
severity: critical
for: 1m
- name: HAProxy server response errors
description: Too many response errors to {{ $labels.server }} server (> 5%).
query: (sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0
severity: critical
for: 1m
- name: HAProxy backend connection errors
description: Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.
query: (sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100
severity: critical
for: 1m
- name: HAProxy server connection errors
description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.
query: (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100
severity: critical
- name: HAProxy backend max active session > 80%
description: Session limit from backend {{ $labels.proxy }} reached 80% of limit - {{ $value | printf "%.2f"}}%
query: ((haproxy_backend_current_sessions >0) * 100) / (haproxy_backend_limit_sessions > 0) > 80
severity: warning
for: 2m
- name: HAProxy pending requests
description: Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
query: sum by (proxy) (haproxy_backend_current_queue) > 0
comments: |
haproxy_backend_current_queue is a gauge (current queue depth), not a counter.
severity: warning
for: 2m
- name: HAProxy HTTP slowing down
description: Average request time is increasing - {{ $value | printf "%.2f"}}
query: avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1
severity: warning
for: 1m
- name: HAProxy retry high
description: High rate of retry on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
query: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10
severity: warning
for: 2m
- name: HAproxy has no alive backends
description: HAProxy has no alive active or backup backends for {{ $labels.proxy }}
query: haproxy_backend_active_servers + haproxy_backend_backup_servers == 0
severity: critical
- name: HAProxy frontend security blocked requests
description: HAProxy is blocking requests for security reason
query: sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10
severity: warning
for: 2m
- name: HAProxy server healthcheck failure
description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)
query: increase(haproxy_server_check_failures_total[1m]) > 0
severity: warning
for: 1m
- name: prometheus/haproxy_exporter (HAProxy < v2)
slug: haproxy-exporter-v1
doc_url: https://github.com/prometheus/haproxy_exporter
rules:
- name: HAProxy down
description: HAProxy down
query: "haproxy_up == 0"
severity: critical
- name: HAProxy high HTTP 4xx error rate backend (v1)
description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
severity: critical
for: 1m
- name: HAProxy high HTTP 5xx error rate backend (v1)
description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
severity: critical
for: 1m
- name: HAProxy high HTTP 4xx error rate server (v1)
description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}
query: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
severity: critical
for: 1m
- name: HAProxy high HTTP 5xx error rate server (v1)
description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}
query: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
severity: critical
for: 1m
- name: HAProxy server response errors (v1)
description: Too many response errors to {{ $labels.server }} server (> 5%).
query: "sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0"
severity: critical
for: 1m
- name: HAProxy backend connection errors (v1)
description: Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.
query: "sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100"
severity: critical
for: 1m
- name: HAProxy server connection errors (v1)
description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.
query: "sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100"
severity: critical
- name: HAProxy backend max active session
description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).
query: "((sum by (backend) (haproxy_backend_current_sessions * 100) / sum by (backend) (haproxy_backend_limit_sessions))) > 80 and sum by (backend) (haproxy_backend_limit_sessions) > 0"
severity: warning
for: 2m
- name: HAProxy pending requests (v1)
description: Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend
query: "sum by (backend) (haproxy_backend_current_queue) > 0"
severity: warning
for: 2m
- name: HAProxy HTTP slowing down (v1)
description: Average request time is increasing
query: "avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1"
severity: warning
for: 1m
- name: HAProxy retry high (v1)
description: High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend
query: "sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10"
severity: warning
for: 2m
- name: HAProxy backend down
description: HAProxy backend is down
query: "haproxy_backend_up == 0"
severity: critical
- name: HAProxy server down
description: HAProxy server is down
query: "haproxy_server_up == 0"
severity: critical
- name: HAProxy frontend security blocked requests (v1)
description: HAProxy is blocking requests for security reason
query: "sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10"
severity: warning
for: 2m
- name: HAProxy server healthcheck failure (v1)
description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)
query: "increase(haproxy_server_check_failures_total[1m]) > 0"
severity: warning
for: 1m
- name: Traefik
exporters:
- name: Embedded exporter v2
slug: embedded-exporter-v2
doc_url: https://docs.traefik.io/observability/metrics/prometheus/
rules:
- name: Traefik service down
description: All Traefik services are down
query: "count(traefik_service_server_up) by (service) == 0"
severity: critical
- name: Traefik high HTTP 4xx error rate service
description: Traefik service 4xx error rate is above 5%
query: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0'
severity: critical
for: 1m
- name: Traefik high HTTP 5xx error rate service
description: Traefik service 5xx error rate is above 5%
query: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0'
severity: critical
for: 1m
- name: Embedded exporter v1
slug: embedded-exporter-v1
doc_url: https://docs.traefik.io/observability/metrics/prometheus/
rules:
- name: Traefik backend down
description: All Traefik backends are down
query: "count(traefik_backend_server_up) by (backend) == 0"
severity: critical
- name: Traefik high HTTP 4xx error rate backend
description: Traefik backend 4xx error rate is above 5%
query: 'sum(rate(traefik_backend_requests_total{code=~"4.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0'
severity: critical
for: 1m
- name: Traefik high HTTP 5xx error rate backend
description: Traefik backend 5xx error rate is above 5%
query: 'sum(rate(traefik_backend_requests_total{code=~"5.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0'
severity: critical
for: 1m
- name: Caddy
exporters:
- name: Embedded exporter
slug: embedded-exporter
doc_url: https://caddyserver.com/docs/metrics
rules:
- name: Caddy Reverse Proxy Down
description: "All Caddy reverse proxies are down"
query: "count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0"
severity: critical
- name: Caddy high HTTP 4xx error rate service
description: "Caddy service 4xx error rate is above 5%"
query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0'
severity: critical
for: 1m
- name: Caddy high HTTP 5xx error rate service
description: "Caddy service 5xx error rate is above 5%"
query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0'
severity: critical
for: 1m
- name: Envoy
exporters:
- name: Built-in metrics
slug: embedded-exporter
doc_url: https://www.envoyproxy.io/docs/envoy/latest/configuration/observability/statistics
rules:
- name: Envoy server not live
description: "Envoy server is not live (draining or shutting down) on {{ $labels.instance }}"
query: "envoy_server_live != 1"
severity: critical
for: 1m
- name: Envoy high memory usage
description: "Envoy memory allocated is above 90% of heap size on {{ $labels.instance }}"
query: "envoy_server_memory_allocated / envoy_server_memory_heap_size * 100 > 90 and envoy_server_memory_heap_size > 0"
severity: warning
for: 5m
- name: Envoy high downstream HTTP 5xx error rate
description: "More than 5% of downstream HTTP responses are 5xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
query: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
severity: critical
for: 1m
- name: Envoy high downstream HTTP 4xx error rate
description: "More than 10% of downstream HTTP responses are 4xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
query: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
severity: warning
for: 5m
- name: Envoy downstream connections overflowing
description: "Downstream connections are being rejected due to listener overflow on {{ $labels.instance }} ({{ $value }} in the last 5m)"
query: "increase(envoy_listener_downstream_cx_overflow[5m]) > 5"
severity: warning
- name: Envoy cluster membership empty
description: "Envoy cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} has no healthy members"
query: "envoy_cluster_membership_healthy == 0"
severity: critical
for: 1m
- name: Envoy cluster membership degraded
description: "More than 25% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are unhealthy"
query: "envoy_cluster_membership_healthy / envoy_cluster_membership_total * 100 < 75 and envoy_cluster_membership_total > 0"
severity: warning
for: 5m
- name: Envoy high cluster upstream connection failures
description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)"
query: "increase(envoy_cluster_upstream_cx_connect_fail[5m]) > 10"
severity: warning
for: 5m
- name: Envoy high cluster upstream request timeout rate
description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
query: "rate(envoy_cluster_upstream_rq_timeout[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0"
severity: warning
for: 5m
- name: Envoy high cluster upstream 5xx error rate
description: "More than 5% of upstream requests return 5xx in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
query: 'rate(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0'
severity: critical
for: 1m
- name: Envoy cluster health check failures
description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)"
query: "increase(envoy_cluster_health_check_failure[5m]) > 5"
severity: warning
for: 5m
- name: Envoy cluster outlier detection ejections active
description: "There are active outlier detection ejections in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
query: "envoy_cluster_outlier_detection_ejections_active > 0"
severity: info
for: 5m
- name: Envoy listener SSL connection errors
description: "Envoy listener is experiencing SSL/TLS connection errors on {{ $labels.instance }} ({{ $value }} in the last 5m)"
query: "increase(envoy_listener_ssl_connection_error[5m]) > 5"
severity: warning
- name: Envoy global downstream connections overflowing
description: "Downstream connections are being rejected due to global connection limit on {{ $labels.instance }} ({{ $value }} in the last 5m)"
query: "increase(envoy_listener_downstream_global_cx_overflow[5m]) > 5"
severity: critical
- name: Envoy SSL certificate expiring soon
description: "SSL certificate loaded by Envoy on {{ $labels.instance }} expires in less than 7 days"
query: "envoy_server_days_until_first_cert_expiring < 7"
severity: warning
- name: Envoy SSL certificate expired
description: "SSL certificate loaded by Envoy on {{ $labels.instance }} has expired"
query: "envoy_server_days_until_first_cert_expiring < 0"
severity: critical
- name: Envoy cluster circuit breaker tripped
description: "Circuit breaker is open for cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
query: "envoy_cluster_circuit_breakers_default_cx_open == 1 or envoy_cluster_circuit_breakers_default_rq_open == 1"
severity: critical
- name: Envoy no healthy upstream
description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)"
query: "increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 0"
severity: critical
- name: Envoy high downstream request timeout rate
description: "Downstream requests are timing out on {{ $labels.instance }} ({{ $value }} in the last 5m)"
query: "increase(envoy_http_downstream_rq_timeout[5m]) > 5"
severity: warning
for: 5m
- name: Linkerd
exporters:
- name: Embedded exporter
slug: embedded-exporter
doc_url: https://linkerd.io/2/tasks/exporting-metrics/
rules:
- name: Linkerd high error rate
description: "Linkerd error rate for {{ $labels.deployment }}{{ $labels.statefulset }}{{ $labels.daemonset }} is over 10%"
query: 'sum(rate(response_total{classification="failure"}[1m])) by (deployment, statefulset, daemonset) / sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10 and sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) > 0'
comments: |
Linkerd does not expose request_errors_total. Errors are tracked via response_total{classification="failure"}.
severity: warning
for: 1m
- name: Istio
exporters:
- name: Embedded exporter
slug: embedded-exporter
doc_url: https://istio.io/latest/docs/tasks/observability/metrics/querying-metrics/
rules:
- name: Istio Kubernetes gateway availability drop
description: Gateway pods have dropped. Inbound traffic will likely be affected.
query: 'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2'
severity: warning
for: 1m
- name: Istio Pilot high total request rate
description: Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.
query: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5 and sum(rate(pilot_xds_pushes[1m])) > 0"
severity: warning
for: 1m
- name: Istio Mixer Prometheus dispatches low
description: Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly.
query: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
severity: warning
for: 1m
- name: Istio high total request rate
description: Global request rate in the service mesh is unusually high.
query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
severity: warning
for: 2m
- name: Istio low total request rate
description: Global request rate in the service mesh is unusually low.
query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
severity: warning
for: 2m
- name: Istio high 4xx error rate
description: High percentage of HTTP 4xx responses in Ist
gitextract_epx1dxil/ ├── .github/ │ ├── FUNDING.yml │ ├── dependabot.yml │ └── workflows/ │ ├── dist.yml │ └── test.yml ├── .gitignore ├── .travis.yml ├── CLAUDE.md ├── CONTRIBUTING.md ├── Gemfile ├── LICENSE ├── README.md ├── _config.yml ├── _data/ │ └── rules.yml ├── _layouts/ │ └── default.html ├── alertmanager.md ├── assets/ │ ├── css/ │ │ └── app.css │ └── js/ │ └── app.js ├── blackbox-exporter.md ├── dist/ │ ├── rules/ │ │ ├── apache/ │ │ │ └── lusitaniae-apache-exporter.yml │ │ ├── apache-flink/ │ │ │ └── flink-prometheus-reporter.yml │ │ ├── apache-spark/ │ │ │ └── spark-prometheus.yml │ │ ├── apc-ups/ │ │ │ └── apcupsd_exporter.yml │ │ ├── argocd/ │ │ │ └── embedded-exporter.yml │ │ ├── aws-cloudwatch/ │ │ │ └── prometheus-cloudwatch-exporter.yml │ │ ├── azure/ │ │ │ └── azure-metrics-exporter.yml │ │ ├── blackbox/ │ │ │ └── blackbox-exporter.yml │ │ ├── caddy/ │ │ │ └── embedded-exporter.yml │ │ ├── cassandra/ │ │ │ ├── criteo-cassandra-exporter.yml │ │ │ └── instaclustr-cassandra-exporter.yml │ │ ├── ceph/ │ │ │ └── embedded-exporter.yml │ │ ├── cert-manager/ │ │ │ └── embedded-exporter.yml │ │ ├── cilium/ │ │ │ └── embedded-exporter.yml │ │ ├── clickhouse/ │ │ │ └── embedded-exporter.yml │ │ ├── cloudflare/ │ │ │ └── lablabs-cloudflare-exporter.yml │ │ ├── consul/ │ │ │ └── consul-exporter.yml │ │ ├── coredns/ │ │ │ └── embedded-exporter.yml │ │ ├── cortex/ │ │ │ └── embedded-exporter.yml │ │ ├── couchdb/ │ │ │ └── gesellix-couchdb-prometheus-exporter.yml │ │ ├── digitalocean/ │ │ │ └── digitalocean-exporter.yml │ │ ├── docker-containers/ │ │ │ └── google-cadvisor.yml │ │ ├── ebpf/ │ │ │ └── ebpf-exporter.yml │ │ ├── elasticsearch/ │ │ │ └── prometheus-community-elasticsearch-exporter.yml │ │ ├── envoy/ │ │ │ └── embedded-exporter.yml │ │ ├── etcd/ │ │ │ └── embedded-exporter.yml │ │ ├── fluxcd/ │ │ │ └── embedded-exporter.yml │ │ ├── freeswitch/ │ │ │ └── znerol-freeswitch-exporter.yml │ │ ├── gitlab-ci/ │ │ │ ├── gitaly.yml │ │ │ ├── gitlab-built-in-exporter.yml │ │ │ └── workhorse.yml │ │ ├── golang/ │ │ │ └── golang-exporter.yml │ │ ├── google-cloud-stackdriver/ │ │ │ └── stackdriver-exporter.yml │ │ ├── grafana-alloy/ │ │ │ └── embedded-exporter.yml │ │ ├── grafana-mimir/ │ │ │ └── embedded-exporter.yml │ │ ├── grafana-tempo/ │ │ │ └── embedded-exporter.yml │ │ ├── graph-node/ │ │ │ └── embedded-exporter.yml │ │ ├── hadoop/ │ │ │ └── jmx_exporter.yml │ │ ├── haproxy/ │ │ │ ├── embedded-exporter-v2.yml │ │ │ └── haproxy-exporter-v1.yml │ │ ├── hashicorp-vault/ │ │ │ └── embedded-exporter.yml │ │ ├── host-and-hardware/ │ │ │ └── node-exporter.yml │ │ ├── ipmi/ │ │ │ └── ipmi-exporter.yml │ │ ├── istio/ │ │ │ └── embedded-exporter.yml │ │ ├── jaeger/ │ │ │ └── embedded-exporter.yml │ │ ├── jenkins/ │ │ │ └── metric-plugin.yml │ │ ├── juniper/ │ │ │ └── czerwonk-junos-exporter.yml │ │ ├── jvm/ │ │ │ └── jvm-exporter.yml │ │ ├── kafka/ │ │ │ ├── danielqsj-kafka-exporter.yml │ │ │ └── linkedin-kafka-exporter.yml │ │ ├── keycloak/ │ │ │ └── aerogear-keycloak-metrics-spi.yml │ │ ├── kubernetes/ │ │ │ └── kubestate-exporter.yml │ │ ├── linkerd/ │ │ │ └── embedded-exporter.yml │ │ ├── loki/ │ │ │ └── embedded-exporter.yml │ │ ├── meilisearch/ │ │ │ └── embedded-exporter.yml │ │ ├── memcached/ │ │ │ └── memcached-exporter.yml │ │ ├── minio/ │ │ │ └── embedded-exporter.yml │ │ ├── mongodb/ │ │ │ ├── dcu-mongodb-exporter.yml │ │ │ ├── percona-mongodb-exporter.yml │ │ │ └── stefanprodan-mgob-exporter.yml │ │ ├── mysql/ │ │ │ └── mysqld-exporter.yml │ │ ├── nats/ │ │ │ └── nats-exporter.yml │ │ ├── netdata/ │ │ │ └── embedded-exporter.yml │ │ ├── nginx/ │ │ │ └── knyar-nginx-exporter.yml │ │ ├── nomad/ │ │ │ └── embedded-exporter.yml │ │ ├── openebs/ │ │ │ └── embedded-exporter.yml │ │ ├── openstack/ │ │ │ └── openstack-exporter.yml │ │ ├── opentelemetry-collector/ │ │ │ └── embedded-exporter.yml │ │ ├── oracle-database/ │ │ │ └── iamseth-oracledb-exporter.yml │ │ ├── patroni/ │ │ │ └── embedded-exporter-patroni.yml │ │ ├── pgbouncer/ │ │ │ └── spreaker-pgbouncer-exporter.yml │ │ ├── php-fpm/ │ │ │ └── bakins-fpm-exporter.yml │ │ ├── postgresql/ │ │ │ └── postgres-exporter.yml │ │ ├── process-exporter/ │ │ │ └── process-exporter.yml │ │ ├── prometheus-self-monitoring/ │ │ │ └── embedded-exporter.yml │ │ ├── promtail/ │ │ │ └── embedded-exporter.yml │ │ ├── proxmox-ve/ │ │ │ └── prometheus-pve-exporter.yml │ │ ├── pulsar/ │ │ │ └── embedded-exporter.yml │ │ ├── python/ │ │ │ └── python-exporter.yml │ │ ├── rabbitmq/ │ │ │ ├── kbudde-rabbitmq-exporter.yml │ │ │ └── rabbitmq-exporter.yml │ │ ├── redis/ │ │ │ └── oliver006-redis-exporter.yml │ │ ├── ruby/ │ │ │ └── ruby-exporter.yml │ │ ├── s.m.a.r.t-device-monitoring/ │ │ │ └── smartctl-exporter.yml │ │ ├── sidekiq/ │ │ │ └── strech-sidekiq-exporter.yml │ │ ├── snmp/ │ │ │ └── snmp-exporter.yml │ │ ├── solr/ │ │ │ └── embedded-exporter.yml │ │ ├── speedtest/ │ │ │ └── nlamirault-speedtest-exporter.yml │ │ ├── spinnaker/ │ │ │ └── embedded-exporter.yml │ │ ├── sql-server/ │ │ │ └── ozarklake-mssql-exporter.yml │ │ ├── ssl/ │ │ │ └── tls/ │ │ │ └── ribbybibby-ssl-exporter.yml │ │ ├── systemd/ │ │ │ └── systemd-exporter.yml │ │ ├── thanos/ │ │ │ ├── thanos-bucket-replicate.yml │ │ │ ├── thanos-compactor.yml │ │ │ ├── thanos-component-absent.yml │ │ │ ├── thanos-query.yml │ │ │ ├── thanos-receiver.yml │ │ │ ├── thanos-ruler.yml │ │ │ ├── thanos-sidecar.yml │ │ │ └── thanos-store.yml │ │ ├── traefik/ │ │ │ ├── embedded-exporter-v1.yml │ │ │ └── embedded-exporter-v2.yml │ │ ├── vmware/ │ │ │ └── pryorda-vmware-exporter.yml │ │ ├── windows-server/ │ │ │ └── windows-exporter.yml │ │ ├── wireguard/ │ │ │ └── mindflavor-prometheus-wireguard-exporter.yml │ │ ├── zfs/ │ │ │ ├── node-exporter.yml │ │ │ └── zfs_exporter.yml │ │ └── zookeeper/ │ │ ├── cloudflare-kafka-zookeeper-exporter.yml │ │ └── dabealu-zookeeper-exporter.yml │ └── template.yml ├── docker-compose.yml ├── index.md ├── package.json ├── rules.md └── sleep-peacefully.md
Condensed preview — 133 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (929K chars).
[
{
"path": ".github/FUNDING.yml",
"chars": 37,
"preview": "github: [samber]\nko_fi: samuelberthe\n"
},
{
"path": ".github/dependabot.yml",
"chars": 123,
"preview": "---\nversion: 2\nupdates:\n - package-ecosystem: \"github-actions\"\n directory: \"/\"\n schedule:\n interval: \"monthl"
},
{
"path": ".github/workflows/dist.yml",
"chars": 2449,
"preview": "name: Publish\n\non:\n workflow_dispatch:\n push:\n branches:\n - master\n\npermissions:\n contents: write\n\njobs:\n pu"
},
{
"path": ".github/workflows/test.yml",
"chars": 1916,
"preview": "name: Promtool check\n\non:\n pull_request:\n push:\n branches:\n - master\n\njobs:\n promtool-check:\n name: Check "
},
{
"path": ".gitignore",
"chars": 106,
"preview": "_site/\n.sass-cache/\n.jekyll-cache/\n.jekyll-metadata\n_data/rules.json\ntest/rules/\n/node_modules\n.worktrees/"
},
{
"path": ".travis.yml",
"chars": 38,
"preview": "language: node_js\nnode_js:\n - 'node'\n"
},
{
"path": "CLAUDE.md",
"chars": 17705,
"preview": "# CLAUDE.md\n\nThis file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.\n\n## "
},
{
"path": "CONTRIBUTING.md",
"chars": 859,
"preview": "\n# Contributing\n\n## Adding alerting rule\n\nIf you don't have time to write a PR, just copy and paste some alerts into an "
},
{
"path": "Gemfile",
"chars": 106,
"preview": "source 'https://rubygems.org'\ngem 'github-pages', '>= 232', group: :jekyll_plugins\ngem 'webrick', '~> 1.8'"
},
{
"path": "LICENSE",
"chars": 112,
"preview": "Creative Commons Attribution 4.0 International License (CC BY 4.0)\n\nhttp://creativecommons.org/licenses/by/4.0/\n"
},
{
"path": "README.md",
"chars": 9726,
"preview": "# 👋 Awesome Prometheus Alerts [](https://awesome.re)\n\n> Most alerting rules"
},
{
"path": "_config.yml",
"chars": 186,
"preview": "theme: jekyll-theme-cayman\n\ntitle: Awesome Prometheus alerts\ndescription: Collection of alerting rules\n\nrepository: samb"
},
{
"path": "_data/rules.yml",
"chars": 369597,
"preview": "#\n# The following yaml cannot be copy-pasted to Prometheus configuration.\n# Please navigate to https://samber.github"
},
{
"path": "_layouts/default.html",
"chars": 5345,
"preview": "<!DOCTYPE html>\n<html lang=\"{{ site.lang | default: \"en-US\" }}\">\n\n<head>\n <meta charset=\"UTF-8\">\n {% seo %}\n <meta na"
},
{
"path": "alertmanager.md",
"chars": 4066,
"preview": "<h1 style=\"text-align: center;\">\n Global configuration\n</h1>\n\nIf you notice a delay between an event and the first noti"
},
{
"path": "assets/css/app.css",
"chars": 5869,
"preview": "a.anchor {\n font-size: 15px;\n vertical-align: middle;\n color: darkblue;\n display: inline-block;\n padding-"
},
{
"path": "assets/js/app.js",
"chars": 648,
"preview": "$(function () {\n var clipboardRules = new ClipboardJS('.clipboard-single', {\n text: function (trigger) {\n "
},
{
"path": "blackbox-exporter.md",
"chars": 4357,
"preview": "\n<h1 style=\"text-align: center;\">\n Blackbox exporter\n</h1>\n\n## Wordwide probes\n\n<a href=\"https://github.com/prometheus/"
},
{
"path": "dist/rules/apache/lusitaniae-apache-exporter.yml",
"chars": 1169,
"preview": "groups:\n\n- name: LusitaniaeApacheExporter\n\n \n rules:\n\n - alert: ApacheDown\n expr: 'apache_up == 0'\n for: "
},
{
"path": "dist/rules/apache-flink/flink-prometheus-reporter.yml",
"chars": 6013,
"preview": "groups:\n\n- name: FlinkPrometheusReporter\n\n \n rules:\n\n - alert: FlinkJobIsNotRunning\n expr: 'flink_jobmanager_n"
},
{
"path": "dist/rules/apache-spark/spark-prometheus.yml",
"chars": 4540,
"preview": "groups:\n\n- name: SparkPrometheus\n\n # Spark exposes metrics via two built-in endpoints:\n # - PrometheusServlet: master/"
},
{
"path": "dist/rules/apc-ups/apcupsd_exporter.yml",
"chars": 2237,
"preview": "groups:\n\n- name: Apcupsd_exporter\n\n \n rules:\n\n - alert: ApcUpsBatteryNearlyEmpty\n expr: 'apcupsd_battery_charg"
},
{
"path": "dist/rules/argocd/embedded-exporter.yml",
"chars": 817,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: ArgocdServiceNotSynced\n expr: 'argocd_app_info{sync_st"
},
{
"path": "dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml",
"chars": 7159,
"preview": "groups:\n\n- name: PrometheusCloudwatchExporter\n\n # CloudWatch metrics are exported as aws_{namespace}_{metric_name}_{sta"
},
{
"path": "dist/rules/azure/azure-metrics-exporter.yml",
"chars": 2925,
"preview": "groups:\n\n- name: AzureMetricsExporter\n\n # The exporter uses azurerm_resource_metric as the default metric name for forw"
},
{
"path": "dist/rules/blackbox/blackbox-exporter.yml",
"chars": 3696,
"preview": "groups:\n\n- name: BlackboxExporter\n\n \n rules:\n\n - alert: BlackboxProbeFailed\n expr: 'probe_success == 0'\n "
},
{
"path": "dist/rules/caddy/embedded-exporter.yml",
"chars": 1589,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: CaddyReverseProxyDown\n expr: 'count(caddy_reverse_prox"
},
{
"path": "dist/rules/cassandra/criteo-cassandra-exporter.yml",
"chars": 8748,
"preview": "groups:\n\n- name: CriteoCassandraExporter\n\n \n rules:\n\n - alert: CassandraHintsCount\n expr: 'changes(cassandra_s"
},
{
"path": "dist/rules/cassandra/instaclustr-cassandra-exporter.yml",
"chars": 5815,
"preview": "groups:\n\n- name: InstaclustrCassandraExporter\n\n \n rules:\n\n # 1m delay allows a restart without triggering an alert."
},
{
"path": "dist/rules/ceph/embedded-exporter.yml",
"chars": 4737,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: CephState\n expr: 'ceph_health_status != 0'\n for: "
},
{
"path": "dist/rules/cert-manager/embedded-exporter.yml",
"chars": 2341,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: Cert-managerAbsent\n expr: 'absent(up{job=\"cert-manager"
},
{
"path": "dist/rules/cilium/embedded-exporter.yml",
"chars": 16260,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n # Metric name depends on Cilium version. Use cilium_unreachable_node"
},
{
"path": "dist/rules/clickhouse/embedded-exporter.yml",
"chars": 9012,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n # Adjust the job label to match your Prometheus configuration.\n -"
},
{
"path": "dist/rules/cloudflare/lablabs-cloudflare-exporter.yml",
"chars": 1215,
"preview": "groups:\n\n- name: LablabsCloudflareExporter\n\n \n rules:\n\n - alert: CloudflareHttp4xxErrorRate\n expr: '(sum by(zo"
},
{
"path": "dist/rules/consul/consul-exporter.yml",
"chars": 1168,
"preview": "groups:\n\n- name: ConsulExporter\n\n \n rules:\n\n - alert: ConsulServiceHealthcheckFailed\n expr: 'consul_catalog_se"
},
{
"path": "dist/rules/coredns/embedded-exporter.yml",
"chars": 387,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: CorednsPanicCount\n expr: 'increase(coredns_panics_tota"
},
{
"path": "dist/rules/cortex/embedded-exporter.yml",
"chars": 2731,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: CortexRulerConfigurationReloadFailure\n expr: 'cortex_r"
},
{
"path": "dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml",
"chars": 7412,
"preview": "groups:\n\n- name: GesellixCouchdbPrometheusExporter\n\n \n rules:\n\n - alert: CouchdbNodeDown\n expr: 'couchdb_httpd"
},
{
"path": "dist/rules/digitalocean/digitalocean-exporter.yml",
"chars": 4350,
"preview": "groups:\n\n- name: DigitaloceanExporter\n\n \n rules:\n\n - alert: DigitaloceanDropletDown\n expr: 'digitalocean_dropl"
},
{
"path": "dist/rules/docker-containers/google-cadvisor.yml",
"chars": 5318,
"preview": "groups:\n\n- name: GoogleCadvisor\n\n \n rules:\n\n # This rule can be very noisy in dynamic infra with legitimate contain"
},
{
"path": "dist/rules/ebpf/ebpf-exporter.yml",
"chars": 1624,
"preview": "groups:\n\n- name: EbpfExporter\n\n \n rules:\n\n # The exporter uses loose attachment: if a program fails to load (missin"
},
{
"path": "dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml",
"chars": 8288,
"preview": "groups:\n\n- name: PrometheusCommunityElasticsearchExporter\n\n \n rules:\n\n - alert: ElasticsearchHeapUsageTooHigh\n "
},
{
"path": "dist/rules/envoy/embedded-exporter.yml",
"chars": 9533,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: EnvoyServerNotLive\n expr: 'envoy_server_live != 1'\n "
},
{
"path": "dist/rules/etcd/embedded-exporter.yml",
"chars": 6390,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: EtcdInsufficientMembers\n expr: 'count(etcd_server_id) "
},
{
"path": "dist/rules/fluxcd/embedded-exporter.yml",
"chars": 1911,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: FluxKustomizationFailure\n expr: 'gotk_resource_info{re"
},
{
"path": "dist/rules/freeswitch/znerol-freeswitch-exporter.yml",
"chars": 1274,
"preview": "groups:\n\n- name: ZnerolFreeswitchExporter\n\n \n rules:\n\n - alert: FreeswitchDown\n expr: 'freeswitch_up == 0'\n "
},
{
"path": "dist/rules/gitlab-ci/gitaly.yml",
"chars": 3548,
"preview": "groups:\n\n- name: Gitaly\n\n \n rules:\n\n # grpc_code!=\"OK\" includes non-error codes like NotFound, AlreadyExists. Consi"
},
{
"path": "dist/rules/gitlab-ci/gitlab-built-in-exporter.yml",
"chars": 11318,
"preview": "groups:\n\n- name: GitlabBuiltInExporter\n\n \n rules:\n\n # Queued connections indicate Puma workers are saturated.\n #"
},
{
"path": "dist/rules/gitlab-ci/workhorse.yml",
"chars": 1795,
"preview": "groups:\n\n- name: Workhorse\n\n \n rules:\n\n # Workhorse sits in front of Puma and handles Git HTTP, file uploads, and p"
},
{
"path": "dist/rules/golang/golang-exporter.yml",
"chars": 4737,
"preview": "groups:\n\n- name: GolangExporter\n\n \n rules:\n\n # Threshold is a rough default. High-concurrency servers may legitimat"
},
{
"path": "dist/rules/google-cloud-stackdriver/stackdriver-exporter.yml",
"chars": 2555,
"preview": "groups:\n\n- name: StackdriverExporter\n\n # Self-monitoring metrics use the stackdriver_monitoring_* prefix.\n # All self-"
},
{
"path": "dist/rules/grafana-alloy/embedded-exporter.yml",
"chars": 502,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: GrafanaAlloyServiceDown\n expr: 'count by (instance) (a"
},
{
"path": "dist/rules/grafana-mimir/embedded-exporter.yml",
"chars": 25602,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n # Mimir uses the `cortex_` metric prefix for backward compatibility with Cortex. Th"
},
{
"path": "dist/rules/grafana-tempo/embedded-exporter.yml",
"chars": 9591,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: TempoDistributorUnhealthy\n expr: 'max by (job) (tempo_"
},
{
"path": "dist/rules/graph-node/embedded-exporter.yml",
"chars": 2592,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: ProviderFailedBecauseNet_versionFailed\n expr: 'eth_rpc"
},
{
"path": "dist/rules/hadoop/jmx_exporter.yml",
"chars": 4056,
"preview": "groups:\n\n- name: Jmx_exporter\n\n \n rules:\n\n - alert: HadoopNameNodeDown\n expr: 'up{job=\"hadoop-namenode\"} == 0'"
},
{
"path": "dist/rules/haproxy/embedded-exporter-v2.yml",
"chars": 7134,
"preview": "groups:\n\n- name: EmbeddedExporterV2\n\n \n rules:\n\n - alert: HaproxyHighHttp4xxErrorRateBackend\n expr: '((sum by "
},
{
"path": "dist/rules/haproxy/haproxy-exporter-v1.yml",
"chars": 7654,
"preview": "groups:\n\n- name: HaproxyExporterV1\n\n \n rules:\n\n - alert: HaproxyDown\n expr: 'haproxy_up == 0'\n for: 0m\n "
},
{
"path": "dist/rules/hashicorp-vault/embedded-exporter.yml",
"chars": 1614,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: VaultSealed\n expr: 'vault_core_unsealed == 0'\n fo"
},
{
"path": "dist/rules/host-and-hardware/node-exporter.yml",
"chars": 16691,
"preview": "groups:\n\n- name: NodeExporter\n\n \n rules:\n\n - alert: HostOutOfMemory\n expr: '(node_memory_MemAvailable_bytes / "
},
{
"path": "dist/rules/ipmi/ipmi-exporter.yml",
"chars": 7461,
"preview": "groups:\n\n- name: IpmiExporter\n\n \n rules:\n\n # The ipmi_up metric is per-collector. A value of 0 means the collector "
},
{
"path": "dist/rules/istio/embedded-exporter.yml",
"chars": 4944,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: IstioKubernetesGatewayAvailabilityDrop\n expr: 'min(kub"
},
{
"path": "dist/rules/jaeger/embedded-exporter.yml",
"chars": 5163,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: JaegerAgentHttpServerErrors\n expr: '100 * sum(rate(jae"
},
{
"path": "dist/rules/jenkins/metric-plugin.yml",
"chars": 3957,
"preview": "groups:\n\n- name: MetricPlugin\n\n \n rules:\n\n - alert: JenkinsNodeOffline\n expr: 'jenkins_node_offline_value > 0'"
},
{
"path": "dist/rules/juniper/czerwonk-junos-exporter.yml",
"chars": 1162,
"preview": "groups:\n\n- name: CzerwonkJunosExporter\n\n \n rules:\n\n - alert: JuniperSwitchDown\n expr: 'junos_up == 0'\n fo"
},
{
"path": "dist/rules/jvm/jvm-exporter.yml",
"chars": 5455,
"preview": "groups:\n\n- name: JvmExporter\n\n \n rules:\n\n - alert: JvmMemoryFillingUp\n expr: '(sum by (instance)(jvm_memory_us"
},
{
"path": "dist/rules/kafka/danielqsj-kafka-exporter.yml",
"chars": 828,
"preview": "groups:\n\n- name: DanielqsjKafkaExporter\n\n \n rules:\n\n - alert: KafkaTopicsReplicas\n expr: 'min(kafka_topic_part"
},
{
"path": "dist/rules/kafka/linkedin-kafka-exporter.yml",
"chars": 1126,
"preview": "groups:\n\n- name: LinkedinKafkaExporter\n\n \n rules:\n\n - alert: KafkaTopicOffsetDecreased\n expr: 'delta(kafka_bur"
},
{
"path": "dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml",
"chars": 4489,
"preview": "groups:\n\n- name: AerogearKeycloakMetricsSpi\n\n \n rules:\n\n # Threshold of 5% is a rough default. Adjust based on your"
},
{
"path": "dist/rules/kubernetes/kubestate-exporter.yml",
"chars": 18814,
"preview": "groups:\n\n- name: KubestateExporter\n\n \n rules:\n\n - alert: KubernetesNodeNotReady\n expr: 'kube_node_status_condi"
},
{
"path": "dist/rules/linkerd/embedded-exporter.yml",
"chars": 799,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n # Linkerd does not expose request_errors_total. Errors are tracked v"
},
{
"path": "dist/rules/loki/embedded-exporter.yml",
"chars": 2006,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: LokiProcessTooManyRestarts\n expr: 'changes(process_sta"
},
{
"path": "dist/rules/meilisearch/embedded-exporter.yml",
"chars": 786,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: MeilisearchIndexIsEmpty\n expr: 'meilisearch_index_docs"
},
{
"path": "dist/rules/memcached/memcached-exporter.yml",
"chars": 4972,
"preview": "groups:\n\n- name: MemcachedExporter\n\n \n rules:\n\n # 1m delay allows a restart without triggering an alert.\n - aler"
},
{
"path": "dist/rules/minio/embedded-exporter.yml",
"chars": 1183,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: MinioClusterDiskOffline\n expr: 'minio_cluster_drive_of"
},
{
"path": "dist/rules/mongodb/dcu-mongodb-exporter.yml",
"chars": 3897,
"preview": "groups:\n\n- name: DcuMongodbExporter\n\n \n rules:\n\n - alert: MongodbReplicationLag(dcu)\n expr: 'avg(mongodb_repls"
},
{
"path": "dist/rules/mongodb/percona-mongodb-exporter.yml",
"chars": 3442,
"preview": "groups:\n\n- name: PerconaMongodbExporter\n\n \n rules:\n\n # 1m delay allows a restart without triggering an alert.\n -"
},
{
"path": "dist/rules/mongodb/stefanprodan-mgob-exporter.yml",
"chars": 402,
"preview": "groups:\n\n- name: StefanprodanMgobExporter\n\n \n rules:\n\n - alert: MgobBackupFailed\n expr: 'changes(mgob_schedule"
},
{
"path": "dist/rules/mysql/mysqld-exporter.yml",
"chars": 6358,
"preview": "groups:\n\n- name: MysqldExporter\n\n \n rules:\n\n # 1m delay allows a restart without triggering an alert.\n - alert: "
},
{
"path": "dist/rules/nats/nats-exporter.yml",
"chars": 5033,
"preview": "groups:\n\n- name: NatsExporter\n\n \n rules:\n\n - alert: NatsHighRoutesCount\n expr: 'gnatsd_varz_routes > 10'\n "
},
{
"path": "dist/rules/netdata/embedded-exporter.yml",
"chars": 3916,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n # This is a gauge metric (not a counter). Checking idle < 20% means "
},
{
"path": "dist/rules/nginx/knyar-nginx-exporter.yml",
"chars": 1437,
"preview": "groups:\n\n- name: KnyarNginxExporter\n\n \n rules:\n\n - alert: NginxHighHttp4xxErrorRate\n expr: 'sum(rate(nginx_htt"
},
{
"path": "dist/rules/nomad/embedded-exporter.yml",
"chars": 1308,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: NomadJobFailed\n expr: 'nomad_nomad_job_summary_failed "
},
{
"path": "dist/rules/openebs/embedded-exporter.yml",
"chars": 410,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: OpenebsUsedPoolCapacity\n expr: 'openebs_used_pool_capa"
},
{
"path": "dist/rules/openstack/openstack-exporter.yml",
"chars": 9861,
"preview": "groups:\n\n- name: OpenstackExporter\n\n \n rules:\n\n - alert: OpenstackExporterDown\n expr: 'up{job=~\".*openstack.*\""
},
{
"path": "dist/rules/opentelemetry-collector/embedded-exporter.yml",
"chars": 6615,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n # OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by defau"
},
{
"path": "dist/rules/oracle-database/iamseth-oracledb-exporter.yml",
"chars": 4642,
"preview": "groups:\n\n- name: IamsethOracledbExporter\n\n \n rules:\n\n # 1m delay allows a restart without triggering an alert.\n "
},
{
"path": "dist/rules/patroni/embedded-exporter-patroni.yml",
"chars": 570,
"preview": "groups:\n\n- name: EmbeddedExporterPatroni\n\n \n rules:\n\n # 1m delay allows a restart without triggering an alert.\n "
},
{
"path": "dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml",
"chars": 1312,
"preview": "groups:\n\n- name: SpreakerPgbouncerExporter\n\n \n rules:\n\n - alert: PgbouncerActiveConnections\n expr: 'pgbouncer_"
},
{
"path": "dist/rules/php-fpm/bakins-fpm-exporter.yml",
"chars": 491,
"preview": "groups:\n\n- name: BakinsFpmExporter\n\n \n rules:\n\n - alert: Php-fpmMax-childrenReached\n expr: 'sum(increase(phpfp"
},
{
"path": "dist/rules/postgresql/postgres-exporter.yml",
"chars": 9697,
"preview": "groups:\n\n- name: PostgresExporter\n\n \n rules:\n\n # 1m delay allows a restart without triggering an alert.\n - alert"
},
{
"path": "dist/rules/process-exporter/process-exporter.yml",
"chars": 5445,
"preview": "groups:\n\n- name: ProcessExporter\n\n \n rules:\n\n - alert: ProcessExporterGroupDown\n expr: 'namedprocess_namegroup"
},
{
"path": "dist/rules/prometheus-self-monitoring/embedded-exporter.yml",
"chars": 12253,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: PrometheusJobMissing\n expr: 'absent(up{job=\"prometheus"
},
{
"path": "dist/rules/promtail/embedded-exporter.yml",
"chars": 1220,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: PromtailRequestErrors\n expr: '100 * sum(rate(promtail_"
},
{
"path": "dist/rules/proxmox-ve/prometheus-pve-exporter.yml",
"chars": 4019,
"preview": "groups:\n\n- name: PrometheusPveExporter\n\n \n rules:\n\n - alert: PveNodeDown\n expr: 'pve_up{id=~\"node/.*\"} == 0'\n "
},
{
"path": "dist/rules/pulsar/embedded-exporter.yml",
"chars": 4105,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: PulsarSubscriptionHighNumberOfBacklogEntries\n expr: 's"
},
{
"path": "dist/rules/python/python-exporter.yml",
"chars": 2449,
"preview": "groups:\n\n- name: PythonExporter\n\n \n rules:\n\n - alert: PythonGcObjectsUncollectable\n expr: 'increase(python_gc_"
},
{
"path": "dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml",
"chars": 4463,
"preview": "groups:\n\n- name: KbuddeRabbitmqExporter\n\n \n rules:\n\n # 1m delay allows a restart without triggering an alert.\n -"
},
{
"path": "dist/rules/rabbitmq/rabbitmq-exporter.yml",
"chars": 4113,
"preview": "groups:\n\n- name: RabbitmqExporter\n\n \n rules:\n\n # 1m delay allows a restart without triggering an alert.\n - alert"
},
{
"path": "dist/rules/redis/oliver006-redis-exporter.yml",
"chars": 5008,
"preview": "groups:\n\n- name: Oliver006RedisExporter\n\n \n rules:\n\n # 1m delay allows a restart without triggering an alert.\n -"
},
{
"path": "dist/rules/ruby/ruby-exporter.yml",
"chars": 2030,
"preview": "groups:\n\n- name: RubyExporter\n\n \n rules:\n\n # Threshold is a rough default. Adjust based on your application's norma"
},
{
"path": "dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml",
"chars": 3853,
"preview": "groups:\n\n- name: SmartctlExporter\n\n \n rules:\n\n - alert: SmartDeviceTemperatureWarning\n expr: '(avg_over_time(s"
},
{
"path": "dist/rules/sidekiq/strech-sidekiq-exporter.yml",
"chars": 817,
"preview": "groups:\n\n- name: StrechSidekiqExporter\n\n \n rules:\n\n - alert: SidekiqQueueSize\n expr: 'sidekiq_queue_size > 100"
},
{
"path": "dist/rules/snmp/snmp-exporter.yml",
"chars": 4361,
"preview": "groups:\n\n- name: SnmpExporter\n\n # These rules use standard IF-MIB and SNMPv2-MIB metrics. Metric names depend on your s"
},
{
"path": "dist/rules/solr/embedded-exporter.yml",
"chars": 1857,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: SolrUpdateErrors\n expr: 'increase(solr_metrics_core_up"
},
{
"path": "dist/rules/speedtest/nlamirault-speedtest-exporter.yml",
"chars": 841,
"preview": "groups:\n\n- name: NlamiraultSpeedtestExporter\n\n \n rules:\n\n - alert: SpeedtestSlowInternetDownload\n expr: 'avg_o"
},
{
"path": "dist/rules/spinnaker/embedded-exporter.yml",
"chars": 6745,
"preview": "groups:\n\n- name: EmbeddedExporter\n\n \n rules:\n\n - alert: SpinnakerCircuitBreakerOpen\n expr: 'resilience4j_circu"
},
{
"path": "dist/rules/sql-server/ozarklake-mssql-exporter.yml",
"chars": 772,
"preview": "groups:\n\n- name: OzarklakeMssqlExporter\n\n \n rules:\n\n # 1m delay allows a restart without triggering an alert.\n -"
},
{
"path": "dist/rules/ssl/tls/ribbybibby-ssl-exporter.yml",
"chars": 1544,
"preview": "groups:\n\n- name: RibbybibbySslExporter\n\n \n rules:\n\n - alert: SslCertificateProbeFailed\n expr: 'ssl_probe_succe"
},
{
"path": "dist/rules/systemd/systemd-exporter.yml",
"chars": 3523,
"preview": "groups:\n\n- name: SystemdExporter\n\n \n rules:\n\n - alert: SystemdUnitFailed\n expr: 'systemd_unit_state{state=\"fai"
},
{
"path": "dist/rules/thanos/thanos-bucket-replicate.yml",
"chars": 1473,
"preview": "groups:\n\n- name: ThanosBucketReplicate\n\n \n rules:\n\n - alert: ThanosBucketReplicateErrorRate\n expr: '(sum by (j"
},
{
"path": "dist/rules/thanos/thanos-compactor.yml",
"chars": 2790,
"preview": "groups:\n\n- name: ThanosCompactor\n\n \n rules:\n\n - alert: ThanosCompactorMultipleRunning\n expr: 'sum by (job) (up"
},
{
"path": "dist/rules/thanos/thanos-component-absent.yml",
"chars": 2430,
"preview": "groups:\n\n- name: ThanosComponentAbsent\n\n \n rules:\n\n - alert: ThanosCompactIsDown\n expr: 'absent(up{job=~\".*tha"
},
{
"path": "dist/rules/thanos/thanos-query.yml",
"chars": 5607,
"preview": "groups:\n\n- name: ThanosQuery\n\n \n rules:\n\n - alert: ThanosQueryHttpRequestQueryErrorRateHigh\n expr: '(sum by (j"
},
{
"path": "dist/rules/thanos/thanos-receiver.yml",
"chars": 4647,
"preview": "groups:\n\n- name: ThanosReceiver\n\n \n rules:\n\n - alert: ThanosReceiveHttpRequestErrorRateHigh\n expr: '(sum by (j"
},
{
"path": "dist/rules/thanos/thanos-ruler.yml",
"chars": 6775,
"preview": "groups:\n\n- name: ThanosRuler\n\n \n rules:\n\n - alert: ThanosRuleQueueIsDroppingAlerts\n expr: 'sum by (job, instan"
},
{
"path": "dist/rules/thanos/thanos-sidecar.yml",
"chars": 1138,
"preview": "groups:\n\n- name: ThanosSidecar\n\n \n rules:\n\n # Threshold of 0.05/s avoids firing on transient single-event spikes.\n "
},
{
"path": "dist/rules/thanos/thanos-store.yml",
"chars": 2798,
"preview": "groups:\n\n- name: ThanosStore\n\n \n rules:\n\n - alert: ThanosStoreGrpcErrorRate\n expr: '(sum by (job) (rate(grpc_s"
},
{
"path": "dist/rules/traefik/embedded-exporter-v1.yml",
"chars": 1506,
"preview": "groups:\n\n- name: EmbeddedExporterV1\n\n \n rules:\n\n - alert: TraefikBackendDown\n expr: 'count(traefik_backend_ser"
},
{
"path": "dist/rules/traefik/embedded-exporter-v2.yml",
"chars": 1506,
"preview": "groups:\n\n- name: EmbeddedExporterV2\n\n \n rules:\n\n - alert: TraefikServiceDown\n expr: 'count(traefik_service_ser"
},
{
"path": "dist/rules/vmware/pryorda-vmware-exporter.yml",
"chars": 1670,
"preview": "groups:\n\n- name: PryordaVmwareExporter\n\n \n rules:\n\n - alert: VirtualMachineMemoryWarning\n expr: 'vmware_vm_mem"
},
{
"path": "dist/rules/windows-server/windows-exporter.yml",
"chars": 2005,
"preview": "groups:\n\n- name: WindowsExporter\n\n \n rules:\n\n - alert: WindowsServerCollectorError\n expr: 'windows_exporter_co"
},
{
"path": "dist/rules/wireguard/mindflavor-prometheus-wireguard-exporter.yml",
"chars": 2334,
"preview": "groups:\n\n- name: MindflavorPrometheusWireguardExporter\n\n \n rules:\n\n # The threshold of 300 seconds (5 minutes) is a"
},
{
"path": "dist/rules/zfs/node-exporter.yml",
"chars": 402,
"preview": "groups:\n\n- name: NodeExporter\n\n \n rules:\n\n - alert: ZfsOfflinePool\n expr: 'node_zfs_zpool_state{state!=\"online"
},
{
"path": "dist/rules/zfs/zfs_exporter.yml",
"chars": 1327,
"preview": "groups:\n\n- name: Zfs_exporter\n\n \n rules:\n\n - alert: ZfsPoolOutOfSpace\n expr: 'zfs_pool_free_bytes * 100 / zfs_"
},
{
"path": "dist/rules/zookeeper/cloudflare-kafka-zookeeper-exporter.yml",
"chars": 63,
"preview": "groups:\n\n- name: CloudflareKafkaZookeeperExporter\n\n \n rules:\n"
},
{
"path": "dist/rules/zookeeper/dabealu-zookeeper-exporter.yml",
"chars": 1436,
"preview": "groups:\n\n- name: DabealuZookeeperExporter\n\n \n rules:\n\n # 1m delay allows a restart without triggering an alert.\n "
},
{
"path": "dist/template.yml",
"chars": 1160,
"preview": "groups:\n{% assign groupName = slug | split: '-' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | c"
},
{
"path": "docker-compose.yml",
"chars": 158,
"preview": "version: '3'\n\nservices:\n\n jekyll:\n image: jekyll/jekyll:latest\n command: jekyll serve\n volumes:\n - ./:/sr"
},
{
"path": "index.md",
"chars": 1158,
"preview": "\n<style>\n.center-image\n{\n margin: 0 auto;\n display: block;\n}\n</style>\n\n\n. The extraction includes 133 files (873.6 KB), approximately 224.7k tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.