Full Code of samber/awesome-prometheus-alerts for AI

master 9d9c648cddd1 cached

133 files

873.6 KB

224.7k tokens

1 requests

Download .txt

Showing preview only (921K chars total). Download the full file or copy to clipboard to get everything.

Repository: samber/awesome-prometheus-alerts
Branch: master
Commit: 9d9c648cddd1
Files: 133
Total size: 873.6 KB

Directory structure:
gitextract_epx1dxil/

├── .github/
│   ├── FUNDING.yml
│   ├── dependabot.yml
│   └── workflows/
│       ├── dist.yml
│       └── test.yml
├── .gitignore
├── .travis.yml
├── CLAUDE.md
├── CONTRIBUTING.md
├── Gemfile
├── LICENSE
├── README.md
├── _config.yml
├── _data/
│   └── rules.yml
├── _layouts/
│   └── default.html
├── alertmanager.md
├── assets/
│   ├── css/
│   │   └── app.css
│   └── js/
│       └── app.js
├── blackbox-exporter.md
├── dist/
│   ├── rules/
│   │   ├── apache/
│   │   │   └── lusitaniae-apache-exporter.yml
│   │   ├── apache-flink/
│   │   │   └── flink-prometheus-reporter.yml
│   │   ├── apache-spark/
│   │   │   └── spark-prometheus.yml
│   │   ├── apc-ups/
│   │   │   └── apcupsd_exporter.yml
│   │   ├── argocd/
│   │   │   └── embedded-exporter.yml
│   │   ├── aws-cloudwatch/
│   │   │   └── prometheus-cloudwatch-exporter.yml
│   │   ├── azure/
│   │   │   └── azure-metrics-exporter.yml
│   │   ├── blackbox/
│   │   │   └── blackbox-exporter.yml
│   │   ├── caddy/
│   │   │   └── embedded-exporter.yml
│   │   ├── cassandra/
│   │   │   ├── criteo-cassandra-exporter.yml
│   │   │   └── instaclustr-cassandra-exporter.yml
│   │   ├── ceph/
│   │   │   └── embedded-exporter.yml
│   │   ├── cert-manager/
│   │   │   └── embedded-exporter.yml
│   │   ├── cilium/
│   │   │   └── embedded-exporter.yml
│   │   ├── clickhouse/
│   │   │   └── embedded-exporter.yml
│   │   ├── cloudflare/
│   │   │   └── lablabs-cloudflare-exporter.yml
│   │   ├── consul/
│   │   │   └── consul-exporter.yml
│   │   ├── coredns/
│   │   │   └── embedded-exporter.yml
│   │   ├── cortex/
│   │   │   └── embedded-exporter.yml
│   │   ├── couchdb/
│   │   │   └── gesellix-couchdb-prometheus-exporter.yml
│   │   ├── digitalocean/
│   │   │   └── digitalocean-exporter.yml
│   │   ├── docker-containers/
│   │   │   └── google-cadvisor.yml
│   │   ├── ebpf/
│   │   │   └── ebpf-exporter.yml
│   │   ├── elasticsearch/
│   │   │   └── prometheus-community-elasticsearch-exporter.yml
│   │   ├── envoy/
│   │   │   └── embedded-exporter.yml
│   │   ├── etcd/
│   │   │   └── embedded-exporter.yml
│   │   ├── fluxcd/
│   │   │   └── embedded-exporter.yml
│   │   ├── freeswitch/
│   │   │   └── znerol-freeswitch-exporter.yml
│   │   ├── gitlab-ci/
│   │   │   ├── gitaly.yml
│   │   │   ├── gitlab-built-in-exporter.yml
│   │   │   └── workhorse.yml
│   │   ├── golang/
│   │   │   └── golang-exporter.yml
│   │   ├── google-cloud-stackdriver/
│   │   │   └── stackdriver-exporter.yml
│   │   ├── grafana-alloy/
│   │   │   └── embedded-exporter.yml
│   │   ├── grafana-mimir/
│   │   │   └── embedded-exporter.yml
│   │   ├── grafana-tempo/
│   │   │   └── embedded-exporter.yml
│   │   ├── graph-node/
│   │   │   └── embedded-exporter.yml
│   │   ├── hadoop/
│   │   │   └── jmx_exporter.yml
│   │   ├── haproxy/
│   │   │   ├── embedded-exporter-v2.yml
│   │   │   └── haproxy-exporter-v1.yml
│   │   ├── hashicorp-vault/
│   │   │   └── embedded-exporter.yml
│   │   ├── host-and-hardware/
│   │   │   └── node-exporter.yml
│   │   ├── ipmi/
│   │   │   └── ipmi-exporter.yml
│   │   ├── istio/
│   │   │   └── embedded-exporter.yml
│   │   ├── jaeger/
│   │   │   └── embedded-exporter.yml
│   │   ├── jenkins/
│   │   │   └── metric-plugin.yml
│   │   ├── juniper/
│   │   │   └── czerwonk-junos-exporter.yml
│   │   ├── jvm/
│   │   │   └── jvm-exporter.yml
│   │   ├── kafka/
│   │   │   ├── danielqsj-kafka-exporter.yml
│   │   │   └── linkedin-kafka-exporter.yml
│   │   ├── keycloak/
│   │   │   └── aerogear-keycloak-metrics-spi.yml
│   │   ├── kubernetes/
│   │   │   └── kubestate-exporter.yml
│   │   ├── linkerd/
│   │   │   └── embedded-exporter.yml
│   │   ├── loki/
│   │   │   └── embedded-exporter.yml
│   │   ├── meilisearch/
│   │   │   └── embedded-exporter.yml
│   │   ├── memcached/
│   │   │   └── memcached-exporter.yml
│   │   ├── minio/
│   │   │   └── embedded-exporter.yml
│   │   ├── mongodb/
│   │   │   ├── dcu-mongodb-exporter.yml
│   │   │   ├── percona-mongodb-exporter.yml
│   │   │   └── stefanprodan-mgob-exporter.yml
│   │   ├── mysql/
│   │   │   └── mysqld-exporter.yml
│   │   ├── nats/
│   │   │   └── nats-exporter.yml
│   │   ├── netdata/
│   │   │   └── embedded-exporter.yml
│   │   ├── nginx/
│   │   │   └── knyar-nginx-exporter.yml
│   │   ├── nomad/
│   │   │   └── embedded-exporter.yml
│   │   ├── openebs/
│   │   │   └── embedded-exporter.yml
│   │   ├── openstack/
│   │   │   └── openstack-exporter.yml
│   │   ├── opentelemetry-collector/
│   │   │   └── embedded-exporter.yml
│   │   ├── oracle-database/
│   │   │   └── iamseth-oracledb-exporter.yml
│   │   ├── patroni/
│   │   │   └── embedded-exporter-patroni.yml
│   │   ├── pgbouncer/
│   │   │   └── spreaker-pgbouncer-exporter.yml
│   │   ├── php-fpm/
│   │   │   └── bakins-fpm-exporter.yml
│   │   ├── postgresql/
│   │   │   └── postgres-exporter.yml
│   │   ├── process-exporter/
│   │   │   └── process-exporter.yml
│   │   ├── prometheus-self-monitoring/
│   │   │   └── embedded-exporter.yml
│   │   ├── promtail/
│   │   │   └── embedded-exporter.yml
│   │   ├── proxmox-ve/
│   │   │   └── prometheus-pve-exporter.yml
│   │   ├── pulsar/
│   │   │   └── embedded-exporter.yml
│   │   ├── python/
│   │   │   └── python-exporter.yml
│   │   ├── rabbitmq/
│   │   │   ├── kbudde-rabbitmq-exporter.yml
│   │   │   └── rabbitmq-exporter.yml
│   │   ├── redis/
│   │   │   └── oliver006-redis-exporter.yml
│   │   ├── ruby/
│   │   │   └── ruby-exporter.yml
│   │   ├── s.m.a.r.t-device-monitoring/
│   │   │   └── smartctl-exporter.yml
│   │   ├── sidekiq/
│   │   │   └── strech-sidekiq-exporter.yml
│   │   ├── snmp/
│   │   │   └── snmp-exporter.yml
│   │   ├── solr/
│   │   │   └── embedded-exporter.yml
│   │   ├── speedtest/
│   │   │   └── nlamirault-speedtest-exporter.yml
│   │   ├── spinnaker/
│   │   │   └── embedded-exporter.yml
│   │   ├── sql-server/
│   │   │   └── ozarklake-mssql-exporter.yml
│   │   ├── ssl/
│   │   │   └── tls/
│   │   │       └── ribbybibby-ssl-exporter.yml
│   │   ├── systemd/
│   │   │   └── systemd-exporter.yml
│   │   ├── thanos/
│   │   │   ├── thanos-bucket-replicate.yml
│   │   │   ├── thanos-compactor.yml
│   │   │   ├── thanos-component-absent.yml
│   │   │   ├── thanos-query.yml
│   │   │   ├── thanos-receiver.yml
│   │   │   ├── thanos-ruler.yml
│   │   │   ├── thanos-sidecar.yml
│   │   │   └── thanos-store.yml
│   │   ├── traefik/
│   │   │   ├── embedded-exporter-v1.yml
│   │   │   └── embedded-exporter-v2.yml
│   │   ├── vmware/
│   │   │   └── pryorda-vmware-exporter.yml
│   │   ├── windows-server/
│   │   │   └── windows-exporter.yml
│   │   ├── wireguard/
│   │   │   └── mindflavor-prometheus-wireguard-exporter.yml
│   │   ├── zfs/
│   │   │   ├── node-exporter.yml
│   │   │   └── zfs_exporter.yml
│   │   └── zookeeper/
│   │       ├── cloudflare-kafka-zookeeper-exporter.yml
│   │       └── dabealu-zookeeper-exporter.yml
│   └── template.yml
├── docker-compose.yml
├── index.md
├── package.json
├── rules.md
└── sleep-peacefully.md

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/FUNDING.yml
================================================
github: [samber]
ko_fi: samuelberthe


================================================
FILE: .github/dependabot.yml
================================================
---
version: 2
updates:
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "monthly"


================================================
FILE: .github/workflows/dist.yml
================================================
name: Publish

on:
  workflow_dispatch:
  push:
    branches:
      - master

permissions:
  contents: write

jobs:
  publish:
    name: Publish
    # Check if the PR is not from a fork
    if: github.repository_owner == 'samber'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout Repo
        uses: actions/checkout@v6

      - name: Set up Ruby
        uses: ruby/setup-ruby@v1
        with:
          ruby-version: 3.4

      - name: Set up yq
        uses: mikefarah/yq@v4

      - name: Install liquid
        run: |
         gem install liquid -v 5.5.1
         gem install liquid-cli 

      - name: Build rule configuration
        run: |
          cat _data/rules.yml | yq -I 0 -o json > _data/rules.json

          rm -rf dist/rules

          for service in $(cat _data/rules.json | jq -r '.groups[].services[] | @base64'); do
            subdir=dist/rules/$(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")')
            mkdir -p "${subdir}"

            # groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
    
            for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
              exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
              cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml
              echo ${subdir}/${exporterName}.yml
            done
          done

          rm _data/rules.json

      # https://peterevans.dev/posts/github-actions-how-to-automate-code-formatting-in-pull-requests/
      - name: Check for modified files
        id: git-check
        run: echo "modified=$(git status -s --porcelain | wc -l | awk '{$1=$1};1')" >> $GITHUB_OUTPUT
      - name: Push changes
        if: steps.git-check.outputs.modified != '0'
        run: |
          git config --global user.name 'samber'
          git config --global user.email 'samber@users.noreply.github.com'
          git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}
          git add .
          git commit -m "Publish"
          git push


================================================
FILE: .github/workflows/test.yml
================================================
name: Promtool check

on:
  pull_request:
  push:
    branches:
      - master

jobs:
  promtool-check:
    name: Check alert rules syntax
    runs-on: ubuntu-latest
    steps:
      - name: Checkout Repo
        uses: actions/checkout@v6

      - name: Set up Ruby
        uses: ruby/setup-ruby@v1
        with:
          ruby-version: 3.4

      - name: Set up yq
        uses: mikefarah/yq@v4

      - name: Install liquid
        run: gem install liquid-cli

      - name: Build rule configuration
        run: |
          cat _data/rules.yml | yq -I 0 -o json > _data/rules.json

          for service in $(cat _data/rules.json | jq -r '.groups[].services[] | @base64'); do
            subdir=test/rules/$(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")')
            mkdir -p "${subdir}"

            # groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))

            for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
              exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
              cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml
              echo ${subdir}/${exporterName}.yml
            done
          done

          rm _data/rules.json

      - name: Check Prometheus alert rules
        uses: peimanja/promtool-github-actions@master
        with:
          promtool_actions_subcommand: 'rules'
          promtool_actions_files: 'test/rules/*/*.yml'
          promtool_actions_comment: true
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


================================================
FILE: .gitignore
================================================
_site/
.sass-cache/
.jekyll-cache/
.jekyll-metadata
_data/rules.json
test/rules/
/node_modules
.worktrees/

================================================
FILE: .travis.yml
================================================
language: node_js
node_js:
  - 'node'


================================================
FILE: CLAUDE.md
================================================
# CLAUDE.md

This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.

## Project Overview

A curated collection of ~940 Prometheus alerting rules covering 90+ services across 100+ exporters, organized in 7 categories: basic resource monitoring (Prometheus, host/hardware, SMART, Docker, Blackbox, Windows, VMware, Netdata), databases and brokers (MySQL, PostgreSQL, Redis, MongoDB, RabbitMQ, Elasticsearch, Cassandra, Clickhouse, Kafka, etc.), reverse proxies and load balancers (Nginx, Apache, HaProxy, Traefik, Caddy), runtimes (PHP-FPM, JVM, Sidekiq), orchestrators (Kubernetes, Nomad, Consul, Etcd, Istio, ArgoCD, FluxCD), network/security/storage (Ceph, ZFS, Minio, SSL/TLS, CoreDNS, Vault, Cloudflare), and observability tools (Thanos, Loki, Cortex, OpenTelemetry Collector, Jenkins).

All rules are stored in a single YAML data file (`_data/rules.yml`) and rendered as a Jekyll-based GitHub Pages site at https://samber.github.io/awesome-prometheus-alerts. The site provides copy-pasteable Prometheus alert snippets and downloadable rule files per exporter.

The project is community-driven. Most contributions are PRs adding or updating rules in `_data/rules.yml`. Files in `dist/rules/` are auto-generated on merge — never edit them manually.

## Architecture

- **`_data/rules.yml`** — The single source of truth for all alerting rules. This is the main file contributors edit. It is NOT a valid Prometheus config; the site renders each rule into copy-pasteable Prometheus alert format.
- **`rules.md`** — Jekyll template that iterates over `_data/rules.yml` and renders the rules page with copy buttons and formatted YAML blocks.
- **`alertmanager.md`** — Static page with Prometheus/AlertManager configuration examples.
- **`_layouts/default.html`** — Site layout (Jekyll theme: cayman).
- **`_config.yml`** — Jekyll configuration.
- **`dist/rules/`** — Pre-built downloadable rule files organized by service/exporter (referenced in the site for `wget` commands).

## Rules YAML Structure

Services are listed in README.md.

`_data/rules.yml` hierarchy:
```
groups:
  - name: "<category>"        # e.g. "Basic resource monitoring"
    services:
      - name: "<service>"     # e.g. "Host and hardware"
        exporters:
          - name: "<exporter>"
            slug: "<slug>"          # used for download URLs
            doc_url: "<url>"        # optional link to exporter docs
            comments:               # optional, exporter-level multiline notes rendered before rules
              "<comment>"
            rules:
              - name: "<alert name>"
                description: "<text>"
                query: "<PromQL>"
                severity: warning|critical|info
                for: "<duration>"   # optional, defaults to 0m
                comments:           # optional, rendered as multiline YAML comments
                  "<comment>"
```

Services are grouped in category. If you are not sure about the classification, ask the developer.

## Running Locally

```bash
# With Ruby/Bundler
gem install bundler
bundle install
jekyll serve

# With Docker Compose
docker compose up -d

# With Docker directly
docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve
```

Site serves at http://localhost:4000/awesome-prometheus-alerts.

## Contributing Rules

All rule changes go in `_data/rules.yml`. Each rule needs: `name`, `description`, `query` (valid PromQL), and `severity`. The `for` field is optional. Descriptions should be factual ("what") and include root cause hints ("why"). Queries must be tested against the latest exporter version. Never modify files in `dist/` — they are auto-generated on merge.

## Query Validation

- When adding or updating an alert, verify that the PromQL query references metric series that actually exist in the related exporter. Check the exporter's documentation or source code to confirm series names.
- If a metric series has been deprecated or removed in a newer version of the exporter, update the query to use the replacement series, or remove the rule if no replacement exists. Known examples: `kube_hpa_*` renamed to `kube_horizontalpodautoscaler_*` in kube-state-metrics 2.x; `node_hwmon_temp_alarm` does not exist (correct: `node_hwmon_temp_crit_alarm_celsius`); node-exporter CLI flags get renamed across versions.
- When writing or reviewing a query, search the internet (exporter docs, GitHub issues, changelogs) to validate correctness and catch outdated series names. When you are not sure about a metric name, always search the internet to confirm it exists and is spelled correctly before using it.
- Pay special attention to metric naming conventions: many exporters add `_total` suffixes for counters and `_seconds_total` for time-based counters. Verify the exact name from source code, not just docs. Known examples: Spark's PrometheusResource adds `_total` and `_seconds_total` suffixes (e.g., `metrics_executor_failedTasks_total`, not `metrics_executor_failedTasks`); Oracle's `oracledb_sessions_value` not `oracledb_sessions_activity`.
- Verify that label names used in `{{ $labels.xxx }}` template variables actually exist on the metric. Check the exporter source code for the exact label names. Known examples: cloudflare/ebpf_exporter uses `id` not `name` for programs, and `config` not `name` for decoder errors.
- When a metric uses info-style patterns (value always 1, information carried in labels), `== 0` will never be true — the metric simply won't exist. Use `absent()` instead. Known example: `ebpf_exporter_enabled_configs`.
- Some metrics are version-dependent. When a metric was renamed or removed in a newer version, add a comment noting the version requirement. Known examples: `go_memstats_gc_cpu_fraction` removed in client_golang v1.12+; cert-manager renamed `certmanager_http_acme_client_request_count` to `certmanager_acme_client_request_count` in v1.19+.
- Verify the unit of a metric before setting thresholds. Some metrics use milliseconds while descriptions assume seconds. Known example: Keycloak's `keycloak_request_duration` is in milliseconds, so `> 2` means 2ms not 2s.
- Some exporters expose labels that differ between services even within the same ecosystem. Known example: OpenStack Neutron uses `adminState="up"` while Nova and Cinder use `adminState="enabled"`.
- When an official mixin exists for a service, compare thresholds and time windows against it. Known deviations to watch for: Mimir store-gateway sync uses 1800s (not 600s), Mimir compactor skipped blocks uses `[24h]` (not `[5m]`), Tempo normalizes outstanding blocks per worker.

## Common Review Pitfalls (learned from PR history)

These are the most frequent issues raised during code review on this repo:

### Severity levels
- `critical` = requires immediate human attention. Do not use for informational/security notifications.
- `warning` = needs attention soon but not urgent.
- `info` = awareness only (e.g., config changes, underutilized resources).
- Authentication failures, security notifications, and config-change detections are typically `info`, not `critical`.

### `for` duration
- Omit `for` when the default (0m) is intentional and appropriate — do not add `for: 0m` explicitly.
- Add a `for` duration (e.g., `for: 2m` or `for: 5m`) to tolerate brief unavailability from restarts or transient spikes. Most "service down" rules should have at least `for: 1m`–`2m`.
- Do not blanket-change all `for: 0m` to `for: 1m` — it depends on the alert's semantics and the range window used in `increase()`/`rate()`.

### Query design
- Prefer symptom-based alerts over cause-based alerts to reduce alert fatigue. Example: "service is unreachable" is better than "specific internal counter changed". Metrics like heap object count, allocation rate, or free heap slots are causes, not symptoms — prefer GC duration, latency, or error rate alerts instead.
- Don't add unnecessary aggregation (`avg()`, `avg_over_time()`) on metrics that are local to a single node/instance. Only aggregate when the alert is cluster-wide.
- Don't combine `min_over_time()[1m]` with `for: 2m` redundantly — pick one mechanism for smoothing. Same applies to `avg_over_time()[5m]` with `for: 5m`.
- Remove unnecessary label filters (e.g., `job="cassandra"` or `cluster=~".*"`) that add noise without value.
- Verify comparison operators match the intent — e.g., "high snapshot count" must use `> N`, not `< N`.
- When dividing counters (e.g., error rate = errors / total), guard against division by zero with `and total > 0` or filter appropriately. This is the most common issue in new PRs — check every ratio query.
- Filter out system/template databases explicitly in DB queries (e.g., PostgreSQL: add `datid!="0"` alongside `datname!~"template.*|postgres"`).
- Never use `rate()` on a gauge metric — use `deriv()` instead. `rate()` is for monotonically increasing counters only.
- When using `increase()` for ratio calculations, prefer `rate()` instead — `increase()` can produce incorrect results when counters reset mid-window.
- When filtering gRPC error codes, don't use `grpc_code!="OK"` — this includes normal application responses like `NotFound`, `AlreadyExists`, and `Cancelled`. Filter to actual errors: `grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"`.
- When computing ratios with `rate()` on a metric that is itself already a normalized rate (e.g., Oracle's `v$waitclassmetric`), applying `rate()` computes the rate-of-change of a rate, which is not meaningful.
- When a multi-label metric is used in a binary operation with a metric that has fewer labels, use `ignoring(extra_label)` to avoid join failures. Known example: `systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max`.
- When a query groups by labels (e.g., `by (le, worker)`), consider the cardinality impact — hundreds of label values means hundreds of independent alerts.
- Ensure `{{ $value | humanizeDuration }}` is only used on values in seconds. If the metric is in milliseconds, divide by 1000 first or use `{{ $value | humanize }}ms`.
- Avoid using `up{job=~"exporter-name"} == 0` or `absent(up{job=~"exporter-name"})` to detect whether a service is down. When targets are managed via service discovery or a job reaches multiple targets, a disappeared target causes the `up` series to become stale and vanish rather than drop to 0, so the alert never fires. Prefer application-level or cluster-level metrics instead (e.g., "number of consul cluster members < 3", "PostgreSQL primary node absent").

### Thresholds
- Alert thresholds are inherently arbitrary and depend on workload. Use `comments:` to note this when a threshold is a rough default.
- When threshold values in a PR seem unreasonable (too high or too low), challenge them with real-world reasoning or exporter docs.
- Watch for thresholds that are so high they only catch catastrophic scenarios and miss real problems. Examples: Go goroutine spike at 100/s (misses gradual leaks), Ruby major GC at 5/s (only fires if app is non-functional), Python gen2 GC at >1/s (extremely rare).
- Watch for thresholds that will fire on normal healthy operation. Examples: Memcached at 90% memory is desired (it's a cache), Flink TaskManager at 90% JVM heap is normal, cache hit rate < 80% is common for cold caches.
- For SNMP bandwidth utilization, `ifSpeed` (Gauge32) maxes at ~4.29 Gbps. For 10G+ interfaces, use `ifHighSpeed * 1000000` instead.
- For alerts using `> 0` on counters with `rate()` or `increase()`, consider whether a single event truly warrants alerting. In most cases, a small threshold (e.g., `> 0.05` for rate, `> 3` for increase) better distinguishes real problems from transient noise.

### Comments
- When an alert or its query needs explanation (e.g., non-obvious PromQL logic, threshold rationale, edge cases), use the rule-level `comments:` field. Use multiline comments when needed.
- Use the exporter-level `comments:` field for notes that apply to all rules under that exporter (e.g., exporter version requirements, known quirks, setup prerequisites).
- Comments are rendered as YAML `#` comments in the output, so they are visible to users who copy-paste the rules.

### Descriptions
- Keep descriptions short, factual, and actionable.
- Include what is happening ("Disk is almost full") and why it matters or what to check.
- Use `{{ $labels.instance }}`, `{{ $value }}`, and other template variables in descriptions when useful.
- If the description says "average" but the query uses `histogram_quantile(0.95, ...)`, fix the description to say "p95" (or vice versa).
- When alerting on rates or ratios that may not be intuitive, include `{{ $value }}` in the description so operators can see the actual number.

### Structure
- Some services have multiple exporters (e.g., MongoDB has `percona/mongodb_exporter` and `dcu/mongodb_exporter`). Place rules under the correct exporter.
- Search for duplicates before adding a new rule — a similar alert may already exist under a different exporter or with different thresholds.
- The `slug` field must be unique per exporter and is used for download URLs.

## Reference Sources for Cross-Checking Alerts

Use these sources to criticize and validate PromQL queries, compare thresholds, and find inspiration for new rules.

Everytime you consume an external resource to change a PromQL query, please compare before/after and explain why you think the external source is right.

### Official project mixins (alerts maintained by the project itself)
- https://github.com/prometheus/node_exporter/tree/master/docs/node-mixin/alerts
- https://github.com/prometheus/prometheus/tree/main/documentation/prometheus-mixin
- https://github.com/prometheus/alertmanager/tree/main/doc/alertmanager-mixin
- https://github.com/prometheus/snmp_exporter/tree/main/snmp-mixin
- https://github.com/prometheus/mysqld_exporter/tree/main/mysqld-mixin
- https://github.com/prometheus-community/postgres_exporter/tree/master/postgres_mixin
- https://github.com/prometheus-community/elasticsearch_exporter (mixin via Grafana docs)
- https://github.com/etcd-io/etcd/tree/main/contrib/mixin
- https://github.com/thanos-io/thanos/tree/main/mixin (also: examples/alerts/)
- https://github.com/grafana/loki/tree/main/production/loki-mixin (also: promtail-mixin/)
- https://github.com/grafana/mimir/tree/main/operations/mimir-mixin
- https://github.com/grafana/tempo/tree/main/operations/tempo-mixin
- https://github.com/grafana/grafana/tree/main/grafana-mixin
- https://github.com/ceph/ceph/tree/main/monitoring/ceph-mixin (in-tree; also https://github.com/ceph/ceph-mixins)
- https://github.com/jaegertracing/jaeger/tree/main/monitoring/jaeger-mixin
- https://github.com/kubernetes-monitoring/kubernetes-mixin (includes runbook.md)
- https://github.com/kubernetes/kube-state-metrics/tree/main/jsonnet/kube-state-metrics-mixin
- https://github.com/prometheus-operator/prometheus-operator/tree/main/jsonnet/mixin
- https://github.com/prometheus-operator/kube-prometheus
- https://github.com/cortexproject/cortex-jsonnet
- https://github.com/gluster/gluster-mixins

### Standalone mixin repositories
- https://github.com/povilasv/coredns-mixin
- https://github.com/adinhodovic/rabbitmq-mixin
- https://github.com/adinhodovic/blackbox-exporter-mixin
- https://github.com/adinhodovic/django-mixin
- https://github.com/adinhodovic/argo-cd-mixin
- https://github.com/adinhodovic/ingress-nginx-mixin
- https://github.com/adinhodovic/kubernetes-autoscaling-mixin
- https://github.com/metalmatze/kube-cockroachdb (CockroachDB on Kubernetes)
- https://github.com/bitnami-labs/sealed-secrets (sealed-secrets mixin)
- https://github.com/lukas-vlcek/elasticsearch-mixin (includes runbook.md)
- https://github.com/adinhodovic/postgresql-mixin
- https://github.com/imusmanmalik/cert-manager-mixin
- https://gitlab.com/uneeq-oss/cert-manager-mixin (alternative cert-manager mixin)
- https://github.com/uneeq-oss/spinnaker-mixin
- https://github.com/metalmatze/slo-libsonnet (SLO alerting/recording rules generation library)

### Grafana jsonnet-libs (93 mixins — browse for specific services)
- https://github.com/grafana/jsonnet-libs
- Notable mixins with alerts: consul, memcached, elasticsearch, haproxy, clickhouse, opensearch, redis, mongodb, kafka, nginx, rabbitmq, jvm, vault, envoy, istio, jenkins, caddy, cloudflare, docker, traefik, windows, snmp, argocd, nomad, pgbouncer, minio, ceph, and 60+ more.

### Mixin aggregators
- https://monitoring.mixins.dev/ (central registry of all monitoring mixins)
- https://github.com/monitoring-mixins/website/blob/master/mixins.json (machine-readable list of all mixins with source URLs)
- https://github.com/nlamirault/monitoring-mixins (hub aggregating many mixins)

### GitLab monitoring & infrastructure
- https://gitlab.com/gitlab-com/runbooks (GitLab.com SRE runbooks — production alert rules, runbook docs, alertmanager config)
- https://gitlab.com/gitlab-com/runbooks/-/tree/master/mimir-rules (production Mimir alerting rules organized by tenant/environment)
- https://gitlab.com/gitlab-com/runbooks/-/tree/master/mimir-rules-jsonnet (jsonnet sources for GitLab alerting rules)
- https://gitlab.com/gitlab-org/omnibus-gitlab/-/tree/master/files/gitlab-cookbooks/monitoring/templates/rules (default Prometheus rules shipped with GitLab Omnibus)

### Community alert collections
- https://github.com/jpweber/prometheus-alert-rules
- https://github.com/bdossantos/prometheus-alert-rules
- https://github.com/giantswarm/prometheus-rules
- https://github.com/last9/awesome-prometheus-toolkit
- https://github.com/warpnet/awesome-prometheus (meta-list of Prometheus resources)


================================================
FILE: CONTRIBUTING.md
================================================

# Contributing

## Adding alerting rule

If you don't have time to write a PR, just copy and paste some alerts into an issue. We will format it accordingly.

Rules are here: `_data/rules.yml`.

### Guidelines

Please ensure your pull request adheres to the following guidelines:

- Search previous suggestions before making a new one, as yours may be a duplicate.
- Keep descriptions short and simple, but descriptive.
- Description must be factual (the "what?") and should provide root cause suggestions (the "why?"), for faster resolution.
- Queries must be tested on latest exporter version.

## Improving Github page

### Run locally

```
gem install bundler
bundle install
jekyll serve
```

Or with Docker:

```
docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve
```

Or with Docker Compose:

```
docker compose up -d
```


================================================
FILE: Gemfile
================================================
source 'https://rubygems.org'
gem 'github-pages', '>= 232', group: :jekyll_plugins
gem 'webrick', '~> 1.8'

================================================
FILE: LICENSE
================================================
Creative Commons Attribution 4.0 International License (CC BY 4.0)

http://creativecommons.org/licenses/by/4.0/


================================================
FILE: README.md
================================================
# 👋 Awesome Prometheus Alerts [![Awesome](https://awesome.re/badge-flat.svg)](https://awesome.re)

> Most alerting rules are common to every Prometheus setup. We need a place to find them all. 🤘 🚨 📊

Collection available here: **[https://samber.github.io/awesome-prometheus-alerts](https://samber.github.io/awesome-prometheus-alerts)**

<div align="center">
  <hr>
  <sup><b>Sponsored by:</b></sup>
  <br>
  <a href="https://cast.ai/samuel">
    <div>
      <img src="https://samber.github.io/awesome-prometheus-alerts/assets/sponsor-cast-ai.png" width="200" alt="Cast AI">
    </div>
    <div>
      Cut Kubernetes & AI costs, boost application stability.
    </div>
  </a>
  <br>
  <a href="https://betterstack.com">
    <div>
      <img src="https://samber.github.io/awesome-prometheus-alerts/assets/sponsor-betterstack.png" width="200" alt="Better Stack">
    </div>
    <div>
      Better Stack lets you centralize, search, and visualize your logs.
    </div>
  </a>
  <hr>
</div>

## ✨ Contents

- [Rules](#-rules)
- [Contributing](#-contributing)
- [Improvements](#-improvements)
- [Help us](#-show-your-support)
- [License](#-license)

## 🚨 Rules

#### Basic resource monitoring

- [Prometheus self-monitoring](https://samber.github.io/awesome-prometheus-alerts/rules#prometheus-internals)
- [Host/Hardware](https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware)
- [SMART](https://samber.github.io/awesome-prometheus-alerts/rules#smart)
- [IPMI](https://samber.github.io/awesome-prometheus-alerts/rules#ipmi)
- [Docker Containers](https://samber.github.io/awesome-prometheus-alerts/rules#docker-containers)
- [Blackbox](https://samber.github.io/awesome-prometheus-alerts/rules#blackbox)
- [Windows](https://samber.github.io/awesome-prometheus-alerts/rules#windows-server)
- [VMWare](https://samber.github.io/awesome-prometheus-alerts/rules#vmware)
- [Proxmox VE](https://samber.github.io/awesome-prometheus-alerts/rules#proxmox-ve)
- [Netdata](https://samber.github.io/awesome-prometheus-alerts/rules#netdata)
- [eBPF](https://samber.github.io/awesome-prometheus-alerts/rules#ebpf)
- [Process Exporter](https://samber.github.io/awesome-prometheus-alerts/rules#process-exporter)
- [Systemd](https://samber.github.io/awesome-prometheus-alerts/rules#systemd)

#### Databases

- [MySQL](https://samber.github.io/awesome-prometheus-alerts/rules#mysql)
- [PostgreSQL](https://samber.github.io/awesome-prometheus-alerts/rules#postgresql)
- [SQL Server](https://samber.github.io/awesome-prometheus-alerts/rules#sql-server)
- [Oracle Database](https://samber.github.io/awesome-prometheus-alerts/rules#oracle-database)
- [Patroni](https://samber.github.io/awesome-prometheus-alerts/rules#patroni)
- [PGBouncer](https://samber.github.io/awesome-prometheus-alerts/rules#pgbouncer)
- [Redis](https://samber.github.io/awesome-prometheus-alerts/rules#redis)
- [Memcached](https://samber.github.io/awesome-prometheus-alerts/rules#memcached)
- [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb)
- [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch)
- [Meilisearch](https://samber.github.io/awesome-prometheus-alerts/rules#meilisearch)
- [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra)
- [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse)
- [CouchDB](https://samber.github.io/awesome-prometheus-alerts/rules#couchdb)
- [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr)

#### Message brokers

- [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq)
- [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper)
- [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka)
- [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar)
- [Nats](https://samber.github.io/awesome-prometheus-alerts/rules#nats)

#### Proxies, load balancers and service meshes

- [Nginx](https://samber.github.io/awesome-prometheus-alerts/rules#nginx)
- [Apache](https://samber.github.io/awesome-prometheus-alerts/rules#apache)
- [HaProxy](https://samber.github.io/awesome-prometheus-alerts/rules#haproxy)
- [Traefik](https://samber.github.io/awesome-prometheus-alerts/rules#traefik)
- [Caddy](https://samber.github.io/awesome-prometheus-alerts/rules#caddy)
- [Envoy](https://samber.github.io/awesome-prometheus-alerts/rules#envoy)
- [Linkerd](https://samber.github.io/awesome-prometheus-alerts/rules#linkerd)
- [Istio](https://samber.github.io/awesome-prometheus-alerts/rules#istio)

#### Runtimes

- [PHP-FPM](https://samber.github.io/awesome-prometheus-alerts/rules#php-fpm)
- [JVM](https://samber.github.io/awesome-prometheus-alerts/rules#jvm)
- [Golang](https://samber.github.io/awesome-prometheus-alerts/rules#golang)
- [Ruby](https://samber.github.io/awesome-prometheus-alerts/rules#ruby)
- [Python](https://samber.github.io/awesome-prometheus-alerts/rules#python)
- [Sidekiq](https://samber.github.io/awesome-prometheus-alerts/rules#sidekiq)

#### Data engineering

- [Apache Flink](https://samber.github.io/awesome-prometheus-alerts/rules#apache-flink)
- [Apache Spark](https://samber.github.io/awesome-prometheus-alerts/rules#apache-spark)
- [Hadoop](https://samber.github.io/awesome-prometheus-alerts/rules#hadoop)

#### Orchestrators

- [Kubernetes](https://samber.github.io/awesome-prometheus-alerts/rules#kubernetes)
- [Nomad](https://samber.github.io/awesome-prometheus-alerts/rules#nomad)
- [Consul](https://samber.github.io/awesome-prometheus-alerts/rules#consul)
- [Etcd](https://samber.github.io/awesome-prometheus-alerts/rules#etcd)
- [OpenStack](https://samber.github.io/awesome-prometheus-alerts/rules#openstack)

#### CI/CD

- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
- [ArgoCD](https://samber.github.io/awesome-prometheus-alerts/rules#argocd)
- [FluxCD](https://samber.github.io/awesome-prometheus-alerts/rules#fluxcd)
- [GitLab CI](https://samber.github.io/awesome-prometheus-alerts/rules#gitlab-ci)
- [Spinnaker](https://samber.github.io/awesome-prometheus-alerts/rules#spinnaker)

#### Network and security

- [SpeedTest](https://samber.github.io/awesome-prometheus-alerts/rules#speedtest)
- [SSL/TLS](https://samber.github.io/awesome-prometheus-alerts/rules#ssl/tls)
- [cert-manager](https://samber.github.io/awesome-prometheus-alerts/rules#cert-manager)
- [Juniper](https://samber.github.io/awesome-prometheus-alerts/rules#juniper)
- [CoreDNS](https://samber.github.io/awesome-prometheus-alerts/rules#coredns)
- [FreeSwitch](https://samber.github.io/awesome-prometheus-alerts/rules#freeswitch)
- [Hashicorp Vault](https://samber.github.io/awesome-prometheus-alerts/rules#hashicorp-vault)
- [Keycloak](https://samber.github.io/awesome-prometheus-alerts/rules#keycloak)
- [Cloudflare](https://samber.github.io/awesome-prometheus-alerts/rules#cloudflare)
- [SNMP](https://samber.github.io/awesome-prometheus-alerts/rules#snmp)
- [Cilium](https://samber.github.io/awesome-prometheus-alerts/rules#cilium)
- [WireGuard](https://samber.github.io/awesome-prometheus-alerts/rules#wireguard)

#### Storage

- [Ceph](https://samber.github.io/awesome-prometheus-alerts/rules#ceph)
- [ZFS](https://samber.github.io/awesome-prometheus-alerts/rules#zfs)
- [OpenEBS](https://samber.github.io/awesome-prometheus-alerts/rules#openebs)
- [Minio](https://samber.github.io/awesome-prometheus-alerts/rules#minio)

#### Cloud providers

- [AWS CloudWatch](https://samber.github.io/awesome-prometheus-alerts/rules#aws-cloudwatch)
- [Google Cloud Stackdriver](https://samber.github.io/awesome-prometheus-alerts/rules#google-cloud-stackdriver)
- [DigitalOcean](https://samber.github.io/awesome-prometheus-alerts/rules#digitalocean)
- [Azure](https://samber.github.io/awesome-prometheus-alerts/rules#azure)

#### Observability

- [Thanos](https://samber.github.io/awesome-prometheus-alerts/rules#thanos)
- [Loki](https://samber.github.io/awesome-prometheus-alerts/rules#loki)
- [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail)
- [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex)
- [Grafana Tempo](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-tempo)
- [Grafana Mimir](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-mimir)
- [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy)
- [OpenTelemetry Collector](https://samber.github.io/awesome-prometheus-alerts/rules#opentelemetry-collector)
- [Jaeger](https://samber.github.io/awesome-prometheus-alerts/rules#jaeger)

#### Other

- [APC UPS](https://samber.github.io/awesome-prometheus-alerts/rules#apc-ups)
- [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node)

## 🤝 Contributing

Contributions from community (you!) are most welcome!

There are many ways to contribute: writing code, alerting rules, documentation, reporting issues, discussing better error tracking...

[Instructions here](CONTRIBUTING.md)

## 🏋️ Improvements

- Create an alert rule builder in Jekyll for custom alerts (severity, thresholds, instances...)
- Add resolution suggestions to rule descriptions, for faster incident resolution ([#85](https://github.com/samber/awesome-prometheus-alerts/issues/85)).

## 💫 Show your support

Give a ⭐️ if this project helped you!

[![support us](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/samber)

## 📝 License

[![CC4](https://mirrors.creativecommons.org/presskit/cc.srr.primary.svg)](https://creativecommons.org/licenses/by/4.0/legalcode)

Licensed under the Creative Commons 4.0 License, see LICENSE file for more detail.


================================================
FILE: _config.yml
================================================
theme: jekyll-theme-cayman

title: Awesome Prometheus alerts
description: Collection of alerting rules

repository: samber/awesome-prometheus-alerts

baseurl: /awesome-prometheus-alerts


================================================
FILE: _data/rules.yml
================================================
#
# The following yaml cannot be copy-pasted to Prometheus configuration.
#     Please navigate to https://samber.github.io/awesome-prometheus-alerts/rules instead.
#
# Contributing guidelines:
#      https://github.com/samber/awesome-prometheus-alerts/blob/master/CONTRIBUTING.md
#

groups:
  - name: Basic resource monitoring
    services:
      - name: Prometheus self-monitoring
        exporters:
          - slug: embedded-exporter
            rules:
              - name: Prometheus job missing
                description: A Prometheus job has disappeared
                query: 'absent(up{job="prometheus"})'
                severity: warning
              - name: Prometheus target missing
                description: A Prometheus target has disappeared. An exporter might be crashed.
                query: "up == 0 unless on(job) (sum by (job) (up) == 0)"
                severity: critical
                for: 1m
                comments: |
                  Only fire if at least one target in the job is still up.
                  If all targets are down, PrometheusJobMissing or PrometheusAllTargetsMissing will fire instead.
              - name: Prometheus all targets missing
                description: A Prometheus job does not have living target anymore.
                query: "sum by (job) (up) == 0"
                severity: critical
                for: 1m
              - name: Prometheus target missing with warmup time
                description: "Allow a job time to start up (10 minutes) before alerting that it's down."
                query: "sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))"
                severity: critical
                for: 1m
              - name: Prometheus configuration reload failure
                description: Prometheus configuration reload error
                query: "prometheus_config_last_reload_successful != 1"
                severity: warning
              - name: Prometheus too many restarts
                description: Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
                query: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
                severity: warning
              - name: Prometheus AlertManager job missing
                description: A Prometheus AlertManager job has disappeared
                query: 'absent(up{job="alertmanager"})'
                severity: warning
              - name: Prometheus AlertManager configuration reload failure
                description: AlertManager configuration reload error
                query: "alertmanager_config_last_reload_successful != 1"
                severity: warning
              - name: Prometheus AlertManager config not synced
                description: Configurations of AlertManager cluster instances are out of sync
                query: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
                severity: warning
              - name: Prometheus AlertManager E2E dead man switch
                description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager."
                query: "vector(1)"
                severity: critical
              - name: Prometheus not connected to alertmanager
                description: Prometheus cannot connect the alertmanager
                query: "prometheus_notifications_alertmanagers_discovered < 1"
                severity: critical
              - name: Prometheus rule evaluation failures
                description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
                query: "increase(prometheus_rule_evaluation_failures_total[3m]) > 0"
                severity: critical
              - name: Prometheus template text expansion failures
                description: "Prometheus encountered {{ $value }} template text expansion failures"
                query: "increase(prometheus_template_text_expansion_failures_total[3m]) > 0"
                severity: critical
              - name: Prometheus rule evaluation slow
                description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query."
                query: "prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds"
                severity: warning
                for: 5m
              - name: Prometheus notifications backlog
                description: The Prometheus notification queue has not been empty for 10 minutes
                query: "min_over_time(prometheus_notifications_queue_length[10m]) > 0"
                severity: warning
              - name: Prometheus AlertManager notification failing
                description: "Alertmanager is failing sending notifications ({{ $value }} notifications/s)"
                query: "rate(alertmanager_notifications_failed_total[1m]) > 0"
                severity: critical
              - name: Prometheus target empty
                description: Prometheus has no target in service discovery
                query: "prometheus_sd_discovered_targets == 0"
                severity: critical
              - name: Prometheus target scraping slow
                description: Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.
                query: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
                severity: warning
                for: 5m
              - name: Prometheus large scrape
                description: "Prometheus has many scrapes that exceed the sample limit ({{ $value }} scrapes)"
                query: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10"
                severity: warning
                for: 5m
              - name: Prometheus target scrape duplicate
                description: "Prometheus has many samples rejected due to duplicate timestamps but different values ({{ $value }} samples)"
                query: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 3"
                severity: warning
              - name: Prometheus TSDB checkpoint creation failures
                description: "Prometheus encountered {{ $value }} checkpoint creation failures"
                query: "increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0"
                severity: critical
              - name: Prometheus TSDB checkpoint deletion failures
                description: "Prometheus encountered {{ $value }} checkpoint deletion failures"
                query: "increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0"
                severity: critical
              - name: Prometheus TSDB compactions failed
                description: "Prometheus encountered {{ $value }} TSDB compactions failures"
                query: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
                severity: critical
              - name: Prometheus TSDB head truncations failed
                description: "Prometheus encountered {{ $value }} TSDB head truncation failures"
                query: "increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0"
                severity: critical
              - name: Prometheus TSDB reload failures
                description: "Prometheus encountered {{ $value }} TSDB reload failures"
                query: "increase(prometheus_tsdb_reloads_failures_total[1m]) > 0"
                severity: critical
              - name: Prometheus TSDB WAL corruptions
                description: "Prometheus encountered {{ $value }} TSDB WAL corruptions"
                query: "increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0"
                severity: critical
              - name: Prometheus TSDB WAL truncations failed
                description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures"
                query: "increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0"
                severity: critical
              - name: Prometheus timeseries cardinality
                description: 'The "{{ $labels.name }}" timeseries cardinality is getting very high: {{ $value }}'
                query: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
                severity: warning

      - name: Host and hardware
        exporters:
          - name: node-exporter
            slug: node-exporter
            doc_url: https://github.com/prometheus/node_exporter
            rules:
              - name: Host out of memory
                description: Node memory is filling up (< 10% left)
                query: "(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)"
                severity: warning
                for: 2m
              - name: Host memory under memory pressure
                description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s)."
                query: "(rate(node_vmstat_pgmajfault[5m]) > 1000)"
                severity: warning
              - name: Host Memory is underutilized
                description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})"
                query: "min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8"
                severity: info
                comments: |
                  You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
              - name: Host unusual network throughput in
                description: Host receive bandwidth is high (>80%).
                query: "((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0"
                severity: warning
              - name: Host unusual network throughput out
                description: Host transmit bandwidth is high (>80%)
                query: "((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0"
                severity: warning
              - name: Host disk IO utilization high
                description: Disk utilization is high (> 80%)
                query: "(rate(node_disk_io_time_seconds_total[5m]) > .80)"
                severity: warning
              - name: Host out of disk space
                description: Disk is almost full (< 10% left)
                query: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
                severity: critical
                comments: |
                  Please add ignored mountpoints in node_exporter parameters like
                  "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
                  Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
                for: 2m
              - name: Host disk may fill in 24 hours
                description: Filesystem will likely run out of space within the next 24 hours.
                query: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
                severity: warning
                comments: |
                  Please add ignored mountpoints in node_exporter parameters like
                  "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
                  Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
                for: 2m
              - name: Host out of inodes
                description: Disk is almost running out of available inodes (< 10% left)
                query: "(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) and node_filesystem_files > 0"
                severity: critical
                for: 2m
              - name: Host filesystem device error
                description: "Error stat-ing the {{ $labels.mountpoint }} filesystem"
                query: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
                severity: critical
                for: 2m
              - name: Host inodes may fill in 24 hours
                description: Filesystem will likely run out of inodes within the next 24 hours at current write rate
                query: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
                severity: warning
                for: 2m
              - name: Host unusual disk read latency
                description: Disk latency is growing (read operations > 100ms)
                query: "(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)"
                severity: warning
                for: 2m
              - name: Host unusual disk write latency
                description: Disk latency is growing (write operations > 100ms)
                query: "(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)"
                severity: warning
                for: 2m
              - name: Host high CPU load
                description: CPU load is > 80%
                query: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
                severity: warning
                for: 10m
              - name: Host CPU is underutilized
                description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs."
                query: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
                severity: info
                for: 1w
                comments: |
                  You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
              - name: Host CPU steal noisy neighbor
                description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
                query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
                severity: warning
              - name: Host CPU high iowait
                description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond.
                query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
                severity: warning
              - name: Host unusual disk IO
                description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues."
                query: "rate(node_disk_io_time_seconds_total[5m]) > 0.8"
                severity: warning
                for: 5m
              - name: Host context switching high
                description: Context switching is growing on the node (twice the daily average during the last 15m)
                query: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
                severity: warning
                comments: |
                  x2 context switches is an arbitrary number.
                  The alert threshold depends on the nature of the application.
                  Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
              - name: Host swap is filling up
                description: Swap is filling up (>80%)
                query: "((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) and node_memory_SwapTotal_bytes > 0"
                severity: warning
                for: 2m
              - name: Host systemd service crashed
                description: "systemd service {{ $labels.name }} crashed"
                query: '(node_systemd_unit_state{state="failed"} == 1)'
                severity: warning
              - name: Host physical component too hot
                description: "Physical hardware component too hot"
                query: "node_hwmon_temp_celsius > node_hwmon_temp_max_celsius"
                severity: warning
                for: 5m
              - name: Host node overtemperature alarm
                description: "Physical node temperature alarm triggered"
                query: "((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))"
                severity: critical
              - name: Host software RAID insufficient drives
                description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining."
                query: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)'
                comments: |
                  Uses ignoring(state) to handle additional labels on node_md_disks. Matches the official node-exporter mixin.
                severity: critical
              - name: Host software RAID disk failure
                description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention."
                query: '(node_md_disks{state="failed"} > 0)'
                severity: warning
                for: 2m
              - name: Host kernel version deviations
                description: Kernel version for {{ $labels.instance }} has changed.
                query: "changes(node_uname_info[1h]) > 0"
                severity: info
              - name: Host OOM kill detected
                description: OOM kill detected
                query: "(increase(node_vmstat_oom_kill[30m]) > 0)"
                severity: warning
                comments: |
                  When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 15–20 minutes to recover, the alert should still trigger.
              - name: Host EDAC Correctable Errors detected
                description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
                query: "(increase(node_edac_correctable_errors_total[1m]) > 0)"
                severity: info
              - name: Host EDAC Uncorrectable Errors detected
                description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
                query: "(node_edac_uncorrectable_errors_total > 0)"
                severity: warning
              - name: Host Network Receive Errors
                description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
                query: "(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) and rate(node_network_receive_packets_total[2m]) > 0"
                severity: warning
                for: 2m
              - name: Host Network Transmit Errors
                description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
                query: "(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) and rate(node_network_transmit_packets_total[2m]) > 0"
                severity: warning
                for: 2m
              - name: Host Network Bond Degraded
                description: 'Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".'
                query: "((node_bonding_active - node_bonding_slaves) != 0)"
                severity: warning
                for: 2m
              - name: Host conntrack limit
                description: "The number of conntrack is approaching limit"
                query: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) and node_nf_conntrack_entries_limit > 0"
                severity: warning
                for: 5m
              - name: Host clock skew
                description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host."
                query: "((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))"
                severity: warning
                for: 10m
              - name: Host clock not synchronising
                description: "Clock not synchronising. Ensure NTP is configured on this host."
                query: "(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)"
                severity: warning
                for: 2m

      - name: S.M.A.R.T Device Monitoring
        exporters:
          - name: smartctl-exporter
            slug: smartctl-exporter
            doc_url: https://github.com/prometheus-community/smartctl_exporter
            rules:
              - name: SMART device temperature warning
                description: Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C
                query: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60'
                severity: warning
              - name: SMART device temperature critical
                description: Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C
                query: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70'
                severity: critical
              - name: SMART device temperature over trip value
                description: Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})
                query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}'
                severity: critical
              - name: SMART device temperature nearing trip value
                description: Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})
                query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)'
                severity: warning
              - name: SMART status
                description: Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})
                query: "smartctl_device_smart_status != 1"
                severity: critical
              - name: SMART critical warning
                description: Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})
                query: "smartctl_device_critical_warning > 0"
                severity: critical
              - name: SMART media errors
                description: Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})
                query: "smartctl_device_media_errors > 0"
                severity: critical
              - name: SMART Wearout Indicator
                description: Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})
                query: "smartctl_device_available_spare < smartctl_device_available_spare_threshold"
                severity: critical

      - name: IPMI
        exporters:
          - name: prometheus-community/ipmi_exporter
            slug: ipmi-exporter
            doc_url: https://github.com/prometheus-community/ipmi_exporter
            rules:
              - name: IPMI collector down
                description: "IPMI collector {{ $labels.collector }} on {{ $labels.instance }} failed to scrape sensor data. Check FreeIPMI tools and BMC connectivity."
                query: 'ipmi_up == 0'
                severity: warning
                for: 5m
                comments: |
                  The ipmi_up metric is per-collector. A value of 0 means the collector could not retrieve data from the BMC.
              - name: IPMI temperature sensor warning
                description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state."
                query: 'ipmi_temperature_state == 1'
                severity: warning
                for: 5m
                comments: |
                  State values: 0=nominal, 1=warning, 2=critical. Thresholds are defined in the BMC firmware.
              - name: IPMI temperature sensor critical
                description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Immediate attention required to prevent hardware damage."
                query: 'ipmi_temperature_state == 2'
                severity: critical
              - name: IPMI fan speed sensor warning
                description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state."
                query: 'ipmi_fan_speed_state == 1'
                severity: warning
                for: 5m
              - name: IPMI fan speed sensor critical
                description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. A fan may have failed."
                query: 'ipmi_fan_speed_state == 2'
                severity: critical
              - name: IPMI fan speed zero
                description: "IPMI fan {{ $labels.name }} on {{ $labels.instance }} reports 0 RPM. The fan may have failed."
                query: 'ipmi_fan_speed_rpm == 0'
                severity: critical
                for: 5m
              - name: IPMI voltage sensor warning
                description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state."
                query: 'ipmi_voltage_state == 1'
                severity: warning
                for: 5m
              - name: IPMI voltage sensor critical
                description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Power supply or motherboard issue possible."
                query: 'ipmi_voltage_state == 2'
                severity: critical
              - name: IPMI current sensor warning
                description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state."
                query: 'ipmi_current_state == 1'
                severity: warning
                for: 5m
              - name: IPMI current sensor critical
                description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state."
                query: 'ipmi_current_state == 2'
                severity: critical
              - name: IPMI power sensor warning
                description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state."
                query: 'ipmi_power_state == 1'
                severity: warning
                for: 5m
              - name: IPMI power sensor critical
                description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state."
                query: 'ipmi_power_state == 2'
                severity: critical
              - name: IPMI generic sensor critical
                description: "IPMI sensor {{ $labels.name }} (type={{ $labels.type }}) on {{ $labels.instance }} is in critical state."
                query: 'ipmi_sensor_state == 2'
                severity: critical
                for: 5m
                comments: |
                  Catches any sensor type not covered by the specific temperature/fan/voltage/current/power alerts.
              - name: IPMI chassis power off
                description: "IPMI reports chassis power is off on {{ $labels.instance }}. The server may have shut down unexpectedly."
                query: 'ipmi_chassis_power_state == 0'
                severity: critical
              - name: IPMI chassis drive fault
                description: "IPMI reports a drive fault on {{ $labels.instance }}. Check disk health."
                query: 'ipmi_chassis_drive_fault_state == 0'
                severity: critical
                comments: |
                  The metric uses inverted logic: 1=no fault, 0=fault detected.
              - name: IPMI chassis cooling fault
                description: "IPMI reports a cooling/fan fault on {{ $labels.instance }}. Check fans and airflow."
                query: 'ipmi_chassis_cooling_fault_state == 0'
                severity: critical
                comments: |
                  The metric uses inverted logic: 1=no fault, 0=fault detected.
              - name: IPMI SEL almost full
                description: "IPMI System Event Log on {{ $labels.instance }} has only {{ printf \"%.0f\" $value }} bytes free. Clear the SEL to prevent loss of new events."
                query: 'ipmi_sel_free_space_bytes < 512'
                severity: warning
                for: 5m
                comments: |
                  SEL storage is typically very limited (e.g., 16KB). When full, new events may be dropped.

      - name: Docker containers
        exporters:
          - name: google/cAdvisor
            slug: google-cadvisor
            doc_url: https://github.com/google/cadvisor
            rules:
              - name: Container killed
                description: A container has disappeared
                query: "time() - container_last_seen > 60"
                severity: warning
                comments: |
                  This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
              - name: Container absent
                description: A container is absent for 5 min
                query: "absent(container_last_seen)"
                severity: warning
                for: 5m
                comments: |
                  This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
              - name: Container High CPU utilization
                description: 'Container CPU utilization is above 80% (current: {{ $value | printf "%.2f" }}%)'
                query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0'
                comments: |
                  Only fires for containers with explicit CPU limits. Containers without limits have cpu_quota=0, which is filtered out by the guard.
                severity: warning
                for: 2m
              - name: Container High Memory usage
                description: Container Memory usage is above 80%
                query: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
                severity: warning
                comments: See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
                for: 2m
              - name: Container Volume usage
                description: Container Volume usage is above 80%
                query: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80 and sum(container_fs_inodes_total) BY (instance) > 0'
                severity: warning
                for: 2m
              - name: Container high throttle rate
                description: "Container is being throttled ({{ $value | humanizePercentage }})"
                query: 'sum(rate(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0'
                severity: warning
                for: 5m
              - name: Container high low change CPU usage
                description: This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%.
                query: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25'
                severity: info
              - name: Container Low CPU utilization
                description: 'Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU. (current: {{ $value | printf "%.2f" }}%)'
                query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
                severity: info
                for: 7d
              - name: Container Low Memory usage
                description: Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.
                query: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20'
                severity: info
                for: 7d

      - name: Blackbox
        exporters:
          - name: prometheus/blackbox_exporter
            slug: blackbox-exporter
            doc_url: https://github.com/prometheus/blackbox_exporter
            rules:
              - name: Blackbox probe failed
                description: Probe failed
                query: probe_success == 0
                severity: critical
              - name: Blackbox configuration reload failure
                description: Blackbox configuration reload failure
                query: "blackbox_exporter_config_last_reload_successful != 1"
                severity: warning
              - name: Blackbox slow probe
                description: Blackbox probe took more than 1s to complete
                query: "probe_duration_seconds > 1"
                severity: warning
                for: 1m
              - name: Blackbox probe HTTP failure
                description: HTTP status code is not 200-399
                query: "probe_http_status_code <= 199 OR probe_http_status_code >= 400"
                severity: critical
              - name: Blackbox SSL certificate will expire soon
                description: SSL certificate expires in less than 20 days
                query: "3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20"
                severity: warning
              - name: Blackbox SSL certificate will expire very soon
                description: SSL certificate expires in less than 3 days
                query: "0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3"
                severity: critical
              - name: Blackbox SSL certificate expired
                description: SSL certificate has expired already
                query: "round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0"
                severity: critical
                comments: |
                  For probe_ssl_earliest_cert_expiry to be exposed after expiration, you
                  need to enable insecure_skip_verify. Note that this will disable
                  certificate validation.
                  See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config
              - name: Blackbox probe slow HTTP
                description: HTTP request took more than 1s
                query: "probe_http_duration_seconds > 1"
                severity: warning
                for: 1m
              - name: Blackbox probe slow ping
                description: Blackbox ping took more than 1s
                query: "probe_icmp_duration_seconds > 1"
                severity: warning
                for: 1m

      - name: Windows Server
        exporters:
          - name: prometheus-community/windows_exporter
            slug: windows-exporter
            doc_url: https://github.com/prometheus-community/windows_exporter
            rules:
              - name: Windows Server collector Error
                description: "Collector {{ $labels.collector }} was not successful"
                query: "windows_exporter_collector_success == 0"
                severity: critical
              - name: Windows Server service Status
                description: Windows Service state is not OK
                query: 'windows_service_status{status="ok"} != 1'
                severity: critical
                for: 1m
              - name: Windows Server CPU Usage
                description: CPU Usage is more than 80%
                query: '100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80'
                severity: warning
              - name: Windows Server memory Usage
                description: Memory usage is more than 90%
                query: "100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90"
                severity: warning
                for: 2m
              - name: Windows Server disk Space Usage
                description: Disk usage is more than 80%
                query: "100 - 100 * (windows_logical_disk_free_bytes / windows_logical_disk_size_bytes) > 80 and windows_logical_disk_size_bytes > 0"
                severity: critical
                for: 2m

      - name: VMware
        exporters:
          - name: pryorda/vmware_exporter
            slug: pryorda-vmware-exporter
            doc_url: https://github.com/pryorda/vmware_exporter
            rules:
              - name: Virtual Machine Memory Warning
                description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
                query: "vmware_vm_mem_usage_average / 100 >= 80 and vmware_vm_mem_usage_average / 100 < 90"
                severity: warning
                for: 5m
              - name: Virtual Machine Memory Critical
                description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
                query: "vmware_vm_mem_usage_average / 100 >= 90"
                severity: critical
                for: 1m
              - name: High Number of Snapshots
                description: "High snapshots number on {{ $labels.instance }}: {{ $value }}"
                query: "vmware_vm_snapshots > 3"
                severity: warning
                for: 30m
              - name: Outdated Snapshots
                description: 'Outdated snapshots on {{ $labels.instance }}: {{ $value | printf "%.0f"}} days'
                query: "(time() - vmware_vm_snapshot_timestamp_seconds) / (60 * 60 * 24) >= 3"
                severity: warning
                for: 5m

      - name: Proxmox VE
        exporters:
          - name: prometheus-pve/prometheus-pve-exporter
            slug: prometheus-pve-exporter
            doc_url: https://github.com/prometheus-pve/prometheus-pve-exporter
            rules:
              - name: PVE node down
                description: 'Proxmox VE node {{ $labels.id }} is down.'
                query: 'pve_up{id=~"node/.*"} == 0'
                severity: critical
                for: 2m
              - name: PVE VM/CT down
                description: 'Proxmox VE guest {{ $labels.id }} is not running.'
                query: 'pve_up{id=~"(qemu|lxc)/.*"} == 0'
                severity: warning
                for: 5m
                comments: |
                  This alert triggers for all VMs and containers that are not running.
                  You may want to filter by specific guests using the `id` label, or exclude
                  intentionally stopped guests with additional label matchers.
              - name: PVE high CPU usage
                description: 'Proxmox VE CPU usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf "%.2f" }}%'
                query: 'pve_cpu_usage_ratio * 100 > 90'
                severity: warning
                for: 5m
              - name: PVE high memory usage
                description: 'Proxmox VE memory usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf "%.2f" }}%'
                query: 'pve_memory_usage_bytes / pve_memory_size_bytes * 100 > 90 and pve_memory_size_bytes > 0'
                severity: warning
                for: 5m
              - name: PVE storage filling up
                description: 'Proxmox VE storage {{ $labels.id }} is above 80% used. Current value: {{ $value | printf "%.2f" }}%'
                query: 'pve_disk_usage_bytes{id=~"storage/.*"} / pve_disk_size_bytes{id=~"storage/.*"} * 100 > 80 and pve_disk_size_bytes{id=~"storage/.*"} > 0'
                severity: warning
                for: 5m
              - name: PVE storage almost full
                description: 'Proxmox VE storage {{ $labels.id }} is above 95% used. Current value: {{ $value | printf "%.2f" }}%'
                query: 'pve_disk_usage_bytes{id=~"storage/.*"} / pve_disk_size_bytes{id=~"storage/.*"} * 100 > 95 and pve_disk_size_bytes{id=~"storage/.*"} > 0'
                severity: critical
                for: 2m
              - name: PVE guest not backed up
                description: '{{ $value }} Proxmox VE guest(s) are not covered by any backup job.'
                query: 'pve_not_backed_up_total > 0'
                severity: warning
              - name: PVE replication failed
                description: 'Proxmox VE replication for {{ $labels.id }} has {{ $value }} failed sync(s).'
                query: 'pve_replication_failed_syncs > 0'
                severity: warning
              - name: PVE cluster not quorate
                description: 'Proxmox VE cluster has lost quorum.'
                query: 'pve_cluster_info{quorate="0"} == 1'
                severity: critical
                comments: |
                  Loss of quorum means the cluster cannot make decisions about VM placement
                  and fencing. This requires immediate attention.

      - name: Netdata
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://github.com/netdata/netdata/blob/master/backends/prometheus/README.md
            rules:
              - name: Netdata high cpu usage
                description: Netdata high CPU usage (> 80%)
                query: 'netdata_cpu_cpu_percentage_average{dimension="idle"} < 20'
                severity: warning
                for: 5m
                comments: |
                  This is a gauge metric (not a counter). Checking idle < 20% means CPU usage > 80%.
              - name: Netdata CPU steal noisy neighbor
                description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
                query: 'netdata_cpu_cpu_percentage_average{dimension="steal"} > 10'
                severity: warning
                for: 5m
              - name: Netdata high memory usage
                description: Netdata high memory usage (> 80%)
                query: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20 and netdata_system_ram_MiB_average > 0'
                severity: warning
                for: 5m
              - name: Netdata low disk space
                description: Netdata low disk space (> 80%)
                query: '100 / netdata_disk_space_GB_average * netdata_disk_space_GB_average{dimension=~"avail|cached"} < 20 and netdata_disk_space_GB_average > 0'
                severity: warning
                for: 5m
              - name: Netdata predicted disk full
                description: Netdata predicted disk full in 24 hours
                query: 'predict_linear(netdata_disk_space_GB_average{dimension=~"avail|cached"}[3h], 24 * 3600) < 0'
                severity: warning
              - name: Netdata MD mismatch cnt unsynchronized blocks
                description: RAID Array have unsynchronized blocks
                query: "netdata_md_mismatch_cnt_unsynchronized_blocks_average > 1024"
                severity: warning
                for: 2m
              - name: Netdata disk reallocated sectors
                description: "Disk reallocated sectors detected ({{ $value }} sectors)"
                query: "increase(netdata_smartd_log_reallocated_sectors_count_sectors_average[1m]) > 0"
                severity: info
              - name: Netdata disk current pending sector
                description: Disk current pending sector
                query: "netdata_smartd_log_current_pending_sector_count_sectors_average > 0"
                severity: warning
              - name: Netdata reported uncorrectable disk sectors
                description: "Reported uncorrectable disk sectors ({{ $value }} sectors)"
                query: "increase(netdata_smartd_log_offline_uncorrectable_sector_count_sectors_average[2m]) > 0"
                severity: warning

      - name: eBPF
        exporters:
          - name: cloudflare/ebpf_exporter
            slug: ebpf-exporter
            doc_url: https://github.com/cloudflare/ebpf_exporter
            rules:
              - name: eBPF exporter program not attached
                description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})"
                query: 'ebpf_exporter_ebpf_program_attached == 0'
                severity: warning
                for: 5m
                comments: |
                  The exporter uses loose attachment: if a program fails to load (missing BTF, kernel incompatibility), it sets this metric to 0 and continues running.
              - name: eBPF exporter decoder errors
                description: "eBPF exporter is experiencing decoder errors for config {{ $labels.config }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})"
                query: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0'
                severity: warning
                for: 5m
              - name: eBPF exporter no enabled configs
                description: "eBPF exporter has no enabled configurations. No eBPF programs are being run. (instance {{ $labels.instance }})"
                query: 'ebpf_exporter_enabled_configs == 0 or absent(ebpf_exporter_enabled_configs)'
                severity: warning
                for: 5m

      - name: Process Exporter
        exporters:
          - name: ncabatoff/process-exporter
            slug: process-exporter
            doc_url: https://github.com/ncabatoff/process-exporter
            rules:
              - name: Process exporter group down
                description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})"
                query: 'namedprocess_namegroup_num_procs == 0'
                severity: warning
                for: 5m
              - name: Process exporter high memory usage
                description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of resident memory. (instance {{ $labels.instance }})"
                query: 'namedprocess_namegroup_memory_bytes{memtype="resident"} > 4e+09'
                severity: warning
                for: 5m
                comments: |
                  Threshold of 4GB is arbitrary and depends on the process being monitored. Adjust per group.
              - name: Process exporter high CPU usage
                description: "Process group {{ $labels.groupname }} is using {{ $value }}% CPU (core-equivalent). (instance {{ $labels.instance }})"
                query: 'rate(namedprocess_namegroup_cpu_seconds_total[5m]) * 100 > 80'
                severity: warning
                for: 5m
                comments: |
                  Value is core-equivalent %: 100% = 1 full core, 200% = 2 cores, etc. Threshold of 80% is per-core. Adjust based on expected workload.
              - name: Process exporter high file descriptor usage
                description: "Process group {{ $labels.groupname }} is using more than 80% of its file descriptor limit. (instance {{ $labels.instance }})"
                query: 'namedprocess_namegroup_worst_fd_ratio > 0.8'
                severity: warning
                for: 5m
              - name: Process exporter file descriptors exhausted
                description: "Process group {{ $labels.groupname }} has nearly exhausted its file descriptor limit. (instance {{ $labels.instance }})"
                query: 'namedprocess_namegroup_worst_fd_ratio > 0.95'
                severity: critical
                for: 2m
              - name: Process exporter high swap usage
                description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})"
                query: 'namedprocess_namegroup_memory_bytes{memtype="swapped"} > 512e+06'
                severity: warning
                for: 5m
                comments: |
                  Threshold of 512MB is arbitrary. Adjust per group and environment.
              - name: Process exporter zombie processes
                description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})"
                query: 'namedprocess_namegroup_states{state="Zombie"} > 5'
                severity: warning
                for: 5m
              - name: Process exporter high context switching
                description: "Process group {{ $labels.groupname }} has a high rate of context switches ({{ $value }}/s). (instance {{ $labels.instance }})"
                query: 'rate(namedprocess_namegroup_context_switches_total{ctxswitchtype="voluntary"}[5m]) > 50000'
                severity: warning
                for: 5m
                comments: |
                  Filters to voluntary switches only — involuntary switches are normal under CPU contention. Threshold of 50000/s is a rough default. Adjust based on workload.
              - name: Process exporter high disk write IO
                description: "Process group {{ $labels.groupname }} is performing {{ $value | humanize }}B/s of disk writes. (instance {{ $labels.instance }})"
                query: 'rate(namedprocess_namegroup_write_bytes_total[5m]) > 100e+06'
                severity: warning
                for: 5m
                comments: |
                  Threshold of 100MB/s is arbitrary. Adjust per group.
              - name: Process exporter process restarting
                description: "Process group {{ $labels.groupname }} has restarted (oldest process start time changed). (instance {{ $labels.instance }})"
                query: 'changes(namedprocess_namegroup_oldest_start_time_seconds[5m]) > 0 and namedprocess_namegroup_num_procs > 0'
                severity: info
                comments: |
                  Detects restarts by watching for changes in the oldest process start time within the group.

      - name: Systemd
        exporters:
          - name: prometheus-community/systemd_exporter
            slug: systemd-exporter
            doc_url: https://github.com/prometheus-community/systemd_exporter
            rules:
              - name: Systemd unit failed
                description: "Systemd unit {{ $labels.name }} has entered failed state. (instance {{ $labels.instance }})"
                query: 'systemd_unit_state{state="failed"} == 1'
                severity: warning
                for: 5m
              - name: Systemd unit inactive
                description: "Systemd unit {{ $labels.name }} is inactive. (instance {{ $labels.instance }})"
                query: 'systemd_unit_state{state="inactive", type="service", name=~"your-critical-service.+"} == 1'
                severity: warning
                for: 5m
                comments: |
                  Many units are legitimately inactive. You must adjust the name=~ filter to match your critical services.
              - name: Systemd service crash looping
                description: "Systemd service {{ $labels.name }} has restarted {{ $value }} times in the last hour. (instance {{ $labels.instance }})"
                query: 'increase(systemd_service_restart_total[1h]) > 5'
                severity: critical
                for: 5m
              - name: Systemd unit tasks near limit
                description: "Systemd unit {{ $labels.name }} is using {{ $value | humanizePercentage }} of its task limit. (instance {{ $labels.instance }})"
                query: 'systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max > 0.9 and ignoring(type) systemd_unit_tasks_max > 0'
                severity: warning
                for: 5m
              - name: Systemd socket refused connections
                description: "Systemd socket {{ $labels.name }} is refusing connections. ({{ $value }} refused in last 5m, instance {{ $labels.instance }})"
                query: 'increase(systemd_socket_refused_connections_total[5m]) > 0'
                severity: warning
                for: 2m
              - name: Systemd socket high connections
                description: "Systemd socket {{ $labels.name }} has {{ $value }} active connections. (instance {{ $labels.instance }})"
                query: 'systemd_socket_current_connections > 100'
                severity: warning
                for: 2m
                comments: |
                  Threshold of 100 connections is arbitrary. Adjust to your workload.
              - name: Systemd timer missed trigger
                description: "Systemd timer {{ $labels.name }} has not triggered for over 24 hours. (instance {{ $labels.instance }})"
                query: '(time() - systemd_timer_last_trigger_seconds) / 3600 > 24 and systemd_timer_last_trigger_seconds > 0'
                severity: warning
                for: 5m
                comments: |
                  Triggers if timer hasn't fired in 24 hours. Adjust threshold per timer schedule.

  - name: Databases
    services:
      - name: MySQL
        exporters:
          - name: prometheus/mysqld_exporter
            slug: mysqld-exporter
            doc_url: https://github.com/prometheus/mysqld_exporter
            rules:
              - name: MySQL down
                description: MySQL instance is down on {{ $labels.instance }}
                query: "mysql_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: MySQL too many connections (> 80%)
                description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}"
                query: "max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80 and mysql_global_variables_max_connections > 0"
                severity: warning
                for: 2m
              - name: MySQL high prepared statements utilization (> 80%)
                description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}"
                query: "max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80 and mysql_global_variables_max_prepared_stmt_count > 0"
                severity: warning
                for: 2m
              - name: MySQL high threads running
                description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}"
                query: "max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60 and mysql_global_variables_max_connections > 0"
                severity: warning
                for: 2m
              - name: MySQL Slave IO thread not running
                description: "MySQL Slave IO thread not running on {{ $labels.instance }}"
                query: "( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: MySQL Slave SQL thread not running
                description: "MySQL Slave SQL thread not running on {{ $labels.instance }}"
                query: "( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: MySQL Slave replication lag
                description: "MySQL replication lag on {{ $labels.instance }}"
                query: "( (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and ON (instance) mysql_slave_status_master_server_id > 0 ) > 30"
                severity: critical
                for: 1m
              - name: MySQL slow queries
                description: "MySQL server mysql has some new slow query ({{ $value }} in the last minute)."
                query: increase(mysql_global_status_slow_queries[1m]) > 0
                severity: warning
                for: 2m
              - name: MySQL InnoDB log waits
                description: "MySQL innodb log writes stalling ({{ $value }} waits/s)"
                query: rate(mysql_global_status_innodb_log_waits[15m]) > 10
                severity: warning
              - name: MySQL restarted
                description: MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.
                query: "mysql_global_status_uptime < 60"
                severity: info
              - name: MySQL High QPS
                description: MySQL is being overload with unusual QPS (> 10k QPS).
                query: "irate(mysql_global_status_questions[1m]) > 10000"
                severity: info
                for: 2m
              - name: MySQL too many open files
                description: MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.
                query: "mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75 and mysql_global_variables_open_files_limit > 0"
                severity: warning
                for: 2m
              - name: MySQL InnoDB Force Recovery is enabled
                description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}"
                query: "mysql_global_variables_innodb_force_recovery != 0"
                severity: warning
                for: 2m
              - name: MySQL InnoDB history_len too long
                description: "MySQL history_len (undo log) too long on {{ $labels.instance }}"
                query: "mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000"
                severity: warning
                for: 2m

      - name: PostgreSQL
        exporters:
          - name: prometheus-community/postgres_exporter
            slug: postgres-exporter
            doc_url: https://github.com/prometheus-community/postgres_exporter
            rules:
              - name: Postgresql down
                description: Postgresql instance is down
                query: "pg_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Postgresql restarted
                description: Postgresql restarted
                query: "time() - pg_postmaster_start_time_seconds < 60"
                severity: critical
              - name: Postgresql exporter error
                description: Postgresql exporter is showing errors. A query may be buggy in query.yaml
                query: "pg_exporter_last_scrape_error > 0"
                severity: critical
              - name: Postgresql table not auto vacuumed
                description: Table {{ $labels.relname }} has not been auto vacuumed for 10 days
                query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10"
                severity: warning
              - name: Postgresql table not auto analyzed
                description: Table {{ $labels.relname }} has not been auto analyzed for 10 days
                query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10"
                severity: warning
              - name: Postgresql too many connections
                description: PostgreSQL instance has too many connections (> 80%).
                query: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)"
                severity: warning
                for: 2m
              - name: Postgresql not enough connections
                description: PostgreSQL instance should have more connections (> 5)
                query: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
                severity: critical
                for: 2m
              - name: Postgresql dead locks
                description: "PostgreSQL has dead-locks ({{ $value }} in the last minute)"
                query: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
                severity: warning
              - name: Postgresql high rollback rate
                description: Ratio of transactions being aborted compared to committed is > 2 %
                query: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
                severity: warning
              - name: Postgresql commit rate low
                description: Postgresql seems to be processing very few transactions
                query: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
                severity: critical
                for: 2m
              - name: Postgresql low XID consumption
                description: Postgresql seems to be consuming transaction IDs very slowly
                query: "rate(pg_txid_current[1m]) < 5"
                severity: warning
                for: 2m
              - name: Postgresql unused replication slot
                description: Unused Replication Slots
                query: "(pg_replication_slots_active == 0) and (pg_replication_is_replica == 0)"
                severity: warning
                for: 1m
              - name: Postgresql too many dead tuples
                description: PostgreSQL dead tuples is too large
                query: "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 and (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) > 0"
                severity: warning
                for: 2m
              - name: Postgresql configuration changed
                description: Postgres Database configuration change has occurred
                query: '{__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} != ON(__name__, instance) {__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} OFFSET 5m'
                severity: info
              - name: Postgresql SSL compression active
                description: Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.
                query: "sum by (instance) (pg_stat_ssl_compression) > 0"
                severity: warning
              - name: Postgresql too many locks acquired
                description: Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.
                query: "((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 and (pg_settings_max_locks_per_transaction * pg_settings_max_connections) > 0"
                severity: critical
                for: 2m
              - name: Postgresql bloat index high (> 80%)
                description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`"
                query: "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)"
                severity: warning
                for: 1h
                comments: |
                  See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
              - name: Postgresql bloat table high (> 80%)
                description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`"
                query: "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)"
                severity: warning
                for: 1h
                comments: |
                  See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
              - name: Postgresql invalid index
                description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`"
                query: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
                severity: warning
                for: 6h
                comments: |
                  See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
              - name: Postgresql replication lag
                description: The PostgreSQL replication lag is high (> 5s)
                query: "pg_replication_lag_seconds > 5"
                severity: warning
                for: 30s

      - name: SQL Server
        exporters:
          - name: Ozarklake/prometheus-mssql-exporter
            slug: ozarklake-mssql-exporter
            doc_url: https://github.com/Ozarklake/prometheus-mssql-exporter
            rules:
              - name: SQL Server down
                description: SQL server instance is down
                query: mssql_up == 0
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: SQL Server deadlock
                description: SQL Server {{ $labels.instance }} is experiencing deadlocks ({{ $value }}/s)
                query: mssql_deadlocks > 5
                severity: warning
                for: 1m

      - name: Oracle Database
        exporters:
          - name: iamseth/oracledb_exporter
            slug: iamseth-oracledb-exporter
            doc_url: https://github.com/iamseth/oracledb_exporter
            rules:
              - name: Oracle DB down
                description: Oracle Database instance is down on {{ $labels.instance }}
                query: "oracledb_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Oracle DB sessions reaching limit (> 85%)
                description: "Oracle Database session utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: "oracledb_resource_current_utilization{resource_name=\"sessions\"} / oracledb_resource_limit_value{resource_name=\"sessions\"} * 100 > 85 and oracledb_resource_limit_value{resource_name=\"sessions\"} > 0"
                severity: warning
                for: 5m
                comments: |
                  Threshold is workload-dependent. Adjust 85% to suit your environment.
              - name: Oracle DB processes reaching limit (> 85%)
                description: "Oracle Database process utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: "oracledb_resource_current_utilization{resource_name=\"processes\"} / oracledb_resource_limit_value{resource_name=\"processes\"} * 100 > 85 and oracledb_resource_limit_value{resource_name=\"processes\"} > 0"
                severity: warning
                for: 5m
                comments: |
                  Threshold is workload-dependent. Adjust 85% to suit your environment.
              - name: Oracle DB tablespace reaching capacity (> 85%)
                description: "Oracle Database tablespace {{ $labels.tablespace }} is above 85% usage on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: "oracledb_tablespace_used_percent > 85"
                severity: warning
                for: 5m
              - name: Oracle DB tablespace full (> 95%)
                description: "Oracle Database tablespace {{ $labels.tablespace }} is critically full on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: "oracledb_tablespace_used_percent > 95"
                severity: critical
                for: 5m
              - name: Oracle DB high user rollbacks
                description: "Oracle Database on {{ $labels.instance }} has a high rollback rate ({{ $value }}% of transactions are rolled back)"
                query: "rate(oracledb_activity_user_rollbacks[5m]) / (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) * 100 > 20 and (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) > 0"
                severity: warning
                for: 5m
                comments: |
                  A high rollback rate (>20%) often indicates application-level issues such as deadlocks, constraint violations, or poorly designed transactions.
              - name: Oracle DB too many active sessions
                description: "Oracle Database on {{ $labels.instance }} has too many active user sessions (current value: {{ $value }})"
                query: "oracledb_sessions_value{status=\"ACTIVE\", type=\"USER\"} > 200"
                severity: warning
                for: 5m
                comments: |
                  Threshold is highly workload-dependent. Adjust 200 to suit your environment.
              - name: Oracle DB high wait time (user I/O)
                description: "Oracle Database on {{ $labels.instance }} is experiencing high user I/O wait time"
                query: "oracledb_wait_time_user_io > 300"
                severity: warning
                for: 5m
                comments: |
                  The metric from v$waitclassmetric is already a normalized rate (centiseconds per second). Threshold 300 means 3 seconds of I/O wait per second of wall time.

      - name: Patroni
        exporters:
          - name: Embedded exporter (Patroni >= 2.1.0)
            slug: embedded-exporter-patroni
            doc_url: https://patroni.readthedocs.io/en/latest/rest_api.html?highlight=prometheus#monitoring-endpoint
            rules:
              - name: Patroni has no Leader
                description: A leader node (neither primary nor standby) cannot be found inside the cluster {{ $labels.scope }}
                query: (max by (scope) (patroni_primary) < 1) and (max by (scope) (patroni_standby_leader) < 1)
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.

      - name: PGBouncer
        exporters:
          - name: spreaker/prometheus-pgbouncer-exporter
            slug: spreaker-pgbouncer-exporter
            doc_url: https://github.com/spreaker/prometheus-pgbouncer-exporter
            rules:
              - name: PGBouncer active connections
                description: PGBouncer pools are filling up
                query: "pgbouncer_pools_server_active_connections > 200"
                severity: warning
                for: 2m
              - name: PGBouncer errors
                description: PGBouncer is logging errors. This may be due to a server restart or an admin typing commands at the pgbouncer console.
                query: 'increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[1m]) > 10'
                severity: warning
              - name: PGBouncer max connections
                description: The number of PGBouncer client connections has reached max_client_conn.
                query: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[2m]) > 0'
                severity: critical

      - name: Redis
        exporters:
          - name: oliver006/redis_exporter
            slug: oliver006-redis-exporter
            doc_url: https://github.com/oliver006/redis_exporter
            rules:
              - name: Redis down
                description: Redis instance is down
                query: "redis_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Redis missing master
                description: Redis cluster has no node marked as master.
                query: '(count(redis_instance_info{role="master"}) or vector(0)) < 1'
                severity: critical
              - name: Redis too many masters
                description: Redis cluster has too many nodes marked as master.
                query: 'count(redis_instance_info{role="master"}) > 1'
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Redis disconnected slaves
                description: Redis not replicating for all slaves. Consider reviewing the redis replication status.
                query: "count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 0"
                severity: critical
              - name: Redis replication broken
                description: Redis instance lost a slave
                query: "delta(redis_connected_slaves[1m]) < 0"
                severity: critical
              - name: Redis cluster flapping
                description: Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).
                query: "changes(redis_connected_slaves[1m]) > 1"
                severity: critical
                for: 2m
              - name: Redis missing backup
                description: Redis has not been backed up for 48 hours
                query: "time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 48"
                severity: critical
              - name: Redis out of system memory
                description: Redis is running out of system memory (> 90%)
                query: "redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90 and redis_total_system_memory_bytes > 0"
                severity: warning
                for: 2m
                comments: |
                  The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
              - name: Redis out of configured maxmemory
                description: Redis is running out of configured maxmemory (> 90%)
                query: "redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0"
                severity: warning
                for: 2m
              - name: Redis too many connections
                description: Redis is running out of connections (> 90% used)
                query: "redis_connected_clients / redis_config_maxclients * 100 > 90 and redis_config_maxclients > 0"
                severity: warning
                for: 2m
              - name: Redis not enough connections
                description: Redis instance should have more connections (> 5)
                query: "redis_connected_clients < 5"
                severity: warning
                for: 2m
              - name: Redis rejected connections
                description: Some connections to Redis has been rejected
                query: "increase(redis_rejected_connections_total[1m]) > 5"
                severity: warning

      - name: Memcached
        exporters:
          - name: prometheus/memcached_exporter
            slug: memcached-exporter
            doc_url: https://github.com/prometheus/memcached_exporter
            rules:
              - name: Memcached down
                description: Memcached instance is down on {{ $labels.instance }}
                query: "memcached_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Memcached connection limit approaching (> 80%)
                description: "Memcached connection usage is above 80% on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: "(memcached_current_connections / memcached_max_connections * 100) > 80 and memcached_max_connections > 0"
                severity: warning
                for: 2m
              - name: Memcached connection limit approaching (> 95%)
                description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: "(memcached_current_connections / memcached_max_connections * 100) > 95 and memcached_max_connections > 0"
                severity: critical
                for: 2m
              - name: Memcached out of memory errors
                description: "Memcached is returning out-of-memory errors on {{ $labels.instance }}"
                query: "sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0"
                severity: warning
                for: 5m
              - name: Memcached memory usage high (> 90%)
                description: "Memcached memory usage is above 90% on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: "(memcached_current_bytes / memcached_limit_bytes * 100) > 90 and memcached_limit_bytes > 0"
                severity: warning
                for: 5m
                comments: |
                  High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions.
              - name: Memcached high eviction rate
                description: "Memcached is evicting items at a high rate on {{ $labels.instance }} ({{ $value }} evictions/s)"
                query: "rate(memcached_items_evicted_total[5m]) > 10"
                severity: warning
                for: 5m
                comments: |
                  A sustained eviction rate indicates memory pressure. Consider increasing memcached memory limit or reducing cache usage. Threshold of 10 evictions/s is a rough default — adjust based on your workload.
              - name: Memcached low cache hit rate (< 80%)
                description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: '(rate(memcached_commands_total{command="get", status="hit"}[5m]) / (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) * 100) < 80 and (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) > 0'
                severity: warning
                for: 10m
                comments: |
                  A low hit rate may indicate poor cache utilization, incorrect cache keys, or TTLs that are too short. Threshold of 80% is a rough default — adjust based on your workload and access patterns.
              - name: Memcached connections rejected
                description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)"
                query: "increase(memcached_connections_rejected_total[5m]) > 0"
                severity: warning
                for: 5m
              - name: Memcached items too large
                description: "Memcached is rejecting items exceeding max-item-size on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)"
                query: "increase(memcached_item_too_large_total[5m]) > 0"
                severity: info
                for: 5m

      - name: MongoDB
        exporters:
          - name: percona/mongodb_exporter
            slug: percona-mongodb-exporter
            doc_url: https://github.com/percona/mongodb_exporter
            rules:
              - name: MongoDB Down
                description: MongoDB instance is down
                query: "mongodb_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Mongodb replica member unhealthy
                description: MongoDB replica member is not healthy
                query: "mongodb_rs_members_health == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: MongoDB replication lag (Percona)
                description: Mongodb replication lag is more than 10s
                query: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10'
                severity: critical
              - name: MongoDB replication headroom
                description: MongoDB replication headroom is <= 0
                query: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0'
                severity: critical
                comments: |
                  This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both.
              - name: MongoDB number cursors open (Percona)
                description: Too many cursors opened by MongoDB for clients (> 10k)
                query: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000'
                severity: warning
                for: 2m
              - name: MongoDB cursors timeouts (Percona)
                description: "Too many cursors are timing out ({{ $value }} in the last minute)"
                query: "increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100"
                severity: warning
                for: 2m
              - name: MongoDB too many connections (Percona)
                description: Too many connections (> 80%)
                query: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80 and (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) > 0'
                severity: warning
                for: 2m

          - name: dcu/mongodb_exporter
            slug: dcu-mongodb-exporter
            doc_url: https://github.com/dcu/mongodb_exporter
            rules:
              - name: MongoDB replication lag (DCU)
                description: Mongodb replication lag is more than 10s
                query: 'avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}) > 10'
                severity: critical
              - name: MongoDB replication Status 3
                description: MongoDB Replication set member either perform startup self-checks, or transition from completing a rollback or resync
                query: "mongodb_replset_member_state == 3"
                severity: critical
              - name: MongoDB replication Status 6
                description: MongoDB Replication set member as seen from another member of the set, is not yet known
                query: "mongodb_replset_member_state == 6"
                severity: critical
              - name: MongoDB replication Status 8
                description: MongoDB Replication set member as seen from another member of the set, is unreachable
                query: "mongodb_replset_member_state == 8"
                severity: critical
              - name: MongoDB replication Status 9
                description: MongoDB Replication set member is actively performing a rollback. Data is not available for reads
                query: "mongodb_replset_member_state == 9"
                severity: critical
              - name: MongoDB replication Status 10
                description: MongoDB Replication set member was once in a replica set but was subsequently removed
                query: "mongodb_replset_member_state == 10"
                severity: critical
              - name: MongoDB number cursors open (DCU)
                description: Too many cursors opened by MongoDB for clients (> 10k)
                query: 'mongodb_metrics_cursor_open{state="total_open"} > 10000'
                severity: warning
                for: 2m
              - name: MongoDB cursors timeouts (DCU)
                description: "Too many cursors are timing out ({{ $value }} in the last minute)"
                query: "increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100"
                severity: warning
                for: 2m
              - name: MongoDB too many connections (DCU)
                description: Too many connections (> 80%)
                query: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80 and (mongodb_connections{state="current"} + mongodb_connections{state="available"}) > 0'
                severity: warning
                for: 2m
          - name: stefanprodan/mgob
            slug: stefanprodan-mgob-exporter
            doc_url: https://github.com/stefanprodan/mgob
            rules:
              - name: Mgob backup failed
                description: MongoDB backup has failed
                query: 'changes(mgob_scheduler_backup_total{status="500"}[1h]) > 0'
                severity: critical

      - name: Elasticsearch
        exporters:
          - name: prometheus-community/elasticsearch_exporter
            slug: prometheus-community-elasticsearch-exporter
            doc_url: https://github.com/prometheus-community/elasticsearch_exporter
            rules:
              - name: Elasticsearch Heap Usage Too High
                description: "The heap usage is over 90%"
                query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0'
                severity: critical
                for: 2m
              - name: Elasticsearch Heap Usage warning
                description: "The heap usage is over 80%"
                query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0'
                severity: warning
                for: 2m
              - name: Elasticsearch disk out of space
                description: The disk usage is over 90%
                query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10 and elasticsearch_filesystem_data_size_bytes > 0"
                severity: critical
              - name: Elasticsearch disk space low
                description: The disk usage is over 80%
                query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20 and elasticsearch_filesystem_data_size_bytes > 0"
                severity: warning
                for: 2m
              - name: Elasticsearch Cluster Red
                description: Elastic Cluster Red status
                query: 'elasticsearch_cluster_health_status{color="red"} == 1'
                severity: critical
              - name: Elasticsearch Cluster Yellow
                description: Elastic Cluster Yellow status
                query: 'elasticsearch_cluster_health_status{color="yellow"} == 1'
                severity: warning
              - name: Elasticsearch Healthy Nodes
                description: "Missing node in Elasticsearch cluster"
                query: "elasticsearch_cluster_health_number_of_nodes < 3"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Elasticsearch Healthy Data Nodes
                description: "Missing data node in Elasticsearch cluster"
                query: "elasticsearch_cluster_health_number_of_data_nodes < 3"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Elasticsearch relocating shards
                description: "Elasticsearch is relocating shards"
                query: "elasticsearch_cluster_health_relocating_shards > 0"
                severity: info
              - name: Elasticsearch relocating shards too long
                description: "Elasticsearch has been relocating shards for 15min"
                query: "elasticsearch_cluster_health_relocating_shards > 0"
                severity: warning
                for: 15m
              - name: Elasticsearch initializing shards
                description: "Elasticsearch is initializing shards"
                query: "elasticsearch_cluster_health_initializing_shards > 0"
                severity: info
              - name: Elasticsearch initializing shards too long
                description: "Elasticsearch has been initializing shards for 15 min"
                query: "elasticsearch_cluster_health_initializing_shards > 0"
                severity: warning
                for: 15m
              - name: Elasticsearch unassigned shards
                description: "Elasticsearch has unassigned shards"
                query: "elasticsearch_cluster_health_unassigned_shards > 0"
                severity: critical
                for: 2m
              - name: Elasticsearch pending tasks
                description: "Elasticsearch has pending tasks. Cluster works slowly."
                query: "elasticsearch_cluster_health_number_of_pending_tasks > 0"
                severity: warning
                for: 15m
              - name: Elasticsearch no new documents
                description: "No new documents for 10 min!"
                query: 'increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1'
                severity: warning
              - name: Elasticsearch High Indexing Latency
                description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s)."
                query: "rate(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / rate(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and rate(elasticsearch_indices_indexing_index_total[1m]) > 0"
                severity: warning
                for: 10m
              - name: Elasticsearch High Indexing Rate
                description: "The indexing rate on Elasticsearch cluster is higher than the threshold."
                query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000"
                severity: warning
                for: 5m
              - name: Elasticsearch High Query Rate
                description: "The query rate on Elasticsearch cluster is higher than the threshold."
                query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100"
                severity: warning
                for: 5m
              - name: Elasticsearch High Query Latency
                description: "The query latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s)."
                query: "rate(elasticsearch_indices_search_query_time_seconds[1m]) / rate(elasticsearch_indices_search_query_total[1m]) > 1 and rate(elasticsearch_indices_search_query_total[1m]) > 0"
                severity: warning
                for: 5m

      - name: Meilisearch
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://github.com/orgs/meilisearch/discussions/625
            rules:
              - name: Meilisearch index is empty
                description: Meilisearch index {{ $labels.index }} has zero documents
                query: "meilisearch_index_docs_count == 0"
                severity: warning
              - name: Meilisearch http response time
                description: Meilisearch http response time is too high
                query: "meilisearch_http_response_time_seconds > 0.5"
                severity: warning

      - name: Cassandra
        exporters:
          - name: instaclustr/cassandra-exporter
            slug: instaclustr-cassandra-exporter
            doc_url: https://github.com/instaclustr/cassandra-exporter
            rules:
              - name: "Cassandra Node is unavailable"
                description: "Cassandra Node is unavailable - {{ $labels.cassandra_cluster }} {{ $labels.exported_endpoint }}"
                query: "cassandra_endpoint_active < 1"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: "Cassandra many compaction tasks are pending"
                description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}"
                query: "cassandra_table_estimated_pending_compactions > 100"
                severity: warning
              - name: "Cassandra commitlog pending tasks (Instaclustr)"
                description: "Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}"
                query: "cassandra_commit_log_pending_tasks > 15"
                for: 2m
                severity: warning
              - name: "Cassandra compaction executor blocked tasks (Instaclustr)"
                description: "Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}"
                query: 'cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15'
                for: 2m
                severity: warning
              - name: "Cassandra flush writer blocked tasks (Instaclustr)"
                description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}"
                query: 'cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15'
                for: 2m
                severity: warning
              - name: "Cassandra connection timeouts total (Instaclustr)"
                description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}"
                query: "sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5"
                for: 2m
                severity: critical
              - name: "Cassandra storage exceptions (Instaclustr)"
                description: "Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}"
                query: "changes(cassandra_storage_exceptions_total[1m]) > 1"
                severity: critical
              - name: "Cassandra tombstone dump (Instaclustr)"
                description: "Cassandra tombstone dump - {{ $labels.cassandra_cluster }}"
                query: 'avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100'
                for: 2m
                severity: critical
              - name: "Cassandra client request unavailable write (Instaclustr)"
                description: "Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}"
                query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0'
                for: 2m
                severity: critical
              - name: "Cassandra client request unavailable read (Instaclustr)"
                description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}"
                query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0'
                for: 2m
                severity: critical
              - name: "Cassandra client request write failure (Instaclustr)"
                description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}"
                query: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0'
                for: 2m
                severity: critical
              - name: "Cassandra client request read failure (Instaclustr)"
                description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}"
                query: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'
                for: 2m
                severity: critical

          - name: criteo/cassandra_exporter
            slug: criteo-cassandra-exporter
            doc_url: https://github.com/criteo/cassandra_exporter
            rules:
              - name: Cassandra hints count
                description: Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down
                query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:totalhints:count"}[1m]) > 3'
                severity: critical
              - name: Cassandra compaction task pending
                description: Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster.
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"} > 100'
                severity: warning
                for: 2m
              - name: Cassandra viewwrite latency
                description: High viewwrite latency on {{ $labels.instance }} cassandra node
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile"} > 100000'
                severity: warning
                for: 2m
              - name: Cassandra authentication failures
                description: Increase of Cassandra authentication failures
                query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
                severity: warning
                for: 2m
              - name: Cassandra node down
                description: Cassandra node down
                query: 'sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0'
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Cassandra commitlog pending tasks (Criteo)
                description: Unexpected number of Cassandra commitlog pending tasks
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15'
                severity: warning
                for: 2m
              - name: Cassandra compaction executor blocked tasks (Criteo)
                description: Some Cassandra compaction executor tasks are blocked
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:compactionexecutor:currentlyblockedtasks:count"} > 0'
                severity: warning
                for: 2m
              - name: Cassandra flush writer blocked tasks (Criteo)
                description: Some Cassandra flush writer tasks are blocked
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:memtableflushwriter:currentlyblockedtasks:count"} > 0'
                severity: warning
                for: 2m
              - name: Cassandra repair pending tasks
                description: Some Cassandra repair tasks are pending
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:pendingtasks:value"} > 2'
                severity: warning
                for: 2m
              - name: Cassandra repair blocked tasks
                description: Some Cassandra repair tasks are blocked
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:currentlyblockedtasks:count"} > 0'
                severity: warning
                for: 2m
              - name: Cassandra connection timeouts total (Criteo)
                description: Some connection between nodes are ending in timeout
                query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
                severity: critical
                for: 2m
              - name: Cassandra storage exceptions (Criteo)
                description: Something is going wrong with cassandra storage
                query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:exceptions:count"}[1m]) > 1'
                severity: critical
              - name: Cassandra tombstone dump (Criteo)
                description: Too much tombstones scanned in queries
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:table:tombstonescannedhistogram:99thpercentile"} > 1000'
                severity: critical
              - name: Cassandra client request unavailable write (Criteo)
                description: Write failures have occurred because too many nodes are unavailable
                query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:unavailables:count"}[1m]) > 0'
                severity: critical
              - name: Cassandra client request unavailable read (Criteo)
                description: Read failures have occurred because too many nodes are unavailable
                query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:unavailables:count"}[1m]) > 0'
                severity: critical
              - name: Cassandra client request write failure (Criteo)
                description: A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0'
                severity: critical
              - name: Cassandra client request read failure (Criteo)
                description: A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0'
                severity: critical
              - name: Cassandra cache hit rate key cache
                description: Key cache hit rate is below 85%
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:cache:keycache:hitrate:value"} < .85'
                severity: critical
                for: 2m

      - name: Clickhouse
        exporters:
          - name: Embedded Exporter
            slug: embedded-exporter
            doc_url: https://clickhouse.com/docs/en/operations/system-tables/metrics
            rules:
              - name: ClickHouse node down
                description: "No metrics received from ClickHouse exporter for over 2 minutes."
                query: 'up{job="clickhouse"} == 0'
                severity: critical
                for: 2m
                comments: |
                  Adjust the job label to match your Prometheus configuration.
              - name: ClickHouse Memory Usage Critical
                description: "Memory usage is critically high, over 90%."
                query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0"
                severity: critical
                for: 5m
              - name: ClickHouse Memory Usage Warning
                description: "Memory usage is over 80%."
                query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0"
                severity: warning
                for: 5m
              - name: ClickHouse Disk Space Low on Default
                description: "Disk space on default is below 20%."
                query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0"
                severity: warning
                for: 2m
              - name: ClickHouse Disk Space Critical on Default
                description: "Disk space on default disk is critically low, below 10%."
                query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0"
                severity: critical
                for: 2m
              - name: ClickHouse Disk Space Low on Backups
                description: "Disk space on backups is below 20%."
                query: "ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) > 0"
                severity: warning
                for: 2m
              - name: ClickHouse Replica Errors
                description: "Critical replica errors detected, either all replicas are stale or lost."
                query: "ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1"
                severity: critical

              - name: ClickHouse No Available Replicas
                description: "No available replicas in ClickHouse."
                query: "ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1"
                severity: critical

              - name: ClickHouse No Live Replicas
                description: "There are too few live replicas available, risking data loss and service disruption."
                query: "ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1"
                severity: critical

              - name: ClickHouse High TCP Connections
                description: "High number of TCP connections, indicating heavy client or inter-cluster communication."
                query: "ClickHouseMetrics_TCPConnection > 400"
                severity: warning
                for: 5m
                comments: |
                  Please replace the threshold with an appropriate value
              - name: ClickHouse Interserver Connection Issues
                description: "High number of interserver connections may indicate replication or distributed query handling issues."
                query: "ClickHouseMetrics_InterserverConnection > 50"
                severity: warning
                for: 5m
                comments: |
                  Adjust the threshold based on your cluster size and expected replication traffic.
              - name: ClickHouse ZooKeeper Connection Issues
                description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination."
                query: "ClickHouseMetrics_ZooKeeperSession != 1"
                severity: warning
                for: 3m
              - name: ClickHouse Authentication Failures
                description: "Authentication failures detected, indicating potential security issues or misconfiguration."
                query: "increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 3"
                severity: info

              - name: ClickHouse Access Denied Errors
                description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts."
                query: "increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 3"
                severity: info

              - name: ClickHouse rejected insert queries
                description: "INSERTs rejected due to too many active data parts. Reduce insert frequency."
                query: "increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 0"
                severity: warning
                for: 1m
              - name: ClickHouse delayed insert queries
                description: "INSERTs delayed due to high number of active parts."
                query: "increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 0"
                severity: warning
                for: 2m
              - name: ClickHouse zookeeper hardware exception
                description: "Zookeeper hardware exception: network issues communicating with ZooKeeper"
                query: "increase(ClickHouseProfileEvents_ZooKeeperHardwareExceptions[1m]) > 0"
                severity: critical
                for: 1m
              - name: ClickHouse high network usage
                description: High network usage. ClickHouse network usage exceeds 100MB/s.
                query: "rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024"
                severity: warning
                for: 2m
                comments: |
                  Please replace the threshold with an appropriate value
              - name: ClickHouse distributed rejected inserts
                description: "INSERTs into Distributed tables rejected due to pending bytes limit."
                query: "increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 0"
                severity: critical
                for: 2m

      - name: CouchDB
        exporters:
          - name: gesellix/couchdb-prometheus-exporter
            slug: gesellix-couchdb-prometheus-exporter
            doc_url: https://github.com/gesellix/couchdb-prometheus-exporter
            rules:
              - name: CouchDB node down
                description: CouchDB node is not responding (node_up metric is 0) for more than 2 minutes
                query: "couchdb_httpd_node_up == 0 or couchdb_httpd_up == 0"
                severity: critical
                for: 2m
              - name: CouchDB atom memory usage critical
                description: Atom memory usage is above 90% of limit
                query: "couchdb_erlang_memory_atom_used > 0.9 * couchdb_erlang_memory_atom"
                severity: critical
                for: 5m
              - name: CouchDB open databases critical
                description: Number of open databases exceeds 90% of node capacity
                query: "couchdb_httpd_open_databases > 0.9 * 1000"
                severity: critical
                for: 5m
              - name: CouchDB open OS files critical
                description: CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files
                query: "couchdb_httpd_open_os_files > 0.9 * 65535"
                severity: critical
                for: 5m
              - name: CouchDB 5xx error ratio high
                description: More than 5% of HTTP requests are returning 5xx errors
                query: "rate(couchdb_httpd_status_codes{code=~\"5..\"}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05 and rate(couchdb_httpd_requests[5m]) > 0"
                severity: critical
                for: 5m
              - name: CouchDB temporary view read rate critical
                description: Temporary view read rate exceeds 100 reads/sec, high risk of performance degradation
                query: "rate(couchdb_httpd_temporary_view_reads[5m]) > 100"
                severity: critical
                for: 5m
              - name: CouchDB Mango queries scanning too many docs
                description: Some Mango queries are scanning too many documents, consider adding indexes
                query: "rate(couchdb_mango_too_many_docs_scanned[5m]) > 50"
                severity: warning
                for: 5m
              - name: CouchDB Mango queries failed due to invalid index
                description: Some Mango queries failed to execute because the index was missing or invalid
                query: "rate(couchdb_mango_query_invalid_index[5m]) > 5"
                severity: warning
                for: 5m
              - name: CouchDB Mango docs examined high
                description: High number of documents examined per Mango queries, consider indexing
                query: "rate(couchdb_mango_docs_examined[5m]) > 1000"
                severity: warning
                for: 5m
              - name: CouchDB Replicator manager died
                description: Replication manager process has crashed
                query: "increase(couchdb_replicator_changes_manager_deaths[5m]) > 0"
                severity: critical
                for: 1m
              - name: CouchDB Replicator queue process died
                description: Replication queue process has crashed
                query: "increase(couchdb_replicator_changes_queue_deaths[5m]) > 0"
                severity: critical
                for: 1m
              - name: CouchDB Replicator reader process died
                description: Replication reader process has crashed
                query: "increase(couchdb_replicator_changes_reader_deaths[5m]) > 0"
                severity: critical
                for: 1m
              - name: CouchDB Replicator failed to start
                description: One or more replication tasks failed to start
                query: "increase(couchdb_replicator_failed_starts[5m]) > 0"
                severity: critical
                for: 1m
              - name: CouchDB replication cluster unstable
                description: The replication cluster is unstable, replication may be interrupted
                query: "couchdb_replicator_cluster_is_stable == 0"
                severity: critical
                for: 2m
              - name: CouchDB replication read failures
                description: Replication changes feed has failed reads more than 5 times in 5 minutes
                query: "increase(couchdb_replicator_changes_read_failures[5m]) > 5"
                severity: warning
                for: 5m
              - name: CouchDB file descriptors high
                description: Process is using more than 85% of allowed file descriptors
                query: "process_open_fds / process_max_fds > 0.85 and process_max_fds > 0"
                severity: warning
                for: 5m
              - name: CouchDB process restarted
                description: CouchDB process has restarted recently
                query: "changes(process_start_time_seconds[1h]) > 0"
                severity: info
                for: 1m
              - name: CouchDB critical log entries
                description: Critical or error log entries detected in the last 5 minutes
                query: "increase(couchdb_server_couch_log{level=~\"error|critical\"}[5m]) > 0"
                severity: critical
                for: 1m

      - name: Solr
        exporters:
          - name: embedded exporter
            slug: embedded-exporter
            doc_url: https://solr.apache.org/guide/8_11/monitoring-solr-with-prometheus-and-grafana.html
            rules:
              - name: Solr update errors
                description: Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}.
                query: "increase(solr_metrics_core_update_handler_errors_total[1m]) > 1"
                severity: critical
              - name: Solr query errors
                description: Solr has increased query errors in collection {{ $labels.collection }} for replica {{ $labels.replica }} on {{ $labels.base_url }}.
                query: 'increase(solr_metrics_core_errors_total{category="QUERY"}[1m]) > 1'
                severity: warning
                for: 5m
              - name: Solr replication errors
                description: Solr collection {{ $labels.collection }} has replication errors for replica {{ $labels.replica }} on {{ $labels.base_url }}.
                query: 'increase(solr_metrics_core_errors_total{category="REPLICATION"}[1m]) > 1'
                severity: critical
              - name: Solr low live node count
                description: Solr collection {{ $labels.collection }} has less than two live nodes for replica {{ $labels.replica }} on {{ $labels.base_url }}.
                query: "solr_collections_live_nodes < 2"
                severity: critical

  - name: Message brokers
    services:
      - name: RabbitMQ
        exporters:
          - name: rabbitmq/rabbitmq-prometheus
            slug: rabbitmq-exporter
            doc_url: https://github.com/rabbitmq/rabbitmq-prometheus
            rules:
              - name: RabbitMQ node down
                description: Less than 3 nodes running in RabbitMQ cluster
                query: "sum(rabbitmq_build_info) < 3"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: RabbitMQ node not distributed
                description: Distribution link state is not 'up'
                query: "erlang_vm_dist_node_state < 3"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: RabbitMQ instances different versions
                description: Running different version of RabbitMQ in the same cluster, can lead to failure.
                query: "count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1"
                severity: warning
                for: 1h
              - name: RabbitMQ memory high
                description: A node use more than 90% of allocated RAM
                query: "rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90 and rabbitmq_resident_memory_limit_bytes > 0"
                severity: warning
                for: 2m
              - name: RabbitMQ file descriptors usage
                description: A node use more than 90% of file descriptors
                query: "rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 and rabbitmq_process_max_fds > 0"
                severity: warning
                for: 2m
              - name: RabbitMQ too many ready messages
                description: RabbitMQ too many ready messages on {{ $labels.instance }}
                query: "sum(rabbitmq_queue_messages_ready) BY (queue) > 1000"
                severity: warning
                for: 1m
              - name: RabbitMQ too many unack messages
                description: Too many unacknowledged messages
                query: "sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000"
                severity: warning
                for: 1m
              - name: RabbitMQ too many connections
                description: The total connections of a node is too high
                query: "rabbitmq_connections > 1000"
                severity: warning
                for: 2m
              - name: RabbitMQ no queue consumer
                description: A queue has less than 1 consumer
                query: "rabbitmq_queue_consumers < 1"
                severity: warning
                for: 1m # allows a short service restart
              - name: RabbitMQ unroutable messages
                description: A queue has unroutable messages ({{ $value }} in the last 1m)
                query: "increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0"
                severity: warning
                for: 2m

          - name: kbudde/rabbitmq-exporter
            slug: kbudde-rabbitmq-exporter
            doc_url: https://github.com/kbudde/rabbitmq_exporter
            rules:
              - name: RabbitMQ down
                description: RabbitMQ node down
                query: "rabbitmq_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: RabbitMQ cluster down
                description: Less than 3 nodes running in RabbitMQ cluster
                query: "sum(rabbitmq_running) < 3"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: RabbitMQ cluster partition
                description: Cluster partition
                query: "rabbitmq_partitions > 0"
                severity: critical
              - name: RabbitMQ out of memory
                description: Memory available for RabbitMQ is low (< 10%)
                query: "rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0"
                severity: warning
                for: 2m
              - name: RabbitMQ instance too many connections
                description: RabbitMQ instance has too many connections (> 1000)
                query: "rabbitmq_connectionsTotal > 1000"
                severity: warning
                for: 2m
              - name: RabbitMQ dead letter queue filling up
                description: Dead letter queue is filling up (> 10 msgs)
                query: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10'
                severity: warning
                for: 1m
                comments: |
                  Indicate the queue name in dedicated label.
              - name: RabbitMQ too many messages in queue
                description: Queue is filling up (> 1000 msgs)
                query: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000'
                severity: warning
                for: 2m
                comments: |
                  Indicate the queue name in dedicated label.
              - name: RabbitMQ slow queue consuming
                description: Queue messages are consumed slowly (> 60s)
                query: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60'
                severity: warning
                for: 2m
                comments: |
                  Indicate the queue name in dedicated label.
              - name: RabbitMQ no consumer
                description: Queue has no consumer
                query: "rabbitmq_queue_consumers == 0"
                severity: critical
                for: 5m
                comments: |
                  Allows a short service restart.
              - name: RabbitMQ too many consumers
                description: Queue should have only 1 consumer
                query: 'rabbitmq_queue_consumers{queue="my-queue"} > 1'
                severity: critical
                comments: |
                  Indicate the queue name in dedicated label.
              - name: RabbitMQ inactive exchange
                description: Exchange receive less than 5 msgs per second
                query: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5'
                severity: warning
                comments: |
                  Indicate the exchange name in dedicated label.
                for: 2m

      - name: Zookeeper
        exporters:
          - name: cloudflare/kafka_zookeeper_exporter
            slug: cloudflare-kafka-zookeeper-exporter
            doc_url: https://github.com/cloudflare/kafka_zookeeper_exporter
            rules:
          - name: dabealu/zookeeper-exporter
            slug: dabealu-zookeeper-exporter
            doc_url: https://github.com/dabealu/zookeeper-exporter
            rules:
              - name: Zookeeper Down
                description: "Zookeeper down on instance {{ $labels.instance }}"
                query: "zk_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Zookeeper missing leader
                description: "Zookeeper cluster has no node marked as leader"
                query: "sum(zk_server_leader) == 0"
                severity: critical
              - name: Zookeeper Too Many Leaders
                description: "Zookeeper cluster has too many nodes marked as leader"
                query: "sum(zk_server_leader) > 1"
                severity: critical
              - name: Zookeeper Not Ok
                description: "Zookeeper instance is not ok"
                query: "zk_ruok == 0"
                severity: warning
                for: 3m

      - name: Kafka
        exporters:
          - name: danielqsj/kafka_exporter
            slug: danielqsj-kafka-exporter
            doc_url: https://github.com/danielqsj/kafka_exporter
            rules:
              - name: Kafka topics replicas
                description: Kafka topic in-sync partition
                query: "min(kafka_topic_partition_in_sync_replica) by (topic) < 3"
                severity: critical
              - name: Kafka consumer group lag
                description: Kafka consumer group {{ $labels.consumergroup }} is lagging behind ({{ $value }} messages)
                query: "sum(kafka_consumergroup_lag) by (consumergroup) > 10000"
                severity: warning
                for: 1m
          - name: linkedin/Burrow
            slug: linkedin-kafka-exporter
            doc_url: https://github.com/linkedin/Burrow
            rules:
              - name: Kafka topic offset decreased
                description: Kafka topic offset has decreased
                query: "delta(kafka_burrow_partition_current_offset[1m]) < 0"
                severity: warning
              - name: Kafka consumer lag
                description: Kafka consumer has a 30 minutes and increasing lag
                query: "kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0"
                severity: warning
                for: 15m

      - name: Pulsar
        exporters:
          - name: embedded exporter
            slug: embedded-exporter
            doc_url: https://pulsar.apache.org/docs/reference-metrics/
            rules:
              - name: Pulsar subscription high number of backlog entries
                description: "The number of subscription backlog entries is over 5k"
                query: sum(pulsar_subscription_back_log) by (subscription) > 5000
                for: 1h
                severity: warning
              - name: Pulsar subscription very high number of backlog entries
                description: "The number of subscription backlog entries is over 100k"
                query: sum(pulsar_subscription_back_log) by (subscription) > 100000
                for: 1h
                severity: critical
              - name: Pulsar topic large backlog storage size
                description: "The topic backlog storage size is over 5 GB"
                query: sum(pulsar_storage_size) by (topic) > 5*1024*1024*1024
                for: 1h
                severity: warning
              - name: Pulsar topic very large backlog storage size
                description: "The topic backlog storage size is over 20 GB"
                query: sum(pulsar_storage_size) by (topic) > 20*1024*1024*1024
                for: 1h
                severity: critical
              - name: Pulsar high write latency
                description: "Messages cannot be written in a timely fashion"
                query: sum(pulsar_storage_write_latency_overflow > 0) by (topic)
                for: 1h
                severity: critical
              - name: Pulsar large message payload
                description: "Observing large message payload (> 1MB)"
                query: sum(pulsar_entry_size_overflow > 0) by (topic)
                for: 1h
                severity: warning
              - name: Pulsar high ledger disk usage
                description: "Observing Ledger Disk Usage (> 75%)"
                query: sum(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_usage) by (kubernetes_pod_name) > 75
                for: 1h
                severity: critical
              - name: Pulsar read only bookies
                description: "Observing Readonly Bookies"
                query: count(bookie_SERVER_STATUS{} == 0) by (pod)
                for: 5m
                severity: critical
              - name: Pulsar high number of function errors
                description: "Observing more than 10 Function errors per minute"
                query: sum(rate(pulsar_function_user_exceptions_total[1m]) + rate(pulsar_function_system_exceptions_total[1m])) by (name) > 10
                for: 1m
                severity: critical
              - name: Pulsar high number of sink errors
                description: "Observing more than 10 Sink errors per minute"
                query: sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10
                for: 1m
                severity: critical

      - name: Nats
        exporters:
          - name: nats-io/prometheus-nats-exporter
            slug: nats-exporter
            doc_url: https://github.com/nats-io/prometheus-nats-exporter
            rules:
              - name: Nats high routes count
                description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }}
                query: "gnatsd_varz_routes > 10"
                severity: warning
                for: 3m
              - name: Nats high memory usage
                description: NATS server memory usage is above 200MB for {{ $labels.instance }}
                query: "gnatsd_varz_mem > 200 * 1024 * 1024"
                severity: warning
                for: 5m
              - name: Nats slow consumers
                description: There are slow consumers in NATS for {{ $labels.instance }}
                query: "gnatsd_varz_slow_consumers > 0"
                severity: critical
                for: 3m
              - name: Nats server down
                description: NATS server has been down for more than 5 minutes
                query: 'absent(up{job="nats"})'
                severity: critical
                for: 5m
              - name: Nats high CPU usage
                description: NATS server is using more than 80% CPU for the last 5 minutes
                query: "gnatsd_varz_cpu > 80"
                severity: warning
                for: 5m
                comments: |
                  gnatsd_varz_cpu is a gauge reporting CPU percentage (0-100 scale).
              - name: Nats high number of connections
                description: NATS server has more than 1000 active connections
                query: "gnatsd_connz_num_connections > 1000"
                severity: warning
                for: 5m
              - name: Nats high JetStream store usage
                description: JetStream store usage is over 80%
                query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 and gnatsd_varz_jetstream_config_max_storage > 0"
                severity: warning
                for: 5m
              - name: Nats high JetStream memory usage
                description: JetStream memory usage is over 80%
                query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 and gnatsd_varz_jetstream_config_max_memory > 0"
                severity: warning
                for: 5m
              - name: Nats high number of subscriptions
                description: NATS server has more than 1000 active subscriptions
                query: "gnatsd_connz_subscriptions > 1000"
                severity: warning
                for: 5m
              - name: Nats high pending bytes
                description: NATS server has more than 100,000 pending bytes
                query: "gnatsd_connz_pending_bytes > 100000"
                severity: warning
                for: 5m
              - name: Nats too many errors
                description: NATS server has encountered {{ $value }} JetStream API errors in the last 5 minutes
                query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0"
                severity: warning
                for: 5m
              - name: Nats JetStream accounts exceeded
                description: JetStream has more than 100 active accounts
                query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100"
                severity: warning
                for: 5m
              - name: Nats leaf node connection issue
                description: No leaf node connections on {{ $labels.instance }}
                query: "gnatsd_varz_leafnodes == 0"
                severity: warning
                for: 5m

  - name: Proxies, load balancers and service meshes
    services:
      - name: Nginx
        exporters:
          - name: knyar/nginx-lua-prometheus
            slug: knyar-nginx-exporter
            doc_url: https://github.com/knyar/nginx-lua-prometheus
            rules:
              - name: Nginx high HTTP 4xx error rate
                description: Too many HTTP requests with status 4xx (> 5%)
                query: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0'
                severity: critical
                for: 1m
              - name: Nginx high HTTP 5xx error rate
                description: Too many HTTP requests with status 5xx (> 5%)
                query: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0'
                severity: critical
                for: 1m
              - name: Nginx latency high
                description: Nginx p99 latency is higher than 3 seconds
                query: "histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node, le)) > 3"
                severity: warning
                for: 2m

      - name: Apache
        exporters:
          - name: Lusitaniae/apache_exporter
            slug: lusitaniae-apache-exporter
            doc_url: https://github.com/Lusitaniae/apache_exporter
            rules:
              - name: Apache down
                description: Apache down
                query: "apache_up == 0"
                severity: critical
              - name: Apache workers load
                description: Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }}
                query: '(sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80 and sum by (instance) (apache_scoreboard) > 0'
                severity: warning
                for: 2m
              - name: Apache restart
                description: Apache has just been restarted.
                query: "apache_uptime_seconds_total / 60 < 1"
                severity: warning

      - name: HaProxy
        exporters:
          - name: Embedded exporter (HAProxy >= v2)
            slug: embedded-exporter-v2
            doc_url: https://github.com/haproxy/haproxy/tree/master/contrib/prometheus-exporter
            rules:
              - name: HAProxy high HTTP 4xx error rate backend
                description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
                query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0
                severity: critical
                for: 1m
              - name: HAProxy high HTTP 5xx error rate backend
                description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
                query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0
                severity: critical
                for: 1m
              - name: HAProxy high HTTP 4xx error rate server
                description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}
                query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0
                severity: critical
                for: 1m
              - name: HAProxy high HTTP 5xx error rate server
                description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}
                query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0
                severity: critical
                for: 1m
              - name: HAProxy server response errors
                description: Too many response errors to {{ $labels.server }} server (> 5%).
                query: (sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0
                severity: critical
                for: 1m
              - name: HAProxy backend connection errors
                description: Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.
                query: (sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100
                severity: critical
                for: 1m
              - name: HAProxy server connection errors
                description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.
                query: (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100
                severity: critical
              - name: HAProxy backend max active session > 80%
                description: Session limit from backend {{ $labels.proxy }} reached 80% of limit - {{ $value | printf "%.2f"}}%
                query: ((haproxy_backend_current_sessions >0) * 100) / (haproxy_backend_limit_sessions > 0) > 80
                severity: warning
                for: 2m
              - name: HAProxy pending requests
                description: Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
                query: sum by (proxy) (haproxy_backend_current_queue) > 0
                comments: |
                  haproxy_backend_current_queue is a gauge (current queue depth), not a counter.
                severity: warning
                for: 2m
              - name: HAProxy HTTP slowing down
                description: Average request time is increasing - {{ $value | printf "%.2f"}}
                query: avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1
                severity: warning
                for: 1m
              - name: HAProxy retry high
                description: High rate of retry on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
                query: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10
                severity: warning
                for: 2m
              - name: HAproxy has no alive backends
                description: HAProxy has no alive active or backup backends for {{ $labels.proxy }}
                query: haproxy_backend_active_servers + haproxy_backend_backup_servers == 0
                severity: critical
              - name: HAProxy frontend security blocked requests
                description: HAProxy is blocking requests for security reason
                query: sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10
                severity: warning
                for: 2m
              - name: HAProxy server healthcheck failure
                description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)
                query: increase(haproxy_server_check_failures_total[1m]) > 0
                severity: warning
                for: 1m
          - name: prometheus/haproxy_exporter (HAProxy < v2)
            slug: haproxy-exporter-v1
            doc_url: https://github.com/prometheus/haproxy_exporter
            rules:
              - name: HAProxy down
                description: HAProxy down
                query: "haproxy_up == 0"
                severity: critical
              - name: HAProxy high HTTP 4xx error rate backend (v1)
                description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
                query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
                severity: critical
                for: 1m
              - name: HAProxy high HTTP 5xx error rate backend (v1)
                description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
                query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
                severity: critical
                for: 1m
              - name: HAProxy high HTTP 4xx error rate server (v1)
                description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}
                query: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
                severity: critical
                for: 1m
              - name: HAProxy high HTTP 5xx error rate server (v1)
                description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}
                query: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
                severity: critical
                for: 1m
              - name: HAProxy server response errors (v1)
                description: Too many response errors to {{ $labels.server }} server (> 5%).
                query: "sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0"
                severity: critical
                for: 1m
              - name: HAProxy backend connection errors (v1)
                description: Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.
                query: "sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100"
                severity: critical
                for: 1m
              - name: HAProxy server connection errors (v1)
                description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.
                query: "sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100"
                severity: critical
              - name: HAProxy backend max active session
                description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).
                query: "((sum by (backend) (haproxy_backend_current_sessions * 100) / sum by (backend) (haproxy_backend_limit_sessions))) > 80 and sum by (backend) (haproxy_backend_limit_sessions) > 0"
                severity: warning
                for: 2m
              - name: HAProxy pending requests (v1)
                description: Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend
                query: "sum by (backend) (haproxy_backend_current_queue) > 0"
                severity: warning
                for: 2m
              - name: HAProxy HTTP slowing down (v1)
                description: Average request time is increasing
                query: "avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1"
                severity: warning
                for: 1m
              - name: HAProxy retry high (v1)
                description: High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend
                query: "sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10"
                severity: warning
                for: 2m
              - name: HAProxy backend down
                description: HAProxy backend is down
                query: "haproxy_backend_up == 0"
                severity: critical
              - name: HAProxy server down
                description: HAProxy server is down
                query: "haproxy_server_up == 0"
                severity: critical
              - name: HAProxy frontend security blocked requests (v1)
                description: HAProxy is blocking requests for security reason
                query: "sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10"
                severity: warning
                for: 2m
              - name: HAProxy server healthcheck failure (v1)
                description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)
                query: "increase(haproxy_server_check_failures_total[1m]) > 0"
                severity: warning
                for: 1m

      - name: Traefik
        exporters:
          - name: Embedded exporter v2
            slug: embedded-exporter-v2
            doc_url: https://docs.traefik.io/observability/metrics/prometheus/
            rules:
              - name: Traefik service down
                description: All Traefik services are down
                query: "count(traefik_service_server_up) by (service) == 0"
                severity: critical
              - name: Traefik high HTTP 4xx error rate service
                description: Traefik service 4xx error rate is above 5%
                query: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0'
                severity: critical
                for: 1m
              - name: Traefik high HTTP 5xx error rate service
                description: Traefik service 5xx error rate is above 5%
                query: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0'
                severity: critical
                for: 1m
          - name: Embedded exporter v1
            slug: embedded-exporter-v1
            doc_url: https://docs.traefik.io/observability/metrics/prometheus/
            rules:
              - name: Traefik backend down
                description: All Traefik backends are down
                query: "count(traefik_backend_server_up) by (backend) == 0"
                severity: critical
              - name: Traefik high HTTP 4xx error rate backend
                description: Traefik backend 4xx error rate is above 5%
                query: 'sum(rate(traefik_backend_requests_total{code=~"4.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0'
                severity: critical
                for: 1m
              - name: Traefik high HTTP 5xx error rate backend
                description: Traefik backend 5xx error rate is above 5%
                query: 'sum(rate(traefik_backend_requests_total{code=~"5.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0'
                severity: critical
                for: 1m

      - name: Caddy
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://caddyserver.com/docs/metrics
            rules:
              - name: Caddy Reverse Proxy Down
                description: "All Caddy reverse proxies are down"
                query: "count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0"
                severity: critical

              - name: Caddy high HTTP 4xx error rate service
                description: "Caddy service 4xx error rate is above 5%"
                query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0'
                severity: critical
                for: 1m
              - name: Caddy high HTTP 5xx error rate service
                description: "Caddy service 5xx error rate is above 5%"
                query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0'
                severity: critical
                for: 1m

      - name: Envoy
        exporters:
          - name: Built-in metrics
            slug: embedded-exporter
            doc_url: https://www.envoyproxy.io/docs/envoy/latest/configuration/observability/statistics
            rules:
              - name: Envoy server not live
                description: "Envoy server is not live (draining or shutting down) on {{ $labels.instance }}"
                query: "envoy_server_live != 1"
                severity: critical
                for: 1m
              - name: Envoy high memory usage
                description: "Envoy memory allocated is above 90% of heap size on {{ $labels.instance }}"
                query: "envoy_server_memory_allocated / envoy_server_memory_heap_size * 100 > 90 and envoy_server_memory_heap_size > 0"
                severity: warning
                for: 5m
              - name: Envoy high downstream HTTP 5xx error rate
                description: "More than 5% of downstream HTTP responses are 5xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
                query: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
                severity: critical
                for: 1m
              - name: Envoy high downstream HTTP 4xx error rate
                description: "More than 10% of downstream HTTP responses are 4xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
                query: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
                severity: warning
                for: 5m
              - name: Envoy downstream connections overflowing
                description: "Downstream connections are being rejected due to listener overflow on {{ $labels.instance }} ({{ $value }} in the last 5m)"
                query: "increase(envoy_listener_downstream_cx_overflow[5m]) > 5"
                severity: warning
              - name: Envoy cluster membership empty
                description: "Envoy cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} has no healthy members"
                query: "envoy_cluster_membership_healthy == 0"
                severity: critical
                for: 1m
              - name: Envoy cluster membership degraded
                description: "More than 25% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are unhealthy"
                query: "envoy_cluster_membership_healthy / envoy_cluster_membership_total * 100 < 75 and envoy_cluster_membership_total > 0"
                severity: warning
                for: 5m
              - name: Envoy high cluster upstream connection failures
                description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)"
                query: "increase(envoy_cluster_upstream_cx_connect_fail[5m]) > 10"
                severity: warning
                for: 5m
              - name: Envoy high cluster upstream request timeout rate
                description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
                query: "rate(envoy_cluster_upstream_rq_timeout[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0"
                severity: warning
                for: 5m
              - name: Envoy high cluster upstream 5xx error rate
                description: "More than 5% of upstream requests return 5xx in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
                query: 'rate(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0'
                severity: critical
                for: 1m
              - name: Envoy cluster health check failures
                description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)"
                query: "increase(envoy_cluster_health_check_failure[5m]) > 5"
                severity: warning
                for: 5m
              - name: Envoy cluster outlier detection ejections active
                description: "There are active outlier detection ejections in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
                query: "envoy_cluster_outlier_detection_ejections_active > 0"
                severity: info
                for: 5m
              - name: Envoy listener SSL connection errors
                description: "Envoy listener is experiencing SSL/TLS connection errors on {{ $labels.instance }} ({{ $value }} in the last 5m)"
                query: "increase(envoy_listener_ssl_connection_error[5m]) > 5"
                severity: warning
              - name: Envoy global downstream connections overflowing
                description: "Downstream connections are being rejected due to global connection limit on {{ $labels.instance }} ({{ $value }} in the last 5m)"
                query: "increase(envoy_listener_downstream_global_cx_overflow[5m]) > 5"
                severity: critical
              - name: Envoy SSL certificate expiring soon
                description: "SSL certificate loaded by Envoy on {{ $labels.instance }} expires in less than 7 days"
                query: "envoy_server_days_until_first_cert_expiring < 7"
                severity: warning
              - name: Envoy SSL certificate expired
                description: "SSL certificate loaded by Envoy on {{ $labels.instance }} has expired"
                query: "envoy_server_days_until_first_cert_expiring < 0"
                severity: critical
              - name: Envoy cluster circuit breaker tripped
                description: "Circuit breaker is open for cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
                query: "envoy_cluster_circuit_breakers_default_cx_open == 1 or envoy_cluster_circuit_breakers_default_rq_open == 1"
                severity: critical
              - name: Envoy no healthy upstream
                description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)"
                query: "increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 0"
                severity: critical
              - name: Envoy high downstream request timeout rate
                description: "Downstream requests are timing out on {{ $labels.instance }} ({{ $value }} in the last 5m)"
                query: "increase(envoy_http_downstream_rq_timeout[5m]) > 5"
                severity: warning
                for: 5m

      - name: Linkerd
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://linkerd.io/2/tasks/exporting-metrics/
            rules:
              - name: Linkerd high error rate
                description: "Linkerd error rate for {{ $labels.deployment }}{{ $labels.statefulset }}{{ $labels.daemonset }} is over 10%"
                query: 'sum(rate(response_total{classification="failure"}[1m])) by (deployment, statefulset, daemonset) / sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10 and sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) > 0'
                comments: |
                  Linkerd does not expose request_errors_total. Errors are tracked via response_total{classification="failure"}.
                severity: warning
                for: 1m

      - name: Istio
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://istio.io/latest/docs/tasks/observability/metrics/querying-metrics/
            rules:
              - name: Istio Kubernetes gateway availability drop
                description: Gateway pods have dropped. Inbound traffic will likely be affected.
                query: 'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2'
                severity: warning
                for: 1m
              - name: Istio Pilot high total request rate
                description: Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.
                query: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5 and sum(rate(pilot_xds_pushes[1m])) > 0"
                severity: warning
                for: 1m
              - name: Istio Mixer Prometheus dispatches low
                description: Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly.
                query: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
                severity: warning
                for: 1m
              - name: Istio high total request rate
                description: Global request rate in the service mesh is unusually high.
                query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
                severity: warning
                for: 2m
              - name: Istio low total request rate
                description: Global request rate in the service mesh is unusually low.
                query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
                severity: warning
                for: 2m
              - name: Istio high 4xx error rate
                description: High percentage of HTTP 4xx responses in Ist

Download .txt

gitextract_epx1dxil/

├── .github/
│   ├── FUNDING.yml
│   ├── dependabot.yml
│   └── workflows/
│       ├── dist.yml
│       └── test.yml
├── .gitignore
├── .travis.yml
├── CLAUDE.md
├── CONTRIBUTING.md
├── Gemfile
├── LICENSE
├── README.md
├── _config.yml
├── _data/
│   └── rules.yml
├── _layouts/
│   └── default.html
├── alertmanager.md
├── assets/
│   ├── css/
│   │   └── app.css
│   └── js/
│       └── app.js
├── blackbox-exporter.md
├── dist/
│   ├── rules/
│   │   ├── apache/
│   │   │   └── lusitaniae-apache-exporter.yml
│   │   ├── apache-flink/
│   │   │   └── flink-prometheus-reporter.yml
│   │   ├── apache-spark/
│   │   │   └── spark-prometheus.yml
│   │   ├── apc-ups/
│   │   │   └── apcupsd_exporter.yml
│   │   ├── argocd/
│   │   │   └── embedded-exporter.yml
│   │   ├── aws-cloudwatch/
│   │   │   └── prometheus-cloudwatch-exporter.yml
│   │   ├── azure/
│   │   │   └── azure-metrics-exporter.yml
│   │   ├── blackbox/
│   │   │   └── blackbox-exporter.yml
│   │   ├── caddy/
│   │   │   └── embedded-exporter.yml
│   │   ├── cassandra/
│   │   │   ├── criteo-cassandra-exporter.yml
│   │   │   └── instaclustr-cassandra-exporter.yml
│   │   ├── ceph/
│   │   │   └── embedded-exporter.yml
│   │   ├── cert-manager/
│   │   │   └── embedded-exporter.yml
│   │   ├── cilium/
│   │   │   └── embedded-exporter.yml
│   │   ├── clickhouse/
│   │   │   └── embedded-exporter.yml
│   │   ├── cloudflare/
│   │   │   └── lablabs-cloudflare-exporter.yml
│   │   ├── consul/
│   │   │   └── consul-exporter.yml
│   │   ├── coredns/
│   │   │   └── embedded-exporter.yml
│   │   ├── cortex/
│   │   │   └── embedded-exporter.yml
│   │   ├── couchdb/
│   │   │   └── gesellix-couchdb-prometheus-exporter.yml
│   │   ├── digitalocean/
│   │   │   └── digitalocean-exporter.yml
│   │   ├── docker-containers/
│   │   │   └── google-cadvisor.yml
│   │   ├── ebpf/
│   │   │   └── ebpf-exporter.yml
│   │   ├── elasticsearch/
│   │   │   └── prometheus-community-elasticsearch-exporter.yml
│   │   ├── envoy/
│   │   │   └── embedded-exporter.yml
│   │   ├── etcd/
│   │   │   └── embedded-exporter.yml
│   │   ├── fluxcd/
│   │   │   └── embedded-exporter.yml
│   │   ├── freeswitch/
│   │   │   └── znerol-freeswitch-exporter.yml
│   │   ├── gitlab-ci/
│   │   │   ├── gitaly.yml
│   │   │   ├── gitlab-built-in-exporter.yml
│   │   │   └── workhorse.yml
│   │   ├── golang/
│   │   │   └── golang-exporter.yml
│   │   ├── google-cloud-stackdriver/
│   │   │   └── stackdriver-exporter.yml
│   │   ├── grafana-alloy/
│   │   │   └── embedded-exporter.yml
│   │   ├── grafana-mimir/
│   │   │   └── embedded-exporter.yml
│   │   ├── grafana-tempo/
│   │   │   └── embedded-exporter.yml
│   │   ├── graph-node/
│   │   │   └── embedded-exporter.yml
│   │   ├── hadoop/
│   │   │   └── jmx_exporter.yml
│   │   ├── haproxy/
│   │   │   ├── embedded-exporter-v2.yml
│   │   │   └── haproxy-exporter-v1.yml
│   │   ├── hashicorp-vault/
│   │   │   └── embedded-exporter.yml
│   │   ├── host-and-hardware/
│   │   │   └── node-exporter.yml
│   │   ├── ipmi/
│   │   │   └── ipmi-exporter.yml
│   │   ├── istio/
│   │   │   └── embedded-exporter.yml
│   │   ├── jaeger/
│   │   │   └── embedded-exporter.yml
│   │   ├── jenkins/
│   │   │   └── metric-plugin.yml
│   │   ├── juniper/
│   │   │   └── czerwonk-junos-exporter.yml
│   │   ├── jvm/
│   │   │   └── jvm-exporter.yml
│   │   ├── kafka/
│   │   │   ├── danielqsj-kafka-exporter.yml
│   │   │   └── linkedin-kafka-exporter.yml
│   │   ├── keycloak/
│   │   │   └── aerogear-keycloak-metrics-spi.yml
│   │   ├── kubernetes/
│   │   │   └── kubestate-exporter.yml
│   │   ├── linkerd/
│   │   │   └── embedded-exporter.yml
│   │   ├── loki/
│   │   │   └── embedded-exporter.yml
│   │   ├── meilisearch/
│   │   │   └── embedded-exporter.yml
│   │   ├── memcached/
│   │   │   └── memcached-exporter.yml
│   │   ├── minio/
│   │   │   └── embedded-exporter.yml
│   │   ├── mongodb/
│   │   │   ├── dcu-mongodb-exporter.yml
│   │   │   ├── percona-mongodb-exporter.yml
│   │   │   └── stefanprodan-mgob-exporter.yml
│   │   ├── mysql/
│   │   │   └── mysqld-exporter.yml
│   │   ├── nats/
│   │   │   └── nats-exporter.yml
│   │   ├── netdata/
│   │   │   └── embedded-exporter.yml
│   │   ├── nginx/
│   │   │   └── knyar-nginx-exporter.yml
│   │   ├── nomad/
│   │   │   └── embedded-exporter.yml
│   │   ├── openebs/
│   │   │   └── embedded-exporter.yml
│   │   ├── openstack/
│   │   │   └── openstack-exporter.yml
│   │   ├── opentelemetry-collector/
│   │   │   └── embedded-exporter.yml
│   │   ├── oracle-database/
│   │   │   └── iamseth-oracledb-exporter.yml
│   │   ├── patroni/
│   │   │   └── embedded-exporter-patroni.yml
│   │   ├── pgbouncer/
│   │   │   └── spreaker-pgbouncer-exporter.yml
│   │   ├── php-fpm/
│   │   │   └── bakins-fpm-exporter.yml
│   │   ├── postgresql/
│   │   │   └── postgres-exporter.yml
│   │   ├── process-exporter/
│   │   │   └── process-exporter.yml
│   │   ├── prometheus-self-monitoring/
│   │   │   └── embedded-exporter.yml
│   │   ├── promtail/
│   │   │   └── embedded-exporter.yml
│   │   ├── proxmox-ve/
│   │   │   └── prometheus-pve-exporter.yml
│   │   ├── pulsar/
│   │   │   └── embedded-exporter.yml
│   │   ├── python/
│   │   │   └── python-exporter.yml
│   │   ├── rabbitmq/
│   │   │   ├── kbudde-rabbitmq-exporter.yml
│   │   │   └── rabbitmq-exporter.yml
│   │   ├── redis/
│   │   │   └── oliver006-redis-exporter.yml
│   │   ├── ruby/
│   │   │   └── ruby-exporter.yml
│   │   ├── s.m.a.r.t-device-monitoring/
│   │   │   └── smartctl-exporter.yml
│   │   ├── sidekiq/
│   │   │   └── strech-sidekiq-exporter.yml
│   │   ├── snmp/
│   │   │   └── snmp-exporter.yml
│   │   ├── solr/
│   │   │   └── embedded-exporter.yml
│   │   ├── speedtest/
│   │   │   └── nlamirault-speedtest-exporter.yml
│   │   ├── spinnaker/
│   │   │   └── embedded-exporter.yml
│   │   ├── sql-server/
│   │   │   └── ozarklake-mssql-exporter.yml
│   │   ├── ssl/
│   │   │   └── tls/
│   │   │       └── ribbybibby-ssl-exporter.yml
│   │   ├── systemd/
│   │   │   └── systemd-exporter.yml
│   │   ├── thanos/
│   │   │   ├── thanos-bucket-replicate.yml
│   │   │   ├── thanos-compactor.yml
│   │   │   ├── thanos-component-absent.yml
│   │   │   ├── thanos-query.yml
│   │   │   ├── thanos-receiver.yml
│   │   │   ├── thanos-ruler.yml
│   │   │   ├── thanos-sidecar.yml
│   │   │   └── thanos-store.yml
│   │   ├── traefik/
│   │   │   ├── embedded-exporter-v1.yml
│   │   │   └── embedded-exporter-v2.yml
│   │   ├── vmware/
│   │   │   └── pryorda-vmware-exporter.yml
│   │   ├── windows-server/
│   │   │   └── windows-exporter.yml
│   │   ├── wireguard/
│   │   │   └── mindflavor-prometheus-wireguard-exporter.yml
│   │   ├── zfs/
│   │   │   ├── node-exporter.yml
│   │   │   └── zfs_exporter.yml
│   │   └── zookeeper/
│   │       ├── cloudflare-kafka-zookeeper-exporter.yml
│   │       └── dabealu-zookeeper-exporter.yml
│   └── template.yml
├── docker-compose.yml
├── index.md
├── package.json
├── rules.md
└── sleep-peacefully.md

Download .json

Condensed preview — 133 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (929K chars).

[
  {
    "path": ".github/FUNDING.yml",
    "chars": 37,
    "preview": "github: [samber]\nko_fi: samuelberthe\n"
  },
  {
    "path": ".github/dependabot.yml",
    "chars": 123,
    "preview": "---\nversion: 2\nupdates:\n  - package-ecosystem: \"github-actions\"\n    directory: \"/\"\n    schedule:\n      interval: \"monthl"
  },
  {
    "path": ".github/workflows/dist.yml",
    "chars": 2449,
    "preview": "name: Publish\n\non:\n  workflow_dispatch:\n  push:\n    branches:\n      - master\n\npermissions:\n  contents: write\n\njobs:\n  pu"
  },
  {
    "path": ".github/workflows/test.yml",
    "chars": 1916,
    "preview": "name: Promtool check\n\non:\n  pull_request:\n  push:\n    branches:\n      - master\n\njobs:\n  promtool-check:\n    name: Check "
  },
  {
    "path": ".gitignore",
    "chars": 106,
    "preview": "_site/\n.sass-cache/\n.jekyll-cache/\n.jekyll-metadata\n_data/rules.json\ntest/rules/\n/node_modules\n.worktrees/"
  },
  {
    "path": ".travis.yml",
    "chars": 38,
    "preview": "language: node_js\nnode_js:\n  - 'node'\n"
  },
  {
    "path": "CLAUDE.md",
    "chars": 17705,
    "preview": "# CLAUDE.md\n\nThis file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.\n\n## "
  },
  {
    "path": "CONTRIBUTING.md",
    "chars": 859,
    "preview": "\n# Contributing\n\n## Adding alerting rule\n\nIf you don't have time to write a PR, just copy and paste some alerts into an "
  },
  {
    "path": "Gemfile",
    "chars": 106,
    "preview": "source 'https://rubygems.org'\ngem 'github-pages', '>= 232', group: :jekyll_plugins\ngem 'webrick', '~> 1.8'"
  },
  {
    "path": "LICENSE",
    "chars": 112,
    "preview": "Creative Commons Attribution 4.0 International License (CC BY 4.0)\n\nhttp://creativecommons.org/licenses/by/4.0/\n"
  },
  {
    "path": "README.md",
    "chars": 9726,
    "preview": "# 👋 Awesome Prometheus Alerts [![Awesome](https://awesome.re/badge-flat.svg)](https://awesome.re)\n\n> Most alerting rules"
  },
  {
    "path": "_config.yml",
    "chars": 186,
    "preview": "theme: jekyll-theme-cayman\n\ntitle: Awesome Prometheus alerts\ndescription: Collection of alerting rules\n\nrepository: samb"
  },
  {
    "path": "_data/rules.yml",
    "chars": 369597,
    "preview": "#\n# The following yaml cannot be copy-pasted to Prometheus configuration.\n#     Please navigate to https://samber.github"
  },
  {
    "path": "_layouts/default.html",
    "chars": 5345,
    "preview": "<!DOCTYPE html>\n<html lang=\"{{ site.lang | default: \"en-US\" }}\">\n\n<head>\n  <meta charset=\"UTF-8\">\n  {% seo %}\n  <meta na"
  },
  {
    "path": "alertmanager.md",
    "chars": 4066,
    "preview": "<h1 style=\"text-align: center;\">\n  Global configuration\n</h1>\n\nIf you notice a delay between an event and the first noti"
  },
  {
    "path": "assets/css/app.css",
    "chars": 5869,
    "preview": "a.anchor {\n    font-size: 15px;\n    vertical-align: middle;\n    color: darkblue;\n    display: inline-block;\n    padding-"
  },
  {
    "path": "assets/js/app.js",
    "chars": 648,
    "preview": "$(function () {\n    var clipboardRules = new ClipboardJS('.clipboard-single', {\n        text: function (trigger) {\n     "
  },
  {
    "path": "blackbox-exporter.md",
    "chars": 4357,
    "preview": "\n<h1 style=\"text-align: center;\">\n  Blackbox exporter\n</h1>\n\n## Wordwide probes\n\n<a href=\"https://github.com/prometheus/"
  },
  {
    "path": "dist/rules/apache/lusitaniae-apache-exporter.yml",
    "chars": 1169,
    "preview": "groups:\n\n- name: LusitaniaeApacheExporter\n\n  \n  rules:\n\n    - alert: ApacheDown\n      expr: 'apache_up == 0'\n      for: "
  },
  {
    "path": "dist/rules/apache-flink/flink-prometheus-reporter.yml",
    "chars": 6013,
    "preview": "groups:\n\n- name: FlinkPrometheusReporter\n\n  \n  rules:\n\n    - alert: FlinkJobIsNotRunning\n      expr: 'flink_jobmanager_n"
  },
  {
    "path": "dist/rules/apache-spark/spark-prometheus.yml",
    "chars": 4540,
    "preview": "groups:\n\n- name: SparkPrometheus\n\n  # Spark exposes metrics via two built-in endpoints:\n  # - PrometheusServlet: master/"
  },
  {
    "path": "dist/rules/apc-ups/apcupsd_exporter.yml",
    "chars": 2237,
    "preview": "groups:\n\n- name: Apcupsd_exporter\n\n  \n  rules:\n\n    - alert: ApcUpsBatteryNearlyEmpty\n      expr: 'apcupsd_battery_charg"
  },
  {
    "path": "dist/rules/argocd/embedded-exporter.yml",
    "chars": 817,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: ArgocdServiceNotSynced\n      expr: 'argocd_app_info{sync_st"
  },
  {
    "path": "dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml",
    "chars": 7159,
    "preview": "groups:\n\n- name: PrometheusCloudwatchExporter\n\n  # CloudWatch metrics are exported as aws_{namespace}_{metric_name}_{sta"
  },
  {
    "path": "dist/rules/azure/azure-metrics-exporter.yml",
    "chars": 2925,
    "preview": "groups:\n\n- name: AzureMetricsExporter\n\n  # The exporter uses azurerm_resource_metric as the default metric name for forw"
  },
  {
    "path": "dist/rules/blackbox/blackbox-exporter.yml",
    "chars": 3696,
    "preview": "groups:\n\n- name: BlackboxExporter\n\n  \n  rules:\n\n    - alert: BlackboxProbeFailed\n      expr: 'probe_success == 0'\n      "
  },
  {
    "path": "dist/rules/caddy/embedded-exporter.yml",
    "chars": 1589,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: CaddyReverseProxyDown\n      expr: 'count(caddy_reverse_prox"
  },
  {
    "path": "dist/rules/cassandra/criteo-cassandra-exporter.yml",
    "chars": 8748,
    "preview": "groups:\n\n- name: CriteoCassandraExporter\n\n  \n  rules:\n\n    - alert: CassandraHintsCount\n      expr: 'changes(cassandra_s"
  },
  {
    "path": "dist/rules/cassandra/instaclustr-cassandra-exporter.yml",
    "chars": 5815,
    "preview": "groups:\n\n- name: InstaclustrCassandraExporter\n\n  \n  rules:\n\n    # 1m delay allows a restart without triggering an alert."
  },
  {
    "path": "dist/rules/ceph/embedded-exporter.yml",
    "chars": 4737,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: CephState\n      expr: 'ceph_health_status != 0'\n      for: "
  },
  {
    "path": "dist/rules/cert-manager/embedded-exporter.yml",
    "chars": 2341,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: Cert-managerAbsent\n      expr: 'absent(up{job=\"cert-manager"
  },
  {
    "path": "dist/rules/cilium/embedded-exporter.yml",
    "chars": 16260,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    # Metric name depends on Cilium version. Use cilium_unreachable_node"
  },
  {
    "path": "dist/rules/clickhouse/embedded-exporter.yml",
    "chars": 9012,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    # Adjust the job label to match your Prometheus configuration.\n    -"
  },
  {
    "path": "dist/rules/cloudflare/lablabs-cloudflare-exporter.yml",
    "chars": 1215,
    "preview": "groups:\n\n- name: LablabsCloudflareExporter\n\n  \n  rules:\n\n    - alert: CloudflareHttp4xxErrorRate\n      expr: '(sum by(zo"
  },
  {
    "path": "dist/rules/consul/consul-exporter.yml",
    "chars": 1168,
    "preview": "groups:\n\n- name: ConsulExporter\n\n  \n  rules:\n\n    - alert: ConsulServiceHealthcheckFailed\n      expr: 'consul_catalog_se"
  },
  {
    "path": "dist/rules/coredns/embedded-exporter.yml",
    "chars": 387,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: CorednsPanicCount\n      expr: 'increase(coredns_panics_tota"
  },
  {
    "path": "dist/rules/cortex/embedded-exporter.yml",
    "chars": 2731,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: CortexRulerConfigurationReloadFailure\n      expr: 'cortex_r"
  },
  {
    "path": "dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml",
    "chars": 7412,
    "preview": "groups:\n\n- name: GesellixCouchdbPrometheusExporter\n\n  \n  rules:\n\n    - alert: CouchdbNodeDown\n      expr: 'couchdb_httpd"
  },
  {
    "path": "dist/rules/digitalocean/digitalocean-exporter.yml",
    "chars": 4350,
    "preview": "groups:\n\n- name: DigitaloceanExporter\n\n  \n  rules:\n\n    - alert: DigitaloceanDropletDown\n      expr: 'digitalocean_dropl"
  },
  {
    "path": "dist/rules/docker-containers/google-cadvisor.yml",
    "chars": 5318,
    "preview": "groups:\n\n- name: GoogleCadvisor\n\n  \n  rules:\n\n    # This rule can be very noisy in dynamic infra with legitimate contain"
  },
  {
    "path": "dist/rules/ebpf/ebpf-exporter.yml",
    "chars": 1624,
    "preview": "groups:\n\n- name: EbpfExporter\n\n  \n  rules:\n\n    # The exporter uses loose attachment: if a program fails to load (missin"
  },
  {
    "path": "dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml",
    "chars": 8288,
    "preview": "groups:\n\n- name: PrometheusCommunityElasticsearchExporter\n\n  \n  rules:\n\n    - alert: ElasticsearchHeapUsageTooHigh\n     "
  },
  {
    "path": "dist/rules/envoy/embedded-exporter.yml",
    "chars": 9533,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: EnvoyServerNotLive\n      expr: 'envoy_server_live != 1'\n   "
  },
  {
    "path": "dist/rules/etcd/embedded-exporter.yml",
    "chars": 6390,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: EtcdInsufficientMembers\n      expr: 'count(etcd_server_id) "
  },
  {
    "path": "dist/rules/fluxcd/embedded-exporter.yml",
    "chars": 1911,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: FluxKustomizationFailure\n      expr: 'gotk_resource_info{re"
  },
  {
    "path": "dist/rules/freeswitch/znerol-freeswitch-exporter.yml",
    "chars": 1274,
    "preview": "groups:\n\n- name: ZnerolFreeswitchExporter\n\n  \n  rules:\n\n    - alert: FreeswitchDown\n      expr: 'freeswitch_up == 0'\n   "
  },
  {
    "path": "dist/rules/gitlab-ci/gitaly.yml",
    "chars": 3548,
    "preview": "groups:\n\n- name: Gitaly\n\n  \n  rules:\n\n    # grpc_code!=\"OK\" includes non-error codes like NotFound, AlreadyExists. Consi"
  },
  {
    "path": "dist/rules/gitlab-ci/gitlab-built-in-exporter.yml",
    "chars": 11318,
    "preview": "groups:\n\n- name: GitlabBuiltInExporter\n\n  \n  rules:\n\n    # Queued connections indicate Puma workers are saturated.\n    #"
  },
  {
    "path": "dist/rules/gitlab-ci/workhorse.yml",
    "chars": 1795,
    "preview": "groups:\n\n- name: Workhorse\n\n  \n  rules:\n\n    # Workhorse sits in front of Puma and handles Git HTTP, file uploads, and p"
  },
  {
    "path": "dist/rules/golang/golang-exporter.yml",
    "chars": 4737,
    "preview": "groups:\n\n- name: GolangExporter\n\n  \n  rules:\n\n    # Threshold is a rough default. High-concurrency servers may legitimat"
  },
  {
    "path": "dist/rules/google-cloud-stackdriver/stackdriver-exporter.yml",
    "chars": 2555,
    "preview": "groups:\n\n- name: StackdriverExporter\n\n  # Self-monitoring metrics use the stackdriver_monitoring_* prefix.\n  # All self-"
  },
  {
    "path": "dist/rules/grafana-alloy/embedded-exporter.yml",
    "chars": 502,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: GrafanaAlloyServiceDown\n      expr: 'count by (instance) (a"
  },
  {
    "path": "dist/rules/grafana-mimir/embedded-exporter.yml",
    "chars": 25602,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  # Mimir uses the `cortex_` metric prefix for backward compatibility with Cortex. Th"
  },
  {
    "path": "dist/rules/grafana-tempo/embedded-exporter.yml",
    "chars": 9591,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: TempoDistributorUnhealthy\n      expr: 'max by (job) (tempo_"
  },
  {
    "path": "dist/rules/graph-node/embedded-exporter.yml",
    "chars": 2592,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: ProviderFailedBecauseNet_versionFailed\n      expr: 'eth_rpc"
  },
  {
    "path": "dist/rules/hadoop/jmx_exporter.yml",
    "chars": 4056,
    "preview": "groups:\n\n- name: Jmx_exporter\n\n  \n  rules:\n\n    - alert: HadoopNameNodeDown\n      expr: 'up{job=\"hadoop-namenode\"} == 0'"
  },
  {
    "path": "dist/rules/haproxy/embedded-exporter-v2.yml",
    "chars": 7134,
    "preview": "groups:\n\n- name: EmbeddedExporterV2\n\n  \n  rules:\n\n    - alert: HaproxyHighHttp4xxErrorRateBackend\n      expr: '((sum by "
  },
  {
    "path": "dist/rules/haproxy/haproxy-exporter-v1.yml",
    "chars": 7654,
    "preview": "groups:\n\n- name: HaproxyExporterV1\n\n  \n  rules:\n\n    - alert: HaproxyDown\n      expr: 'haproxy_up == 0'\n      for: 0m\n  "
  },
  {
    "path": "dist/rules/hashicorp-vault/embedded-exporter.yml",
    "chars": 1614,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: VaultSealed\n      expr: 'vault_core_unsealed == 0'\n      fo"
  },
  {
    "path": "dist/rules/host-and-hardware/node-exporter.yml",
    "chars": 16691,
    "preview": "groups:\n\n- name: NodeExporter\n\n  \n  rules:\n\n    - alert: HostOutOfMemory\n      expr: '(node_memory_MemAvailable_bytes / "
  },
  {
    "path": "dist/rules/ipmi/ipmi-exporter.yml",
    "chars": 7461,
    "preview": "groups:\n\n- name: IpmiExporter\n\n  \n  rules:\n\n    # The ipmi_up metric is per-collector. A value of 0 means the collector "
  },
  {
    "path": "dist/rules/istio/embedded-exporter.yml",
    "chars": 4944,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: IstioKubernetesGatewayAvailabilityDrop\n      expr: 'min(kub"
  },
  {
    "path": "dist/rules/jaeger/embedded-exporter.yml",
    "chars": 5163,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: JaegerAgentHttpServerErrors\n      expr: '100 * sum(rate(jae"
  },
  {
    "path": "dist/rules/jenkins/metric-plugin.yml",
    "chars": 3957,
    "preview": "groups:\n\n- name: MetricPlugin\n\n  \n  rules:\n\n    - alert: JenkinsNodeOffline\n      expr: 'jenkins_node_offline_value > 0'"
  },
  {
    "path": "dist/rules/juniper/czerwonk-junos-exporter.yml",
    "chars": 1162,
    "preview": "groups:\n\n- name: CzerwonkJunosExporter\n\n  \n  rules:\n\n    - alert: JuniperSwitchDown\n      expr: 'junos_up == 0'\n      fo"
  },
  {
    "path": "dist/rules/jvm/jvm-exporter.yml",
    "chars": 5455,
    "preview": "groups:\n\n- name: JvmExporter\n\n  \n  rules:\n\n    - alert: JvmMemoryFillingUp\n      expr: '(sum by (instance)(jvm_memory_us"
  },
  {
    "path": "dist/rules/kafka/danielqsj-kafka-exporter.yml",
    "chars": 828,
    "preview": "groups:\n\n- name: DanielqsjKafkaExporter\n\n  \n  rules:\n\n    - alert: KafkaTopicsReplicas\n      expr: 'min(kafka_topic_part"
  },
  {
    "path": "dist/rules/kafka/linkedin-kafka-exporter.yml",
    "chars": 1126,
    "preview": "groups:\n\n- name: LinkedinKafkaExporter\n\n  \n  rules:\n\n    - alert: KafkaTopicOffsetDecreased\n      expr: 'delta(kafka_bur"
  },
  {
    "path": "dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml",
    "chars": 4489,
    "preview": "groups:\n\n- name: AerogearKeycloakMetricsSpi\n\n  \n  rules:\n\n    # Threshold of 5% is a rough default. Adjust based on your"
  },
  {
    "path": "dist/rules/kubernetes/kubestate-exporter.yml",
    "chars": 18814,
    "preview": "groups:\n\n- name: KubestateExporter\n\n  \n  rules:\n\n    - alert: KubernetesNodeNotReady\n      expr: 'kube_node_status_condi"
  },
  {
    "path": "dist/rules/linkerd/embedded-exporter.yml",
    "chars": 799,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    # Linkerd does not expose request_errors_total. Errors are tracked v"
  },
  {
    "path": "dist/rules/loki/embedded-exporter.yml",
    "chars": 2006,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: LokiProcessTooManyRestarts\n      expr: 'changes(process_sta"
  },
  {
    "path": "dist/rules/meilisearch/embedded-exporter.yml",
    "chars": 786,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: MeilisearchIndexIsEmpty\n      expr: 'meilisearch_index_docs"
  },
  {
    "path": "dist/rules/memcached/memcached-exporter.yml",
    "chars": 4972,
    "preview": "groups:\n\n- name: MemcachedExporter\n\n  \n  rules:\n\n    # 1m delay allows a restart without triggering an alert.\n    - aler"
  },
  {
    "path": "dist/rules/minio/embedded-exporter.yml",
    "chars": 1183,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: MinioClusterDiskOffline\n      expr: 'minio_cluster_drive_of"
  },
  {
    "path": "dist/rules/mongodb/dcu-mongodb-exporter.yml",
    "chars": 3897,
    "preview": "groups:\n\n- name: DcuMongodbExporter\n\n  \n  rules:\n\n    - alert: MongodbReplicationLag(dcu)\n      expr: 'avg(mongodb_repls"
  },
  {
    "path": "dist/rules/mongodb/percona-mongodb-exporter.yml",
    "chars": 3442,
    "preview": "groups:\n\n- name: PerconaMongodbExporter\n\n  \n  rules:\n\n    # 1m delay allows a restart without triggering an alert.\n    -"
  },
  {
    "path": "dist/rules/mongodb/stefanprodan-mgob-exporter.yml",
    "chars": 402,
    "preview": "groups:\n\n- name: StefanprodanMgobExporter\n\n  \n  rules:\n\n    - alert: MgobBackupFailed\n      expr: 'changes(mgob_schedule"
  },
  {
    "path": "dist/rules/mysql/mysqld-exporter.yml",
    "chars": 6358,
    "preview": "groups:\n\n- name: MysqldExporter\n\n  \n  rules:\n\n    # 1m delay allows a restart without triggering an alert.\n    - alert: "
  },
  {
    "path": "dist/rules/nats/nats-exporter.yml",
    "chars": 5033,
    "preview": "groups:\n\n- name: NatsExporter\n\n  \n  rules:\n\n    - alert: NatsHighRoutesCount\n      expr: 'gnatsd_varz_routes > 10'\n     "
  },
  {
    "path": "dist/rules/netdata/embedded-exporter.yml",
    "chars": 3916,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    # This is a gauge metric (not a counter). Checking idle < 20% means "
  },
  {
    "path": "dist/rules/nginx/knyar-nginx-exporter.yml",
    "chars": 1437,
    "preview": "groups:\n\n- name: KnyarNginxExporter\n\n  \n  rules:\n\n    - alert: NginxHighHttp4xxErrorRate\n      expr: 'sum(rate(nginx_htt"
  },
  {
    "path": "dist/rules/nomad/embedded-exporter.yml",
    "chars": 1308,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: NomadJobFailed\n      expr: 'nomad_nomad_job_summary_failed "
  },
  {
    "path": "dist/rules/openebs/embedded-exporter.yml",
    "chars": 410,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: OpenebsUsedPoolCapacity\n      expr: 'openebs_used_pool_capa"
  },
  {
    "path": "dist/rules/openstack/openstack-exporter.yml",
    "chars": 9861,
    "preview": "groups:\n\n- name: OpenstackExporter\n\n  \n  rules:\n\n    - alert: OpenstackExporterDown\n      expr: 'up{job=~\".*openstack.*\""
  },
  {
    "path": "dist/rules/opentelemetry-collector/embedded-exporter.yml",
    "chars": 6615,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  # OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by defau"
  },
  {
    "path": "dist/rules/oracle-database/iamseth-oracledb-exporter.yml",
    "chars": 4642,
    "preview": "groups:\n\n- name: IamsethOracledbExporter\n\n  \n  rules:\n\n    # 1m delay allows a restart without triggering an alert.\n    "
  },
  {
    "path": "dist/rules/patroni/embedded-exporter-patroni.yml",
    "chars": 570,
    "preview": "groups:\n\n- name: EmbeddedExporterPatroni\n\n  \n  rules:\n\n    # 1m delay allows a restart without triggering an alert.\n    "
  },
  {
    "path": "dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml",
    "chars": 1312,
    "preview": "groups:\n\n- name: SpreakerPgbouncerExporter\n\n  \n  rules:\n\n    - alert: PgbouncerActiveConnections\n      expr: 'pgbouncer_"
  },
  {
    "path": "dist/rules/php-fpm/bakins-fpm-exporter.yml",
    "chars": 491,
    "preview": "groups:\n\n- name: BakinsFpmExporter\n\n  \n  rules:\n\n    - alert: Php-fpmMax-childrenReached\n      expr: 'sum(increase(phpfp"
  },
  {
    "path": "dist/rules/postgresql/postgres-exporter.yml",
    "chars": 9697,
    "preview": "groups:\n\n- name: PostgresExporter\n\n  \n  rules:\n\n    # 1m delay allows a restart without triggering an alert.\n    - alert"
  },
  {
    "path": "dist/rules/process-exporter/process-exporter.yml",
    "chars": 5445,
    "preview": "groups:\n\n- name: ProcessExporter\n\n  \n  rules:\n\n    - alert: ProcessExporterGroupDown\n      expr: 'namedprocess_namegroup"
  },
  {
    "path": "dist/rules/prometheus-self-monitoring/embedded-exporter.yml",
    "chars": 12253,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: PrometheusJobMissing\n      expr: 'absent(up{job=\"prometheus"
  },
  {
    "path": "dist/rules/promtail/embedded-exporter.yml",
    "chars": 1220,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: PromtailRequestErrors\n      expr: '100 * sum(rate(promtail_"
  },
  {
    "path": "dist/rules/proxmox-ve/prometheus-pve-exporter.yml",
    "chars": 4019,
    "preview": "groups:\n\n- name: PrometheusPveExporter\n\n  \n  rules:\n\n    - alert: PveNodeDown\n      expr: 'pve_up{id=~\"node/.*\"} == 0'\n "
  },
  {
    "path": "dist/rules/pulsar/embedded-exporter.yml",
    "chars": 4105,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: PulsarSubscriptionHighNumberOfBacklogEntries\n      expr: 's"
  },
  {
    "path": "dist/rules/python/python-exporter.yml",
    "chars": 2449,
    "preview": "groups:\n\n- name: PythonExporter\n\n  \n  rules:\n\n    - alert: PythonGcObjectsUncollectable\n      expr: 'increase(python_gc_"
  },
  {
    "path": "dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml",
    "chars": 4463,
    "preview": "groups:\n\n- name: KbuddeRabbitmqExporter\n\n  \n  rules:\n\n    # 1m delay allows a restart without triggering an alert.\n    -"
  },
  {
    "path": "dist/rules/rabbitmq/rabbitmq-exporter.yml",
    "chars": 4113,
    "preview": "groups:\n\n- name: RabbitmqExporter\n\n  \n  rules:\n\n    # 1m delay allows a restart without triggering an alert.\n    - alert"
  },
  {
    "path": "dist/rules/redis/oliver006-redis-exporter.yml",
    "chars": 5008,
    "preview": "groups:\n\n- name: Oliver006RedisExporter\n\n  \n  rules:\n\n    # 1m delay allows a restart without triggering an alert.\n    -"
  },
  {
    "path": "dist/rules/ruby/ruby-exporter.yml",
    "chars": 2030,
    "preview": "groups:\n\n- name: RubyExporter\n\n  \n  rules:\n\n    # Threshold is a rough default. Adjust based on your application's norma"
  },
  {
    "path": "dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml",
    "chars": 3853,
    "preview": "groups:\n\n- name: SmartctlExporter\n\n  \n  rules:\n\n    - alert: SmartDeviceTemperatureWarning\n      expr: '(avg_over_time(s"
  },
  {
    "path": "dist/rules/sidekiq/strech-sidekiq-exporter.yml",
    "chars": 817,
    "preview": "groups:\n\n- name: StrechSidekiqExporter\n\n  \n  rules:\n\n    - alert: SidekiqQueueSize\n      expr: 'sidekiq_queue_size > 100"
  },
  {
    "path": "dist/rules/snmp/snmp-exporter.yml",
    "chars": 4361,
    "preview": "groups:\n\n- name: SnmpExporter\n\n  # These rules use standard IF-MIB and SNMPv2-MIB metrics. Metric names depend on your s"
  },
  {
    "path": "dist/rules/solr/embedded-exporter.yml",
    "chars": 1857,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: SolrUpdateErrors\n      expr: 'increase(solr_metrics_core_up"
  },
  {
    "path": "dist/rules/speedtest/nlamirault-speedtest-exporter.yml",
    "chars": 841,
    "preview": "groups:\n\n- name: NlamiraultSpeedtestExporter\n\n  \n  rules:\n\n    - alert: SpeedtestSlowInternetDownload\n      expr: 'avg_o"
  },
  {
    "path": "dist/rules/spinnaker/embedded-exporter.yml",
    "chars": 6745,
    "preview": "groups:\n\n- name: EmbeddedExporter\n\n  \n  rules:\n\n    - alert: SpinnakerCircuitBreakerOpen\n      expr: 'resilience4j_circu"
  },
  {
    "path": "dist/rules/sql-server/ozarklake-mssql-exporter.yml",
    "chars": 772,
    "preview": "groups:\n\n- name: OzarklakeMssqlExporter\n\n  \n  rules:\n\n    # 1m delay allows a restart without triggering an alert.\n    -"
  },
  {
    "path": "dist/rules/ssl/tls/ribbybibby-ssl-exporter.yml",
    "chars": 1544,
    "preview": "groups:\n\n- name: RibbybibbySslExporter\n\n  \n  rules:\n\n    - alert: SslCertificateProbeFailed\n      expr: 'ssl_probe_succe"
  },
  {
    "path": "dist/rules/systemd/systemd-exporter.yml",
    "chars": 3523,
    "preview": "groups:\n\n- name: SystemdExporter\n\n  \n  rules:\n\n    - alert: SystemdUnitFailed\n      expr: 'systemd_unit_state{state=\"fai"
  },
  {
    "path": "dist/rules/thanos/thanos-bucket-replicate.yml",
    "chars": 1473,
    "preview": "groups:\n\n- name: ThanosBucketReplicate\n\n  \n  rules:\n\n    - alert: ThanosBucketReplicateErrorRate\n      expr: '(sum by (j"
  },
  {
    "path": "dist/rules/thanos/thanos-compactor.yml",
    "chars": 2790,
    "preview": "groups:\n\n- name: ThanosCompactor\n\n  \n  rules:\n\n    - alert: ThanosCompactorMultipleRunning\n      expr: 'sum by (job) (up"
  },
  {
    "path": "dist/rules/thanos/thanos-component-absent.yml",
    "chars": 2430,
    "preview": "groups:\n\n- name: ThanosComponentAbsent\n\n  \n  rules:\n\n    - alert: ThanosCompactIsDown\n      expr: 'absent(up{job=~\".*tha"
  },
  {
    "path": "dist/rules/thanos/thanos-query.yml",
    "chars": 5607,
    "preview": "groups:\n\n- name: ThanosQuery\n\n  \n  rules:\n\n    - alert: ThanosQueryHttpRequestQueryErrorRateHigh\n      expr: '(sum by (j"
  },
  {
    "path": "dist/rules/thanos/thanos-receiver.yml",
    "chars": 4647,
    "preview": "groups:\n\n- name: ThanosReceiver\n\n  \n  rules:\n\n    - alert: ThanosReceiveHttpRequestErrorRateHigh\n      expr: '(sum by (j"
  },
  {
    "path": "dist/rules/thanos/thanos-ruler.yml",
    "chars": 6775,
    "preview": "groups:\n\n- name: ThanosRuler\n\n  \n  rules:\n\n    - alert: ThanosRuleQueueIsDroppingAlerts\n      expr: 'sum by (job, instan"
  },
  {
    "path": "dist/rules/thanos/thanos-sidecar.yml",
    "chars": 1138,
    "preview": "groups:\n\n- name: ThanosSidecar\n\n  \n  rules:\n\n    # Threshold of 0.05/s avoids firing on transient single-event spikes.\n "
  },
  {
    "path": "dist/rules/thanos/thanos-store.yml",
    "chars": 2798,
    "preview": "groups:\n\n- name: ThanosStore\n\n  \n  rules:\n\n    - alert: ThanosStoreGrpcErrorRate\n      expr: '(sum by (job) (rate(grpc_s"
  },
  {
    "path": "dist/rules/traefik/embedded-exporter-v1.yml",
    "chars": 1506,
    "preview": "groups:\n\n- name: EmbeddedExporterV1\n\n  \n  rules:\n\n    - alert: TraefikBackendDown\n      expr: 'count(traefik_backend_ser"
  },
  {
    "path": "dist/rules/traefik/embedded-exporter-v2.yml",
    "chars": 1506,
    "preview": "groups:\n\n- name: EmbeddedExporterV2\n\n  \n  rules:\n\n    - alert: TraefikServiceDown\n      expr: 'count(traefik_service_ser"
  },
  {
    "path": "dist/rules/vmware/pryorda-vmware-exporter.yml",
    "chars": 1670,
    "preview": "groups:\n\n- name: PryordaVmwareExporter\n\n  \n  rules:\n\n    - alert: VirtualMachineMemoryWarning\n      expr: 'vmware_vm_mem"
  },
  {
    "path": "dist/rules/windows-server/windows-exporter.yml",
    "chars": 2005,
    "preview": "groups:\n\n- name: WindowsExporter\n\n  \n  rules:\n\n    - alert: WindowsServerCollectorError\n      expr: 'windows_exporter_co"
  },
  {
    "path": "dist/rules/wireguard/mindflavor-prometheus-wireguard-exporter.yml",
    "chars": 2334,
    "preview": "groups:\n\n- name: MindflavorPrometheusWireguardExporter\n\n  \n  rules:\n\n    # The threshold of 300 seconds (5 minutes) is a"
  },
  {
    "path": "dist/rules/zfs/node-exporter.yml",
    "chars": 402,
    "preview": "groups:\n\n- name: NodeExporter\n\n  \n  rules:\n\n    - alert: ZfsOfflinePool\n      expr: 'node_zfs_zpool_state{state!=\"online"
  },
  {
    "path": "dist/rules/zfs/zfs_exporter.yml",
    "chars": 1327,
    "preview": "groups:\n\n- name: Zfs_exporter\n\n  \n  rules:\n\n    - alert: ZfsPoolOutOfSpace\n      expr: 'zfs_pool_free_bytes * 100 / zfs_"
  },
  {
    "path": "dist/rules/zookeeper/cloudflare-kafka-zookeeper-exporter.yml",
    "chars": 63,
    "preview": "groups:\n\n- name: CloudflareKafkaZookeeperExporter\n\n  \n  rules:\n"
  },
  {
    "path": "dist/rules/zookeeper/dabealu-zookeeper-exporter.yml",
    "chars": 1436,
    "preview": "groups:\n\n- name: DabealuZookeeperExporter\n\n  \n  rules:\n\n    # 1m delay allows a restart without triggering an alert.\n   "
  },
  {
    "path": "dist/template.yml",
    "chars": 1160,
    "preview": "groups:\n{% assign groupName = slug | split: '-' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | c"
  },
  {
    "path": "docker-compose.yml",
    "chars": 158,
    "preview": "version: '3'\n\nservices:\n\n  jekyll:\n    image: jekyll/jekyll:latest\n    command: jekyll serve\n    volumes:\n      - ./:/sr"
  },
  {
    "path": "index.md",
    "chars": 1158,
    "preview": "\n<style>\n.center-image\n{\n    margin: 0 auto;\n    display: block;\n}\n</style>\n\n\n![Prometheus logo](/assets/prometheus-logo"
  },
  {
    "path": "package.json",
    "chars": 110,
    "preview": "{\n\t  \"scripts\": {\n\t\t    \"test\": \"awesome-lint\"\n\t  },\n\t  \"devDependencies\": {\n\t\t    \"awesome-lint\": \"*\"\n\t  }\n}\n"
  },
  {
    "path": "rules.md",
    "chars": 4981,
    "preview": "<style>\n  ul {\n    list-style: none;\n  }\n</style>\n\n<!-- CAUTIONS -->\n<div style=\"padding: 20px 20px 10px 20px; border: s"
  },
  {
    "path": "sleep-peacefully.md",
    "chars": 3847,
    "preview": "<h1 style=\"text-align: center;\">\n  Sleep Peacefully\n</h1>\n\n## Alerting time window\n\nIn some applications, load and activ"
  }
]

About this extraction

This page contains the full source code of the samber/awesome-prometheus-alerts GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 133 files (873.6 KB), approximately 224.7k tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo