Repository: samber/awesome-prometheus-alerts
Branch: master
Commit: 9d9c648cddd1
Files: 133
Total size: 873.6 KB

Directory structure:
gitextract_epx1dxil/

├── .github/
│   ├── FUNDING.yml
│   ├── dependabot.yml
│   └── workflows/
│       ├── dist.yml
│       └── test.yml
├── .gitignore
├── .travis.yml
├── CLAUDE.md
├── CONTRIBUTING.md
├── Gemfile
├── LICENSE
├── README.md
├── _config.yml
├── _data/
│   └── rules.yml
├── _layouts/
│   └── default.html
├── alertmanager.md
├── assets/
│   ├── css/
│   │   └── app.css
│   └── js/
│       └── app.js
├── blackbox-exporter.md
├── dist/
│   ├── rules/
│   │   ├── apache/
│   │   │   └── lusitaniae-apache-exporter.yml
│   │   ├── apache-flink/
│   │   │   └── flink-prometheus-reporter.yml
│   │   ├── apache-spark/
│   │   │   └── spark-prometheus.yml
│   │   ├── apc-ups/
│   │   │   └── apcupsd_exporter.yml
│   │   ├── argocd/
│   │   │   └── embedded-exporter.yml
│   │   ├── aws-cloudwatch/
│   │   │   └── prometheus-cloudwatch-exporter.yml
│   │   ├── azure/
│   │   │   └── azure-metrics-exporter.yml
│   │   ├── blackbox/
│   │   │   └── blackbox-exporter.yml
│   │   ├── caddy/
│   │   │   └── embedded-exporter.yml
│   │   ├── cassandra/
│   │   │   ├── criteo-cassandra-exporter.yml
│   │   │   └── instaclustr-cassandra-exporter.yml
│   │   ├── ceph/
│   │   │   └── embedded-exporter.yml
│   │   ├── cert-manager/
│   │   │   └── embedded-exporter.yml
│   │   ├── cilium/
│   │   │   └── embedded-exporter.yml
│   │   ├── clickhouse/
│   │   │   └── embedded-exporter.yml
│   │   ├── cloudflare/
│   │   │   └── lablabs-cloudflare-exporter.yml
│   │   ├── consul/
│   │   │   └── consul-exporter.yml
│   │   ├── coredns/
│   │   │   └── embedded-exporter.yml
│   │   ├── cortex/
│   │   │   └── embedded-exporter.yml
│   │   ├── couchdb/
│   │   │   └── gesellix-couchdb-prometheus-exporter.yml
│   │   ├── digitalocean/
│   │   │   └── digitalocean-exporter.yml
│   │   ├── docker-containers/
│   │   │   └── google-cadvisor.yml
│   │   ├── ebpf/
│   │   │   └── ebpf-exporter.yml
│   │   ├── elasticsearch/
│   │   │   └── prometheus-community-elasticsearch-exporter.yml
│   │   ├── envoy/
│   │   │   └── embedded-exporter.yml
│   │   ├── etcd/
│   │   │   └── embedded-exporter.yml
│   │   ├── fluxcd/
│   │   │   └── embedded-exporter.yml
│   │   ├── freeswitch/
│   │   │   └── znerol-freeswitch-exporter.yml
│   │   ├── gitlab-ci/
│   │   │   ├── gitaly.yml
│   │   │   ├── gitlab-built-in-exporter.yml
│   │   │   └── workhorse.yml
│   │   ├── golang/
│   │   │   └── golang-exporter.yml
│   │   ├── google-cloud-stackdriver/
│   │   │   └── stackdriver-exporter.yml
│   │   ├── grafana-alloy/
│   │   │   └── embedded-exporter.yml
│   │   ├── grafana-mimir/
│   │   │   └── embedded-exporter.yml
│   │   ├── grafana-tempo/
│   │   │   └── embedded-exporter.yml
│   │   ├── graph-node/
│   │   │   └── embedded-exporter.yml
│   │   ├── hadoop/
│   │   │   └── jmx_exporter.yml
│   │   ├── haproxy/
│   │   │   ├── embedded-exporter-v2.yml
│   │   │   └── haproxy-exporter-v1.yml
│   │   ├── hashicorp-vault/
│   │   │   └── embedded-exporter.yml
│   │   ├── host-and-hardware/
│   │   │   └── node-exporter.yml
│   │   ├── ipmi/
│   │   │   └── ipmi-exporter.yml
│   │   ├── istio/
│   │   │   └── embedded-exporter.yml
│   │   ├── jaeger/
│   │   │   └── embedded-exporter.yml
│   │   ├── jenkins/
│   │   │   └── metric-plugin.yml
│   │   ├── juniper/
│   │   │   └── czerwonk-junos-exporter.yml
│   │   ├── jvm/
│   │   │   └── jvm-exporter.yml
│   │   ├── kafka/
│   │   │   ├── danielqsj-kafka-exporter.yml
│   │   │   └── linkedin-kafka-exporter.yml
│   │   ├── keycloak/
│   │   │   └── aerogear-keycloak-metrics-spi.yml
│   │   ├── kubernetes/
│   │   │   └── kubestate-exporter.yml
│   │   ├── linkerd/
│   │   │   └── embedded-exporter.yml
│   │   ├── loki/
│   │   │   └── embedded-exporter.yml
│   │   ├── meilisearch/
│   │   │   └── embedded-exporter.yml
│   │   ├── memcached/
│   │   │   └── memcached-exporter.yml
│   │   ├── minio/
│   │   │   └── embedded-exporter.yml
│   │   ├── mongodb/
│   │   │   ├── dcu-mongodb-exporter.yml
│   │   │   ├── percona-mongodb-exporter.yml
│   │   │   └── stefanprodan-mgob-exporter.yml
│   │   ├── mysql/
│   │   │   └── mysqld-exporter.yml
│   │   ├── nats/
│   │   │   └── nats-exporter.yml
│   │   ├── netdata/
│   │   │   └── embedded-exporter.yml
│   │   ├── nginx/
│   │   │   └── knyar-nginx-exporter.yml
│   │   ├── nomad/
│   │   │   └── embedded-exporter.yml
│   │   ├── openebs/
│   │   │   └── embedded-exporter.yml
│   │   ├── openstack/
│   │   │   └── openstack-exporter.yml
│   │   ├── opentelemetry-collector/
│   │   │   └── embedded-exporter.yml
│   │   ├── oracle-database/
│   │   │   └── iamseth-oracledb-exporter.yml
│   │   ├── patroni/
│   │   │   └── embedded-exporter-patroni.yml
│   │   ├── pgbouncer/
│   │   │   └── spreaker-pgbouncer-exporter.yml
│   │   ├── php-fpm/
│   │   │   └── bakins-fpm-exporter.yml
│   │   ├── postgresql/
│   │   │   └── postgres-exporter.yml
│   │   ├── process-exporter/
│   │   │   └── process-exporter.yml
│   │   ├── prometheus-self-monitoring/
│   │   │   └── embedded-exporter.yml
│   │   ├── promtail/
│   │   │   └── embedded-exporter.yml
│   │   ├── proxmox-ve/
│   │   │   └── prometheus-pve-exporter.yml
│   │   ├── pulsar/
│   │   │   └── embedded-exporter.yml
│   │   ├── python/
│   │   │   └── python-exporter.yml
│   │   ├── rabbitmq/
│   │   │   ├── kbudde-rabbitmq-exporter.yml
│   │   │   └── rabbitmq-exporter.yml
│   │   ├── redis/
│   │   │   └── oliver006-redis-exporter.yml
│   │   ├── ruby/
│   │   │   └── ruby-exporter.yml
│   │   ├── s.m.a.r.t-device-monitoring/
│   │   │   └── smartctl-exporter.yml
│   │   ├── sidekiq/
│   │   │   └── strech-sidekiq-exporter.yml
│   │   ├── snmp/
│   │   │   └── snmp-exporter.yml
│   │   ├── solr/
│   │   │   └── embedded-exporter.yml
│   │   ├── speedtest/
│   │   │   └── nlamirault-speedtest-exporter.yml
│   │   ├── spinnaker/
│   │   │   └── embedded-exporter.yml
│   │   ├── sql-server/
│   │   │   └── ozarklake-mssql-exporter.yml
│   │   ├── ssl/
│   │   │   └── tls/
│   │   │       └── ribbybibby-ssl-exporter.yml
│   │   ├── systemd/
│   │   │   └── systemd-exporter.yml
│   │   ├── thanos/
│   │   │   ├── thanos-bucket-replicate.yml
│   │   │   ├── thanos-compactor.yml
│   │   │   ├── thanos-component-absent.yml
│   │   │   ├── thanos-query.yml
│   │   │   ├── thanos-receiver.yml
│   │   │   ├── thanos-ruler.yml
│   │   │   ├── thanos-sidecar.yml
│   │   │   └── thanos-store.yml
│   │   ├── traefik/
│   │   │   ├── embedded-exporter-v1.yml
│   │   │   └── embedded-exporter-v2.yml
│   │   ├── vmware/
│   │   │   └── pryorda-vmware-exporter.yml
│   │   ├── windows-server/
│   │   │   └── windows-exporter.yml
│   │   ├── wireguard/
│   │   │   └── mindflavor-prometheus-wireguard-exporter.yml
│   │   ├── zfs/
│   │   │   ├── node-exporter.yml
│   │   │   └── zfs_exporter.yml
│   │   └── zookeeper/
│   │       ├── cloudflare-kafka-zookeeper-exporter.yml
│   │       └── dabealu-zookeeper-exporter.yml
│   └── template.yml
├── docker-compose.yml
├── index.md
├── package.json
├── rules.md
└── sleep-peacefully.md

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/FUNDING.yml
================================================
github: [samber]
ko_fi: samuelberthe


================================================
FILE: .github/dependabot.yml
================================================
---
version: 2
updates:
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "monthly"


================================================
FILE: .github/workflows/dist.yml
================================================
name: Publish

on:
  workflow_dispatch:
  push:
    branches:
      - master

permissions:
  contents: write

jobs:
  publish:
    name: Publish
    # Check if the PR is not from a fork
    if: github.repository_owner == 'samber'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout Repo
        uses: actions/checkout@v6

      - name: Set up Ruby
        uses: ruby/setup-ruby@v1
        with:
          ruby-version: 3.4

      - name: Set up yq
        uses: mikefarah/yq@v4

      - name: Install liquid
        run: |
         gem install liquid -v 5.5.1
         gem install liquid-cli 

      - name: Build rule configuration
        run: |
          cat _data/rules.yml | yq -I 0 -o json > _data/rules.json

          rm -rf dist/rules

          for service in $(cat _data/rules.json | jq -r '.groups[].services[] | @base64'); do
            subdir=dist/rules/$(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")')
            mkdir -p "${subdir}"

            # groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
    
            for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
              exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
              cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml
              echo ${subdir}/${exporterName}.yml
            done
          done

          rm _data/rules.json

      # https://peterevans.dev/posts/github-actions-how-to-automate-code-formatting-in-pull-requests/
      - name: Check for modified files
        id: git-check
        run: echo "modified=$(git status -s --porcelain | wc -l | awk '{$1=$1};1')" >> $GITHUB_OUTPUT
      - name: Push changes
        if: steps.git-check.outputs.modified != '0'
        run: |
          git config --global user.name 'samber'
          git config --global user.email 'samber@users.noreply.github.com'
          git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}
          git add .
          git commit -m "Publish"
          git push


================================================
FILE: .github/workflows/test.yml
================================================
name: Promtool check

on:
  pull_request:
  push:
    branches:
      - master

jobs:
  promtool-check:
    name: Check alert rules syntax
    runs-on: ubuntu-latest
    steps:
      - name: Checkout Repo
        uses: actions/checkout@v6

      - name: Set up Ruby
        uses: ruby/setup-ruby@v1
        with:
          ruby-version: 3.4

      - name: Set up yq
        uses: mikefarah/yq@v4

      - name: Install liquid
        run: gem install liquid-cli

      - name: Build rule configuration
        run: |
          cat _data/rules.yml | yq -I 0 -o json > _data/rules.json

          for service in $(cat _data/rules.json | jq -r '.groups[].services[] | @base64'); do
            subdir=test/rules/$(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")')
            mkdir -p "${subdir}"

            # groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))

            for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
              exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
              cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml
              echo ${subdir}/${exporterName}.yml
            done
          done

          rm _data/rules.json

      - name: Check Prometheus alert rules
        uses: peimanja/promtool-github-actions@master
        with:
          promtool_actions_subcommand: 'rules'
          promtool_actions_files: 'test/rules/*/*.yml'
          promtool_actions_comment: true
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


================================================
FILE: .gitignore
================================================
_site/
.sass-cache/
.jekyll-cache/
.jekyll-metadata
_data/rules.json
test/rules/
/node_modules
.worktrees/

================================================
FILE: .travis.yml
================================================
language: node_js
node_js:
  - 'node'


================================================
FILE: CLAUDE.md
================================================
# CLAUDE.md

This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.

## Project Overview

A curated collection of ~940 Prometheus alerting rules covering 90+ services across 100+ exporters, organized in 7 categories: basic resource monitoring (Prometheus, host/hardware, SMART, Docker, Blackbox, Windows, VMware, Netdata), databases and brokers (MySQL, PostgreSQL, Redis, MongoDB, RabbitMQ, Elasticsearch, Cassandra, Clickhouse, Kafka, etc.), reverse proxies and load balancers (Nginx, Apache, HaProxy, Traefik, Caddy), runtimes (PHP-FPM, JVM, Sidekiq), orchestrators (Kubernetes, Nomad, Consul, Etcd, Istio, ArgoCD, FluxCD), network/security/storage (Ceph, ZFS, Minio, SSL/TLS, CoreDNS, Vault, Cloudflare), and observability tools (Thanos, Loki, Cortex, OpenTelemetry Collector, Jenkins).

All rules are stored in a single YAML data file (`_data/rules.yml`) and rendered as a Jekyll-based GitHub Pages site at https://samber.github.io/awesome-prometheus-alerts. The site provides copy-pasteable Prometheus alert snippets and downloadable rule files per exporter.

The project is community-driven. Most contributions are PRs adding or updating rules in `_data/rules.yml`. Files in `dist/rules/` are auto-generated on merge — never edit them manually.

## Architecture

- **`_data/rules.yml`** — The single source of truth for all alerting rules. This is the main file contributors edit. It is NOT a valid Prometheus config; the site renders each rule into copy-pasteable Prometheus alert format.
- **`rules.md`** — Jekyll template that iterates over `_data/rules.yml` and renders the rules page with copy buttons and formatted YAML blocks.
- **`alertmanager.md`** — Static page with Prometheus/AlertManager configuration examples.
- **`_layouts/default.html`** — Site layout (Jekyll theme: cayman).
- **`_config.yml`** — Jekyll configuration.
- **`dist/rules/`** — Pre-built downloadable rule files organized by service/exporter (referenced in the site for `wget` commands).

## Rules YAML Structure

Services are listed in README.md.

`_data/rules.yml` hierarchy:
```
groups:
  - name: "<category>"        # e.g. "Basic resource monitoring"
    services:
      - name: "<service>"     # e.g. "Host and hardware"
        exporters:
          - name: "<exporter>"
            slug: "<slug>"          # used for download URLs
            doc_url: "<url>"        # optional link to exporter docs
            comments:               # optional, exporter-level multiline notes rendered before rules
              "<comment>"
            rules:
              - name: "<alert name>"
                description: "<text>"
                query: "<PromQL>"
                severity: warning|critical|info
                for: "<duration>"   # optional, defaults to 0m
                comments:           # optional, rendered as multiline YAML comments
                  "<comment>"
```

Services are grouped in category. If you are not sure about the classification, ask the developer.

## Running Locally

```bash
# With Ruby/Bundler
gem install bundler
bundle install
jekyll serve

# With Docker Compose
docker compose up -d

# With Docker directly
docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve
```

Site serves at http://localhost:4000/awesome-prometheus-alerts.

## Contributing Rules

All rule changes go in `_data/rules.yml`. Each rule needs: `name`, `description`, `query` (valid PromQL), and `severity`. The `for` field is optional. Descriptions should be factual ("what") and include root cause hints ("why"). Queries must be tested against the latest exporter version. Never modify files in `dist/` — they are auto-generated on merge.

## Query Validation

- When adding or updating an alert, verify that the PromQL query references metric series that actually exist in the related exporter. Check the exporter's documentation or source code to confirm series names.
- If a metric series has been deprecated or removed in a newer version of the exporter, update the query to use the replacement series, or remove the rule if no replacement exists. Known examples: `kube_hpa_*` renamed to `kube_horizontalpodautoscaler_*` in kube-state-metrics 2.x; `node_hwmon_temp_alarm` does not exist (correct: `node_hwmon_temp_crit_alarm_celsius`); node-exporter CLI flags get renamed across versions.
- When writing or reviewing a query, search the internet (exporter docs, GitHub issues, changelogs) to validate correctness and catch outdated series names. When you are not sure about a metric name, always search the internet to confirm it exists and is spelled correctly before using it.
- Pay special attention to metric naming conventions: many exporters add `_total` suffixes for counters and `_seconds_total` for time-based counters. Verify the exact name from source code, not just docs. Known examples: Spark's PrometheusResource adds `_total` and `_seconds_total` suffixes (e.g., `metrics_executor_failedTasks_total`, not `metrics_executor_failedTasks`); Oracle's `oracledb_sessions_value` not `oracledb_sessions_activity`.
- Verify that label names used in `{{ $labels.xxx }}` template variables actually exist on the metric. Check the exporter source code for the exact label names. Known examples: cloudflare/ebpf_exporter uses `id` not `name` for programs, and `config` not `name` for decoder errors.
- When a metric uses info-style patterns (value always 1, information carried in labels), `== 0` will never be true — the metric simply won't exist. Use `absent()` instead. Known example: `ebpf_exporter_enabled_configs`.
- Some metrics are version-dependent. When a metric was renamed or removed in a newer version, add a comment noting the version requirement. Known examples: `go_memstats_gc_cpu_fraction` removed in client_golang v1.12+; cert-manager renamed `certmanager_http_acme_client_request_count` to `certmanager_acme_client_request_count` in v1.19+.
- Verify the unit of a metric before setting thresholds. Some metrics use milliseconds while descriptions assume seconds. Known example: Keycloak's `keycloak_request_duration` is in milliseconds, so `> 2` means 2ms not 2s.
- Some exporters expose labels that differ between services even within the same ecosystem. Known example: OpenStack Neutron uses `adminState="up"` while Nova and Cinder use `adminState="enabled"`.
- When an official mixin exists for a service, compare thresholds and time windows against it. Known deviations to watch for: Mimir store-gateway sync uses 1800s (not 600s), Mimir compactor skipped blocks uses `[24h]` (not `[5m]`), Tempo normalizes outstanding blocks per worker.

## Common Review Pitfalls (learned from PR history)

These are the most frequent issues raised during code review on this repo:

### Severity levels
- `critical` = requires immediate human attention. Do not use for informational/security notifications.
- `warning` = needs attention soon but not urgent.
- `info` = awareness only (e.g., config changes, underutilized resources).
- Authentication failures, security notifications, and config-change detections are typically `info`, not `critical`.

### `for` duration
- Omit `for` when the default (0m) is intentional and appropriate — do not add `for: 0m` explicitly.
- Add a `for` duration (e.g., `for: 2m` or `for: 5m`) to tolerate brief unavailability from restarts or transient spikes. Most "service down" rules should have at least `for: 1m`–`2m`.
- Do not blanket-change all `for: 0m` to `for: 1m` — it depends on the alert's semantics and the range window used in `increase()`/`rate()`.

### Query design
- Prefer symptom-based alerts over cause-based alerts to reduce alert fatigue. Example: "service is unreachable" is better than "specific internal counter changed". Metrics like heap object count, allocation rate, or free heap slots are causes, not symptoms — prefer GC duration, latency, or error rate alerts instead.
- Don't add unnecessary aggregation (`avg()`, `avg_over_time()`) on metrics that are local to a single node/instance. Only aggregate when the alert is cluster-wide.
- Don't combine `min_over_time()[1m]` with `for: 2m` redundantly — pick one mechanism for smoothing. Same applies to `avg_over_time()[5m]` with `for: 5m`.
- Remove unnecessary label filters (e.g., `job="cassandra"` or `cluster=~".*"`) that add noise without value.
- Verify comparison operators match the intent — e.g., "high snapshot count" must use `> N`, not `< N`.
- When dividing counters (e.g., error rate = errors / total), guard against division by zero with `and total > 0` or filter appropriately. This is the most common issue in new PRs — check every ratio query.
- Filter out system/template databases explicitly in DB queries (e.g., PostgreSQL: add `datid!="0"` alongside `datname!~"template.*|postgres"`).
- Never use `rate()` on a gauge metric — use `deriv()` instead. `rate()` is for monotonically increasing counters only.
- When using `increase()` for ratio calculations, prefer `rate()` instead — `increase()` can produce incorrect results when counters reset mid-window.
- When filtering gRPC error codes, don't use `grpc_code!="OK"` — this includes normal application responses like `NotFound`, `AlreadyExists`, and `Cancelled`. Filter to actual errors: `grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"`.
- When computing ratios with `rate()` on a metric that is itself already a normalized rate (e.g., Oracle's `v$waitclassmetric`), applying `rate()` computes the rate-of-change of a rate, which is not meaningful.
- When a multi-label metric is used in a binary operation with a metric that has fewer labels, use `ignoring(extra_label)` to avoid join failures. Known example: `systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max`.
- When a query groups by labels (e.g., `by (le, worker)`), consider the cardinality impact — hundreds of label values means hundreds of independent alerts.
- Ensure `{{ $value | humanizeDuration }}` is only used on values in seconds. If the metric is in milliseconds, divide by 1000 first or use `{{ $value | humanize }}ms`.
- Avoid using `up{job=~"exporter-name"} == 0` or `absent(up{job=~"exporter-name"})` to detect whether a service is down. When targets are managed via service discovery or a job reaches multiple targets, a disappeared target causes the `up` series to become stale and vanish rather than drop to 0, so the alert never fires. Prefer application-level or cluster-level metrics instead (e.g., "number of consul cluster members < 3", "PostgreSQL primary node absent").

### Thresholds
- Alert thresholds are inherently arbitrary and depend on workload. Use `comments:` to note this when a threshold is a rough default.
- When threshold values in a PR seem unreasonable (too high or too low), challenge them with real-world reasoning or exporter docs.
- Watch for thresholds that are so high they only catch catastrophic scenarios and miss real problems. Examples: Go goroutine spike at 100/s (misses gradual leaks), Ruby major GC at 5/s (only fires if app is non-functional), Python gen2 GC at >1/s (extremely rare).
- Watch for thresholds that will fire on normal healthy operation. Examples: Memcached at 90% memory is desired (it's a cache), Flink TaskManager at 90% JVM heap is normal, cache hit rate < 80% is common for cold caches.
- For SNMP bandwidth utilization, `ifSpeed` (Gauge32) maxes at ~4.29 Gbps. For 10G+ interfaces, use `ifHighSpeed * 1000000` instead.
- For alerts using `> 0` on counters with `rate()` or `increase()`, consider whether a single event truly warrants alerting. In most cases, a small threshold (e.g., `> 0.05` for rate, `> 3` for increase) better distinguishes real problems from transient noise.

### Comments
- When an alert or its query needs explanation (e.g., non-obvious PromQL logic, threshold rationale, edge cases), use the rule-level `comments:` field. Use multiline comments when needed.
- Use the exporter-level `comments:` field for notes that apply to all rules under that exporter (e.g., exporter version requirements, known quirks, setup prerequisites).
- Comments are rendered as YAML `#` comments in the output, so they are visible to users who copy-paste the rules.

### Descriptions
- Keep descriptions short, factual, and actionable.
- Include what is happening ("Disk is almost full") and why it matters or what to check.
- Use `{{ $labels.instance }}`, `{{ $value }}`, and other template variables in descriptions when useful.
- If the description says "average" but the query uses `histogram_quantile(0.95, ...)`, fix the description to say "p95" (or vice versa).
- When alerting on rates or ratios that may not be intuitive, include `{{ $value }}` in the description so operators can see the actual number.

### Structure
- Some services have multiple exporters (e.g., MongoDB has `percona/mongodb_exporter` and `dcu/mongodb_exporter`). Place rules under the correct exporter.
- Search for duplicates before adding a new rule — a similar alert may already exist under a different exporter or with different thresholds.
- The `slug` field must be unique per exporter and is used for download URLs.

## Reference Sources for Cross-Checking Alerts

Use these sources to criticize and validate PromQL queries, compare thresholds, and find inspiration for new rules.

Everytime you consume an external resource to change a PromQL query, please compare before/after and explain why you think the external source is right.

### Official project mixins (alerts maintained by the project itself)
- https://github.com/prometheus/node_exporter/tree/master/docs/node-mixin/alerts
- https://github.com/prometheus/prometheus/tree/main/documentation/prometheus-mixin
- https://github.com/prometheus/alertmanager/tree/main/doc/alertmanager-mixin
- https://github.com/prometheus/snmp_exporter/tree/main/snmp-mixin
- https://github.com/prometheus/mysqld_exporter/tree/main/mysqld-mixin
- https://github.com/prometheus-community/postgres_exporter/tree/master/postgres_mixin
- https://github.com/prometheus-community/elasticsearch_exporter (mixin via Grafana docs)
- https://github.com/etcd-io/etcd/tree/main/contrib/mixin
- https://github.com/thanos-io/thanos/tree/main/mixin (also: examples/alerts/)
- https://github.com/grafana/loki/tree/main/production/loki-mixin (also: promtail-mixin/)
- https://github.com/grafana/mimir/tree/main/operations/mimir-mixin
- https://github.com/grafana/tempo/tree/main/operations/tempo-mixin
- https://github.com/grafana/grafana/tree/main/grafana-mixin
- https://github.com/ceph/ceph/tree/main/monitoring/ceph-mixin (in-tree; also https://github.com/ceph/ceph-mixins)
- https://github.com/jaegertracing/jaeger/tree/main/monitoring/jaeger-mixin
- https://github.com/kubernetes-monitoring/kubernetes-mixin (includes runbook.md)
- https://github.com/kubernetes/kube-state-metrics/tree/main/jsonnet/kube-state-metrics-mixin
- https://github.com/prometheus-operator/prometheus-operator/tree/main/jsonnet/mixin
- https://github.com/prometheus-operator/kube-prometheus
- https://github.com/cortexproject/cortex-jsonnet
- https://github.com/gluster/gluster-mixins

### Standalone mixin repositories
- https://github.com/povilasv/coredns-mixin
- https://github.com/adinhodovic/rabbitmq-mixin
- https://github.com/adinhodovic/blackbox-exporter-mixin
- https://github.com/adinhodovic/django-mixin
- https://github.com/adinhodovic/argo-cd-mixin
- https://github.com/adinhodovic/ingress-nginx-mixin
- https://github.com/adinhodovic/kubernetes-autoscaling-mixin
- https://github.com/metalmatze/kube-cockroachdb (CockroachDB on Kubernetes)
- https://github.com/bitnami-labs/sealed-secrets (sealed-secrets mixin)
- https://github.com/lukas-vlcek/elasticsearch-mixin (includes runbook.md)
- https://github.com/adinhodovic/postgresql-mixin
- https://github.com/imusmanmalik/cert-manager-mixin
- https://gitlab.com/uneeq-oss/cert-manager-mixin (alternative cert-manager mixin)
- https://github.com/uneeq-oss/spinnaker-mixin
- https://github.com/metalmatze/slo-libsonnet (SLO alerting/recording rules generation library)

### Grafana jsonnet-libs (93 mixins — browse for specific services)
- https://github.com/grafana/jsonnet-libs
- Notable mixins with alerts: consul, memcached, elasticsearch, haproxy, clickhouse, opensearch, redis, mongodb, kafka, nginx, rabbitmq, jvm, vault, envoy, istio, jenkins, caddy, cloudflare, docker, traefik, windows, snmp, argocd, nomad, pgbouncer, minio, ceph, and 60+ more.

### Mixin aggregators
- https://monitoring.mixins.dev/ (central registry of all monitoring mixins)
- https://github.com/monitoring-mixins/website/blob/master/mixins.json (machine-readable list of all mixins with source URLs)
- https://github.com/nlamirault/monitoring-mixins (hub aggregating many mixins)

### GitLab monitoring & infrastructure
- https://gitlab.com/gitlab-com/runbooks (GitLab.com SRE runbooks — production alert rules, runbook docs, alertmanager config)
- https://gitlab.com/gitlab-com/runbooks/-/tree/master/mimir-rules (production Mimir alerting rules organized by tenant/environment)
- https://gitlab.com/gitlab-com/runbooks/-/tree/master/mimir-rules-jsonnet (jsonnet sources for GitLab alerting rules)
- https://gitlab.com/gitlab-org/omnibus-gitlab/-/tree/master/files/gitlab-cookbooks/monitoring/templates/rules (default Prometheus rules shipped with GitLab Omnibus)

### Community alert collections
- https://github.com/jpweber/prometheus-alert-rules
- https://github.com/bdossantos/prometheus-alert-rules
- https://github.com/giantswarm/prometheus-rules
- https://github.com/last9/awesome-prometheus-toolkit
- https://github.com/warpnet/awesome-prometheus (meta-list of Prometheus resources)


================================================
FILE: CONTRIBUTING.md
================================================

# Contributing

## Adding alerting rule

If you don't have time to write a PR, just copy and paste some alerts into an issue. We will format it accordingly.

Rules are here: `_data/rules.yml`.

### Guidelines

Please ensure your pull request adheres to the following guidelines:

- Search previous suggestions before making a new one, as yours may be a duplicate.
- Keep descriptions short and simple, but descriptive.
- Description must be factual (the "what?") and should provide root cause suggestions (the "why?"), for faster resolution.
- Queries must be tested on latest exporter version.

## Improving Github page

### Run locally

```
gem install bundler
bundle install
jekyll serve
```

Or with Docker:

```
docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve
```

Or with Docker Compose:

```
docker compose up -d
```


================================================
FILE: Gemfile
================================================
source 'https://rubygems.org'
gem 'github-pages', '>= 232', group: :jekyll_plugins
gem 'webrick', '~> 1.8'

================================================
FILE: LICENSE
================================================
Creative Commons Attribution 4.0 International License (CC BY 4.0)

http://creativecommons.org/licenses/by/4.0/


================================================
FILE: README.md
================================================
# 👋 Awesome Prometheus Alerts [![Awesome](https://awesome.re/badge-flat.svg)](https://awesome.re)

> Most alerting rules are common to every Prometheus setup. We need a place to find them all. 🤘 🚨 📊

Collection available here: **[https://samber.github.io/awesome-prometheus-alerts](https://samber.github.io/awesome-prometheus-alerts)**

<div align="center">
  <hr>
  <sup><b>Sponsored by:</b></sup>
  <br>
  <a href="https://cast.ai/samuel">
    <div>
      <img src="https://samber.github.io/awesome-prometheus-alerts/assets/sponsor-cast-ai.png" width="200" alt="Cast AI">
    </div>
    <div>
      Cut Kubernetes & AI costs, boost application stability.
    </div>
  </a>
  <br>
  <a href="https://betterstack.com">
    <div>
      <img src="https://samber.github.io/awesome-prometheus-alerts/assets/sponsor-betterstack.png" width="200" alt="Better Stack">
    </div>
    <div>
      Better Stack lets you centralize, search, and visualize your logs.
    </div>
  </a>
  <hr>
</div>

## ✨ Contents

- [Rules](#-rules)
- [Contributing](#-contributing)
- [Improvements](#-improvements)
- [Help us](#-show-your-support)
- [License](#-license)

## 🚨 Rules

#### Basic resource monitoring

- [Prometheus self-monitoring](https://samber.github.io/awesome-prometheus-alerts/rules#prometheus-internals)
- [Host/Hardware](https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware)
- [SMART](https://samber.github.io/awesome-prometheus-alerts/rules#smart)
- [IPMI](https://samber.github.io/awesome-prometheus-alerts/rules#ipmi)
- [Docker Containers](https://samber.github.io/awesome-prometheus-alerts/rules#docker-containers)
- [Blackbox](https://samber.github.io/awesome-prometheus-alerts/rules#blackbox)
- [Windows](https://samber.github.io/awesome-prometheus-alerts/rules#windows-server)
- [VMWare](https://samber.github.io/awesome-prometheus-alerts/rules#vmware)
- [Proxmox VE](https://samber.github.io/awesome-prometheus-alerts/rules#proxmox-ve)
- [Netdata](https://samber.github.io/awesome-prometheus-alerts/rules#netdata)
- [eBPF](https://samber.github.io/awesome-prometheus-alerts/rules#ebpf)
- [Process Exporter](https://samber.github.io/awesome-prometheus-alerts/rules#process-exporter)
- [Systemd](https://samber.github.io/awesome-prometheus-alerts/rules#systemd)

#### Databases

- [MySQL](https://samber.github.io/awesome-prometheus-alerts/rules#mysql)
- [PostgreSQL](https://samber.github.io/awesome-prometheus-alerts/rules#postgresql)
- [SQL Server](https://samber.github.io/awesome-prometheus-alerts/rules#sql-server)
- [Oracle Database](https://samber.github.io/awesome-prometheus-alerts/rules#oracle-database)
- [Patroni](https://samber.github.io/awesome-prometheus-alerts/rules#patroni)
- [PGBouncer](https://samber.github.io/awesome-prometheus-alerts/rules#pgbouncer)
- [Redis](https://samber.github.io/awesome-prometheus-alerts/rules#redis)
- [Memcached](https://samber.github.io/awesome-prometheus-alerts/rules#memcached)
- [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb)
- [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch)
- [Meilisearch](https://samber.github.io/awesome-prometheus-alerts/rules#meilisearch)
- [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra)
- [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse)
- [CouchDB](https://samber.github.io/awesome-prometheus-alerts/rules#couchdb)
- [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr)

#### Message brokers

- [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq)
- [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper)
- [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka)
- [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar)
- [Nats](https://samber.github.io/awesome-prometheus-alerts/rules#nats)

#### Proxies, load balancers and service meshes

- [Nginx](https://samber.github.io/awesome-prometheus-alerts/rules#nginx)
- [Apache](https://samber.github.io/awesome-prometheus-alerts/rules#apache)
- [HaProxy](https://samber.github.io/awesome-prometheus-alerts/rules#haproxy)
- [Traefik](https://samber.github.io/awesome-prometheus-alerts/rules#traefik)
- [Caddy](https://samber.github.io/awesome-prometheus-alerts/rules#caddy)
- [Envoy](https://samber.github.io/awesome-prometheus-alerts/rules#envoy)
- [Linkerd](https://samber.github.io/awesome-prometheus-alerts/rules#linkerd)
- [Istio](https://samber.github.io/awesome-prometheus-alerts/rules#istio)

#### Runtimes

- [PHP-FPM](https://samber.github.io/awesome-prometheus-alerts/rules#php-fpm)
- [JVM](https://samber.github.io/awesome-prometheus-alerts/rules#jvm)
- [Golang](https://samber.github.io/awesome-prometheus-alerts/rules#golang)
- [Ruby](https://samber.github.io/awesome-prometheus-alerts/rules#ruby)
- [Python](https://samber.github.io/awesome-prometheus-alerts/rules#python)
- [Sidekiq](https://samber.github.io/awesome-prometheus-alerts/rules#sidekiq)

#### Data engineering

- [Apache Flink](https://samber.github.io/awesome-prometheus-alerts/rules#apache-flink)
- [Apache Spark](https://samber.github.io/awesome-prometheus-alerts/rules#apache-spark)
- [Hadoop](https://samber.github.io/awesome-prometheus-alerts/rules#hadoop)

#### Orchestrators

- [Kubernetes](https://samber.github.io/awesome-prometheus-alerts/rules#kubernetes)
- [Nomad](https://samber.github.io/awesome-prometheus-alerts/rules#nomad)
- [Consul](https://samber.github.io/awesome-prometheus-alerts/rules#consul)
- [Etcd](https://samber.github.io/awesome-prometheus-alerts/rules#etcd)
- [OpenStack](https://samber.github.io/awesome-prometheus-alerts/rules#openstack)

#### CI/CD

- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
- [ArgoCD](https://samber.github.io/awesome-prometheus-alerts/rules#argocd)
- [FluxCD](https://samber.github.io/awesome-prometheus-alerts/rules#fluxcd)
- [GitLab CI](https://samber.github.io/awesome-prometheus-alerts/rules#gitlab-ci)
- [Spinnaker](https://samber.github.io/awesome-prometheus-alerts/rules#spinnaker)

#### Network and security

- [SpeedTest](https://samber.github.io/awesome-prometheus-alerts/rules#speedtest)
- [SSL/TLS](https://samber.github.io/awesome-prometheus-alerts/rules#ssl/tls)
- [cert-manager](https://samber.github.io/awesome-prometheus-alerts/rules#cert-manager)
- [Juniper](https://samber.github.io/awesome-prometheus-alerts/rules#juniper)
- [CoreDNS](https://samber.github.io/awesome-prometheus-alerts/rules#coredns)
- [FreeSwitch](https://samber.github.io/awesome-prometheus-alerts/rules#freeswitch)
- [Hashicorp Vault](https://samber.github.io/awesome-prometheus-alerts/rules#hashicorp-vault)
- [Keycloak](https://samber.github.io/awesome-prometheus-alerts/rules#keycloak)
- [Cloudflare](https://samber.github.io/awesome-prometheus-alerts/rules#cloudflare)
- [SNMP](https://samber.github.io/awesome-prometheus-alerts/rules#snmp)
- [Cilium](https://samber.github.io/awesome-prometheus-alerts/rules#cilium)
- [WireGuard](https://samber.github.io/awesome-prometheus-alerts/rules#wireguard)

#### Storage

- [Ceph](https://samber.github.io/awesome-prometheus-alerts/rules#ceph)
- [ZFS](https://samber.github.io/awesome-prometheus-alerts/rules#zfs)
- [OpenEBS](https://samber.github.io/awesome-prometheus-alerts/rules#openebs)
- [Minio](https://samber.github.io/awesome-prometheus-alerts/rules#minio)

#### Cloud providers

- [AWS CloudWatch](https://samber.github.io/awesome-prometheus-alerts/rules#aws-cloudwatch)
- [Google Cloud Stackdriver](https://samber.github.io/awesome-prometheus-alerts/rules#google-cloud-stackdriver)
- [DigitalOcean](https://samber.github.io/awesome-prometheus-alerts/rules#digitalocean)
- [Azure](https://samber.github.io/awesome-prometheus-alerts/rules#azure)

#### Observability

- [Thanos](https://samber.github.io/awesome-prometheus-alerts/rules#thanos)
- [Loki](https://samber.github.io/awesome-prometheus-alerts/rules#loki)
- [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail)
- [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex)
- [Grafana Tempo](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-tempo)
- [Grafana Mimir](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-mimir)
- [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy)
- [OpenTelemetry Collector](https://samber.github.io/awesome-prometheus-alerts/rules#opentelemetry-collector)
- [Jaeger](https://samber.github.io/awesome-prometheus-alerts/rules#jaeger)

#### Other

- [APC UPS](https://samber.github.io/awesome-prometheus-alerts/rules#apc-ups)
- [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node)

## 🤝 Contributing

Contributions from community (you!) are most welcome!

There are many ways to contribute: writing code, alerting rules, documentation, reporting issues, discussing better error tracking...

[Instructions here](CONTRIBUTING.md)

## 🏋️ Improvements

- Create an alert rule builder in Jekyll for custom alerts (severity, thresholds, instances...)
- Add resolution suggestions to rule descriptions, for faster incident resolution ([#85](https://github.com/samber/awesome-prometheus-alerts/issues/85)).

## 💫 Show your support

Give a ⭐️ if this project helped you!

[![support us](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/samber)

## 📝 License

[![CC4](https://mirrors.creativecommons.org/presskit/cc.srr.primary.svg)](https://creativecommons.org/licenses/by/4.0/legalcode)

Licensed under the Creative Commons 4.0 License, see LICENSE file for more detail.


================================================
FILE: _config.yml
================================================
theme: jekyll-theme-cayman

title: Awesome Prometheus alerts
description: Collection of alerting rules

repository: samber/awesome-prometheus-alerts

baseurl: /awesome-prometheus-alerts


================================================
FILE: _data/rules.yml
================================================
#
# The following yaml cannot be copy-pasted to Prometheus configuration.
#     Please navigate to https://samber.github.io/awesome-prometheus-alerts/rules instead.
#
# Contributing guidelines:
#      https://github.com/samber/awesome-prometheus-alerts/blob/master/CONTRIBUTING.md
#

groups:
  - name: Basic resource monitoring
    services:
      - name: Prometheus self-monitoring
        exporters:
          - slug: embedded-exporter
            rules:
              - name: Prometheus job missing
                description: A Prometheus job has disappeared
                query: 'absent(up{job="prometheus"})'
                severity: warning
              - name: Prometheus target missing
                description: A Prometheus target has disappeared. An exporter might be crashed.
                query: "up == 0 unless on(job) (sum by (job) (up) == 0)"
                severity: critical
                for: 1m
                comments: |
                  Only fire if at least one target in the job is still up.
                  If all targets are down, PrometheusJobMissing or PrometheusAllTargetsMissing will fire instead.
              - name: Prometheus all targets missing
                description: A Prometheus job does not have living target anymore.
                query: "sum by (job) (up) == 0"
                severity: critical
                for: 1m
              - name: Prometheus target missing with warmup time
                description: "Allow a job time to start up (10 minutes) before alerting that it's down."
                query: "sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))"
                severity: critical
                for: 1m
              - name: Prometheus configuration reload failure
                description: Prometheus configuration reload error
                query: "prometheus_config_last_reload_successful != 1"
                severity: warning
              - name: Prometheus too many restarts
                description: Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
                query: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
                severity: warning
              - name: Prometheus AlertManager job missing
                description: A Prometheus AlertManager job has disappeared
                query: 'absent(up{job="alertmanager"})'
                severity: warning
              - name: Prometheus AlertManager configuration reload failure
                description: AlertManager configuration reload error
                query: "alertmanager_config_last_reload_successful != 1"
                severity: warning
              - name: Prometheus AlertManager config not synced
                description: Configurations of AlertManager cluster instances are out of sync
                query: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
                severity: warning
              - name: Prometheus AlertManager E2E dead man switch
                description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager."
                query: "vector(1)"
                severity: critical
              - name: Prometheus not connected to alertmanager
                description: Prometheus cannot connect the alertmanager
                query: "prometheus_notifications_alertmanagers_discovered < 1"
                severity: critical
              - name: Prometheus rule evaluation failures
                description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
                query: "increase(prometheus_rule_evaluation_failures_total[3m]) > 0"
                severity: critical
              - name: Prometheus template text expansion failures
                description: "Prometheus encountered {{ $value }} template text expansion failures"
                query: "increase(prometheus_template_text_expansion_failures_total[3m]) > 0"
                severity: critical
              - name: Prometheus rule evaluation slow
                description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query."
                query: "prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds"
                severity: warning
                for: 5m
              - name: Prometheus notifications backlog
                description: The Prometheus notification queue has not been empty for 10 minutes
                query: "min_over_time(prometheus_notifications_queue_length[10m]) > 0"
                severity: warning
              - name: Prometheus AlertManager notification failing
                description: "Alertmanager is failing sending notifications ({{ $value }} notifications/s)"
                query: "rate(alertmanager_notifications_failed_total[1m]) > 0"
                severity: critical
              - name: Prometheus target empty
                description: Prometheus has no target in service discovery
                query: "prometheus_sd_discovered_targets == 0"
                severity: critical
              - name: Prometheus target scraping slow
                description: Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.
                query: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
                severity: warning
                for: 5m
              - name: Prometheus large scrape
                description: "Prometheus has many scrapes that exceed the sample limit ({{ $value }} scrapes)"
                query: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10"
                severity: warning
                for: 5m
              - name: Prometheus target scrape duplicate
                description: "Prometheus has many samples rejected due to duplicate timestamps but different values ({{ $value }} samples)"
                query: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 3"
                severity: warning
              - name: Prometheus TSDB checkpoint creation failures
                description: "Prometheus encountered {{ $value }} checkpoint creation failures"
                query: "increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0"
                severity: critical
              - name: Prometheus TSDB checkpoint deletion failures
                description: "Prometheus encountered {{ $value }} checkpoint deletion failures"
                query: "increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0"
                severity: critical
              - name: Prometheus TSDB compactions failed
                description: "Prometheus encountered {{ $value }} TSDB compactions failures"
                query: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
                severity: critical
              - name: Prometheus TSDB head truncations failed
                description: "Prometheus encountered {{ $value }} TSDB head truncation failures"
                query: "increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0"
                severity: critical
              - name: Prometheus TSDB reload failures
                description: "Prometheus encountered {{ $value }} TSDB reload failures"
                query: "increase(prometheus_tsdb_reloads_failures_total[1m]) > 0"
                severity: critical
              - name: Prometheus TSDB WAL corruptions
                description: "Prometheus encountered {{ $value }} TSDB WAL corruptions"
                query: "increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0"
                severity: critical
              - name: Prometheus TSDB WAL truncations failed
                description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures"
                query: "increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0"
                severity: critical
              - name: Prometheus timeseries cardinality
                description: 'The "{{ $labels.name }}" timeseries cardinality is getting very high: {{ $value }}'
                query: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
                severity: warning

      - name: Host and hardware
        exporters:
          - name: node-exporter
            slug: node-exporter
            doc_url: https://github.com/prometheus/node_exporter
            rules:
              - name: Host out of memory
                description: Node memory is filling up (< 10% left)
                query: "(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)"
                severity: warning
                for: 2m
              - name: Host memory under memory pressure
                description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s)."
                query: "(rate(node_vmstat_pgmajfault[5m]) > 1000)"
                severity: warning
              - name: Host Memory is underutilized
                description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})"
                query: "min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8"
                severity: info
                comments: |
                  You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
              - name: Host unusual network throughput in
                description: Host receive bandwidth is high (>80%).
                query: "((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0"
                severity: warning
              - name: Host unusual network throughput out
                description: Host transmit bandwidth is high (>80%)
                query: "((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0"
                severity: warning
              - name: Host disk IO utilization high
                description: Disk utilization is high (> 80%)
                query: "(rate(node_disk_io_time_seconds_total[5m]) > .80)"
                severity: warning
              - name: Host out of disk space
                description: Disk is almost full (< 10% left)
                query: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
                severity: critical
                comments: |
                  Please add ignored mountpoints in node_exporter parameters like
                  "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
                  Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
                for: 2m
              - name: Host disk may fill in 24 hours
                description: Filesystem will likely run out of space within the next 24 hours.
                query: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
                severity: warning
                comments: |
                  Please add ignored mountpoints in node_exporter parameters like
                  "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
                  Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
                for: 2m
              - name: Host out of inodes
                description: Disk is almost running out of available inodes (< 10% left)
                query: "(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) and node_filesystem_files > 0"
                severity: critical
                for: 2m
              - name: Host filesystem device error
                description: "Error stat-ing the {{ $labels.mountpoint }} filesystem"
                query: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
                severity: critical
                for: 2m
              - name: Host inodes may fill in 24 hours
                description: Filesystem will likely run out of inodes within the next 24 hours at current write rate
                query: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
                severity: warning
                for: 2m
              - name: Host unusual disk read latency
                description: Disk latency is growing (read operations > 100ms)
                query: "(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)"
                severity: warning
                for: 2m
              - name: Host unusual disk write latency
                description: Disk latency is growing (write operations > 100ms)
                query: "(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)"
                severity: warning
                for: 2m
              - name: Host high CPU load
                description: CPU load is > 80%
                query: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
                severity: warning
                for: 10m
              - name: Host CPU is underutilized
                description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs."
                query: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
                severity: info
                for: 1w
                comments: |
                  You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
              - name: Host CPU steal noisy neighbor
                description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
                query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
                severity: warning
              - name: Host CPU high iowait
                description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond.
                query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
                severity: warning
              - name: Host unusual disk IO
                description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues."
                query: "rate(node_disk_io_time_seconds_total[5m]) > 0.8"
                severity: warning
                for: 5m
              - name: Host context switching high
                description: Context switching is growing on the node (twice the daily average during the last 15m)
                query: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
                severity: warning
                comments: |
                  x2 context switches is an arbitrary number.
                  The alert threshold depends on the nature of the application.
                  Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
              - name: Host swap is filling up
                description: Swap is filling up (>80%)
                query: "((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) and node_memory_SwapTotal_bytes > 0"
                severity: warning
                for: 2m
              - name: Host systemd service crashed
                description: "systemd service {{ $labels.name }} crashed"
                query: '(node_systemd_unit_state{state="failed"} == 1)'
                severity: warning
              - name: Host physical component too hot
                description: "Physical hardware component too hot"
                query: "node_hwmon_temp_celsius > node_hwmon_temp_max_celsius"
                severity: warning
                for: 5m
              - name: Host node overtemperature alarm
                description: "Physical node temperature alarm triggered"
                query: "((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))"
                severity: critical
              - name: Host software RAID insufficient drives
                description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining."
                query: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)'
                comments: |
                  Uses ignoring(state) to handle additional labels on node_md_disks. Matches the official node-exporter mixin.
                severity: critical
              - name: Host software RAID disk failure
                description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention."
                query: '(node_md_disks{state="failed"} > 0)'
                severity: warning
                for: 2m
              - name: Host kernel version deviations
                description: Kernel version for {{ $labels.instance }} has changed.
                query: "changes(node_uname_info[1h]) > 0"
                severity: info
              - name: Host OOM kill detected
                description: OOM kill detected
                query: "(increase(node_vmstat_oom_kill[30m]) > 0)"
                severity: warning
                comments: |
                  When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 15–20 minutes to recover, the alert should still trigger.
              - name: Host EDAC Correctable Errors detected
                description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
                query: "(increase(node_edac_correctable_errors_total[1m]) > 0)"
                severity: info
              - name: Host EDAC Uncorrectable Errors detected
                description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
                query: "(node_edac_uncorrectable_errors_total > 0)"
                severity: warning
              - name: Host Network Receive Errors
                description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
                query: "(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) and rate(node_network_receive_packets_total[2m]) > 0"
                severity: warning
                for: 2m
              - name: Host Network Transmit Errors
                description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
                query: "(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) and rate(node_network_transmit_packets_total[2m]) > 0"
                severity: warning
                for: 2m
              - name: Host Network Bond Degraded
                description: 'Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".'
                query: "((node_bonding_active - node_bonding_slaves) != 0)"
                severity: warning
                for: 2m
              - name: Host conntrack limit
                description: "The number of conntrack is approaching limit"
                query: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) and node_nf_conntrack_entries_limit > 0"
                severity: warning
                for: 5m
              - name: Host clock skew
                description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host."
                query: "((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))"
                severity: warning
                for: 10m
              - name: Host clock not synchronising
                description: "Clock not synchronising. Ensure NTP is configured on this host."
                query: "(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)"
                severity: warning
                for: 2m

      - name: S.M.A.R.T Device Monitoring
        exporters:
          - name: smartctl-exporter
            slug: smartctl-exporter
            doc_url: https://github.com/prometheus-community/smartctl_exporter
            rules:
              - name: SMART device temperature warning
                description: Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C
                query: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60'
                severity: warning
              - name: SMART device temperature critical
                description: Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C
                query: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70'
                severity: critical
              - name: SMART device temperature over trip value
                description: Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})
                query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}'
                severity: critical
              - name: SMART device temperature nearing trip value
                description: Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})
                query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)'
                severity: warning
              - name: SMART status
                description: Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})
                query: "smartctl_device_smart_status != 1"
                severity: critical
              - name: SMART critical warning
                description: Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})
                query: "smartctl_device_critical_warning > 0"
                severity: critical
              - name: SMART media errors
                description: Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})
                query: "smartctl_device_media_errors > 0"
                severity: critical
              - name: SMART Wearout Indicator
                description: Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})
                query: "smartctl_device_available_spare < smartctl_device_available_spare_threshold"
                severity: critical

      - name: IPMI
        exporters:
          - name: prometheus-community/ipmi_exporter
            slug: ipmi-exporter
            doc_url: https://github.com/prometheus-community/ipmi_exporter
            rules:
              - name: IPMI collector down
                description: "IPMI collector {{ $labels.collector }} on {{ $labels.instance }} failed to scrape sensor data. Check FreeIPMI tools and BMC connectivity."
                query: 'ipmi_up == 0'
                severity: warning
                for: 5m
                comments: |
                  The ipmi_up metric is per-collector. A value of 0 means the collector could not retrieve data from the BMC.
              - name: IPMI temperature sensor warning
                description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state."
                query: 'ipmi_temperature_state == 1'
                severity: warning
                for: 5m
                comments: |
                  State values: 0=nominal, 1=warning, 2=critical. Thresholds are defined in the BMC firmware.
              - name: IPMI temperature sensor critical
                description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Immediate attention required to prevent hardware damage."
                query: 'ipmi_temperature_state == 2'
                severity: critical
              - name: IPMI fan speed sensor warning
                description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state."
                query: 'ipmi_fan_speed_state == 1'
                severity: warning
                for: 5m
              - name: IPMI fan speed sensor critical
                description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. A fan may have failed."
                query: 'ipmi_fan_speed_state == 2'
                severity: critical
              - name: IPMI fan speed zero
                description: "IPMI fan {{ $labels.name }} on {{ $labels.instance }} reports 0 RPM. The fan may have failed."
                query: 'ipmi_fan_speed_rpm == 0'
                severity: critical
                for: 5m
              - name: IPMI voltage sensor warning
                description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state."
                query: 'ipmi_voltage_state == 1'
                severity: warning
                for: 5m
              - name: IPMI voltage sensor critical
                description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Power supply or motherboard issue possible."
                query: 'ipmi_voltage_state == 2'
                severity: critical
              - name: IPMI current sensor warning
                description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state."
                query: 'ipmi_current_state == 1'
                severity: warning
                for: 5m
              - name: IPMI current sensor critical
                description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state."
                query: 'ipmi_current_state == 2'
                severity: critical
              - name: IPMI power sensor warning
                description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state."
                query: 'ipmi_power_state == 1'
                severity: warning
                for: 5m
              - name: IPMI power sensor critical
                description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state."
                query: 'ipmi_power_state == 2'
                severity: critical
              - name: IPMI generic sensor critical
                description: "IPMI sensor {{ $labels.name }} (type={{ $labels.type }}) on {{ $labels.instance }} is in critical state."
                query: 'ipmi_sensor_state == 2'
                severity: critical
                for: 5m
                comments: |
                  Catches any sensor type not covered by the specific temperature/fan/voltage/current/power alerts.
              - name: IPMI chassis power off
                description: "IPMI reports chassis power is off on {{ $labels.instance }}. The server may have shut down unexpectedly."
                query: 'ipmi_chassis_power_state == 0'
                severity: critical
              - name: IPMI chassis drive fault
                description: "IPMI reports a drive fault on {{ $labels.instance }}. Check disk health."
                query: 'ipmi_chassis_drive_fault_state == 0'
                severity: critical
                comments: |
                  The metric uses inverted logic: 1=no fault, 0=fault detected.
              - name: IPMI chassis cooling fault
                description: "IPMI reports a cooling/fan fault on {{ $labels.instance }}. Check fans and airflow."
                query: 'ipmi_chassis_cooling_fault_state == 0'
                severity: critical
                comments: |
                  The metric uses inverted logic: 1=no fault, 0=fault detected.
              - name: IPMI SEL almost full
                description: "IPMI System Event Log on {{ $labels.instance }} has only {{ printf \"%.0f\" $value }} bytes free. Clear the SEL to prevent loss of new events."
                query: 'ipmi_sel_free_space_bytes < 512'
                severity: warning
                for: 5m
                comments: |
                  SEL storage is typically very limited (e.g., 16KB). When full, new events may be dropped.

      - name: Docker containers
        exporters:
          - name: google/cAdvisor
            slug: google-cadvisor
            doc_url: https://github.com/google/cadvisor
            rules:
              - name: Container killed
                description: A container has disappeared
                query: "time() - container_last_seen > 60"
                severity: warning
                comments: |
                  This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
              - name: Container absent
                description: A container is absent for 5 min
                query: "absent(container_last_seen)"
                severity: warning
                for: 5m
                comments: |
                  This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
              - name: Container High CPU utilization
                description: 'Container CPU utilization is above 80% (current: {{ $value | printf "%.2f" }}%)'
                query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0'
                comments: |
                  Only fires for containers with explicit CPU limits. Containers without limits have cpu_quota=0, which is filtered out by the guard.
                severity: warning
                for: 2m
              - name: Container High Memory usage
                description: Container Memory usage is above 80%
                query: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
                severity: warning
                comments: See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
                for: 2m
              - name: Container Volume usage
                description: Container Volume usage is above 80%
                query: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80 and sum(container_fs_inodes_total) BY (instance) > 0'
                severity: warning
                for: 2m
              - name: Container high throttle rate
                description: "Container is being throttled ({{ $value | humanizePercentage }})"
                query: 'sum(rate(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0'
                severity: warning
                for: 5m
              - name: Container high low change CPU usage
                description: This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%.
                query: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25'
                severity: info
              - name: Container Low CPU utilization
                description: 'Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU. (current: {{ $value | printf "%.2f" }}%)'
                query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
                severity: info
                for: 7d
              - name: Container Low Memory usage
                description: Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.
                query: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20'
                severity: info
                for: 7d

      - name: Blackbox
        exporters:
          - name: prometheus/blackbox_exporter
            slug: blackbox-exporter
            doc_url: https://github.com/prometheus/blackbox_exporter
            rules:
              - name: Blackbox probe failed
                description: Probe failed
                query: probe_success == 0
                severity: critical
              - name: Blackbox configuration reload failure
                description: Blackbox configuration reload failure
                query: "blackbox_exporter_config_last_reload_successful != 1"
                severity: warning
              - name: Blackbox slow probe
                description: Blackbox probe took more than 1s to complete
                query: "probe_duration_seconds > 1"
                severity: warning
                for: 1m
              - name: Blackbox probe HTTP failure
                description: HTTP status code is not 200-399
                query: "probe_http_status_code <= 199 OR probe_http_status_code >= 400"
                severity: critical
              - name: Blackbox SSL certificate will expire soon
                description: SSL certificate expires in less than 20 days
                query: "3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20"
                severity: warning
              - name: Blackbox SSL certificate will expire very soon
                description: SSL certificate expires in less than 3 days
                query: "0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3"
                severity: critical
              - name: Blackbox SSL certificate expired
                description: SSL certificate has expired already
                query: "round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0"
                severity: critical
                comments: |
                  For probe_ssl_earliest_cert_expiry to be exposed after expiration, you
                  need to enable insecure_skip_verify. Note that this will disable
                  certificate validation.
                  See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config
              - name: Blackbox probe slow HTTP
                description: HTTP request took more than 1s
                query: "probe_http_duration_seconds > 1"
                severity: warning
                for: 1m
              - name: Blackbox probe slow ping
                description: Blackbox ping took more than 1s
                query: "probe_icmp_duration_seconds > 1"
                severity: warning
                for: 1m

      - name: Windows Server
        exporters:
          - name: prometheus-community/windows_exporter
            slug: windows-exporter
            doc_url: https://github.com/prometheus-community/windows_exporter
            rules:
              - name: Windows Server collector Error
                description: "Collector {{ $labels.collector }} was not successful"
                query: "windows_exporter_collector_success == 0"
                severity: critical
              - name: Windows Server service Status
                description: Windows Service state is not OK
                query: 'windows_service_status{status="ok"} != 1'
                severity: critical
                for: 1m
              - name: Windows Server CPU Usage
                description: CPU Usage is more than 80%
                query: '100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80'
                severity: warning
              - name: Windows Server memory Usage
                description: Memory usage is more than 90%
                query: "100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90"
                severity: warning
                for: 2m
              - name: Windows Server disk Space Usage
                description: Disk usage is more than 80%
                query: "100 - 100 * (windows_logical_disk_free_bytes / windows_logical_disk_size_bytes) > 80 and windows_logical_disk_size_bytes > 0"
                severity: critical
                for: 2m

      - name: VMware
        exporters:
          - name: pryorda/vmware_exporter
            slug: pryorda-vmware-exporter
            doc_url: https://github.com/pryorda/vmware_exporter
            rules:
              - name: Virtual Machine Memory Warning
                description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
                query: "vmware_vm_mem_usage_average / 100 >= 80 and vmware_vm_mem_usage_average / 100 < 90"
                severity: warning
                for: 5m
              - name: Virtual Machine Memory Critical
                description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
                query: "vmware_vm_mem_usage_average / 100 >= 90"
                severity: critical
                for: 1m
              - name: High Number of Snapshots
                description: "High snapshots number on {{ $labels.instance }}: {{ $value }}"
                query: "vmware_vm_snapshots > 3"
                severity: warning
                for: 30m
              - name: Outdated Snapshots
                description: 'Outdated snapshots on {{ $labels.instance }}: {{ $value | printf "%.0f"}} days'
                query: "(time() - vmware_vm_snapshot_timestamp_seconds) / (60 * 60 * 24) >= 3"
                severity: warning
                for: 5m

      - name: Proxmox VE
        exporters:
          - name: prometheus-pve/prometheus-pve-exporter
            slug: prometheus-pve-exporter
            doc_url: https://github.com/prometheus-pve/prometheus-pve-exporter
            rules:
              - name: PVE node down
                description: 'Proxmox VE node {{ $labels.id }} is down.'
                query: 'pve_up{id=~"node/.*"} == 0'
                severity: critical
                for: 2m
              - name: PVE VM/CT down
                description: 'Proxmox VE guest {{ $labels.id }} is not running.'
                query: 'pve_up{id=~"(qemu|lxc)/.*"} == 0'
                severity: warning
                for: 5m
                comments: |
                  This alert triggers for all VMs and containers that are not running.
                  You may want to filter by specific guests using the `id` label, or exclude
                  intentionally stopped guests with additional label matchers.
              - name: PVE high CPU usage
                description: 'Proxmox VE CPU usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf "%.2f" }}%'
                query: 'pve_cpu_usage_ratio * 100 > 90'
                severity: warning
                for: 5m
              - name: PVE high memory usage
                description: 'Proxmox VE memory usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf "%.2f" }}%'
                query: 'pve_memory_usage_bytes / pve_memory_size_bytes * 100 > 90 and pve_memory_size_bytes > 0'
                severity: warning
                for: 5m
              - name: PVE storage filling up
                description: 'Proxmox VE storage {{ $labels.id }} is above 80% used. Current value: {{ $value | printf "%.2f" }}%'
                query: 'pve_disk_usage_bytes{id=~"storage/.*"} / pve_disk_size_bytes{id=~"storage/.*"} * 100 > 80 and pve_disk_size_bytes{id=~"storage/.*"} > 0'
                severity: warning
                for: 5m
              - name: PVE storage almost full
                description: 'Proxmox VE storage {{ $labels.id }} is above 95% used. Current value: {{ $value | printf "%.2f" }}%'
                query: 'pve_disk_usage_bytes{id=~"storage/.*"} / pve_disk_size_bytes{id=~"storage/.*"} * 100 > 95 and pve_disk_size_bytes{id=~"storage/.*"} > 0'
                severity: critical
                for: 2m
              - name: PVE guest not backed up
                description: '{{ $value }} Proxmox VE guest(s) are not covered by any backup job.'
                query: 'pve_not_backed_up_total > 0'
                severity: warning
              - name: PVE replication failed
                description: 'Proxmox VE replication for {{ $labels.id }} has {{ $value }} failed sync(s).'
                query: 'pve_replication_failed_syncs > 0'
                severity: warning
              - name: PVE cluster not quorate
                description: 'Proxmox VE cluster has lost quorum.'
                query: 'pve_cluster_info{quorate="0"} == 1'
                severity: critical
                comments: |
                  Loss of quorum means the cluster cannot make decisions about VM placement
                  and fencing. This requires immediate attention.

      - name: Netdata
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://github.com/netdata/netdata/blob/master/backends/prometheus/README.md
            rules:
              - name: Netdata high cpu usage
                description: Netdata high CPU usage (> 80%)
                query: 'netdata_cpu_cpu_percentage_average{dimension="idle"} < 20'
                severity: warning
                for: 5m
                comments: |
                  This is a gauge metric (not a counter). Checking idle < 20% means CPU usage > 80%.
              - name: Netdata CPU steal noisy neighbor
                description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
                query: 'netdata_cpu_cpu_percentage_average{dimension="steal"} > 10'
                severity: warning
                for: 5m
              - name: Netdata high memory usage
                description: Netdata high memory usage (> 80%)
                query: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20 and netdata_system_ram_MiB_average > 0'
                severity: warning
                for: 5m
              - name: Netdata low disk space
                description: Netdata low disk space (> 80%)
                query: '100 / netdata_disk_space_GB_average * netdata_disk_space_GB_average{dimension=~"avail|cached"} < 20 and netdata_disk_space_GB_average > 0'
                severity: warning
                for: 5m
              - name: Netdata predicted disk full
                description: Netdata predicted disk full in 24 hours
                query: 'predict_linear(netdata_disk_space_GB_average{dimension=~"avail|cached"}[3h], 24 * 3600) < 0'
                severity: warning
              - name: Netdata MD mismatch cnt unsynchronized blocks
                description: RAID Array have unsynchronized blocks
                query: "netdata_md_mismatch_cnt_unsynchronized_blocks_average > 1024"
                severity: warning
                for: 2m
              - name: Netdata disk reallocated sectors
                description: "Disk reallocated sectors detected ({{ $value }} sectors)"
                query: "increase(netdata_smartd_log_reallocated_sectors_count_sectors_average[1m]) > 0"
                severity: info
              - name: Netdata disk current pending sector
                description: Disk current pending sector
                query: "netdata_smartd_log_current_pending_sector_count_sectors_average > 0"
                severity: warning
              - name: Netdata reported uncorrectable disk sectors
                description: "Reported uncorrectable disk sectors ({{ $value }} sectors)"
                query: "increase(netdata_smartd_log_offline_uncorrectable_sector_count_sectors_average[2m]) > 0"
                severity: warning

      - name: eBPF
        exporters:
          - name: cloudflare/ebpf_exporter
            slug: ebpf-exporter
            doc_url: https://github.com/cloudflare/ebpf_exporter
            rules:
              - name: eBPF exporter program not attached
                description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})"
                query: 'ebpf_exporter_ebpf_program_attached == 0'
                severity: warning
                for: 5m
                comments: |
                  The exporter uses loose attachment: if a program fails to load (missing BTF, kernel incompatibility), it sets this metric to 0 and continues running.
              - name: eBPF exporter decoder errors
                description: "eBPF exporter is experiencing decoder errors for config {{ $labels.config }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})"
                query: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0'
                severity: warning
                for: 5m
              - name: eBPF exporter no enabled configs
                description: "eBPF exporter has no enabled configurations. No eBPF programs are being run. (instance {{ $labels.instance }})"
                query: 'ebpf_exporter_enabled_configs == 0 or absent(ebpf_exporter_enabled_configs)'
                severity: warning
                for: 5m

      - name: Process Exporter
        exporters:
          - name: ncabatoff/process-exporter
            slug: process-exporter
            doc_url: https://github.com/ncabatoff/process-exporter
            rules:
              - name: Process exporter group down
                description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})"
                query: 'namedprocess_namegroup_num_procs == 0'
                severity: warning
                for: 5m
              - name: Process exporter high memory usage
                description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of resident memory. (instance {{ $labels.instance }})"
                query: 'namedprocess_namegroup_memory_bytes{memtype="resident"} > 4e+09'
                severity: warning
                for: 5m
                comments: |
                  Threshold of 4GB is arbitrary and depends on the process being monitored. Adjust per group.
              - name: Process exporter high CPU usage
                description: "Process group {{ $labels.groupname }} is using {{ $value }}% CPU (core-equivalent). (instance {{ $labels.instance }})"
                query: 'rate(namedprocess_namegroup_cpu_seconds_total[5m]) * 100 > 80'
                severity: warning
                for: 5m
                comments: |
                  Value is core-equivalent %: 100% = 1 full core, 200% = 2 cores, etc. Threshold of 80% is per-core. Adjust based on expected workload.
              - name: Process exporter high file descriptor usage
                description: "Process group {{ $labels.groupname }} is using more than 80% of its file descriptor limit. (instance {{ $labels.instance }})"
                query: 'namedprocess_namegroup_worst_fd_ratio > 0.8'
                severity: warning
                for: 5m
              - name: Process exporter file descriptors exhausted
                description: "Process group {{ $labels.groupname }} has nearly exhausted its file descriptor limit. (instance {{ $labels.instance }})"
                query: 'namedprocess_namegroup_worst_fd_ratio > 0.95'
                severity: critical
                for: 2m
              - name: Process exporter high swap usage
                description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})"
                query: 'namedprocess_namegroup_memory_bytes{memtype="swapped"} > 512e+06'
                severity: warning
                for: 5m
                comments: |
                  Threshold of 512MB is arbitrary. Adjust per group and environment.
              - name: Process exporter zombie processes
                description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})"
                query: 'namedprocess_namegroup_states{state="Zombie"} > 5'
                severity: warning
                for: 5m
              - name: Process exporter high context switching
                description: "Process group {{ $labels.groupname }} has a high rate of context switches ({{ $value }}/s). (instance {{ $labels.instance }})"
                query: 'rate(namedprocess_namegroup_context_switches_total{ctxswitchtype="voluntary"}[5m]) > 50000'
                severity: warning
                for: 5m
                comments: |
                  Filters to voluntary switches only — involuntary switches are normal under CPU contention. Threshold of 50000/s is a rough default. Adjust based on workload.
              - name: Process exporter high disk write IO
                description: "Process group {{ $labels.groupname }} is performing {{ $value | humanize }}B/s of disk writes. (instance {{ $labels.instance }})"
                query: 'rate(namedprocess_namegroup_write_bytes_total[5m]) > 100e+06'
                severity: warning
                for: 5m
                comments: |
                  Threshold of 100MB/s is arbitrary. Adjust per group.
              - name: Process exporter process restarting
                description: "Process group {{ $labels.groupname }} has restarted (oldest process start time changed). (instance {{ $labels.instance }})"
                query: 'changes(namedprocess_namegroup_oldest_start_time_seconds[5m]) > 0 and namedprocess_namegroup_num_procs > 0'
                severity: info
                comments: |
                  Detects restarts by watching for changes in the oldest process start time within the group.

      - name: Systemd
        exporters:
          - name: prometheus-community/systemd_exporter
            slug: systemd-exporter
            doc_url: https://github.com/prometheus-community/systemd_exporter
            rules:
              - name: Systemd unit failed
                description: "Systemd unit {{ $labels.name }} has entered failed state. (instance {{ $labels.instance }})"
                query: 'systemd_unit_state{state="failed"} == 1'
                severity: warning
                for: 5m
              - name: Systemd unit inactive
                description: "Systemd unit {{ $labels.name }} is inactive. (instance {{ $labels.instance }})"
                query: 'systemd_unit_state{state="inactive", type="service", name=~"your-critical-service.+"} == 1'
                severity: warning
                for: 5m
                comments: |
                  Many units are legitimately inactive. You must adjust the name=~ filter to match your critical services.
              - name: Systemd service crash looping
                description: "Systemd service {{ $labels.name }} has restarted {{ $value }} times in the last hour. (instance {{ $labels.instance }})"
                query: 'increase(systemd_service_restart_total[1h]) > 5'
                severity: critical
                for: 5m
              - name: Systemd unit tasks near limit
                description: "Systemd unit {{ $labels.name }} is using {{ $value | humanizePercentage }} of its task limit. (instance {{ $labels.instance }})"
                query: 'systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max > 0.9 and ignoring(type) systemd_unit_tasks_max > 0'
                severity: warning
                for: 5m
              - name: Systemd socket refused connections
                description: "Systemd socket {{ $labels.name }} is refusing connections. ({{ $value }} refused in last 5m, instance {{ $labels.instance }})"
                query: 'increase(systemd_socket_refused_connections_total[5m]) > 0'
                severity: warning
                for: 2m
              - name: Systemd socket high connections
                description: "Systemd socket {{ $labels.name }} has {{ $value }} active connections. (instance {{ $labels.instance }})"
                query: 'systemd_socket_current_connections > 100'
                severity: warning
                for: 2m
                comments: |
                  Threshold of 100 connections is arbitrary. Adjust to your workload.
              - name: Systemd timer missed trigger
                description: "Systemd timer {{ $labels.name }} has not triggered for over 24 hours. (instance {{ $labels.instance }})"
                query: '(time() - systemd_timer_last_trigger_seconds) / 3600 > 24 and systemd_timer_last_trigger_seconds > 0'
                severity: warning
                for: 5m
                comments: |
                  Triggers if timer hasn't fired in 24 hours. Adjust threshold per timer schedule.

  - name: Databases
    services:
      - name: MySQL
        exporters:
          - name: prometheus/mysqld_exporter
            slug: mysqld-exporter
            doc_url: https://github.com/prometheus/mysqld_exporter
            rules:
              - name: MySQL down
                description: MySQL instance is down on {{ $labels.instance }}
                query: "mysql_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: MySQL too many connections (> 80%)
                description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}"
                query: "max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80 and mysql_global_variables_max_connections > 0"
                severity: warning
                for: 2m
              - name: MySQL high prepared statements utilization (> 80%)
                description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}"
                query: "max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80 and mysql_global_variables_max_prepared_stmt_count > 0"
                severity: warning
                for: 2m
              - name: MySQL high threads running
                description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}"
                query: "max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60 and mysql_global_variables_max_connections > 0"
                severity: warning
                for: 2m
              - name: MySQL Slave IO thread not running
                description: "MySQL Slave IO thread not running on {{ $labels.instance }}"
                query: "( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: MySQL Slave SQL thread not running
                description: "MySQL Slave SQL thread not running on {{ $labels.instance }}"
                query: "( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: MySQL Slave replication lag
                description: "MySQL replication lag on {{ $labels.instance }}"
                query: "( (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and ON (instance) mysql_slave_status_master_server_id > 0 ) > 30"
                severity: critical
                for: 1m
              - name: MySQL slow queries
                description: "MySQL server mysql has some new slow query ({{ $value }} in the last minute)."
                query: increase(mysql_global_status_slow_queries[1m]) > 0
                severity: warning
                for: 2m
              - name: MySQL InnoDB log waits
                description: "MySQL innodb log writes stalling ({{ $value }} waits/s)"
                query: rate(mysql_global_status_innodb_log_waits[15m]) > 10
                severity: warning
              - name: MySQL restarted
                description: MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.
                query: "mysql_global_status_uptime < 60"
                severity: info
              - name: MySQL High QPS
                description: MySQL is being overload with unusual QPS (> 10k QPS).
                query: "irate(mysql_global_status_questions[1m]) > 10000"
                severity: info
                for: 2m
              - name: MySQL too many open files
                description: MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.
                query: "mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75 and mysql_global_variables_open_files_limit > 0"
                severity: warning
                for: 2m
              - name: MySQL InnoDB Force Recovery is enabled
                description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}"
                query: "mysql_global_variables_innodb_force_recovery != 0"
                severity: warning
                for: 2m
              - name: MySQL InnoDB history_len too long
                description: "MySQL history_len (undo log) too long on {{ $labels.instance }}"
                query: "mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000"
                severity: warning
                for: 2m

      - name: PostgreSQL
        exporters:
          - name: prometheus-community/postgres_exporter
            slug: postgres-exporter
            doc_url: https://github.com/prometheus-community/postgres_exporter
            rules:
              - name: Postgresql down
                description: Postgresql instance is down
                query: "pg_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Postgresql restarted
                description: Postgresql restarted
                query: "time() - pg_postmaster_start_time_seconds < 60"
                severity: critical
              - name: Postgresql exporter error
                description: Postgresql exporter is showing errors. A query may be buggy in query.yaml
                query: "pg_exporter_last_scrape_error > 0"
                severity: critical
              - name: Postgresql table not auto vacuumed
                description: Table {{ $labels.relname }} has not been auto vacuumed for 10 days
                query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10"
                severity: warning
              - name: Postgresql table not auto analyzed
                description: Table {{ $labels.relname }} has not been auto analyzed for 10 days
                query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10"
                severity: warning
              - name: Postgresql too many connections
                description: PostgreSQL instance has too many connections (> 80%).
                query: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)"
                severity: warning
                for: 2m
              - name: Postgresql not enough connections
                description: PostgreSQL instance should have more connections (> 5)
                query: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
                severity: critical
                for: 2m
              - name: Postgresql dead locks
                description: "PostgreSQL has dead-locks ({{ $value }} in the last minute)"
                query: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
                severity: warning
              - name: Postgresql high rollback rate
                description: Ratio of transactions being aborted compared to committed is > 2 %
                query: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
                severity: warning
              - name: Postgresql commit rate low
                description: Postgresql seems to be processing very few transactions
                query: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
                severity: critical
                for: 2m
              - name: Postgresql low XID consumption
                description: Postgresql seems to be consuming transaction IDs very slowly
                query: "rate(pg_txid_current[1m]) < 5"
                severity: warning
                for: 2m
              - name: Postgresql unused replication slot
                description: Unused Replication Slots
                query: "(pg_replication_slots_active == 0) and (pg_replication_is_replica == 0)"
                severity: warning
                for: 1m
              - name: Postgresql too many dead tuples
                description: PostgreSQL dead tuples is too large
                query: "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 and (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) > 0"
                severity: warning
                for: 2m
              - name: Postgresql configuration changed
                description: Postgres Database configuration change has occurred
                query: '{__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} != ON(__name__, instance) {__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} OFFSET 5m'
                severity: info
              - name: Postgresql SSL compression active
                description: Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.
                query: "sum by (instance) (pg_stat_ssl_compression) > 0"
                severity: warning
              - name: Postgresql too many locks acquired
                description: Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.
                query: "((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 and (pg_settings_max_locks_per_transaction * pg_settings_max_connections) > 0"
                severity: critical
                for: 2m
              - name: Postgresql bloat index high (> 80%)
                description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`"
                query: "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)"
                severity: warning
                for: 1h
                comments: |
                  See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
              - name: Postgresql bloat table high (> 80%)
                description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`"
                query: "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)"
                severity: warning
                for: 1h
                comments: |
                  See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
              - name: Postgresql invalid index
                description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`"
                query: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
                severity: warning
                for: 6h
                comments: |
                  See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
              - name: Postgresql replication lag
                description: The PostgreSQL replication lag is high (> 5s)
                query: "pg_replication_lag_seconds > 5"
                severity: warning
                for: 30s

      - name: SQL Server
        exporters:
          - name: Ozarklake/prometheus-mssql-exporter
            slug: ozarklake-mssql-exporter
            doc_url: https://github.com/Ozarklake/prometheus-mssql-exporter
            rules:
              - name: SQL Server down
                description: SQL server instance is down
                query: mssql_up == 0
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: SQL Server deadlock
                description: SQL Server {{ $labels.instance }} is experiencing deadlocks ({{ $value }}/s)
                query: mssql_deadlocks > 5
                severity: warning
                for: 1m

      - name: Oracle Database
        exporters:
          - name: iamseth/oracledb_exporter
            slug: iamseth-oracledb-exporter
            doc_url: https://github.com/iamseth/oracledb_exporter
            rules:
              - name: Oracle DB down
                description: Oracle Database instance is down on {{ $labels.instance }}
                query: "oracledb_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Oracle DB sessions reaching limit (> 85%)
                description: "Oracle Database session utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: "oracledb_resource_current_utilization{resource_name=\"sessions\"} / oracledb_resource_limit_value{resource_name=\"sessions\"} * 100 > 85 and oracledb_resource_limit_value{resource_name=\"sessions\"} > 0"
                severity: warning
                for: 5m
                comments: |
                  Threshold is workload-dependent. Adjust 85% to suit your environment.
              - name: Oracle DB processes reaching limit (> 85%)
                description: "Oracle Database process utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: "oracledb_resource_current_utilization{resource_name=\"processes\"} / oracledb_resource_limit_value{resource_name=\"processes\"} * 100 > 85 and oracledb_resource_limit_value{resource_name=\"processes\"} > 0"
                severity: warning
                for: 5m
                comments: |
                  Threshold is workload-dependent. Adjust 85% to suit your environment.
              - name: Oracle DB tablespace reaching capacity (> 85%)
                description: "Oracle Database tablespace {{ $labels.tablespace }} is above 85% usage on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: "oracledb_tablespace_used_percent > 85"
                severity: warning
                for: 5m
              - name: Oracle DB tablespace full (> 95%)
                description: "Oracle Database tablespace {{ $labels.tablespace }} is critically full on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: "oracledb_tablespace_used_percent > 95"
                severity: critical
                for: 5m
              - name: Oracle DB high user rollbacks
                description: "Oracle Database on {{ $labels.instance }} has a high rollback rate ({{ $value }}% of transactions are rolled back)"
                query: "rate(oracledb_activity_user_rollbacks[5m]) / (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) * 100 > 20 and (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) > 0"
                severity: warning
                for: 5m
                comments: |
                  A high rollback rate (>20%) often indicates application-level issues such as deadlocks, constraint violations, or poorly designed transactions.
              - name: Oracle DB too many active sessions
                description: "Oracle Database on {{ $labels.instance }} has too many active user sessions (current value: {{ $value }})"
                query: "oracledb_sessions_value{status=\"ACTIVE\", type=\"USER\"} > 200"
                severity: warning
                for: 5m
                comments: |
                  Threshold is highly workload-dependent. Adjust 200 to suit your environment.
              - name: Oracle DB high wait time (user I/O)
                description: "Oracle Database on {{ $labels.instance }} is experiencing high user I/O wait time"
                query: "oracledb_wait_time_user_io > 300"
                severity: warning
                for: 5m
                comments: |
                  The metric from v$waitclassmetric is already a normalized rate (centiseconds per second). Threshold 300 means 3 seconds of I/O wait per second of wall time.

      - name: Patroni
        exporters:
          - name: Embedded exporter (Patroni >= 2.1.0)
            slug: embedded-exporter-patroni
            doc_url: https://patroni.readthedocs.io/en/latest/rest_api.html?highlight=prometheus#monitoring-endpoint
            rules:
              - name: Patroni has no Leader
                description: A leader node (neither primary nor standby) cannot be found inside the cluster {{ $labels.scope }}
                query: (max by (scope) (patroni_primary) < 1) and (max by (scope) (patroni_standby_leader) < 1)
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.

      - name: PGBouncer
        exporters:
          - name: spreaker/prometheus-pgbouncer-exporter
            slug: spreaker-pgbouncer-exporter
            doc_url: https://github.com/spreaker/prometheus-pgbouncer-exporter
            rules:
              - name: PGBouncer active connections
                description: PGBouncer pools are filling up
                query: "pgbouncer_pools_server_active_connections > 200"
                severity: warning
                for: 2m
              - name: PGBouncer errors
                description: PGBouncer is logging errors. This may be due to a server restart or an admin typing commands at the pgbouncer console.
                query: 'increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[1m]) > 10'
                severity: warning
              - name: PGBouncer max connections
                description: The number of PGBouncer client connections has reached max_client_conn.
                query: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[2m]) > 0'
                severity: critical

      - name: Redis
        exporters:
          - name: oliver006/redis_exporter
            slug: oliver006-redis-exporter
            doc_url: https://github.com/oliver006/redis_exporter
            rules:
              - name: Redis down
                description: Redis instance is down
                query: "redis_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Redis missing master
                description: Redis cluster has no node marked as master.
                query: '(count(redis_instance_info{role="master"}) or vector(0)) < 1'
                severity: critical
              - name: Redis too many masters
                description: Redis cluster has too many nodes marked as master.
                query: 'count(redis_instance_info{role="master"}) > 1'
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Redis disconnected slaves
                description: Redis not replicating for all slaves. Consider reviewing the redis replication status.
                query: "count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 0"
                severity: critical
              - name: Redis replication broken
                description: Redis instance lost a slave
                query: "delta(redis_connected_slaves[1m]) < 0"
                severity: critical
              - name: Redis cluster flapping
                description: Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).
                query: "changes(redis_connected_slaves[1m]) > 1"
                severity: critical
                for: 2m
              - name: Redis missing backup
                description: Redis has not been backed up for 48 hours
                query: "time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 48"
                severity: critical
              - name: Redis out of system memory
                description: Redis is running out of system memory (> 90%)
                query: "redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90 and redis_total_system_memory_bytes > 0"
                severity: warning
                for: 2m
                comments: |
                  The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
              - name: Redis out of configured maxmemory
                description: Redis is running out of configured maxmemory (> 90%)
                query: "redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0"
                severity: warning
                for: 2m
              - name: Redis too many connections
                description: Redis is running out of connections (> 90% used)
                query: "redis_connected_clients / redis_config_maxclients * 100 > 90 and redis_config_maxclients > 0"
                severity: warning
                for: 2m
              - name: Redis not enough connections
                description: Redis instance should have more connections (> 5)
                query: "redis_connected_clients < 5"
                severity: warning
                for: 2m
              - name: Redis rejected connections
                description: Some connections to Redis has been rejected
                query: "increase(redis_rejected_connections_total[1m]) > 5"
                severity: warning

      - name: Memcached
        exporters:
          - name: prometheus/memcached_exporter
            slug: memcached-exporter
            doc_url: https://github.com/prometheus/memcached_exporter
            rules:
              - name: Memcached down
                description: Memcached instance is down on {{ $labels.instance }}
                query: "memcached_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Memcached connection limit approaching (> 80%)
                description: "Memcached connection usage is above 80% on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: "(memcached_current_connections / memcached_max_connections * 100) > 80 and memcached_max_connections > 0"
                severity: warning
                for: 2m
              - name: Memcached connection limit approaching (> 95%)
                description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: "(memcached_current_connections / memcached_max_connections * 100) > 95 and memcached_max_connections > 0"
                severity: critical
                for: 2m
              - name: Memcached out of memory errors
                description: "Memcached is returning out-of-memory errors on {{ $labels.instance }}"
                query: "sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0"
                severity: warning
                for: 5m
              - name: Memcached memory usage high (> 90%)
                description: "Memcached memory usage is above 90% on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: "(memcached_current_bytes / memcached_limit_bytes * 100) > 90 and memcached_limit_bytes > 0"
                severity: warning
                for: 5m
                comments: |
                  High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions.
              - name: Memcached high eviction rate
                description: "Memcached is evicting items at a high rate on {{ $labels.instance }} ({{ $value }} evictions/s)"
                query: "rate(memcached_items_evicted_total[5m]) > 10"
                severity: warning
                for: 5m
                comments: |
                  A sustained eviction rate indicates memory pressure. Consider increasing memcached memory limit or reducing cache usage. Threshold of 10 evictions/s is a rough default — adjust based on your workload.
              - name: Memcached low cache hit rate (< 80%)
                description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)"
                query: '(rate(memcached_commands_total{command="get", status="hit"}[5m]) / (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) * 100) < 80 and (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) > 0'
                severity: warning
                for: 10m
                comments: |
                  A low hit rate may indicate poor cache utilization, incorrect cache keys, or TTLs that are too short. Threshold of 80% is a rough default — adjust based on your workload and access patterns.
              - name: Memcached connections rejected
                description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)"
                query: "increase(memcached_connections_rejected_total[5m]) > 0"
                severity: warning
                for: 5m
              - name: Memcached items too large
                description: "Memcached is rejecting items exceeding max-item-size on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)"
                query: "increase(memcached_item_too_large_total[5m]) > 0"
                severity: info
                for: 5m

      - name: MongoDB
        exporters:
          - name: percona/mongodb_exporter
            slug: percona-mongodb-exporter
            doc_url: https://github.com/percona/mongodb_exporter
            rules:
              - name: MongoDB Down
                description: MongoDB instance is down
                query: "mongodb_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Mongodb replica member unhealthy
                description: MongoDB replica member is not healthy
                query: "mongodb_rs_members_health == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: MongoDB replication lag (Percona)
                description: Mongodb replication lag is more than 10s
                query: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10'
                severity: critical
              - name: MongoDB replication headroom
                description: MongoDB replication headroom is <= 0
                query: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0'
                severity: critical
                comments: |
                  This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both.
              - name: MongoDB number cursors open (Percona)
                description: Too many cursors opened by MongoDB for clients (> 10k)
                query: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000'
                severity: warning
                for: 2m
              - name: MongoDB cursors timeouts (Percona)
                description: "Too many cursors are timing out ({{ $value }} in the last minute)"
                query: "increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100"
                severity: warning
                for: 2m
              - name: MongoDB too many connections (Percona)
                description: Too many connections (> 80%)
                query: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80 and (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) > 0'
                severity: warning
                for: 2m

          - name: dcu/mongodb_exporter
            slug: dcu-mongodb-exporter
            doc_url: https://github.com/dcu/mongodb_exporter
            rules:
              - name: MongoDB replication lag (DCU)
                description: Mongodb replication lag is more than 10s
                query: 'avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}) > 10'
                severity: critical
              - name: MongoDB replication Status 3
                description: MongoDB Replication set member either perform startup self-checks, or transition from completing a rollback or resync
                query: "mongodb_replset_member_state == 3"
                severity: critical
              - name: MongoDB replication Status 6
                description: MongoDB Replication set member as seen from another member of the set, is not yet known
                query: "mongodb_replset_member_state == 6"
                severity: critical
              - name: MongoDB replication Status 8
                description: MongoDB Replication set member as seen from another member of the set, is unreachable
                query: "mongodb_replset_member_state == 8"
                severity: critical
              - name: MongoDB replication Status 9
                description: MongoDB Replication set member is actively performing a rollback. Data is not available for reads
                query: "mongodb_replset_member_state == 9"
                severity: critical
              - name: MongoDB replication Status 10
                description: MongoDB Replication set member was once in a replica set but was subsequently removed
                query: "mongodb_replset_member_state == 10"
                severity: critical
              - name: MongoDB number cursors open (DCU)
                description: Too many cursors opened by MongoDB for clients (> 10k)
                query: 'mongodb_metrics_cursor_open{state="total_open"} > 10000'
                severity: warning
                for: 2m
              - name: MongoDB cursors timeouts (DCU)
                description: "Too many cursors are timing out ({{ $value }} in the last minute)"
                query: "increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100"
                severity: warning
                for: 2m
              - name: MongoDB too many connections (DCU)
                description: Too many connections (> 80%)
                query: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80 and (mongodb_connections{state="current"} + mongodb_connections{state="available"}) > 0'
                severity: warning
                for: 2m
          - name: stefanprodan/mgob
            slug: stefanprodan-mgob-exporter
            doc_url: https://github.com/stefanprodan/mgob
            rules:
              - name: Mgob backup failed
                description: MongoDB backup has failed
                query: 'changes(mgob_scheduler_backup_total{status="500"}[1h]) > 0'
                severity: critical

      - name: Elasticsearch
        exporters:
          - name: prometheus-community/elasticsearch_exporter
            slug: prometheus-community-elasticsearch-exporter
            doc_url: https://github.com/prometheus-community/elasticsearch_exporter
            rules:
              - name: Elasticsearch Heap Usage Too High
                description: "The heap usage is over 90%"
                query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0'
                severity: critical
                for: 2m
              - name: Elasticsearch Heap Usage warning
                description: "The heap usage is over 80%"
                query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0'
                severity: warning
                for: 2m
              - name: Elasticsearch disk out of space
                description: The disk usage is over 90%
                query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10 and elasticsearch_filesystem_data_size_bytes > 0"
                severity: critical
              - name: Elasticsearch disk space low
                description: The disk usage is over 80%
                query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20 and elasticsearch_filesystem_data_size_bytes > 0"
                severity: warning
                for: 2m
              - name: Elasticsearch Cluster Red
                description: Elastic Cluster Red status
                query: 'elasticsearch_cluster_health_status{color="red"} == 1'
                severity: critical
              - name: Elasticsearch Cluster Yellow
                description: Elastic Cluster Yellow status
                query: 'elasticsearch_cluster_health_status{color="yellow"} == 1'
                severity: warning
              - name: Elasticsearch Healthy Nodes
                description: "Missing node in Elasticsearch cluster"
                query: "elasticsearch_cluster_health_number_of_nodes < 3"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Elasticsearch Healthy Data Nodes
                description: "Missing data node in Elasticsearch cluster"
                query: "elasticsearch_cluster_health_number_of_data_nodes < 3"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Elasticsearch relocating shards
                description: "Elasticsearch is relocating shards"
                query: "elasticsearch_cluster_health_relocating_shards > 0"
                severity: info
              - name: Elasticsearch relocating shards too long
                description: "Elasticsearch has been relocating shards for 15min"
                query: "elasticsearch_cluster_health_relocating_shards > 0"
                severity: warning
                for: 15m
              - name: Elasticsearch initializing shards
                description: "Elasticsearch is initializing shards"
                query: "elasticsearch_cluster_health_initializing_shards > 0"
                severity: info
              - name: Elasticsearch initializing shards too long
                description: "Elasticsearch has been initializing shards for 15 min"
                query: "elasticsearch_cluster_health_initializing_shards > 0"
                severity: warning
                for: 15m
              - name: Elasticsearch unassigned shards
                description: "Elasticsearch has unassigned shards"
                query: "elasticsearch_cluster_health_unassigned_shards > 0"
                severity: critical
                for: 2m
              - name: Elasticsearch pending tasks
                description: "Elasticsearch has pending tasks. Cluster works slowly."
                query: "elasticsearch_cluster_health_number_of_pending_tasks > 0"
                severity: warning
                for: 15m
              - name: Elasticsearch no new documents
                description: "No new documents for 10 min!"
                query: 'increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1'
                severity: warning
              - name: Elasticsearch High Indexing Latency
                description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s)."
                query: "rate(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / rate(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and rate(elasticsearch_indices_indexing_index_total[1m]) > 0"
                severity: warning
                for: 10m
              - name: Elasticsearch High Indexing Rate
                description: "The indexing rate on Elasticsearch cluster is higher than the threshold."
                query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000"
                severity: warning
                for: 5m
              - name: Elasticsearch High Query Rate
                description: "The query rate on Elasticsearch cluster is higher than the threshold."
                query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100"
                severity: warning
                for: 5m
              - name: Elasticsearch High Query Latency
                description: "The query latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s)."
                query: "rate(elasticsearch_indices_search_query_time_seconds[1m]) / rate(elasticsearch_indices_search_query_total[1m]) > 1 and rate(elasticsearch_indices_search_query_total[1m]) > 0"
                severity: warning
                for: 5m

      - name: Meilisearch
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://github.com/orgs/meilisearch/discussions/625
            rules:
              - name: Meilisearch index is empty
                description: Meilisearch index {{ $labels.index }} has zero documents
                query: "meilisearch_index_docs_count == 0"
                severity: warning
              - name: Meilisearch http response time
                description: Meilisearch http response time is too high
                query: "meilisearch_http_response_time_seconds > 0.5"
                severity: warning

      - name: Cassandra
        exporters:
          - name: instaclustr/cassandra-exporter
            slug: instaclustr-cassandra-exporter
            doc_url: https://github.com/instaclustr/cassandra-exporter
            rules:
              - name: "Cassandra Node is unavailable"
                description: "Cassandra Node is unavailable - {{ $labels.cassandra_cluster }} {{ $labels.exported_endpoint }}"
                query: "cassandra_endpoint_active < 1"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: "Cassandra many compaction tasks are pending"
                description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}"
                query: "cassandra_table_estimated_pending_compactions > 100"
                severity: warning
              - name: "Cassandra commitlog pending tasks (Instaclustr)"
                description: "Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}"
                query: "cassandra_commit_log_pending_tasks > 15"
                for: 2m
                severity: warning
              - name: "Cassandra compaction executor blocked tasks (Instaclustr)"
                description: "Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}"
                query: 'cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15'
                for: 2m
                severity: warning
              - name: "Cassandra flush writer blocked tasks (Instaclustr)"
                description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}"
                query: 'cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15'
                for: 2m
                severity: warning
              - name: "Cassandra connection timeouts total (Instaclustr)"
                description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}"
                query: "sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5"
                for: 2m
                severity: critical
              - name: "Cassandra storage exceptions (Instaclustr)"
                description: "Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}"
                query: "changes(cassandra_storage_exceptions_total[1m]) > 1"
                severity: critical
              - name: "Cassandra tombstone dump (Instaclustr)"
                description: "Cassandra tombstone dump - {{ $labels.cassandra_cluster }}"
                query: 'avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100'
                for: 2m
                severity: critical
              - name: "Cassandra client request unavailable write (Instaclustr)"
                description: "Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}"
                query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0'
                for: 2m
                severity: critical
              - name: "Cassandra client request unavailable read (Instaclustr)"
                description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}"
                query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0'
                for: 2m
                severity: critical
              - name: "Cassandra client request write failure (Instaclustr)"
                description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}"
                query: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0'
                for: 2m
                severity: critical
              - name: "Cassandra client request read failure (Instaclustr)"
                description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}"
                query: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'
                for: 2m
                severity: critical

          - name: criteo/cassandra_exporter
            slug: criteo-cassandra-exporter
            doc_url: https://github.com/criteo/cassandra_exporter
            rules:
              - name: Cassandra hints count
                description: Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down
                query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:totalhints:count"}[1m]) > 3'
                severity: critical
              - name: Cassandra compaction task pending
                description: Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster.
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"} > 100'
                severity: warning
                for: 2m
              - name: Cassandra viewwrite latency
                description: High viewwrite latency on {{ $labels.instance }} cassandra node
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile"} > 100000'
                severity: warning
                for: 2m
              - name: Cassandra authentication failures
                description: Increase of Cassandra authentication failures
                query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
                severity: warning
                for: 2m
              - name: Cassandra node down
                description: Cassandra node down
                query: 'sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0'
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Cassandra commitlog pending tasks (Criteo)
                description: Unexpected number of Cassandra commitlog pending tasks
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15'
                severity: warning
                for: 2m
              - name: Cassandra compaction executor blocked tasks (Criteo)
                description: Some Cassandra compaction executor tasks are blocked
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:compactionexecutor:currentlyblockedtasks:count"} > 0'
                severity: warning
                for: 2m
              - name: Cassandra flush writer blocked tasks (Criteo)
                description: Some Cassandra flush writer tasks are blocked
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:memtableflushwriter:currentlyblockedtasks:count"} > 0'
                severity: warning
                for: 2m
              - name: Cassandra repair pending tasks
                description: Some Cassandra repair tasks are pending
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:pendingtasks:value"} > 2'
                severity: warning
                for: 2m
              - name: Cassandra repair blocked tasks
                description: Some Cassandra repair tasks are blocked
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:currentlyblockedtasks:count"} > 0'
                severity: warning
                for: 2m
              - name: Cassandra connection timeouts total (Criteo)
                description: Some connection between nodes are ending in timeout
                query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
                severity: critical
                for: 2m
              - name: Cassandra storage exceptions (Criteo)
                description: Something is going wrong with cassandra storage
                query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:exceptions:count"}[1m]) > 1'
                severity: critical
              - name: Cassandra tombstone dump (Criteo)
                description: Too much tombstones scanned in queries
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:table:tombstonescannedhistogram:99thpercentile"} > 1000'
                severity: critical
              - name: Cassandra client request unavailable write (Criteo)
                description: Write failures have occurred because too many nodes are unavailable
                query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:unavailables:count"}[1m]) > 0'
                severity: critical
              - name: Cassandra client request unavailable read (Criteo)
                description: Read failures have occurred because too many nodes are unavailable
                query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:unavailables:count"}[1m]) > 0'
                severity: critical
              - name: Cassandra client request write failure (Criteo)
                description: A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0'
                severity: critical
              - name: Cassandra client request read failure (Criteo)
                description: A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0'
                severity: critical
              - name: Cassandra cache hit rate key cache
                description: Key cache hit rate is below 85%
                query: 'cassandra_stats{name="org:apache:cassandra:metrics:cache:keycache:hitrate:value"} < .85'
                severity: critical
                for: 2m

      - name: Clickhouse
        exporters:
          - name: Embedded Exporter
            slug: embedded-exporter
            doc_url: https://clickhouse.com/docs/en/operations/system-tables/metrics
            rules:
              - name: ClickHouse node down
                description: "No metrics received from ClickHouse exporter for over 2 minutes."
                query: 'up{job="clickhouse"} == 0'
                severity: critical
                for: 2m
                comments: |
                  Adjust the job label to match your Prometheus configuration.
              - name: ClickHouse Memory Usage Critical
                description: "Memory usage is critically high, over 90%."
                query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0"
                severity: critical
                for: 5m
              - name: ClickHouse Memory Usage Warning
                description: "Memory usage is over 80%."
                query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0"
                severity: warning
                for: 5m
              - name: ClickHouse Disk Space Low on Default
                description: "Disk space on default is below 20%."
                query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0"
                severity: warning
                for: 2m
              - name: ClickHouse Disk Space Critical on Default
                description: "Disk space on default disk is critically low, below 10%."
                query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0"
                severity: critical
                for: 2m
              - name: ClickHouse Disk Space Low on Backups
                description: "Disk space on backups is below 20%."
                query: "ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) > 0"
                severity: warning
                for: 2m
              - name: ClickHouse Replica Errors
                description: "Critical replica errors detected, either all replicas are stale or lost."
                query: "ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1"
                severity: critical

              - name: ClickHouse No Available Replicas
                description: "No available replicas in ClickHouse."
                query: "ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1"
                severity: critical

              - name: ClickHouse No Live Replicas
                description: "There are too few live replicas available, risking data loss and service disruption."
                query: "ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1"
                severity: critical

              - name: ClickHouse High TCP Connections
                description: "High number of TCP connections, indicating heavy client or inter-cluster communication."
                query: "ClickHouseMetrics_TCPConnection > 400"
                severity: warning
                for: 5m
                comments: |
                  Please replace the threshold with an appropriate value
              - name: ClickHouse Interserver Connection Issues
                description: "High number of interserver connections may indicate replication or distributed query handling issues."
                query: "ClickHouseMetrics_InterserverConnection > 50"
                severity: warning
                for: 5m
                comments: |
                  Adjust the threshold based on your cluster size and expected replication traffic.
              - name: ClickHouse ZooKeeper Connection Issues
                description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination."
                query: "ClickHouseMetrics_ZooKeeperSession != 1"
                severity: warning
                for: 3m
              - name: ClickHouse Authentication Failures
                description: "Authentication failures detected, indicating potential security issues or misconfiguration."
                query: "increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 3"
                severity: info

              - name: ClickHouse Access Denied Errors
                description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts."
                query: "increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 3"
                severity: info

              - name: ClickHouse rejected insert queries
                description: "INSERTs rejected due to too many active data parts. Reduce insert frequency."
                query: "increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 0"
                severity: warning
                for: 1m
              - name: ClickHouse delayed insert queries
                description: "INSERTs delayed due to high number of active parts."
                query: "increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 0"
                severity: warning
                for: 2m
              - name: ClickHouse zookeeper hardware exception
                description: "Zookeeper hardware exception: network issues communicating with ZooKeeper"
                query: "increase(ClickHouseProfileEvents_ZooKeeperHardwareExceptions[1m]) > 0"
                severity: critical
                for: 1m
              - name: ClickHouse high network usage
                description: High network usage. ClickHouse network usage exceeds 100MB/s.
                query: "rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024"
                severity: warning
                for: 2m
                comments: |
                  Please replace the threshold with an appropriate value
              - name: ClickHouse distributed rejected inserts
                description: "INSERTs into Distributed tables rejected due to pending bytes limit."
                query: "increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 0"
                severity: critical
                for: 2m

      - name: CouchDB
        exporters:
          - name: gesellix/couchdb-prometheus-exporter
            slug: gesellix-couchdb-prometheus-exporter
            doc_url: https://github.com/gesellix/couchdb-prometheus-exporter
            rules:
              - name: CouchDB node down
                description: CouchDB node is not responding (node_up metric is 0) for more than 2 minutes
                query: "couchdb_httpd_node_up == 0 or couchdb_httpd_up == 0"
                severity: critical
                for: 2m
              - name: CouchDB atom memory usage critical
                description: Atom memory usage is above 90% of limit
                query: "couchdb_erlang_memory_atom_used > 0.9 * couchdb_erlang_memory_atom"
                severity: critical
                for: 5m
              - name: CouchDB open databases critical
                description: Number of open databases exceeds 90% of node capacity
                query: "couchdb_httpd_open_databases > 0.9 * 1000"
                severity: critical
                for: 5m
              - name: CouchDB open OS files critical
                description: CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files
                query: "couchdb_httpd_open_os_files > 0.9 * 65535"
                severity: critical
                for: 5m
              - name: CouchDB 5xx error ratio high
                description: More than 5% of HTTP requests are returning 5xx errors
                query: "rate(couchdb_httpd_status_codes{code=~\"5..\"}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05 and rate(couchdb_httpd_requests[5m]) > 0"
                severity: critical
                for: 5m
              - name: CouchDB temporary view read rate critical
                description: Temporary view read rate exceeds 100 reads/sec, high risk of performance degradation
                query: "rate(couchdb_httpd_temporary_view_reads[5m]) > 100"
                severity: critical
                for: 5m
              - name: CouchDB Mango queries scanning too many docs
                description: Some Mango queries are scanning too many documents, consider adding indexes
                query: "rate(couchdb_mango_too_many_docs_scanned[5m]) > 50"
                severity: warning
                for: 5m
              - name: CouchDB Mango queries failed due to invalid index
                description: Some Mango queries failed to execute because the index was missing or invalid
                query: "rate(couchdb_mango_query_invalid_index[5m]) > 5"
                severity: warning
                for: 5m
              - name: CouchDB Mango docs examined high
                description: High number of documents examined per Mango queries, consider indexing
                query: "rate(couchdb_mango_docs_examined[5m]) > 1000"
                severity: warning
                for: 5m
              - name: CouchDB Replicator manager died
                description: Replication manager process has crashed
                query: "increase(couchdb_replicator_changes_manager_deaths[5m]) > 0"
                severity: critical
                for: 1m
              - name: CouchDB Replicator queue process died
                description: Replication queue process has crashed
                query: "increase(couchdb_replicator_changes_queue_deaths[5m]) > 0"
                severity: critical
                for: 1m
              - name: CouchDB Replicator reader process died
                description: Replication reader process has crashed
                query: "increase(couchdb_replicator_changes_reader_deaths[5m]) > 0"
                severity: critical
                for: 1m
              - name: CouchDB Replicator failed to start
                description: One or more replication tasks failed to start
                query: "increase(couchdb_replicator_failed_starts[5m]) > 0"
                severity: critical
                for: 1m
              - name: CouchDB replication cluster unstable
                description: The replication cluster is unstable, replication may be interrupted
                query: "couchdb_replicator_cluster_is_stable == 0"
                severity: critical
                for: 2m
              - name: CouchDB replication read failures
                description: Replication changes feed has failed reads more than 5 times in 5 minutes
                query: "increase(couchdb_replicator_changes_read_failures[5m]) > 5"
                severity: warning
                for: 5m
              - name: CouchDB file descriptors high
                description: Process is using more than 85% of allowed file descriptors
                query: "process_open_fds / process_max_fds > 0.85 and process_max_fds > 0"
                severity: warning
                for: 5m
              - name: CouchDB process restarted
                description: CouchDB process has restarted recently
                query: "changes(process_start_time_seconds[1h]) > 0"
                severity: info
                for: 1m
              - name: CouchDB critical log entries
                description: Critical or error log entries detected in the last 5 minutes
                query: "increase(couchdb_server_couch_log{level=~\"error|critical\"}[5m]) > 0"
                severity: critical
                for: 1m

      - name: Solr
        exporters:
          - name: embedded exporter
            slug: embedded-exporter
            doc_url: https://solr.apache.org/guide/8_11/monitoring-solr-with-prometheus-and-grafana.html
            rules:
              - name: Solr update errors
                description: Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}.
                query: "increase(solr_metrics_core_update_handler_errors_total[1m]) > 1"
                severity: critical
              - name: Solr query errors
                description: Solr has increased query errors in collection {{ $labels.collection }} for replica {{ $labels.replica }} on {{ $labels.base_url }}.
                query: 'increase(solr_metrics_core_errors_total{category="QUERY"}[1m]) > 1'
                severity: warning
                for: 5m
              - name: Solr replication errors
                description: Solr collection {{ $labels.collection }} has replication errors for replica {{ $labels.replica }} on {{ $labels.base_url }}.
                query: 'increase(solr_metrics_core_errors_total{category="REPLICATION"}[1m]) > 1'
                severity: critical
              - name: Solr low live node count
                description: Solr collection {{ $labels.collection }} has less than two live nodes for replica {{ $labels.replica }} on {{ $labels.base_url }}.
                query: "solr_collections_live_nodes < 2"
                severity: critical

  - name: Message brokers
    services:
      - name: RabbitMQ
        exporters:
          - name: rabbitmq/rabbitmq-prometheus
            slug: rabbitmq-exporter
            doc_url: https://github.com/rabbitmq/rabbitmq-prometheus
            rules:
              - name: RabbitMQ node down
                description: Less than 3 nodes running in RabbitMQ cluster
                query: "sum(rabbitmq_build_info) < 3"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: RabbitMQ node not distributed
                description: Distribution link state is not 'up'
                query: "erlang_vm_dist_node_state < 3"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: RabbitMQ instances different versions
                description: Running different version of RabbitMQ in the same cluster, can lead to failure.
                query: "count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1"
                severity: warning
                for: 1h
              - name: RabbitMQ memory high
                description: A node use more than 90% of allocated RAM
                query: "rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90 and rabbitmq_resident_memory_limit_bytes > 0"
                severity: warning
                for: 2m
              - name: RabbitMQ file descriptors usage
                description: A node use more than 90% of file descriptors
                query: "rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 and rabbitmq_process_max_fds > 0"
                severity: warning
                for: 2m
              - name: RabbitMQ too many ready messages
                description: RabbitMQ too many ready messages on {{ $labels.instance }}
                query: "sum(rabbitmq_queue_messages_ready) BY (queue) > 1000"
                severity: warning
                for: 1m
              - name: RabbitMQ too many unack messages
                description: Too many unacknowledged messages
                query: "sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000"
                severity: warning
                for: 1m
              - name: RabbitMQ too many connections
                description: The total connections of a node is too high
                query: "rabbitmq_connections > 1000"
                severity: warning
                for: 2m
              - name: RabbitMQ no queue consumer
                description: A queue has less than 1 consumer
                query: "rabbitmq_queue_consumers < 1"
                severity: warning
                for: 1m # allows a short service restart
              - name: RabbitMQ unroutable messages
                description: A queue has unroutable messages ({{ $value }} in the last 1m)
                query: "increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0"
                severity: warning
                for: 2m

          - name: kbudde/rabbitmq-exporter
            slug: kbudde-rabbitmq-exporter
            doc_url: https://github.com/kbudde/rabbitmq_exporter
            rules:
              - name: RabbitMQ down
                description: RabbitMQ node down
                query: "rabbitmq_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: RabbitMQ cluster down
                description: Less than 3 nodes running in RabbitMQ cluster
                query: "sum(rabbitmq_running) < 3"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: RabbitMQ cluster partition
                description: Cluster partition
                query: "rabbitmq_partitions > 0"
                severity: critical
              - name: RabbitMQ out of memory
                description: Memory available for RabbitMQ is low (< 10%)
                query: "rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0"
                severity: warning
                for: 2m
              - name: RabbitMQ instance too many connections
                description: RabbitMQ instance has too many connections (> 1000)
                query: "rabbitmq_connectionsTotal > 1000"
                severity: warning
                for: 2m
              - name: RabbitMQ dead letter queue filling up
                description: Dead letter queue is filling up (> 10 msgs)
                query: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10'
                severity: warning
                for: 1m
                comments: |
                  Indicate the queue name in dedicated label.
              - name: RabbitMQ too many messages in queue
                description: Queue is filling up (> 1000 msgs)
                query: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000'
                severity: warning
                for: 2m
                comments: |
                  Indicate the queue name in dedicated label.
              - name: RabbitMQ slow queue consuming
                description: Queue messages are consumed slowly (> 60s)
                query: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60'
                severity: warning
                for: 2m
                comments: |
                  Indicate the queue name in dedicated label.
              - name: RabbitMQ no consumer
                description: Queue has no consumer
                query: "rabbitmq_queue_consumers == 0"
                severity: critical
                for: 5m
                comments: |
                  Allows a short service restart.
              - name: RabbitMQ too many consumers
                description: Queue should have only 1 consumer
                query: 'rabbitmq_queue_consumers{queue="my-queue"} > 1'
                severity: critical
                comments: |
                  Indicate the queue name in dedicated label.
              - name: RabbitMQ inactive exchange
                description: Exchange receive less than 5 msgs per second
                query: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5'
                severity: warning
                comments: |
                  Indicate the exchange name in dedicated label.
                for: 2m

      - name: Zookeeper
        exporters:
          - name: cloudflare/kafka_zookeeper_exporter
            slug: cloudflare-kafka-zookeeper-exporter
            doc_url: https://github.com/cloudflare/kafka_zookeeper_exporter
            rules:
          - name: dabealu/zookeeper-exporter
            slug: dabealu-zookeeper-exporter
            doc_url: https://github.com/dabealu/zookeeper-exporter
            rules:
              - name: Zookeeper Down
                description: "Zookeeper down on instance {{ $labels.instance }}"
                query: "zk_up == 0"
                severity: critical
                for: 1m
                comments: |
                  1m delay allows a restart without triggering an alert.
              - name: Zookeeper missing leader
                description: "Zookeeper cluster has no node marked as leader"
                query: "sum(zk_server_leader) == 0"
                severity: critical
              - name: Zookeeper Too Many Leaders
                description: "Zookeeper cluster has too many nodes marked as leader"
                query: "sum(zk_server_leader) > 1"
                severity: critical
              - name: Zookeeper Not Ok
                description: "Zookeeper instance is not ok"
                query: "zk_ruok == 0"
                severity: warning
                for: 3m

      - name: Kafka
        exporters:
          - name: danielqsj/kafka_exporter
            slug: danielqsj-kafka-exporter
            doc_url: https://github.com/danielqsj/kafka_exporter
            rules:
              - name: Kafka topics replicas
                description: Kafka topic in-sync partition
                query: "min(kafka_topic_partition_in_sync_replica) by (topic) < 3"
                severity: critical
              - name: Kafka consumer group lag
                description: Kafka consumer group {{ $labels.consumergroup }} is lagging behind ({{ $value }} messages)
                query: "sum(kafka_consumergroup_lag) by (consumergroup) > 10000"
                severity: warning
                for: 1m
          - name: linkedin/Burrow
            slug: linkedin-kafka-exporter
            doc_url: https://github.com/linkedin/Burrow
            rules:
              - name: Kafka topic offset decreased
                description: Kafka topic offset has decreased
                query: "delta(kafka_burrow_partition_current_offset[1m]) < 0"
                severity: warning
              - name: Kafka consumer lag
                description: Kafka consumer has a 30 minutes and increasing lag
                query: "kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0"
                severity: warning
                for: 15m

      - name: Pulsar
        exporters:
          - name: embedded exporter
            slug: embedded-exporter
            doc_url: https://pulsar.apache.org/docs/reference-metrics/
            rules:
              - name: Pulsar subscription high number of backlog entries
                description: "The number of subscription backlog entries is over 5k"
                query: sum(pulsar_subscription_back_log) by (subscription) > 5000
                for: 1h
                severity: warning
              - name: Pulsar subscription very high number of backlog entries
                description: "The number of subscription backlog entries is over 100k"
                query: sum(pulsar_subscription_back_log) by (subscription) > 100000
                for: 1h
                severity: critical
              - name: Pulsar topic large backlog storage size
                description: "The topic backlog storage size is over 5 GB"
                query: sum(pulsar_storage_size) by (topic) > 5*1024*1024*1024
                for: 1h
                severity: warning
              - name: Pulsar topic very large backlog storage size
                description: "The topic backlog storage size is over 20 GB"
                query: sum(pulsar_storage_size) by (topic) > 20*1024*1024*1024
                for: 1h
                severity: critical
              - name: Pulsar high write latency
                description: "Messages cannot be written in a timely fashion"
                query: sum(pulsar_storage_write_latency_overflow > 0) by (topic)
                for: 1h
                severity: critical
              - name: Pulsar large message payload
                description: "Observing large message payload (> 1MB)"
                query: sum(pulsar_entry_size_overflow > 0) by (topic)
                for: 1h
                severity: warning
              - name: Pulsar high ledger disk usage
                description: "Observing Ledger Disk Usage (> 75%)"
                query: sum(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_usage) by (kubernetes_pod_name) > 75
                for: 1h
                severity: critical
              - name: Pulsar read only bookies
                description: "Observing Readonly Bookies"
                query: count(bookie_SERVER_STATUS{} == 0) by (pod)
                for: 5m
                severity: critical
              - name: Pulsar high number of function errors
                description: "Observing more than 10 Function errors per minute"
                query: sum(rate(pulsar_function_user_exceptions_total[1m]) + rate(pulsar_function_system_exceptions_total[1m])) by (name) > 10
                for: 1m
                severity: critical
              - name: Pulsar high number of sink errors
                description: "Observing more than 10 Sink errors per minute"
                query: sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10
                for: 1m
                severity: critical

      - name: Nats
        exporters:
          - name: nats-io/prometheus-nats-exporter
            slug: nats-exporter
            doc_url: https://github.com/nats-io/prometheus-nats-exporter
            rules:
              - name: Nats high routes count
                description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }}
                query: "gnatsd_varz_routes > 10"
                severity: warning
                for: 3m
              - name: Nats high memory usage
                description: NATS server memory usage is above 200MB for {{ $labels.instance }}
                query: "gnatsd_varz_mem > 200 * 1024 * 1024"
                severity: warning
                for: 5m
              - name: Nats slow consumers
                description: There are slow consumers in NATS for {{ $labels.instance }}
                query: "gnatsd_varz_slow_consumers > 0"
                severity: critical
                for: 3m
              - name: Nats server down
                description: NATS server has been down for more than 5 minutes
                query: 'absent(up{job="nats"})'
                severity: critical
                for: 5m
              - name: Nats high CPU usage
                description: NATS server is using more than 80% CPU for the last 5 minutes
                query: "gnatsd_varz_cpu > 80"
                severity: warning
                for: 5m
                comments: |
                  gnatsd_varz_cpu is a gauge reporting CPU percentage (0-100 scale).
              - name: Nats high number of connections
                description: NATS server has more than 1000 active connections
                query: "gnatsd_connz_num_connections > 1000"
                severity: warning
                for: 5m
              - name: Nats high JetStream store usage
                description: JetStream store usage is over 80%
                query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 and gnatsd_varz_jetstream_config_max_storage > 0"
                severity: warning
                for: 5m
              - name: Nats high JetStream memory usage
                description: JetStream memory usage is over 80%
                query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 and gnatsd_varz_jetstream_config_max_memory > 0"
                severity: warning
                for: 5m
              - name: Nats high number of subscriptions
                description: NATS server has more than 1000 active subscriptions
                query: "gnatsd_connz_subscriptions > 1000"
                severity: warning
                for: 5m
              - name: Nats high pending bytes
                description: NATS server has more than 100,000 pending bytes
                query: "gnatsd_connz_pending_bytes > 100000"
                severity: warning
                for: 5m
              - name: Nats too many errors
                description: NATS server has encountered {{ $value }} JetStream API errors in the last 5 minutes
                query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0"
                severity: warning
                for: 5m
              - name: Nats JetStream accounts exceeded
                description: JetStream has more than 100 active accounts
                query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100"
                severity: warning
                for: 5m
              - name: Nats leaf node connection issue
                description: No leaf node connections on {{ $labels.instance }}
                query: "gnatsd_varz_leafnodes == 0"
                severity: warning
                for: 5m

  - name: Proxies, load balancers and service meshes
    services:
      - name: Nginx
        exporters:
          - name: knyar/nginx-lua-prometheus
            slug: knyar-nginx-exporter
            doc_url: https://github.com/knyar/nginx-lua-prometheus
            rules:
              - name: Nginx high HTTP 4xx error rate
                description: Too many HTTP requests with status 4xx (> 5%)
                query: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0'
                severity: critical
                for: 1m
              - name: Nginx high HTTP 5xx error rate
                description: Too many HTTP requests with status 5xx (> 5%)
                query: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0'
                severity: critical
                for: 1m
              - name: Nginx latency high
                description: Nginx p99 latency is higher than 3 seconds
                query: "histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node, le)) > 3"
                severity: warning
                for: 2m

      - name: Apache
        exporters:
          - name: Lusitaniae/apache_exporter
            slug: lusitaniae-apache-exporter
            doc_url: https://github.com/Lusitaniae/apache_exporter
            rules:
              - name: Apache down
                description: Apache down
                query: "apache_up == 0"
                severity: critical
              - name: Apache workers load
                description: Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }}
                query: '(sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80 and sum by (instance) (apache_scoreboard) > 0'
                severity: warning
                for: 2m
              - name: Apache restart
                description: Apache has just been restarted.
                query: "apache_uptime_seconds_total / 60 < 1"
                severity: warning

      - name: HaProxy
        exporters:
          - name: Embedded exporter (HAProxy >= v2)
            slug: embedded-exporter-v2
            doc_url: https://github.com/haproxy/haproxy/tree/master/contrib/prometheus-exporter
            rules:
              - name: HAProxy high HTTP 4xx error rate backend
                description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
                query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0
                severity: critical
                for: 1m
              - name: HAProxy high HTTP 5xx error rate backend
                description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
                query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0
                severity: critical
                for: 1m
              - name: HAProxy high HTTP 4xx error rate server
                description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}
                query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0
                severity: critical
                for: 1m
              - name: HAProxy high HTTP 5xx error rate server
                description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}
                query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0
                severity: critical
                for: 1m
              - name: HAProxy server response errors
                description: Too many response errors to {{ $labels.server }} server (> 5%).
                query: (sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0
                severity: critical
                for: 1m
              - name: HAProxy backend connection errors
                description: Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.
                query: (sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100
                severity: critical
                for: 1m
              - name: HAProxy server connection errors
                description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.
                query: (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100
                severity: critical
              - name: HAProxy backend max active session > 80%
                description: Session limit from backend {{ $labels.proxy }} reached 80% of limit - {{ $value | printf "%.2f"}}%
                query: ((haproxy_backend_current_sessions >0) * 100) / (haproxy_backend_limit_sessions > 0) > 80
                severity: warning
                for: 2m
              - name: HAProxy pending requests
                description: Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
                query: sum by (proxy) (haproxy_backend_current_queue) > 0
                comments: |
                  haproxy_backend_current_queue is a gauge (current queue depth), not a counter.
                severity: warning
                for: 2m
              - name: HAProxy HTTP slowing down
                description: Average request time is increasing - {{ $value | printf "%.2f"}}
                query: avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1
                severity: warning
                for: 1m
              - name: HAProxy retry high
                description: High rate of retry on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
                query: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10
                severity: warning
                for: 2m
              - name: HAproxy has no alive backends
                description: HAProxy has no alive active or backup backends for {{ $labels.proxy }}
                query: haproxy_backend_active_servers + haproxy_backend_backup_servers == 0
                severity: critical
              - name: HAProxy frontend security blocked requests
                description: HAProxy is blocking requests for security reason
                query: sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10
                severity: warning
                for: 2m
              - name: HAProxy server healthcheck failure
                description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)
                query: increase(haproxy_server_check_failures_total[1m]) > 0
                severity: warning
                for: 1m
          - name: prometheus/haproxy_exporter (HAProxy < v2)
            slug: haproxy-exporter-v1
            doc_url: https://github.com/prometheus/haproxy_exporter
            rules:
              - name: HAProxy down
                description: HAProxy down
                query: "haproxy_up == 0"
                severity: critical
              - name: HAProxy high HTTP 4xx error rate backend (v1)
                description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
                query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
                severity: critical
                for: 1m
              - name: HAProxy high HTTP 5xx error rate backend (v1)
                description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
                query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
                severity: critical
                for: 1m
              - name: HAProxy high HTTP 4xx error rate server (v1)
                description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}
                query: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
                severity: critical
                for: 1m
              - name: HAProxy high HTTP 5xx error rate server (v1)
                description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}
                query: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
                severity: critical
                for: 1m
              - name: HAProxy server response errors (v1)
                description: Too many response errors to {{ $labels.server }} server (> 5%).
                query: "sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0"
                severity: critical
                for: 1m
              - name: HAProxy backend connection errors (v1)
                description: Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.
                query: "sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100"
                severity: critical
                for: 1m
              - name: HAProxy server connection errors (v1)
                description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.
                query: "sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100"
                severity: critical
              - name: HAProxy backend max active session
                description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).
                query: "((sum by (backend) (haproxy_backend_current_sessions * 100) / sum by (backend) (haproxy_backend_limit_sessions))) > 80 and sum by (backend) (haproxy_backend_limit_sessions) > 0"
                severity: warning
                for: 2m
              - name: HAProxy pending requests (v1)
                description: Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend
                query: "sum by (backend) (haproxy_backend_current_queue) > 0"
                severity: warning
                for: 2m
              - name: HAProxy HTTP slowing down (v1)
                description: Average request time is increasing
                query: "avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1"
                severity: warning
                for: 1m
              - name: HAProxy retry high (v1)
                description: High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend
                query: "sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10"
                severity: warning
                for: 2m
              - name: HAProxy backend down
                description: HAProxy backend is down
                query: "haproxy_backend_up == 0"
                severity: critical
              - name: HAProxy server down
                description: HAProxy server is down
                query: "haproxy_server_up == 0"
                severity: critical
              - name: HAProxy frontend security blocked requests (v1)
                description: HAProxy is blocking requests for security reason
                query: "sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10"
                severity: warning
                for: 2m
              - name: HAProxy server healthcheck failure (v1)
                description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)
                query: "increase(haproxy_server_check_failures_total[1m]) > 0"
                severity: warning
                for: 1m

      - name: Traefik
        exporters:
          - name: Embedded exporter v2
            slug: embedded-exporter-v2
            doc_url: https://docs.traefik.io/observability/metrics/prometheus/
            rules:
              - name: Traefik service down
                description: All Traefik services are down
                query: "count(traefik_service_server_up) by (service) == 0"
                severity: critical
              - name: Traefik high HTTP 4xx error rate service
                description: Traefik service 4xx error rate is above 5%
                query: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0'
                severity: critical
                for: 1m
              - name: Traefik high HTTP 5xx error rate service
                description: Traefik service 5xx error rate is above 5%
                query: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0'
                severity: critical
                for: 1m
          - name: Embedded exporter v1
            slug: embedded-exporter-v1
            doc_url: https://docs.traefik.io/observability/metrics/prometheus/
            rules:
              - name: Traefik backend down
                description: All Traefik backends are down
                query: "count(traefik_backend_server_up) by (backend) == 0"
                severity: critical
              - name: Traefik high HTTP 4xx error rate backend
                description: Traefik backend 4xx error rate is above 5%
                query: 'sum(rate(traefik_backend_requests_total{code=~"4.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0'
                severity: critical
                for: 1m
              - name: Traefik high HTTP 5xx error rate backend
                description: Traefik backend 5xx error rate is above 5%
                query: 'sum(rate(traefik_backend_requests_total{code=~"5.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0'
                severity: critical
                for: 1m

      - name: Caddy
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://caddyserver.com/docs/metrics
            rules:
              - name: Caddy Reverse Proxy Down
                description: "All Caddy reverse proxies are down"
                query: "count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0"
                severity: critical

              - name: Caddy high HTTP 4xx error rate service
                description: "Caddy service 4xx error rate is above 5%"
                query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0'
                severity: critical
                for: 1m
              - name: Caddy high HTTP 5xx error rate service
                description: "Caddy service 5xx error rate is above 5%"
                query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0'
                severity: critical
                for: 1m

      - name: Envoy
        exporters:
          - name: Built-in metrics
            slug: embedded-exporter
            doc_url: https://www.envoyproxy.io/docs/envoy/latest/configuration/observability/statistics
            rules:
              - name: Envoy server not live
                description: "Envoy server is not live (draining or shutting down) on {{ $labels.instance }}"
                query: "envoy_server_live != 1"
                severity: critical
                for: 1m
              - name: Envoy high memory usage
                description: "Envoy memory allocated is above 90% of heap size on {{ $labels.instance }}"
                query: "envoy_server_memory_allocated / envoy_server_memory_heap_size * 100 > 90 and envoy_server_memory_heap_size > 0"
                severity: warning
                for: 5m
              - name: Envoy high downstream HTTP 5xx error rate
                description: "More than 5% of downstream HTTP responses are 5xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
                query: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
                severity: critical
                for: 1m
              - name: Envoy high downstream HTTP 4xx error rate
                description: "More than 10% of downstream HTTP responses are 4xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)"
                query: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
                severity: warning
                for: 5m
              - name: Envoy downstream connections overflowing
                description: "Downstream connections are being rejected due to listener overflow on {{ $labels.instance }} ({{ $value }} in the last 5m)"
                query: "increase(envoy_listener_downstream_cx_overflow[5m]) > 5"
                severity: warning
              - name: Envoy cluster membership empty
                description: "Envoy cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} has no healthy members"
                query: "envoy_cluster_membership_healthy == 0"
                severity: critical
                for: 1m
              - name: Envoy cluster membership degraded
                description: "More than 25% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are unhealthy"
                query: "envoy_cluster_membership_healthy / envoy_cluster_membership_total * 100 < 75 and envoy_cluster_membership_total > 0"
                severity: warning
                for: 5m
              - name: Envoy high cluster upstream connection failures
                description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)"
                query: "increase(envoy_cluster_upstream_cx_connect_fail[5m]) > 10"
                severity: warning
                for: 5m
              - name: Envoy high cluster upstream request timeout rate
                description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
                query: "rate(envoy_cluster_upstream_rq_timeout[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0"
                severity: warning
                for: 5m
              - name: Envoy high cluster upstream 5xx error rate
                description: "More than 5% of upstream requests return 5xx in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
                query: 'rate(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0'
                severity: critical
                for: 1m
              - name: Envoy cluster health check failures
                description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)"
                query: "increase(envoy_cluster_health_check_failure[5m]) > 5"
                severity: warning
                for: 5m
              - name: Envoy cluster outlier detection ejections active
                description: "There are active outlier detection ejections in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
                query: "envoy_cluster_outlier_detection_ejections_active > 0"
                severity: info
                for: 5m
              - name: Envoy listener SSL connection errors
                description: "Envoy listener is experiencing SSL/TLS connection errors on {{ $labels.instance }} ({{ $value }} in the last 5m)"
                query: "increase(envoy_listener_ssl_connection_error[5m]) > 5"
                severity: warning
              - name: Envoy global downstream connections overflowing
                description: "Downstream connections are being rejected due to global connection limit on {{ $labels.instance }} ({{ $value }} in the last 5m)"
                query: "increase(envoy_listener_downstream_global_cx_overflow[5m]) > 5"
                severity: critical
              - name: Envoy SSL certificate expiring soon
                description: "SSL certificate loaded by Envoy on {{ $labels.instance }} expires in less than 7 days"
                query: "envoy_server_days_until_first_cert_expiring < 7"
                severity: warning
              - name: Envoy SSL certificate expired
                description: "SSL certificate loaded by Envoy on {{ $labels.instance }} has expired"
                query: "envoy_server_days_until_first_cert_expiring < 0"
                severity: critical
              - name: Envoy cluster circuit breaker tripped
                description: "Circuit breaker is open for cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
                query: "envoy_cluster_circuit_breakers_default_cx_open == 1 or envoy_cluster_circuit_breakers_default_rq_open == 1"
                severity: critical
              - name: Envoy no healthy upstream
                description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)"
                query: "increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 0"
                severity: critical
              - name: Envoy high downstream request timeout rate
                description: "Downstream requests are timing out on {{ $labels.instance }} ({{ $value }} in the last 5m)"
                query: "increase(envoy_http_downstream_rq_timeout[5m]) > 5"
                severity: warning
                for: 5m

      - name: Linkerd
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://linkerd.io/2/tasks/exporting-metrics/
            rules:
              - name: Linkerd high error rate
                description: "Linkerd error rate for {{ $labels.deployment }}{{ $labels.statefulset }}{{ $labels.daemonset }} is over 10%"
                query: 'sum(rate(response_total{classification="failure"}[1m])) by (deployment, statefulset, daemonset) / sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10 and sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) > 0'
                comments: |
                  Linkerd does not expose request_errors_total. Errors are tracked via response_total{classification="failure"}.
                severity: warning
                for: 1m

      - name: Istio
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://istio.io/latest/docs/tasks/observability/metrics/querying-metrics/
            rules:
              - name: Istio Kubernetes gateway availability drop
                description: Gateway pods have dropped. Inbound traffic will likely be affected.
                query: 'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2'
                severity: warning
                for: 1m
              - name: Istio Pilot high total request rate
                description: Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.
                query: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5 and sum(rate(pilot_xds_pushes[1m])) > 0"
                severity: warning
                for: 1m
              - name: Istio Mixer Prometheus dispatches low
                description: Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly.
                query: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
                severity: warning
                for: 1m
              - name: Istio high total request rate
                description: Global request rate in the service mesh is unusually high.
                query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
                severity: warning
                for: 2m
              - name: Istio low total request rate
                description: Global request rate in the service mesh is unusually low.
                query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
                severity: warning
                for: 2m
              - name: Istio high 4xx error rate
                description: High percentage of HTTP 4xx responses in Istio (> 5%).
                query: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0'
                severity: warning
                for: 1m
              - name: Istio high 5xx error rate
                description: High percentage of HTTP 5xx responses in Istio (> 5%).
                query: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0'
                severity: warning
                for: 1m
              - name: Istio high request latency
                description: Istio average requests execution is longer than 100ms.
                query: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100 and rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 0'
                severity: warning
                for: 1m
              - name: Istio latency 99 percentile
                description: Istio 1% slowest requests are longer than 1000ms.
                query: "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000"
                severity: warning
                for: 1m
              - name: Istio Pilot Duplicate Entry
                description: Istio pilot duplicate entry error.
                query: "sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0"
                severity: critical

  - name: Runtimes
    services:
      - name: PHP-FPM
        exporters:
          - name: bakins/php-fpm-exporter
            slug: bakins-fpm-exporter
            doc_url: https://github.com/bakins/php-fpm-exporter
            rules:
              - name: PHP-FPM max-children reached
                description: PHP-FPM reached max children on {{ $labels.instance }} ({{ $value }} times in the last 5m)
                query: "sum(increase(phpfpm_max_children_reached_total[5m])) by (instance) > 3"
                severity: warning

      - name: JVM
        exporters:
          - name: java-client
            slug: jvm-exporter
            doc_url: https://github.com/prometheus/client_java
            rules:
              - name: JVM memory filling up
                description: JVM memory is filling up (> 80%)
                query: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80 and sum by (instance)(jvm_memory_max_bytes{area="heap"}) > 0'
                severity: warning
                for: 2m
              - name: JVM non-heap memory filling up
                description: JVM non-heap memory (metaspace/code cache) is filling up (> 80%)
                query: '(sum by (instance)(jvm_memory_used_bytes{area="nonheap"}) / (sum by (instance)(jvm_memory_max_bytes{area="nonheap"}) > 0)) * 100 > 80'
                severity: warning
                for: 2m
                comments: |
                  Many JVM configurations leave metaspace unbounded, in which case jvm_memory_max_bytes{area="nonheap"} is -1 and this alert will not fire.
                  The query filters out max_bytes <= 0 to avoid false negatives.
              - name: JVM GC time too high
                description: JVM is spending too much time in garbage collection (> 5% of wall clock time)
                query: 'sum by (instance)(rate(jvm_gc_collection_seconds_sum[5m])) > 0.05'
                severity: warning
                for: 5m
              - name: JVM threads deadlocked
                description: JVM has deadlocked threads
                query: 'jvm_threads_deadlocked > 0'
                severity: critical
                for: 1m
              - name: JVM thread count high
                description: JVM thread count is high (> 300), potential thread leak
                query: 'jvm_threads_current > 300'
                severity: warning
                for: 5m
              - name: JVM threads BLOCKED
                description: JVM has high number of BLOCKED threads, indicating lock contention
                query: 'jvm_threads_state{state="BLOCKED"} > 50'
                severity: warning
                for: 5m
              - name: JVM old gen GC frequency
                description: Frequent old/major GC cycles, indicating memory pressure
                query: 'rate(jvm_gc_collection_seconds_count{gc=~".*old.*|.*major.*"}[5m]) > 0.3'
                severity: warning
                for: 5m
                comments: |
                  This regex matches CMS, G1, and Parallel collector names. It will not match ZGC or Shenandoah cycle names.
                  Adjust the gc label filter if you use a different collector.
              - name: JVM direct buffer pool filling up
                description: JVM direct buffer pool is filling up (> 90%)
                query: '(jvm_buffer_pool_used_bytes / jvm_buffer_pool_capacity_bytes) * 100 > 90 and jvm_buffer_pool_capacity_bytes > 0'
                severity: warning
                for: 5m
              - name: JVM objects pending finalization
                description: JVM has objects pending finalization, potential memory leak
                query: 'jvm_memory_objects_pending_finalization > 1000'
                severity: warning
                for: 5m
              - name: JVM file descriptors exhaustion
                description: JVM process is running out of file descriptors (> 90% used)
                query: '(process_open_fds / process_max_fds) * 100 > 90 and process_max_fds > 0'
                severity: warning
                for: 5m
                comments: |
                  process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not JVM-specific.
                  This alert will also fire for Go, Python, or any process exposing these metrics.
              - name: JVM class loading anomaly
                description: Rapid class loading detected, potential classloader leak
                query: 'rate(jvm_classes_loaded_total[5m]) > 100'
                severity: warning
                for: 5m
              - name: JVM compilation time spike
                description: Excessive JIT compilation time consuming CPU
                query: 'rate(jvm_compilation_time_seconds_total[5m]) > 0.1'
                severity: warning
                for: 5m

      - name: Golang
        exporters:
          - name: client_golang
            slug: golang-exporter
            doc_url: https://github.com/prometheus/client_golang
            rules:
              - name: Go goroutine count high
                description: Go application has too many goroutines (> 1000), potential goroutine leak
                query: 'go_goroutines > 1000'
                severity: warning
                for: 5m
                comments: |
                  Threshold is a rough default. High-concurrency servers may legitimately run thousands of goroutines. Adjust to match your baseline.
              - name: Go GC duration high
                description: Go GC pause duration is too high (max > 1s)
                query: 'go_gc_duration_seconds{quantile="1"} > 1'
                severity: warning
                for: 5m
                comments: |
                  quantile="1" is the maximum observed GC pause in the current summary window, not p99.
                  A single outlier pause can push this above 1s. The for: 5m ensures the max stays elevated.
              - name: Go memory usage high
                description: Go heap allocation is using most of the runtime's reserved memory (> 90%), indicating the process may need more memory or has a leak
                query: '(go_memstats_heap_alloc_bytes / go_memstats_sys_bytes) * 100 > 90'
                severity: warning
                for: 5m
                comments: |
                  go_memstats_sys_bytes is the total memory obtained from the OS by the Go runtime, not total host memory.
                  This ratio measures Go-internal memory utilization, not system-level memory pressure.
              - name: Go thread count high
                description: Go OS thread count is high (> 500), potential blocking syscall or CGo leak
                query: 'go_threads > 500'
                severity: warning
                for: 5m
                comments: |
                  Threshold is workload-dependent. Applications with heavy CGo or blocking I/O may legitimately use more OS threads. Adjust to match your baseline.
              - name: Go heap objects count high
                description: Go heap has too many live objects (> 10M), high GC pressure
                query: 'go_memstats_heap_objects > 10000000'
                severity: warning
                for: 5m
                comments: |
                  Threshold is a rough default. Adjust based on your application's normal object count.
              - name: Go GC CPU fraction high
                description: Go GC is consuming too much CPU (> 5%)
                query: 'go_memstats_gc_cpu_fraction > 0.05'
                severity: warning
                for: 5m
                comments: |
                  go_memstats_gc_cpu_fraction is deprecated since Go 1.20 and may return 0 in newer versions.
                  Consider using runtime/metrics-based alternatives if running Go >= 1.20.
              - name: Go goroutine spike
                description: Go goroutine count is growing rapidly
                query: 'deriv(go_goroutines[5m]) > 100'
                severity: warning
                for: 5m
              - name: Go heap fragmentation
                description: Go heap has high idle ratio (> 90%), indicating memory fragmentation
                query: 'go_memstats_heap_idle_bytes / go_memstats_heap_sys_bytes > 0.9'
                severity: warning
                for: 5m
              - name: Go memory leak
                description: Go application has sustained high allocation rate (> 1GB/s), potential memory leak
                query: 'rate(go_memstats_alloc_bytes_total[5m]) > 1e9'
                severity: warning
                for: 5m
              - name: Go stack memory high
                description: Go stack memory usage is high (> 1GB), likely excessive goroutines or deep recursion
                query: 'go_memstats_stack_inuse_bytes > 1e9'
                severity: warning
                for: 5m

      - name: Ruby
        exporters:
          - name: prometheus_exporter
            slug: ruby-exporter
            doc_url: https://github.com/discourse/prometheus_exporter
            rules:
              - name: Ruby heap live slots high
                description: Ruby heap has too many live slots (> 500k), heap bloat
                query: 'ruby_heap_live_slots > 500000'
                severity: warning
                for: 5m
                comments: |
                  Threshold is a rough default. Adjust based on your application's normal heap size.
              - name: Ruby heap free slots high
                description: Ruby heap has too many free slots (> 500k), memory fragmentation after large allocations
                query: 'ruby_heap_free_slots > 500000'
                severity: warning
                for: 5m
              - name: Ruby major GC rate high
                description: Ruby is performing too many major GC cycles, indicating memory pressure
                query: 'rate(ruby_major_gc_ops_total[5m]) > 5'
                severity: warning
                for: 5m
                comments: |
                  Major GC rate > 5/s is extremely high. Consider lowering to > 1 or > 2 for earlier detection.
              - name: Ruby RSS high
                description: Ruby process RSS is high (> 1GB)
                query: 'ruby_rss > 1e9'
                severity: warning
                for: 5m
              - name: Ruby allocated objects spike
                description: Ruby is allocating objects at a high rate
                query: 'rate(ruby_allocated_objects_total[5m]) > 100000'
                severity: warning
                for: 5m

      - name: Python
        exporters:
          - name: client_python
            slug: python-exporter
            doc_url: https://github.com/prometheus/client_python
            rules:
              - name: Python GC objects uncollectable
                description: Python has uncollectable objects, potential memory leak via reference cycles
                query: 'increase(python_gc_objects_uncollectable_total[5m]) > 0'
                severity: warning
                for: 5m
              - name: Python GC collections high
                description: Python GC is collecting too many objects (> 10k/s), high allocation pressure
                query: 'rate(python_gc_objects_collected_total[5m]) > 10000'
                severity: warning
                for: 5m
              - name: Python file descriptors exhaustion
                description: Python process is running out of file descriptors (> 90% used)
                query: '(process_open_fds / process_max_fds) * 100 > 90 and process_max_fds > 0'
                severity: warning
                for: 5m
                comments: |
                  process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not Python-specific.
              - name: Python GC generation 2 collections high
                description: Python full GC (generation 2) is running too frequently, indicating memory pressure
                query: 'rate(python_gc_collections_total{generation="2"}[5m]) > 1'
                severity: warning
                for: 5m
                comments: |
                  Gen2 collection rate > 1/s is very high. In most applications, gen2 runs are infrequent. Adjust threshold based on your workload.
              - name: Python virtual memory high
                description: Python process virtual memory is high (> 4GB)
                query: 'process_virtual_memory_bytes > 4e9'
                severity: warning
                for: 5m
                comments: |
                  Threshold is a rough default. Adjust based on your application's expected memory footprint.

      - name: Sidekiq
        exporters:
          - name: Strech/sidekiq-prometheus-exporter
            slug: strech-sidekiq-exporter
            doc_url: https://github.com/Strech/sidekiq-prometheus-exporter
            rules:
              - name: Sidekiq queue size
                description: Sidekiq queue {{ $labels.name }} is growing
                query: "sidekiq_queue_size > 100"
                severity: warning
                for: 1m
              - name: Sidekiq scheduling latency too high
                description: Sidekiq jobs are taking more than 1min to be picked up. Users may be seeing delays in background processing.
                query: "max(sidekiq_queue_latency) > 60"
                severity: critical

  - name: Data engineering
    services:
      - name: Apache Flink
        exporters:
          - name: Built-in Prometheus reporter
            slug: flink-prometheus-reporter
            doc_url: https://nightlies.apache.org/flink/flink-docs-stable/docs/deployment/metric_reporters/
            rules:
              - name: Flink job is not running
                description: "No Flink jobs are currently running. All jobs may have failed or been cancelled."
                query: "flink_jobmanager_numRunningJobs == 0"
                severity: critical
                for: 1m
              - name: Flink no TaskManagers registered
                description: "No TaskManagers are registered with the JobManager. The cluster has no processing capacity."
                query: "flink_jobmanager_numRegisteredTaskManagers == 0"
                severity: critical
                for: 1m
              - name: Flink all task slots used
                description: "All Flink task slots are in use ({{ $value }} available). New jobs cannot be scheduled."
                query: "flink_jobmanager_taskSlotsAvailable == 0"
                severity: warning
                for: 5m
                comments: |
                  This alert fires when there are no available task slots. Adjust the threshold if your cluster is expected to run at full capacity.
              - name: Flink job restart increasing
                description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes."
                query: "increase(flink_jobmanager_job_numRestarts[5m]) > 1"
                severity: warning
                for: 5m
                comments: |
                  A single restart may be normal during deployments. Adjust threshold based on restart tolerance.
              - name: Flink checkpoint failures
                description: "Flink job {{ $labels.job_name }} has {{ $value }} failed checkpoints in the last 10 minutes."
                query: "increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1"
                severity: warning
                for: 5m
              - name: Flink checkpoint duration high
                description: "Flink job {{ $labels.job_name }} last checkpoint took {{ $value | humanizeDuration }} to complete."
                query: "flink_jobmanager_job_lastCheckpointDuration / 1000 > 60"
                severity: warning
                for: 5m
                comments: |
                  Value is converted from milliseconds to seconds for correct humanizeDuration display.
                  Threshold is 60 seconds. Adjust based on your checkpoint interval and state size.
              - name: Flink task backpressured
                description: "Flink task {{ $labels.task_name }} in job {{ $labels.job_name }} is backpressured."
                query: "flink_taskmanager_job_task_isBackPressured == 1"
                severity: warning
                for: 5m
              - name: Flink task high backpressure time
                description: "Flink task {{ $labels.task_name }} is spending {{ $value | humanize }}ms/sec in backpressure."
                query: "flink_taskmanager_job_task_backPressuredTimeMsPerSecond > 500"
                severity: warning
                for: 5m
                comments: |
                  Fires when a task spends more than 500ms/sec backpressured. This indicates the task cannot keep up with upstream data rate.
              - name: Flink TaskManager heap memory high
                description: "Flink TaskManager {{ $labels.instance }} heap memory usage is above 90%."
                query: "flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9"
                severity: warning
                for: 5m
              - name: Flink JobManager heap memory high
                description: "Flink JobManager {{ $labels.instance }} heap memory usage is above 90%."
                query: "flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9"
                severity: warning
                for: 5m
              - name: Flink TaskManager GC time high
                description: "Flink TaskManager {{ $labels.instance }} is spending more than 10% of time in garbage collection."
                query: "rate(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100"
                severity: warning
                for: 5m
                comments: |
                  Threshold: more than 100ms/sec of GC time (10% of wall clock). Adjust based on your workload.
              - name: Flink no records processed
                description: "Flink task {{ $labels.task_name }} has not processed any records in the last 5 minutes."
                query: "rate(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0"
                severity: warning
                for: 5m
                comments: |
                  Only fires for tasks that have previously received records, to avoid false positives during startup.

      - name: Apache Spark
        exporters:
          - name: Built-in Prometheus (PrometheusServlet + PrometheusResource)
            slug: spark-prometheus
            doc_url: https://spark.apache.org/docs/latest/monitoring.html
            comments: |
              Spark exposes metrics via two built-in endpoints:
              - PrometheusServlet: master/worker/driver metrics at /metrics/prometheus/ (ports 8080, 8081, 4040)
              - PrometheusResource: executor metrics at /metrics/executors/prometheus/ (port 4040, requires spark.ui.prometheus.enabled=true in Spark 3.x)
              Metric names from PrometheusServlet include a dynamic namespace (application ID), making static PromQL queries challenging.
              Configuration: spark.metrics.conf.*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet
            rules:
              - name: Spark no alive workers
                description: "No Spark workers are alive. The cluster has no processing capacity."
                query: "metrics_master_aliveWorkers_Value == 0"
                severity: critical
                for: 1m
              - name: Spark too many waiting apps
                description: "Spark has {{ $value }} applications waiting for resources."
                query: "metrics_master_waitingApps_Value > 10"
                severity: warning
                for: 5m
                comments: |
                  Adjust the threshold based on your cluster's typical queuing behavior.
              - name: Spark worker memory exhausted
                description: "Spark worker {{ $labels.instance }} has no free memory ({{ $value }}MB free)."
                query: "metrics_worker_memFree_MB_Value == 0"
                severity: warning
                for: 2m
              - name: Spark worker cores exhausted
                description: "Spark worker {{ $labels.instance }} has no free cores."
                query: "metrics_worker_coresFree_Value == 0"
                severity: warning
                for: 5m
                comments: |
                  Fires when a worker has no free cores. This may be normal under high load but can indicate capacity issues.
              - name: Spark executor high GC time
                description: "Spark executor {{ $labels.executor_id }} in {{ $labels.application_name }} is spending too much time in GC."
                query: "metrics_executor_totalGCTime_seconds_total / metrics_executor_totalDuration > 0.1 and metrics_executor_totalDuration > 0"
                severity: warning
                for: 5m
                comments: |
                  Fires when more than 10% of executor time is spent in garbage collection.
                  This metric comes from the PrometheusResource endpoint (/metrics/executors/prometheus/).
              - name: Spark executor all tasks failing
                description: "Spark executor {{ $labels.executor_id }} has only failing tasks ({{ $value }} failed, 0 completed)."
                query: "metrics_executor_failedTasks_total > 0 and metrics_executor_completedTasks_total == 0"
                severity: critical
                for: 5m
              - name: Spark executor high task failure rate
                description: "Spark executor {{ $labels.executor_id }} has a task failure rate above 10%."
                query: "metrics_executor_failedTasks_total / metrics_executor_totalTasks_total > 0.1 and metrics_executor_totalTasks_total > 0"
                severity: warning
                for: 5m
              - name: Spark executor high disk spill
                description: "Spark executor {{ $labels.executor_id }} is spilling data to disk. Consider increasing executor memory."
                query: "metrics_executor_diskUsed_bytes > 1e9"
                severity: warning
                for: 5m
                comments: |
                  diskUsed is a gauge, not a counter — do not use rate(). Threshold of 1GB is a rough default.
                  Disk spilling indicates insufficient memory for the workload.

      - name: Hadoop
        exporters:
          - name: hadoop/jmx_exporter
            slug: jmx_exporter
            doc_url: https://github.com/prometheus/jmx_exporter
            rules:
              # Alert rule for NameNode availability
              - name: Hadoop Name Node Down
                query: up{job="hadoop-namenode"} == 0
                for: 5m
                severity: critical
                description: "The Hadoop NameNode service is unavailable."

              # Alert rule for ResourceManager availability
              - name: Hadoop Resource Manager Down
                query: up{job="hadoop-resourcemanager"} == 0
                for: 5m
                severity: critical
                description: "The Hadoop ResourceManager service is unavailable."

              # Alert rule for DataNode status
              - name: Hadoop Data Node Out Of Service
                query: hadoop_datanode_last_heartbeat == 0
                for: 10m
                severity: warning
                description: "The Hadoop DataNode is not sending heartbeats."

              # Alert rule for low HDFS disk space
              - name: Hadoop HDFS Disk Space Low
                query: (hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1 and hadoop_hdfs_bytes_total > 0
                for: 15m
                severity: warning
                description: "Available HDFS disk space is running low."

              # Alert rule for excessive MapReduce task failures
              - name: Hadoop Map Reduce Task Failures
                query: increase(hadoop_mapreduce_task_failures_total[1h]) > 100
                for: 10m
                severity: critical
                description: "There is an unusually high number of MapReduce task failures."

              # Alert rule for high ResourceManager memory usage
              - name: Hadoop Resource Manager Memory High
                query: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8
                for: 15m
                severity: warning
                description: "The Hadoop ResourceManager is approaching its memory limit."

              # Alert rule for high YARN container allocation failures
              - name: Hadoop YARN Container Allocation Failures
                query: increase(hadoop_yarn_container_allocation_failures_total[1h]) > 10
                for: 10m
                severity: warning
                description: "There is a significant number of YARN container allocation failures."

              # Alert rule for excessive HBase region server region count
              - name: Hadoop HBase Region Count High
                query: hadoop_hbase_region_count > 5000
                for: 15m
                severity: warning
                description: "The HBase cluster has an unusually high number of regions."

              # Alert rule for low HBase region server heap space
              - name: Hadoop HBase Region Server Heap Low
                query: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8
                for: 10m
                severity: warning
                description: "HBase Region Servers are running low on heap space."

              # Alert rule for high HBase Write Requests latency
              - name: Hadoop HBase Write Requests Latency High
                query: hadoop_hbase_write_requests_latency_seconds > 0.5
                for: 10m
                severity: warning
                description: "HBase Write Requests are experiencing high latency."

  - name: Orchestrators
    services:
      - name: Kubernetes
        exporters:
          - name: kube-state-metrics
            slug: kubestate-exporter
            doc_url: https://github.com/kubernetes/kube-state-metrics/tree/master/docs
            rules:
              - name: Kubernetes Node not ready
                description: Node {{ $labels.node }} has been unready for a long time
                query: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
                severity: critical
                for: 10m
              - name: Kubernetes Node scheduling disabled
                description: Node {{ $labels.node }} has been marked as unschedulable for more than 30 minutes.
                query: 'kube_node_spec_taint{key="node.kubernetes.io/unschedulable"} == 1'
                severity: warning
                for: 30m
                comments: |
                  Kubernetes Node with disabled schedules are fine.
                  This alarm can be useful to get warned if there are nodes which are longer unscheduled.
              - name: Kubernetes Node memory pressure
                description: "Node {{ $labels.node }} has MemoryPressure condition"
                query: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
                severity: critical
                for: 2m
              - name: Kubernetes Node disk pressure
                description: "Node {{ $labels.node }} has DiskPressure condition"
                query: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
                severity: critical
                for: 2m
              - name: Kubernetes Node network unavailable
                description: "Node {{ $labels.node }} has NetworkUnavailable condition"
                query: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
                severity: critical
                for: 2m
              - name: Kubernetes Node out of pod capacity
                description: "Node {{ $labels.node }} is out of pod capacity"
                query: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
                severity: warning
                for: 2m
              - name: Kubernetes Container oom killer
                description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes."
                query: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
                severity: warning
              - name: Kubernetes Job failed
                description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete"
                query: "kube_job_status_failed > 0"
                severity: warning
              - name: Kubernetes Job not starting
                description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes"
                query: "kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600"
                severity: warning
              - name: Kubernetes CronJob failing
                description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is failing"
                query: "(kube_cronjob_status_last_schedule_time > kube_cronjob_status_last_successful_time) AND (kube_cronjob_status_active == 0) AND (kube_cronjob_spec_suspend == 0)"
                severity: critical
              - name: Kubernetes CronJob suspended
                description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended"
                query: "kube_cronjob_spec_suspend != 0"
                severity: warning
              - name: Kubernetes PersistentVolumeClaim pending
                description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending"
                query: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1'
                severity: warning
                for: 2m
              - name: Kubernetes Volume out of disk space
                description: Volume is almost full (< 10% left)
                query: "kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10 and kubelet_volume_stats_capacity_bytes > 0"
                severity: warning
                for: 2m
              - name: Kubernetes Volume full in four days
                description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available."
                query: "predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0"
                severity: critical
              - name: Kubernetes PersistentVolume error
                description: "Persistent volume {{ $labels.persistentvolume }} is in bad state"
                query: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
                severity: critical
              - name: Kubernetes StatefulSet down
                description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down
                query: "kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0"
                severity: critical
                for: 1m
              - name: Kubernetes HPA scale inability
                description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale
                query: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0'
                severity: warning
                for: 2m
              - name: Kubernetes HPA metrics unavailability
                description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics
                query: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1'
                severity: warning
              - name: Kubernetes HPA scale maximum
                description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods
                query: "(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)"
                severity: info
                for: 2m
              - name: Kubernetes HPA underutilized
                description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.
                query: "max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3" # allow minimum 3 replicas running
                severity: info
              - name: Kubernetes Pod not healthy
                description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.
                query: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
                severity: critical
                for: 15m
              - name: Kubernetes pod crash looping
                description: Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping
                query: "increase(kube_pod_container_status_restarts_total[1m]) > 3"
                severity: warning
                for: 2m
              - name: Kubernetes ReplicaSet replicas mismatch
                description: ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch
                query: "kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas"
                severity: warning
                for: 10m
              - name: Kubernetes Deployment replicas mismatch
                description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch
                query: "kube_deployment_spec_replicas != kube_deployment_status_replicas_available"
                severity: warning
                for: 10m
              - name: Kubernetes StatefulSet replicas mismatch
                description: StatefulSet does not match the expected number of replicas.
                query: "kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas"
                severity: warning
                for: 10m
              - name: Kubernetes Deployment generation mismatch
                description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.
                query: "kube_deployment_status_observed_generation != kube_deployment_metadata_generation"
                severity: critical
                for: 10m
              - name: Kubernetes StatefulSet generation mismatch
                description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.
                query: "kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation"
                severity: critical
                for: 10m
              - name: Kubernetes StatefulSet update not rolled out
                description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
                query: "max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)"
                severity: warning
                for: 10m
              - name: Kubernetes DaemonSet rollout stuck
                description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready
                query: "(kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 and kube_daemonset_status_desired_number_scheduled > 0) or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0"
                severity: warning
                for: 10m
              - name: Kubernetes DaemonSet misscheduled
                description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run
                query: "kube_daemonset_status_number_misscheduled > 0"
                severity: critical
                for: 1m
              - name: Kubernetes CronJob too long
                description: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.
                query: "kube_job_status_start_time > 0 and absent(kube_job_status_completion_time) and (time() - kube_job_status_start_time) > 3600"
                severity: warning
                comments: |
                  Threshold should be customized for each cronjob name.
              - name: Kubernetes Job slow completion
                description: Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.
                query: "kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0"
                severity: critical
                for: 12h
              - name: Kubernetes API server errors
                description: "Kubernetes API server is experiencing {{ $value | humanize }}% error rate"
                query: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3 and sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) > 0'
                severity: critical
                for: 2m
              - name: Kubernetes API client errors
                description: "Kubernetes API client is experiencing {{ $value | humanize }}% error rate"
                query: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 and sum(rate(rest_client_requests_total[1m])) by (instance, job) > 0'
                severity: critical
                for: 2m
              - name: Kubernetes client certificate expires next week
                description: A client certificate used to authenticate to the apiserver is expiring next week.
                query: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60'
                severity: warning
              - name: Kubernetes client certificate expires soon
                description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
                query: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60'
                severity: critical
              - name: Kubernetes API server latency
                description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}."
                query: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1'
                severity: warning
                for: 2m

      - name: Nomad
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            rules:
              - name: Nomad job failed
                description: Nomad job failed
                query: "nomad_nomad_job_summary_failed > 0"
                severity: warning
              - name: Nomad job lost
                description: Nomad job lost
                query: "nomad_nomad_job_summary_lost > 0"
                severity: warning
              - name: Nomad job queued
                description: Nomad job queued
                query: "nomad_nomad_job_summary_queued > 0"
                severity: warning
                for: 2m
              - name: Nomad blocked evaluation
                description: Nomad blocked evaluation
                query: "nomad_nomad_blocked_evals_total_blocked > 0"
                severity: warning

      - name: Consul
        exporters:
          - name: prometheus/consul_exporter
            slug: consul-exporter
            doc_url: https://github.com/prometheus/consul_exporter
            rules:
              - name: Consul service healthcheck failed
                description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`"
                query: "consul_catalog_service_node_healthy == 0"
                severity: critical
                for: 1m # allows a short service restart
              - name: Consul missing master node
                description: Numbers of consul raft peers should be 3, in order to preserve quorum.
                query: "consul_raft_peers < 3"
                severity: critical
              - name: Consul agent unhealthy
                description: A Consul agent is down
                query: 'consul_health_node_status{status="critical"} == 1'
                severity: critical

      - name: Etcd
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            rules:
              - name: Etcd insufficient Members
                description: Etcd cluster should have an odd number of members
                query: "count(etcd_server_id) % 2 == 0"
                severity: critical
              - name: Etcd no Leader
                description: Etcd cluster have no leader
                query: "etcd_server_has_leader == 0"
                severity: critical
              - name: Etcd high number of leader changes
                description: "Etcd leader changed {{ $value }} times during 10 minutes"
                query: "increase(etcd_server_leader_changes_seen_total[10m]) > 2"
                severity: warning
              - name: Etcd high number of failed GRPC requests warning
                description: More than 1% GRPC request failure detected in Etcd
                query: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0'
                severity: warning
                for: 2m
                comments: |
                  Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
              - name: Etcd high number of failed GRPC requests critical
                description: More than 5% GRPC request failure detected in Etcd
                query: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0'
                severity: critical
                for: 2m
                comments: |
                  Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
              - name: Etcd GRPC requests slow
                description: GRPC requests slowing down, 99th percentile is over 0.15s
                query: 'histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service, grpc_method, le)) > 0.15'
                severity: warning
                for: 2m
              - name: Etcd high number of failed HTTP requests warning
                description: More than 1% HTTP failure detected in Etcd
                query: "sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0"
                severity: warning
                for: 2m
              - name: Etcd high number of failed HTTP requests critical
                description: More than 5% HTTP failure detected in Etcd
                query: "sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0"
                severity: critical
                for: 2m
              - name: Etcd HTTP requests slow
                description: HTTP requests slowing down, 99th percentile is over 0.15s
                query: "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15"
                severity: warning
                for: 2m
              - name: Etcd member communication slow
                description: Etcd member communication slowing down, 99th percentile is over 0.15s
                query: "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15"
                severity: warning
                for: 2m
              - name: Etcd high number of failed proposals
                description: "Etcd server got {{ $value }} failed proposals in the past hour"
                query: "increase(etcd_server_proposals_failed_total[1h]) > 5"
                severity: warning
                for: 2m
              - name: Etcd high fsync durations
                description: Etcd WAL fsync duration increasing, 99th percentile is over 0.5s
                query: "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5"
                severity: warning
                for: 2m
              - name: Etcd high commit durations
                description: Etcd commit duration increasing, 99th percentile is over 0.25s
                query: "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25"
                severity: warning
                for: 2m

      - name: OpenStack
        exporters:
          - name: openstack-exporter/openstack-exporter
            slug: openstack-exporter
            doc_url: https://github.com/openstack-exporter/openstack-exporter
            rules:
              - name: OpenStack exporter down
                description: The OpenStack exporter is down. OpenStack cloud metrics are no longer being collected.
                query: 'up{job=~".*openstack.*"} == 0'
                severity: critical
                for: 2m
              - name: OpenStack Nova agent down
                description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}"
                query: 'openstack_nova_agent_state{adminState="enabled"} == 0'
                severity: critical
                for: 2m
              - name: OpenStack Neutron agent down
                description: "Neutron agent {{ $labels.hostname }} ({{ $labels.service }}) is down"
                query: 'openstack_neutron_agent_state{adminState="up"} == 0'
                severity: critical
                for: 2m
              - name: OpenStack Cinder agent down
                description: "Cinder agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}"
                query: 'openstack_cinder_agent_state{adminState="enabled"} == 0'
                severity: critical
                for: 2m
              - name: OpenStack hypervisor high vCPU usage
                description: "Hypervisor {{ $labels.hostname }} vCPU usage is above 90%"
                query: 'openstack_nova_vcpus_used / openstack_nova_vcpus_available > 0.9 and openstack_nova_vcpus_available > 0'
                severity: warning
                for: 5m
                comments: |
                  The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
              - name: OpenStack hypervisor high memory usage
                description: "Hypervisor {{ $labels.hostname }} memory usage is above 90%"
                query: 'openstack_nova_memory_used_bytes / openstack_nova_memory_available_bytes > 0.9 and openstack_nova_memory_available_bytes > 0'
                severity: warning
                for: 5m
                comments: |
                  The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
              - name: OpenStack hypervisor high disk usage
                description: "Hypervisor {{ $labels.hostname }} local disk usage is above 90%"
                query: 'openstack_nova_local_storage_used_bytes / openstack_nova_local_storage_available_bytes > 0.9 and openstack_nova_local_storage_available_bytes > 0'
                severity: warning
                for: 5m
              - name: OpenStack Nova tenant vCPU quota nearly exhausted
                description: "Tenant {{ $labels.tenant }} has used over 90% of its vCPU quota"
                query: 'openstack_nova_limits_vcpus_used / openstack_nova_limits_vcpus_max > 0.9 and openstack_nova_limits_vcpus_max > 0'
                severity: warning
                comments: |
                  A value of -1 for limits_vcpus_max means unlimited quota (no limit set).
              - name: OpenStack Nova tenant memory quota nearly exhausted
                description: "Tenant {{ $labels.tenant }} has used over 90% of its memory quota"
                query: 'openstack_nova_limits_memory_used / openstack_nova_limits_memory_max > 0.9 and openstack_nova_limits_memory_max > 0'
                severity: warning
              - name: OpenStack Nova tenant instance quota nearly exhausted
                description: "Tenant {{ $labels.tenant }} has used over 90% of its instance quota"
                query: 'openstack_nova_limits_instances_used / openstack_nova_limits_instances_max > 0.9 and openstack_nova_limits_instances_max > 0'
                severity: warning
              - name: OpenStack Cinder tenant volume quota nearly exhausted
                description: "Tenant {{ $labels.tenant }} has used over 90% of its volume storage quota"
                query: 'openstack_cinder_limits_volume_used_gb / openstack_cinder_limits_volume_max_gb > 0.9 and openstack_cinder_limits_volume_max_gb > 0'
                severity: warning
              - name: OpenStack Cinder pool low free capacity
                description: "Cinder storage pool {{ $labels.name }} has less than 10% free capacity"
                query: 'openstack_cinder_pool_capacity_free_gb / openstack_cinder_pool_capacity_total_gb < 0.1 and openstack_cinder_pool_capacity_total_gb > 0'
                severity: warning
                for: 5m
              - name: OpenStack Neutron floating IPs associated but not active
                description: "{{ $value }} floating IPs are associated to a private IP but are not in ACTIVE state"
                query: 'openstack_neutron_floating_ips_associated_not_active > 0'
                severity: warning
                for: 5m
              - name: OpenStack Neutron routers not active
                description: "{{ $value }} Neutron routers are not in ACTIVE state"
                query: 'openstack_neutron_routers_not_active > 0'
                severity: warning
                for: 5m
              - name: OpenStack Neutron subnet IP pool exhaustion
                description: "Subnet {{ $labels.subnet_name }} on network {{ $labels.network_name }} has used over 90% of its IP pool"
                query: 'openstack_neutron_network_ip_availabilities_used / openstack_neutron_network_ip_availabilities_total > 0.9 and openstack_neutron_network_ip_availabilities_total > 0'
                severity: warning
              - name: OpenStack Neutron ports without IPs
                description: "{{ $value }} active ports have no IP addresses assigned"
                query: 'openstack_neutron_ports_no_ips > 0'
                severity: warning
                for: 5m
              - name: OpenStack load balancer not online
                description: "Load balancer {{ $labels.name }} ({{ $labels.id }}) operating status is {{ $labels.operating_status }}"
                query: 'openstack_loadbalancer_loadbalancer_status{operating_status!="ONLINE"} > 0'
                severity: warning
                for: 5m
              - name: OpenStack Nova instances in ERROR state
                description: "{{ $value }} Nova instances are in ERROR state"
                query: 'sum(openstack_nova_server_status{status="ERROR"}) > 0'
                severity: warning
                for: 5m
              - name: OpenStack Cinder volumes in error state
                description: "{{ $value }} Cinder volumes are in an error state"
                query: 'openstack_cinder_volume_status_counter{status=~"error.*"} > 0'
                severity: warning
                for: 5m
              - name: OpenStack placement resource high usage
                description: "Resource {{ $labels.resourcetype }} on host {{ $labels.hostname }} usage exceeds 90% of its allocation"
                query: 'openstack_placement_resource_usage / (openstack_placement_resource_total * openstack_placement_resource_allocation_ratio) > 0.9 and openstack_placement_resource_total > 0'
                severity: warning
                for: 5m
                comments: |
                  This alert factors in the allocation ratio to compute effective capacity.
                  The threshold of 90% is a rough default. Adjust based on your allocation ratios and workload patterns.

  - name: CI/CD
    services:
      - name: Jenkins
        exporters:
          - name: Metric plugin
            slug: metric-plugin
            doc_url: https://plugins.jenkins.io/prometheus/
            rules:
              - name: Jenkins node offline
                description: "At least one Jenkins node offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
                query: "jenkins_node_offline_value > 0"
                severity: critical
                for: 5m
              - name: Jenkins no node online
                description: "No Jenkins nodes are online: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
                query: "jenkins_node_online_value == 0"
                severity: critical
              - name: Jenkins healthcheck
                description: "Jenkins healthcheck score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
                query: "jenkins_health_check_score < 1"
                severity: critical
              - name: Jenkins outdated plugins
                description: "{{ $value }} plugins need update"
                query: "sum(jenkins_plugins_withUpdate) by (instance) > 3"
                severity: warning
                for: 1d
              - name: Jenkins builds health score
                description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
                query: "default_jenkins_builds_health_score < 1"
                severity: critical
              - name: Jenkins run failure total
                description: "Job run failures: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
                query: "delta(jenkins_runs_failure_total[1h]) > 100"
                severity: warning
              - name: Jenkins build tests failing
                description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})"
                query: "default_jenkins_builds_last_build_tests_failing > 0"
                severity: warning
              - name: Jenkins last build failed
                description: "Last build failed: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})"
                query: "default_jenkins_builds_last_build_result_ordinal == 2"
                severity: warning
                comments: |
                  * RUNNING  -1 true  - The build had no errors.
                  * SUCCESS   0 true  - The build had no errors.
                  * UNSTABLE  1 true  - The build had some errors but they were not fatal. For example, some tests failed.
                  * FAILURE   2 false - The build had a fatal error.
                  * NOT_BUILT 3 false - The module was not built.
                  * ABORTED   4 false - The build was manually aborted.

      - name: ArgoCD
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://argo-cd.readthedocs.io/en/stable/operator-manual/metrics/
            rules:
              - name: ArgoCD service not synced
                description: Service {{ $labels.name }} run by argo is currently not in sync.
                query: 'argocd_app_info{sync_status!="Synced"} != 0'
                severity: warning
                for: 15m
              - name: ArgoCD service unhealthy
                description: Service {{ $labels.name }} run by argo is currently not healthy.
                query: 'argocd_app_info{health_status!="Healthy"} != 0'
                severity: warning
                for: 15m

      - name: FluxCD
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://fluxcd.io/flux/monitoring/metrics/
            rules:
              - name: Flux Kustomization Failure
                description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready.
                query: 'gotk_resource_info{ready="False", customresource_kind="Kustomization"} > 0'
                severity: warning
                for: 15m
              - name: Flux HelmRelease Failure
                description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready.
                query: 'gotk_resource_info{ready="False", customresource_kind="HelmRelease"} > 0'
                severity: warning
                for: 15m
              - name: Flux Source Issue
                description: Flux source {{ $labels.customresource_kind }} '{{ $labels.name }}' has issue(s).
                query: 'gotk_resource_info{ready="False", customresource_kind=~"GitRepository|HelmRepository|Bucket|OCIRepository"} > 0'
                severity: warning
                for: 15m
              - name: Flux Image Issue
                description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' is marked as not ready.
                query: 'gotk_resource_info{ready="False", customresource_kind=~"ImagePolicy|ImageRepository|ImageUpdateAutomation"} > 0'
                severity: warning
                for: 15m

      - name: GitLab CI
        exporters:
          - name: GitLab built-in exporter
            slug: gitlab-built-in-exporter
            doc_url: https://docs.gitlab.com/administration/monitoring/prometheus/gitlab_metrics/
            rules:
              # Puma web server
              - name: GitLab Puma high queued connections
                description: "GitLab Puma has {{ $value }} queued connections on {{ $labels.instance }}. Requests are waiting for an available worker thread."
                query: "puma_queued_connections > 5"
                severity: warning
                for: 5m
                comments: |
                  Queued connections indicate Puma workers are saturated.
                  Consider increasing puma['worker_processes'] or puma['max_threads'] in gitlab.rb.
              - name: GitLab Puma no available pool capacity
                description: "GitLab Puma pool capacity on {{ $labels.instance }} has been at 0 for 5 minutes. All threads are busy."
                query: "puma_pool_capacity == 0"
                severity: critical
                for: 5m
              - name: GitLab Puma workers not running
                description: "GitLab Puma on {{ $labels.instance }} has {{ $value }} running workers out of expected total."
                query: "puma_running_workers < puma_workers"
                severity: warning
                for: 5m
              # HTTP request handling
              - name: GitLab high HTTP error rate
                description: "GitLab is returning more than 5% HTTP 5xx errors on {{ $labels.instance }}."
                query: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5 and sum(rate(http_requests_total[5m])) > 0'
                severity: critical
                for: 5m
                comments: |
                  Threshold is 5% of all requests returning server errors.
                  Check GitLab logs at /var/log/gitlab/ for root cause.
              - name: GitLab high HTTP request latency
                description: "GitLab p95 HTTP request latency on {{ $labels.instance }} is above 10 seconds."
                query: "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 10"
                severity: warning
                for: 5m
                comments: |
                  Threshold of 10s may need adjustment based on your instance size and workload.
              # Sidekiq background jobs
              - name: GitLab Sidekiq jobs failing
                description: "GitLab Sidekiq jobs are failing at a rate of {{ $value }} per second on {{ $labels.instance }}."
                query: "rate(sidekiq_jobs_failed_total[5m]) > 0.1"
                severity: warning
                for: 10m
                comments: |
                  This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
                  A sustained failure rate indicates background processing issues.
              - name: GitLab Sidekiq queue too large
                description: "GitLab Sidekiq has {{ $value }} running jobs, approaching concurrency limit on {{ $labels.instance }}."
                query: "sum(sidekiq_running_jobs) >= sum(sidekiq_concurrency) * 0.9"
                severity: warning
                for: 10m
                comments: |
                  When running jobs approach the concurrency limit, new jobs will queue up.
                  Consider scaling Sidekiq workers or increasing concurrency.
              - name: GitLab Sidekiq high job completion time
                description: "GitLab Sidekiq job p95 completion time on {{ $labels.instance }} is above 5 minutes ({{ $value | humanizeDuration }})."
                query: "histogram_quantile(0.95, sum(rate(sidekiq_jobs_completion_seconds_bucket[5m])) by (le, worker)) > 300"
                severity: warning
                for: 10m
                comments: |
                  This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
              - name: GitLab Sidekiq high queue latency
                description: "GitLab Sidekiq jobs on {{ $labels.instance }} are waiting more than 60 seconds before being processed."
                query: "histogram_quantile(0.95, sum(rate(sidekiq_jobs_queue_duration_seconds_bucket[5m])) by (le)) > 60"
                severity: warning
                for: 5m
                comments: |
                  This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
                  High queue latency means jobs are stuck waiting. Check Sidekiq concurrency and queue sizes.
              # Database connection pool
              - name: GitLab database connection pool saturation
                description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) is {{ $value }}% busy."
                query: "gitlab_database_connection_pool_busy / gitlab_database_connection_pool_size * 100 > 90 and gitlab_database_connection_pool_size > 0"
                severity: warning
                for: 5m
                comments: |
                  When the pool is near saturation, requests may block waiting for a connection.
                  Increase db_pool_size in gitlab.rb or investigate slow queries.
              - name: GitLab database connection pool dead connections
                description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) has {{ $value }} dead connections."
                query: "gitlab_database_connection_pool_dead > 0"
                severity: warning
                for: 5m
              - name: GitLab database connection pool waiting
                description: "GitLab on {{ $labels.instance }} has {{ $value }} threads waiting for a database connection."
                query: "gitlab_database_connection_pool_waiting > 0"
                severity: warning
                for: 5m
              # CI/CD pipelines
              - name: GitLab CI pipeline creation slow
                description: "GitLab CI pipeline creation p95 latency on {{ $labels.instance }} is above 30 seconds."
                query: "histogram_quantile(0.95, sum(rate(gitlab_ci_pipeline_creation_duration_seconds_bucket[5m])) by (le)) > 30"
                severity: warning
                for: 5m
              - name: GitLab CI pipeline failures increasing
                description: "GitLab CI pipeline failures are increasing on {{ $labels.instance }} ({{ $value }}/s)."
                query: "rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0"
                severity: warning
                for: 10m
                comments: |
                  This metric may not exist in all GitLab versions. Verify against your GitLab installation.
              - name: GitLab CI runner authentication failures
                description: "GitLab CI runners are experiencing authentication failures on {{ $labels.instance }} ({{ $value }} failures)."
                query: "increase(gitlab_ci_runner_authentication_failure_total[5m]) > 5"
                severity: warning
                for: 5m
                comments: |
                  Frequent runner auth failures may indicate expired tokens or misconfigured runners.
              # Ruby process health
              - name: GitLab high memory usage
                description: "GitLab process on {{ $labels.instance }} is using {{ $value | humanize1024 }}B of RSS memory."
                query: "process_resident_memory_bytes{job=~\".*gitlab.*\"} > 2e+9"
                severity: warning
                for: 10m
                comments: |
                  Threshold of 2GB may need adjustment based on your instance size.
                  High memory usage can lead to OOM kills and service disruptions.
              - name: GitLab Ruby heap fragmentation
                description: "GitLab Ruby heap fragmentation on {{ $labels.instance }} is {{ $value }}. High fragmentation wastes memory."
                query: "ruby_gc_stat_ext_heap_fragmentation{job=~\".*gitlab.*\"} > 0.5"
                severity: warning
                for: 15m
                comments: |
                  Heap fragmentation above 50% means a significant amount of memory is wasted.
                  A Puma worker restart may help reclaim memory.
              # Uncaught errors
              - name: GitLab rack uncaught errors
                description: "GitLab is experiencing uncaught errors in the Rack layer on {{ $labels.instance }} ({{ $value }}/s)."
                query: "rate(rack_uncaught_errors_total[5m]) > 0"
                severity: warning
                for: 5m
              # Application version / deployment
              - name: GitLab version mismatch
                description: "Multiple GitLab versions are running across the fleet."
                query: 'count(count by (version) (gitlab_build_info)) > 1'
                severity: warning
                comments: |
                  This may happen during a rolling deployment. If it persists, investigate incomplete upgrades.
              # File descriptors
              - name: GitLab high file descriptor usage
                description: "GitLab on {{ $labels.instance }} is using {{ $value }}% of available file descriptors."
                query: 'process_open_fds{job=~".*gitlab.*"} / process_max_fds * 100 > 80 and process_max_fds > 0'
                severity: warning
                for: 5m
              # Ruby threads
              - name: GitLab Ruby threads saturated
                description: "GitLab running threads on {{ $labels.instance }} have exceeded the expected maximum ({{ $value }})."
                query: "sum by (instance) (gitlab_ruby_threads_running_threads) > on(instance) gitlab_ruby_threads_max_expected_threads * 1.5"
                severity: warning
                for: 10m

          - name: Workhorse
            slug: workhorse
            doc_url: https://docs.gitlab.com/administration/monitoring/prometheus/gitlab_metrics/#gitlab-workhorse
            rules:
              - name: GitLab Workhorse high error rate
                description: "GitLab Workhorse on {{ $labels.instance }} is returning more than 10% HTTP 5xx errors."
                query: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10 and sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) > 0'
                severity: critical
                for: 5m
                comments: |
                  Workhorse sits in front of Puma and handles Git HTTP, file uploads, and proxying.
                  Threshold from GitLab Omnibus default rules: 10% for high-traffic instances.
              - name: GitLab Workhorse high latency
                description: "GitLab Workhorse on {{ $labels.instance }} p95 request latency is above 10 seconds."
                query: "histogram_quantile(0.95, sum(rate(gitlab_workhorse_http_request_duration_seconds_bucket[5m])) by (le)) > 10"
                severity: warning
                for: 5m
              - name: GitLab Workhorse high in-flight requests
                description: "GitLab Workhorse on {{ $labels.instance }} has {{ $value }} in-flight requests."
                query: "gitlab_workhorse_http_in_flight_requests > 100"
                severity: warning
                for: 5m
                comments: |
                  Threshold of 100 may need adjustment based on instance size.

          - name: Gitaly
            slug: gitaly
            doc_url: https://docs.gitlab.com/administration/gitaly/monitoring/
            rules:
              - name: GitLab Gitaly high gRPC error rate
                description: "Gitaly on {{ $labels.instance }} is returning more than 5% gRPC errors."
                query: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
                severity: warning
                for: 5m
                comments: |
                  grpc_code!="OK" includes non-error codes like NotFound, AlreadyExists. Consider filtering to specific error codes for less noise.
              - name: GitLab Gitaly resource exhausted
                description: "Gitaly on {{ $labels.instance }} is returning ResourceExhausted errors, indicating overload ({{ $value }}%)."
                query: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
                severity: critical
                for: 5m
                comments: |
                  ResourceExhausted errors from Gitaly mean Git operations are being rejected due to
                  concurrency limits. This directly impacts users trying to push, pull, or clone.
                  This alert is derived from the GitLab Omnibus default rules.
              - name: GitLab Gitaly high RPC latency
                description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s)."
                query: 'histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job="gitaly",grpc_type="unary"}[5m])) by (le)) > 1'
                severity: warning
                for: 5m
              - name: GitLab Gitaly CPU throttled
                description: "Gitaly processes on {{ $labels.instance }} are being CPU throttled by cgroups."
                query: "rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0"
                severity: warning
                for: 5m
              - name: GitLab Gitaly authentication failures
                description: "Gitaly on {{ $labels.instance }} has authentication failures ({{ $value }})."
                query: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 0'
                severity: warning
              - name: GitLab Gitaly circuit breaker tripped
                description: "Gitaly circuit breaker has tripped on {{ $labels.instance }}. Git operations are failing."
                query: 'increase(gitaly_circuit_breaker_transitions_total{to_state="open"}[5m]) > 0'
                severity: critical
                comments: |
                  When the circuit breaker trips to "open" state, Git operations (push, pull, clone) will fail.
                  Check Gitaly service health and logs.

      - name: Spinnaker
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://spinnaker.io/docs/setup/other_config/monitoring/
            rules:
              - name: Spinnaker circuit breaker open
                description: "Circuit breaker {{ $labels.name }} is open on {{ $labels.instance }}, indicating repeated downstream failures."
                query: 'resilience4j_circuitbreaker_state{state="open"} == 1'
                severity: warning
                for: 5m
              - name: Spinnaker Orca queue backing up
                description: "Orca work queue has {{ $value }} messages ready for delivery but not yet picked up. Pipeline executions may be delayed."
                query: 'queue_ready_depth > 0'
                severity: warning
                for: 5m
                comments: |
                  In a healthy Spinnaker, queue_ready_depth should stay at or near 0.
                  Sustained non-zero values indicate Orca cannot keep up with incoming work.
              - name: Spinnaker Orca queue message lag high
                description: "Orca queue message lag is {{ $value }}s. Pipeline stages are waiting too long before being processed."
                query: 'rate(queue_message_lag_seconds_sum[5m]) / rate(queue_message_lag_seconds_count[5m]) > 30 and rate(queue_message_lag_seconds_count[5m]) > 0'
                severity: warning
                for: 5m
                comments: |
                  The 30s threshold is a rough default. Adjust based on your pipeline SLOs.
              - name: Spinnaker dead messages
                description: "Orca is producing dead-lettered messages ({{ $value }} per second). These are tasks that exhausted all retries and will not be executed."
                query: 'rate(queue_dead_messages_total[5m]) > 0'
                severity: critical
                for: 2m
              - name: Spinnaker zombie executions
                description: "{{ $value }} zombie pipeline executions detected. These are executions with no corresponding queue messages."
                query: 'rate(queue_zombies_total[5m]) > 0'
                severity: warning
                for: 5m
                comments: |
                  Zombies are pipeline executions that are running but have lost their queue entry.
                  See https://spinnaker.io/docs/guides/runbooks/orca-zombie-executions/
              - name: Spinnaker thread pool exhaustion
                description: "Orca message handler thread pool has {{ $value }} blocked threads on {{ $labels.instance }}. Pipeline execution throughput is degraded."
                query: 'threadpool_blockingQueueSize > 0'
                severity: warning
                for: 5m
              - name: Spinnaker polling monitor items over threshold
                description: "Igor polling monitor {{ $labels.monitor }} for {{ $labels.partition }} has exceeded its item threshold, preventing pipeline triggers."
                query: 'sum by (monitor, partition) (pollingMonitor_itemsOverThreshold) > 0'
                severity: critical
                for: 5m
                comments: |
                  When this threshold is exceeded, Igor stops triggering pipelines for the affected monitor.
                  See https://kb.armory.io/s/article/Hitting-Igor-s-caching-thresholds
              - name: Spinnaker polling monitor failures
                description: "Igor polling monitor is experiencing failures ({{ $value }} per second). CI/SCM integrations may not trigger pipelines."
                query: 'rate(pollingMonitor_failed_total[5m]) > 0'
                severity: warning
                for: 5m
              - name: Spinnaker high API error rate
                description: "Spinnaker API 5xx error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}."
                query: 'sum by (instance) (rate(controller_invocations_total{status="5xx"}[5m])) / sum by (instance) (rate(controller_invocations_total[5m])) > 0.05 and sum by (instance) (rate(controller_invocations_total[5m])) > 0'
                severity: warning
                for: 5m
                comments: |
                  The 5% threshold is a rough default. Adjust based on your traffic patterns.
              - name: Spinnaker API rate limit throttling
                description: "Gate is actively throttling API requests on {{ $labels.instance }} ({{ $value }} throttled requests per second)."
                query: 'rate(rateLimitThrottling_total[5m]) > 0'
                severity: warning
                for: 2m
              - name: Spinnaker Clouddriver high error rate
                description: "Clouddriver 5xx error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}. Cloud operations may be failing."
                query: 'sum by (instance) (rate(controller_invocations_total{status="5xx", job=~".*clouddriver.*"}[5m])) / sum by (instance) (rate(controller_invocations_total{job=~".*clouddriver.*"}[5m])) > 0.05 and sum by (instance) (rate(controller_invocations_total{job=~".*clouddriver.*"}[5m])) > 0'
                severity: warning
                for: 5m
              - name: Spinnaker AWS rate limiting
                description: "Clouddriver is being rate-limited by AWS on {{ $labels.instance }} ({{ $value }}ms delay). Cloud operations will be slower."
                query: 'amazonClientProvider_rateLimitDelayMil > 1000'
                severity: warning
                for: 5m
                comments: |
                  This metric is specific to AWS cloud providers in Clouddriver.
                  The 1000ms threshold is a rough default. Adjust based on your AWS usage patterns.

  - name: Network and security
    services:
      - name: SpeedTest
        exporters:
          - name: Speedtest exporter
            slug: nlamirault-speedtest-exporter
            doc_url: https://github.com/nlamirault/speedtest_exporter
            rules:
              - name: SpeedTest Slow Internet Download
                description: Internet download speed is currently {{humanize $value}} Mbps.
                query: "avg_over_time(speedtest_download[10m]) < 100"
                severity: warning
              - name: SpeedTest Slow Internet Upload
                description: Internet upload speed is currently {{humanize $value}} Mbps.
                query: "avg_over_time(speedtest_upload[10m]) < 20"
                severity: warning

      - name: SSL/TLS
        exporters:
          - name: ssl_exporter
            slug: ribbybibby-ssl-exporter
            doc_url: https://github.com/ribbybibby/ssl_exporter
            rules:
              - name: SSL certificate probe failed
                description: Failed to fetch SSL information {{ $labels.instance }}
                query: ssl_probe_success == 0
                severity: critical
              - name: SSL certificate OSCP status unknown
                description: Failed to get the OSCP status {{ $labels.instance }}
                query: ssl_ocsp_response_status == 2
                severity: warning
              - name: SSL certificate revoked
                description: SSL certificate revoked {{ $labels.instance }}
                query: ssl_ocsp_response_status == 1
                severity: critical
              - name: SSL certificate expiry (< 7 days)
                description: "{{ $labels.instance }} Certificate is expiring in 7 days"
                query: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 7
                severity: warning

      - name: cert-manager
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://cert-manager.io/docs/devops-tips/prometheus-metrics/
            rules:
              - name: Cert-Manager absent
                description: Cert-Manager has disappeared from Prometheus service discovery. New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back.
                query: 'absent(up{job="cert-manager"})'
                severity: critical
                for: 10m
              - name: Cert-Manager certificate expiring soon
                description: The certificate {{ $labels.name }} is expiring in less than 21 days.
                query: 'avg by (exported_namespace, namespace, name) (certmanager_certificate_expiration_timestamp_seconds - time()) < (21 * 24 * 3600)'
                severity: warning
                for: 1h
                comments: |
                  Threshold of 21 days is a rough default. ACME certificates are typically renewed 30 days before expiry, so expiring within 21 days may indicate issuer misconfiguration.
              - name: Cert-Manager certificate not ready
                description: "The certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready to serve traffic."
                query: 'max by (name, exported_namespace, namespace, condition) (certmanager_certificate_ready_status{condition!="True"} == 1)'
                severity: critical
                for: 10m
              - name: Cert-Manager hitting ACME rate limits
                description: Cert-Manager is being rate-limited by the ACME provider. Certificate issuance and renewal may be blocked for up to a week.
                query: 'sum by (host) (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) > 0'
                severity: critical
                for: 5m
                comments: |
                  In cert-manager 1.19+, the metric was renamed (dropped http_ prefix). Verify metric name against your version.

      - name: Juniper
        exporters:
          - name: czerwonk/junos_exporter
            slug: czerwonk-junos-exporter
            doc_url: https://github.com/czerwonk/junos_exporter
            rules:
              - name: Juniper switch down
                description: The switch appears to be down
                query: junos_up == 0
                severity: critical
              - name: Juniper critical Bandwidth Usage 1GiB
                description: Interface is highly saturated. (> 0.90GiB/s)
                query: "rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.90"
                severity: critical
                for: 1m
              - name: Juniper warning Bandwidth Usage 1GiB
                description: Interface is getting saturated. (> 0.80GiB/s)
                query: "rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.80"
                severity: warning
                for: 1m

      - name: CoreDNS
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            rules:
              - name: CoreDNS Panic Count
                description: Number of CoreDNS panics encountered
                query: "increase(coredns_panics_total[1m]) > 0"
                severity: critical

      - name: Freeswitch
        exporters:
          - name: znerol/prometheus-freeswitch-exporter
            slug: znerol-freeswitch-exporter
            doc_url: https://pypi.org/project/prometheus-freeswitch-exporter
            rules:
              - name: Freeswitch down
                description: Freeswitch is unresponsive
                query: "freeswitch_up == 0"
                severity: critical
              - name: Freeswitch Sessions Warning
                description: 'High sessions usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
                query: "(freeswitch_session_active * 100 / freeswitch_session_limit) > 80 and freeswitch_session_limit > 0"
                severity: warning
                for: 10m
              - name: Freeswitch Sessions Critical
                description: 'High sessions usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
                query: "(freeswitch_session_active * 100 / freeswitch_session_limit) > 90 and freeswitch_session_limit > 0"
                severity: critical
                for: 5m

      - name: Hashicorp Vault
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://github.com/hashicorp/vault/blob/master/website/content/docs/configuration/telemetry.mdx#prometheus
            rules:
              - name: Vault sealed
                description: "Vault instance is sealed on {{ $labels.instance }}"
                query: "vault_core_unsealed == 0"
                severity: critical
              - name: Vault too many pending tokens
                description: 'Too many pending tokens {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
                query: "avg(vault_token_create_count - vault_token_store_count) > 0"
                severity: warning
                for: 5m
              - name: Vault too many infinity tokens
                description: 'Too many infinity tokens {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
                query: 'vault_token_count_by_ttl{creation_ttl="+Inf"} > 3'
                severity: warning
                for: 5m
              - name: Vault cluster health
                description: 'Vault cluster is not healthy {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
                query: "sum(vault_core_active) / count(vault_core_active) <= 0.5"
                severity: critical

      - name: Keycloak
        exporters:
          - name: aerogear/keycloak-metrics-spi
            slug: aerogear-keycloak-metrics-spi
            doc_url: https://github.com/aerogear/keycloak-metrics-spi
            rules:
              - name: Keycloak high login failure rate
                description: "More than 5% of login attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%)."
                query: '(sum by (realm) (rate(keycloak_failed_login_attempts_total[5m])) / (sum by (realm) (rate(keycloak_logins_total[5m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[5m])))) * 100 > 5 and (sum by (realm) (rate(keycloak_logins_total[5m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[5m]))) > 0'
                severity: warning
                for: 5m
                comments: |
                  Threshold of 5% is a rough default. Adjust based on your user base and expected error rates.
                  A spike in failed logins may indicate a brute-force attack or misconfigured client.
              - name: Keycloak no successful logins
                description: "No successful logins in realm {{ $labels.realm }} for the last 15 minutes."
                query: 'sum by (realm) (rate(keycloak_logins_total[15m])) == 0 and (sum by (realm) (rate(keycloak_logins_total[15m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[15m]))) > 0'
                severity: critical
                for: 5m
                comments: Only fires when login attempts exist but none succeed — may indicate an authentication outage.
              - name: Keycloak high token refresh error rate
                description: "More than 10% of token refresh attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%)."
                query: '(sum by (realm) (rate(keycloak_refresh_tokens_errors_total[5m])) / sum by (realm) (rate(keycloak_refresh_tokens_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_refresh_tokens_total[5m])) > 0'
                severity: warning
                for: 5m
                comments: Threshold of 10% is a rough default. High refresh token errors may indicate expired sessions or token store issues.
              - name: Keycloak high code-to-token exchange error rate
                description: "More than 10% of code-to-token exchanges are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%)."
                query: '(sum by (realm) (rate(keycloak_code_to_tokens_errors_total[5m])) / sum by (realm) (rate(keycloak_code_to_tokens_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_code_to_tokens_total[5m])) > 0'
                severity: warning
                for: 5m
                comments: Threshold of 10% is a rough default. Code-to-token failures may indicate misconfigured OAuth clients or replay attacks.
              - name: Keycloak high registration failure rate
                description: "More than 10% of registration attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%)."
                query: '(sum by (realm) (rate(keycloak_registrations_errors_total[5m])) / sum by (realm) (rate(keycloak_registrations_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_registrations_total[5m])) > 0'
                severity: warning
                for: 5m
                comments: Threshold of 10% is a rough default.
              - name: Keycloak slow request response time
                description: "Keycloak {{ $labels.method }} requests are taking more than 2 seconds on average."
                query: 'sum by (method) (rate(keycloak_request_duration_sum[5m])) / sum by (method) (rate(keycloak_request_duration_count[5m])) > 2000 and sum by (method) (rate(keycloak_request_duration_count[5m])) > 0'
                severity: warning
                for: 5m
                comments: |
                  keycloak_request_duration is in milliseconds. Threshold of 2000ms (2 seconds) is a rough default.

      - name: Cloudflare
        exporters:
          - name: lablabs/cloudflare-exporter
            slug: lablabs-cloudflare-exporter
            doc_url: https://github.com/lablabs/cloudflare-exporter
            rules:
              - name: Cloudflare http 4xx error rate
                description: "Cloudflare high HTTP 4xx error rate (> 5% for domain {{ $labels.zone }})"
                query: '(sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[15m])) > 0'
                severity: warning
              - name: Cloudflare http 5xx error rate
                description: "Cloudflare high HTTP 5xx error rate (> 5% for domain {{ $labels.zone }})"
                query: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[5m])) > 0'
                severity: critical

      - name: SNMP
        exporters:
          - name: prometheus/snmp_exporter
            slug: snmp-exporter
            doc_url: https://github.com/prometheus/snmp_exporter
            comments: |
              These rules use standard IF-MIB and SNMPv2-MIB metrics. Metric names depend on your snmp.yml module configuration.
              Thresholds for bandwidth and error rates are rough defaults - adjust to your environment.
            rules:
              - name: SNMP target down
                description: "SNMP device {{ $labels.instance }} is unreachable."
                query: 'up{job=~"snmp.*"} == 0'
                severity: critical
                for: 5m
                comments: From the official snmp-mixin.
              - name: SNMP interface down
                description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} is operationally down while administratively up."
                query: '(ifOperStatus{job=~"snmp.*"} == 2) and on(instance, job, ifIndex) (ifAdminStatus{job=~"snmp.*"} == 1)'
                severity: critical
                for: 2m
              - name: SNMP interface high inbound error rate
                description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an inbound error rate above 5%."
                query: 'rate(ifInErrors{job=~"snmp.*"}[5m]) / (rate(ifHCInUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInMulticastPkts{job=~"snmp.*"}[5m])) > 0.05 and (rate(ifHCInUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInMulticastPkts{job=~"snmp.*"}[5m])) > 0'
                severity: warning
                for: 5m
                comments: Threshold is a rough default. Adjust based on your network environment.
              - name: SNMP interface high outbound error rate
                description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an outbound error rate above 5%."
                query: 'rate(ifOutErrors{job=~"snmp.*"}[5m]) / (rate(ifHCOutUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutMulticastPkts{job=~"snmp.*"}[5m])) > 0.05 and (rate(ifHCOutUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutMulticastPkts{job=~"snmp.*"}[5m])) > 0'
                severity: warning
                for: 5m
                comments: Threshold is a rough default. Adjust based on your network environment.
              - name: SNMP interface high bandwidth usage inbound
                description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} inbound utilization is above 80%."
                query: 'rate(ifHCInOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0'
                severity: warning
                for: 15m
                comments: |
                  Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead.
              - name: SNMP interface high bandwidth usage outbound
                description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} outbound utilization is above 80%."
                query: 'rate(ifHCOutOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0'
                severity: warning
                for: 15m
                comments: |
                  Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead.
              - name: SNMP device restarted
                description: "SNMP device {{ $labels.instance }} has restarted (uptime < 5 minutes)."
                query: "sysUpTime / 100 < 300"
                severity: info
                comments: sysUpTime is in centiseconds (hundredths of a second).

      - name: Cilium
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://docs.cilium.io/en/stable/observability/metrics/
            rules:
              # Agent health
              - name: Cilium agent unreachable nodes
                description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} node(s). Check network connectivity and node health."
                query: "sum(cilium_unreachable_nodes{}) by (pod) > 0"
                severity: warning
                for: 15m
                comments: |
                  Metric name depends on Cilium version. Use cilium_unreachable_nodes (older) or cilium_node_connectivity_status (1.14+).
              - name: Cilium agent unreachable health endpoints
                description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} health endpoint(s). Node-to-node health probes are failing."
                query: "sum(cilium_unreachable_health_endpoints{}) by (pod) > 0"
                severity: warning
                for: 15m
                comments: |
                  Metric name depends on Cilium version. Use cilium_unreachable_health_endpoints (older) or cilium_node_connectivity_status (1.14+).
              - name: Cilium agent failing controllers
                description: "Cilium agent {{ $labels.pod }} has {{ $value }} failing controller(s). Check cilium-agent logs for details."
                query: "sum(cilium_controllers_failing{}) by (pod) > 0"
                severity: warning
                for: 5m
                comments: |
                  Metric name depends on Cilium version. Use cilium_controllers_failing (older) or cilium_controllers_runs_total (1.14+).
              # Endpoints
              - name: Cilium agent endpoint failures
                description: "Cilium agent {{ $labels.pod }} has {{ $value }} endpoint(s) in invalid state."
                query: 'sum(cilium_endpoint_state{endpoint_state="invalid"}) by (pod) > 0'
                severity: warning
                for: 5m
              - name: Cilium agent endpoint regeneration failures
                description: "Cilium agent {{ $labels.pod }} is failing to regenerate endpoints. Network policy enforcement may be stale."
                query: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0'
                severity: warning
                for: 5m
              - name: Cilium agent endpoint update failure
                description: "Cilium agent {{ $labels.pod }} is failing K8s endpoint update API calls ({{ $labels.method }} {{ $labels.return_code }})."
                query: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0'
                severity: warning
                for: 5m
              - name: Cilium agent endpoint create failure
                description: "Cilium agent {{ $labels.pod }} is failing CNI endpoint-create calls. New pods may fail to get networking."
                query: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0'
                severity: info
                for: 5m
              # BPF maps
              - name: Cilium agent map operation failures
                description: "Cilium agent {{ $labels.pod }} has eBPF map operation failures on {{ $labels.map_name }}. Datapath may be degraded."
                query: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0'
                severity: warning
                for: 5m
              - name: Cilium agent BPF map pressure
                description: "Cilium agent {{ $labels.pod }} eBPF map {{ $labels.map_name }} is above 90% utilization. Map may become full."
                query: "cilium_bpf_map_pressure{} > 0.9"
                severity: warning
                for: 5m
                comments: Map pressure is a ratio from 0 to 1. At 1.0, the map is full and new entries will be dropped.
              # Conntrack and NAT
              - name: Cilium agent conntrack table full
                description: "Cilium agent {{ $labels.pod }} conntrack table is full, causing packet drops. Increase CT map size or investigate connection leaks."
                query: 'sum(rate(cilium_drop_count_total{reason="CT: Map insertion failed"}[5m])) by (pod) > 0'
                severity: critical
                for: 5m
              - name: Cilium agent conntrack failed garbage collection
                description: "Cilium agent {{ $labels.pod }} conntrack garbage collection is failing. Stale entries may accumulate."
                query: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0'
                severity: warning
                for: 5m
              - name: Cilium agent NAT table full
                description: "Cilium agent {{ $labels.pod }} NAT table is full, causing masquerade failures. Increase NAT map size or investigate."
                query: 'sum(rate(cilium_drop_count_total{reason="No mapping for NAT masquerade"}[1m])) by (pod) > 0'
                severity: critical
                for: 5m
              # Packet drops
              - name: Cilium agent high denied rate
                description: "Cilium agent {{ $labels.pod }} is dropping packets due to policy denial. Verify network policies are correct."
                query: 'sum(rate(cilium_drop_count_total{reason="Policy denied"}[1m])) by (pod) > 0'
                severity: info
                for: 10m
                comments: Policy denials may be expected behavior. Investigate only if unexpected traffic is being blocked.
              - name: Cilium agent high drop rate
                description: "Cilium agent {{ $labels.pod }} is dropping packets for reason {{ $labels.reason }}. This indicates infrastructure issues."
                query: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0'
                severity: warning
                for: 5m
              # Policy
              - name: Cilium agent policy map pressure
                description: "Cilium agent {{ $labels.pod }} policy BPF map is above 90% utilization. New policies may fail to apply."
                query: 'sum(cilium_bpf_map_pressure{map_name=~"cilium_policy_.*"}) by (pod) > 0.9'
                severity: warning
                for: 5m
              - name: Cilium agent policy import errors
                description: "Cilium agent {{ $labels.pod }} is failing to import network policies. Policy enforcement may be incomplete."
                query: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0'
                severity: warning
                for: 5m
              - name: Cilium agent policy implementation delay
                description: "Cilium agent {{ $labels.pod }} P99 policy deployment latency exceeds 60 seconds. Endpoints may run with stale policies."
                query: "histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay[5m])) by (le, pod)) > 60"
                severity: warning
                for: 5m
                comments: Threshold of 60s is a rough default. Adjust based on cluster size and policy complexity.
              # Identity
              - name: Cilium node-local high identity allocation
                description: "Cilium agent {{ $labels.pod }} node-local identity allocation is above 80%. Approaching the 65535 identity limit."
                query: '(sum(cilium_identity{type="node_local"}) by (pod) / (2^16-1)) > 0.8'
                severity: warning
                for: 5m
              - name: Cilium cluster high identity allocation
                description: "Cilium cluster-wide identity allocation is above 80%. Approaching the maximum identity limit."
                query: '(sum(cilium_identity{type="cluster_local"}) by () / (2^16-256)) > 0.8'
                severity: warning
                for: 5m
              # IPAM
              - name: Cilium operator exhausted IPAM IPs
                description: "Cilium operator has no available IPAM IPs. New pods will fail to schedule networking."
                query: 'sum(cilium_operator_ipam_ips{type="available"}) by () <= 0'
                severity: critical
                for: 5m
              - name: Cilium operator low available IPAM IPs
                description: "Cilium operator IPAM IP pool is over 90% utilized. Allocate more IPs to avoid exhaustion."
                query: 'sum(cilium_operator_ipam_ips{type!="available"}) by () / sum(cilium_operator_ipam_ips) by () > 0.9 and sum(cilium_operator_ipam_ips) by () > 0'
                severity: warning
                for: 5m
                comments: Threshold of 90% is a rough default. Adjust based on your pod churn rate and IP pool size.
              - name: Cilium operator IPAM interface creation failures
                description: "Cilium operator is failing to create IPAM network interfaces. IP allocation may be impacted."
                query: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0'
                severity: warning
                for: 10m
                comments: |
                  Some Cilium versions may not have a status label on this metric. Verify against your Cilium version.
              # API and K8s client
              - name: Cilium agent API errors
                description: "Cilium agent {{ $labels.pod }} API is returning 5xx errors ({{ $labels.return_code }}). Agent may be unhealthy."
                query: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0'
                severity: warning
                for: 5m
              - name: Cilium agent Kubernetes client errors
                description: "Cilium agent {{ $labels.pod }} is receiving errors from K8s API for endpoint {{ $labels.endpoint }} ({{ $labels.return_code }})."
                query: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0'
                severity: info
                for: 5m
              # ClusterMesh
              - name: Cilium ClusterMesh remote cluster not ready
                description: "Cilium ClusterMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}."
                query: "count(cilium_clustermesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0"
                severity: critical
                for: 5m
              - name: Cilium ClusterMesh remote cluster failing
                description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing."
                query: "sum(rate(cilium_clustermesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0"
                severity: critical
                for: 5m
              # KVStoreMesh
              - name: Cilium KVStoreMesh remote cluster not ready
                description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}."
                query: "count(cilium_kvstoremesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0"
                severity: critical
                for: 5m
              - name: Cilium KVStoreMesh remote cluster failing
                description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures."
                query: "sum(rate(cilium_kvstoremesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0"
                severity: critical
                for: 5m
              - name: Cilium KVStoreMesh sync errors
                description: "Cilium KVStoreMesh from {{ $labels.source_cluster }} is experiencing kvstore sync errors."
                query: "sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0"
                severity: critical
                for: 5m
              # Hubble
              - name: Cilium Hubble lost events
                description: "Cilium Hubble on {{ $labels.pod }} is losing flow events. Observability data may be incomplete."
                query: "sum(rate(hubble_lost_events_total[5m])) by (pod) > 0"
                severity: warning
                for: 5m
              - name: Cilium Hubble high DNS error rate
                description: "Cilium Hubble on {{ $labels.pod }} is observing more than 10% DNS error responses."
                query: 'sum(rate(hubble_dns_responses_total{rcode!="No Error"}[5m])) by (pod) / sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0.1 and sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0'
                severity: warning
                for: 5m
                comments: Threshold of 10% is a rough default. Some DNS errors may be normal depending on your workload.

      - name: WireGuard
        exporters:
          - name: MindFlavor/prometheus_wireguard_exporter
            slug: mindflavor-prometheus-wireguard-exporter
            doc_url: https://github.com/MindFlavor/prometheus_wireguard_exporter
            rules:
              - name: WireGuard peer handshake too old
                description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has not had a handshake for over 5 minutes. The tunnel may be down."
                query: 'time() - wireguard_latest_handshake_seconds > 300 and wireguard_latest_handshake_seconds > 0'
                severity: warning
                for: 2m
                comments: |
                  The threshold of 300 seconds (5 minutes) is a rough default. WireGuard peers that are idle but reachable
                  typically re-handshake every 2 minutes. Adjust based on your keepalive interval.
                  The `> 0` guard excludes peers that have never completed a handshake (covered by a separate rule).
              - name: WireGuard peer handshake never established
                description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has never completed a handshake. Check peer configuration and network connectivity."
                query: 'wireguard_latest_handshake_seconds == 0'
                severity: critical
                for: 5m
                comments: |
                  This alert will fire for all offline mobile/laptop peers. Consider filtering by expected-online peers.
              - name: WireGuard no traffic on peer
                description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has had no traffic for 15 minutes despite an active handshake."
                query: '(rate(wireguard_sent_bytes_total[15m]) + rate(wireguard_received_bytes_total[15m])) == 0 and wireguard_latest_handshake_seconds > 0 and (time() - wireguard_latest_handshake_seconds) < 300'
                severity: warning
                for: 15m
                comments: |
                  This alert fires when a peer has a recent handshake but zero traffic flow.
                  May indicate routing issues or a misconfigured allowed-ips.
                  Only useful if you expect continuous traffic on all peers.

  - name: Storage
    services:
      - name: Ceph
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://docs.ceph.com/en/quincy/mgr/prometheus/
            rules:
              - name: Ceph State
                description: Ceph instance unhealthy
                query: "ceph_health_status != 0"
                severity: critical
              - name: Ceph monitor clock skew
                description: Ceph monitor clock skew detected. Please check ntp and hardware clock settings
                query: "abs(ceph_monitor_clock_skew_seconds) > 0.2"
                severity: warning
                for: 2m
              - name: Ceph monitor low space
                description: Ceph monitor storage is low.
                query: "ceph_monitor_avail_percent < 10"
                severity: warning
                for: 2m
              - name: Ceph OSD Down
                description: Ceph Object Storage Daemon Down
                query: "ceph_osd_up == 0"
                severity: critical
              - name: Ceph high OSD latency
                description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state."
                query: "ceph_osd_perf_apply_latency_seconds > 5"
                severity: warning
                for: 1m
              - name: Ceph OSD low space
                description: Ceph Object Storage Daemon is going out of space. Please add more disks.
                query: ceph_osd_utilization > 90
                severity: warning
                for: 2m
              - name: Ceph OSD reweighted
                description: Ceph Object Storage Daemon takes too much time to resize.
                query: "ceph_osd_weight < 1"
                severity: warning
                for: 2m
              - name: Ceph PG down
                description: Some Ceph placement groups are down. Please ensure that all the data are available.
                query: "ceph_pg_down > 0"
                severity: critical
              - name: Ceph PG incomplete
                description: Some Ceph placement groups are incomplete. Please ensure that all the data are available.
                query: "ceph_pg_incomplete > 0"
                severity: critical
              - name: Ceph PG inconsistent
                description: Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes.
                query: ceph_pg_inconsistent > 0
                severity: warning
              - name: Ceph PG activation long
                description: Some Ceph placement groups are too long to activate.
                query: "ceph_pg_activating > 0"
                severity: warning
                for: 2m
              - name: Ceph PG backfill full
                description: Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.
                query: "ceph_pg_backfill_toofull > 0"
                severity: warning
                for: 2m
              - name: Ceph PG unavailable
                description: Some Ceph placement groups are unavailable.
                query: "ceph_pg_total - ceph_pg_active > 0"
                severity: critical

      - name: ZFS
        exporters:
          - name: node-exporter
            slug: node-exporter
            doc_url: https://github.com/prometheus/node_exporter
            rules:
              - name: ZFS offline pool
                description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}."
                query: 'node_zfs_zpool_state{state!="online"} > 0'
                severity: critical
                for: 1m
          - name: ZFS exporter
            slug: zfs_exporter
            doc_url: https://github.com/pdf/zfs_exporter
            rules:
              - name: ZFS pool out of space
                description: Disk is almost full (< 10% left)
                query: "zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0 and zfs_pool_size_bytes > 0"
                severity: warning
              - name: ZFS pool unhealthy
                description: ZFS pool state is {{ $value }}. See comments for more information.
                query: "zfs_pool_health > 0"
                severity: critical
                comments: |
                  0: ONLINE
                  1: DEGRADED
                  2: FAULTED
                  3: OFFLINE
                  4: UNAVAIL
                  5: REMOVED
                  6: SUSPENDED
              - name: ZFS collector failed
                description: ZFS collector for {{ $labels.instance }} has failed to collect information
                query: "zfs_scrape_collector_success != 1"
                severity: warning

      - name: OpenEBS
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            rules:
              - name: OpenEBS used pool capacity
                description: "OpenEBS Pool use more than 80% of his capacity"
                query: "openebs_used_pool_capacity_percent > 80"
                severity: warning
                for: 2m

      - name: Minio
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            rules:
              - name: Minio cluster disk offline
                description: "Minio cluster disk is offline"
                query: "minio_cluster_drive_offline_total > 0"
                severity: critical
              - name: Minio node disk offline
                description: "Minio cluster node disk is offline"
                query: "minio_cluster_nodes_offline_total > 0"
                severity: critical
              - name: Minio disk space usage
                description: "Minio available free space is low (< 10%)"
                query: minio_cluster_capacity_raw_free_bytes / minio_cluster_capacity_raw_total_bytes * 100 < 10 and minio_cluster_capacity_raw_total_bytes > 0
                severity: warning

  - name: Cloud providers
    services:
      - name: AWS CloudWatch
        exporters:
          - name: prometheus/cloudwatch_exporter
            slug: prometheus-cloudwatch-exporter
            doc_url: https://github.com/prometheus/cloudwatch_exporter
            comments: |
              CloudWatch metrics are exported as aws_{namespace}_{metric_name}_{statistic} gauges.
              The rules below cover both exporter health and common AWS service alerts.
              Adjust thresholds and label filters to match your CloudWatch exporter configuration.
            rules:
              - name: CloudWatch exporter scrape error
                description: "CloudWatch exporter on {{ $labels.instance }} failed to scrape metrics from AWS CloudWatch API."
                query: "cloudwatch_exporter_scrape_error > 0"
                severity: warning
                for: 5m
              - name: CloudWatch exporter slow scrape
                description: "CloudWatch exporter on {{ $labels.instance }} scrape is taking more than 5 minutes ({{ $value }}s). Consider reducing the number of metrics or splitting across multiple exporters."
                query: "cloudwatch_exporter_scrape_duration_seconds > 300"
                severity: warning
                for: 5m
              - name: CloudWatch API high request rate
                description: "CloudWatch exporter on {{ $labels.instance }} is making {{ $value }} API calls per minute to namespace {{ $labels.namespace }}. This can lead to high AWS costs."
                query: "sum by (instance, namespace) (rate(cloudwatch_requests_total[5m])) * 60 > 100"
                severity: warning
                comments: |
                  CloudWatch API calls cost money (~$0.01 per 1000 GetMetricData requests).
                  100 requests/minute ≈ $45/month. Adjust the threshold based on your budget.
              - name: AWS EC2 high CPU utilization
                description: "EC2 instance {{ $labels.instance_id }} CPU utilization is above 90% ({{ $value }}%)."
                query: "aws_ec2_cpuutilization_average > 90"
                severity: warning
                for: 15m
                comments: Requires EC2 CPUUtilization metric configured in the CloudWatch exporter.
              - name: AWS RDS low free storage space
                description: "RDS instance {{ $labels.dbinstance_identifier }} has less than 2GB free storage ({{ $value }} bytes remaining)."
                query: "aws_rds_free_storage_space_average < 2000000000"
                severity: warning
                for: 5m
                comments: |
                  Requires RDS FreeStorageSpace metric. The threshold of 2GB is a rough default.
                  Adjust based on your database size.
              - name: AWS RDS high CPU utilization
                description: "RDS instance {{ $labels.dbinstance_identifier }} CPU utilization is above 90% ({{ $value }}%)."
                query: "aws_rds_cpuutilization_average > 90"
                severity: warning
                for: 15m
                comments: Requires RDS CPUUtilization metric configured in the CloudWatch exporter.
              - name: AWS RDS high database connections
                description: "RDS instance {{ $labels.dbinstance_identifier }} has {{ $value }} active connections."
                query: "aws_rds_database_connections_average > 100"
                severity: warning
                for: 5m
                comments: |
                  The threshold depends on the RDS instance class. Adjust based on your
                  instance type's max_connections parameter.
              - name: AWS SQS queue messages visible
                description: "SQS queue {{ $labels.queue_name }} has {{ $value }} messages waiting to be processed."
                query: "aws_sqs_approximate_number_of_messages_visible_average > 1000"
                severity: warning
                for: 10m
                comments: |
                  Requires SQS ApproximateNumberOfMessagesVisible metric. The threshold of 1000
                  is a rough default. Adjust based on your expected queue depth.
              - name: AWS SQS message age too old
                description: "SQS queue {{ $labels.queue_name }} has messages older than 1 hour ({{ $value }}s)."
                query: "aws_sqs_approximate_age_of_oldest_message_maximum > 3600"
                severity: warning
                comments: Requires SQS ApproximateAgeOfOldestMessage metric.
              - name: AWS ALB unhealthy targets
                description: "ALB {{ $labels.load_balancer }} has {{ $value }} unhealthy target(s) in target group {{ $labels.target_group }}."
                query: "aws_applicationelb_unhealthy_host_count_average > 0"
                severity: critical
                for: 5m
                comments: Requires ApplicationELB UnHealthyHostCount metric.
              - name: AWS ALB high 5xx error rate
                description: "ALB {{ $labels.load_balancer }} 5xx error rate is above 5% ({{ $value }}%)."
                query: "(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5 and aws_applicationelb_request_count_sum > 0"
                severity: critical
                for: 5m
                comments: Requires ApplicationELB HTTPCode_ELB_5XX_Count and RequestCount metrics.
              - name: AWS ALB high target response time
                description: "ALB {{ $labels.load_balancer }} average target response time is above 2 seconds ({{ $value }}s)."
                query: "aws_applicationelb_target_response_time_average > 2"
                severity: warning
                for: 5m
                comments: Requires ApplicationELB TargetResponseTime metric.
              - name: AWS Lambda high error rate
                description: "Lambda function {{ $labels.function_name }} error rate is above 5% ({{ $value }}%)."
                query: "(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5 and aws_lambda_invocations_sum > 0"
                severity: warning
                for: 5m
                comments: Requires Lambda Errors and Invocations metrics.

      - name: Google Cloud Stackdriver
        exporters:
          - name: prometheus-community/stackdriver_exporter
            slug: stackdriver-exporter
            doc_url: https://github.com/prometheus-community/stackdriver_exporter
            comments: |
              Self-monitoring metrics use the stackdriver_monitoring_* prefix.
              All self-monitoring metrics include a project_id label.
            rules:
              - name: Stackdriver exporter scrape error
                description: "Stackdriver exporter failed to scrape metrics from Google Cloud Monitoring API for project {{ $labels.project_id }}."
                query: "stackdriver_monitoring_last_scrape_error > 0"
                severity: warning
                for: 5m
              - name: Stackdriver exporter slow scrape
                description: "Stackdriver exporter scrape for project {{ $labels.project_id }} is taking more than 5 minutes ({{ $value }}s)."
                query: "stackdriver_monitoring_last_scrape_duration_seconds > 300"
                severity: warning
                for: 5m
              - name: Stackdriver exporter scrape errors increasing
                description: "Stackdriver exporter has had {{ $value }} scrape errors in the last 15 minutes for project {{ $labels.project_id }}."
                query: "increase(stackdriver_monitoring_scrape_errors_total[15m]) > 5"
                severity: warning
              - name: Stackdriver exporter high API calls
                description: "Stackdriver exporter is making {{ $value }} API calls per minute for project {{ $labels.project_id }}. This may hit Google Cloud Monitoring API quotas."
                query: "rate(stackdriver_monitoring_api_calls_total[5m]) * 60 > 100"
                severity: warning
              - name: Stackdriver exporter scrape stale
                description: "Stackdriver exporter has not successfully scraped metrics for project {{ $labels.project_id }} in the last 10 minutes."
                query: "time() - stackdriver_monitoring_last_scrape_timestamp > 600"
                severity: warning

      - name: DigitalOcean
        exporters:
          - name: metalmatze/digitalocean_exporter
            slug: digitalocean-exporter
            doc_url: https://github.com/metalmatze/digitalocean_exporter
            rules:
              - name: DigitalOcean droplet down
                description: "DigitalOcean droplet {{ $labels.name }} ({{ $labels.id }}) in {{ $labels.region }} is not running."
                query: "digitalocean_droplet_up == 0"
                severity: critical
                for: 5m
              - name: DigitalOcean account not active
                description: "DigitalOcean account is not active. It may be suspended or locked."
                query: "digitalocean_account_active != 1"
                severity: critical
                for: 5m
              - name: DigitalOcean database down
                description: "DigitalOcean managed database {{ $labels.name }} ({{ $labels.engine }}) in {{ $labels.region }} is offline."
                query: "digitalocean_database_status == 0"
                severity: critical
                for: 2m
              - name: DigitalOcean Kubernetes cluster down
                description: "DigitalOcean Kubernetes cluster {{ $labels.name }} ({{ $labels.version }}) in {{ $labels.region }} is not running."
                query: "digitalocean_kubernetes_cluster_up == 0"
                severity: critical
                for: 5m
              - name: DigitalOcean load balancer down
                description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) is not active."
                query: "digitalocean_loadbalancer_status == 0"
                severity: critical
                for: 2m
              - name: DigitalOcean load balancer no backends
                description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) has no droplets attached."
                query: "digitalocean_loadbalancer_droplets == 0"
                severity: warning
                for: 1m
              - name: DigitalOcean floating IP not assigned
                description: "DigitalOcean floating IP {{ $labels.ipv4 }} in {{ $labels.region }} is not assigned to any droplet."
                query: "digitalocean_floating_ipv4_active == 0"
                severity: warning
              - name: DigitalOcean active incidents
                description: "DigitalOcean platform has {{ $value }} active incident(s)."
                query: "digitalocean_incidents_total > 0"
                severity: warning
              - name: DigitalOcean exporter collection errors
                description: "DigitalOcean exporter {{ $labels.collector }} collector has {{ $value }} errors."
                query: "increase(digitalocean_errors_total[5m]) > 0"
                severity: warning
                for: 5m
              - name: DigitalOcean droplet limit approaching
                description: "DigitalOcean account is using {{ $value }}% of its droplet quota."
                query: "(count(digitalocean_droplet_up) / digitalocean_account_droplet_limit) * 100 > 80 and digitalocean_account_droplet_limit > 0"
                severity: warning
                comments: Fires when more than 80% of the account's droplet limit is in use.

      - name: Azure
        exporters:
          - name: webdevops/azure-metrics-exporter
            slug: azure-metrics-exporter
            doc_url: https://github.com/webdevops/azure-metrics-exporter
            comments: |
              The exporter uses azurerm_resource_metric as the default metric name for forwarded Azure Monitor metrics.
              The metric name can be customized via the name parameter in probe configuration.
              Self-monitoring metrics use the azurerm_stats_* and azurerm_api_* prefixes.
            rules:
              - name: Azure exporter request errors
                description: "Azure metrics exporter on {{ $labels.instance }} has {{ $value }} API request errors in the last 15 minutes."
                query: 'increase(azurerm_stats_metric_requests{result="error"}[15m]) > 5'
                severity: warning
              - name: Azure exporter high error rate
                description: "Azure metrics exporter on {{ $labels.instance }} has an error rate above 10% ({{ $value }}%)."
                query: 'sum by (instance) (rate(azurerm_stats_metric_requests{result="error"}[5m])) / sum by (instance) (rate(azurerm_stats_metric_requests[5m])) * 100 > 10 and sum by (instance) (rate(azurerm_stats_metric_requests[5m])) > 0'
                severity: warning
                for: 5m
              - name: Azure API read rate limit approaching
                description: "Azure API read rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining)."
                query: 'azurerm_api_ratelimit{type="read"} < 100'
                severity: warning
                comments: |
                  Azure Resource Manager enforces rate limits per subscription.
                  The threshold of 100 remaining calls is a rough default. Adjust based on your
                  scrape interval and number of monitored resources.
              - name: Azure API write rate limit approaching
                description: "Azure API write rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining)."
                query: 'azurerm_api_ratelimit{type="write"} < 50'
                severity: warning
              - name: Azure exporter slow collection
                description: "Azure metrics exporter on {{ $labels.instance }} metric collection is taking more than 5 minutes ({{ $value }}s)."
                query: "azurerm_stats_metric_collecttime > 300"
                severity: warning
                for: 5m


  - name: Observability
    services:
      - name: Thanos
        exporters:
          - name: Thanos Compactor
            slug: thanos-compactor
            rules:
              - name: Thanos Compactor Multiple Running
                description: "No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running."
                query: 'sum by (job) (up{job=~".*thanos-compact.*"}) > 1'
                severity: warning
                for: 5m
              - name: Thanos Compactor Halted
                description: "Thanos Compact {{$labels.job}} has failed to run and now is halted."
                query: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1'
                severity: warning
                for: 5m
              - name: Thanos Compactor High Compaction Failures
                description: "Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions."
                query: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) > 0'
                severity: warning
                for: 15m
              - name: Thanos Compact Bucket High Operation Failures
                description: "Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations."
                query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) > 0'
                severity: warning
                for: 15m
              - name: Thanos Compact Has Not Run
                description: "Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours."
                query: '(time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24'
                severity: warning

          - name: Thanos Query
            slug: thanos-query
            rules:
              - name: Thanos Query Http Request Query Error Rate High
                description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query" requests.'
                query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/  sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m])) > 0'
                severity: critical
                for: 5m
              - name: Thanos Query Http Request Query Range Error Rate High
                description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query_range" requests.'
                query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/  sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0'
                severity: critical
                for: 5m
              - name: Thanos Query Grpc Server Error Rate
                description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
                query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/  sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5) and sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) > 0'
                severity: warning
                for: 5m
              - name: Thanos Query Grpc Client Error Rate
                description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests."
                query: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5 and sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) > 0'
                severity: warning
                for: 5m
              - name: Thanos Query High D N S Failures
                description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints."
                query: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) > 0'
                severity: warning
                for: 15m
              - name: Thanos Query Instant Latency High
                description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries."
                query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
                severity: critical
                for: 10m
              - name: Thanos Query Range Latency High
                description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries."
                query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
                severity: critical
                for: 10m
              - name: Thanos Query Overload
                description: "Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support."
                query: "(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)"
                severity: warning
                for: 15m
          - name: Thanos Receiver
            slug: thanos-receiver
            rules:
              - name: Thanos Receive Http Request Error Rate High
                description: "Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
                query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/  sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0'
                severity: critical
                for: 5m
              - name: Thanos Receive Http Request Latency High
                description: "Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests."
                query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)'
                severity: critical
                for: 10m
              - name: Thanos Receive High Replication Failures
                description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests."
                query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
                severity: warning
                for: 5m
              - name: Thanos Receive High Forward Request Failures
                description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests."
                query: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/  sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) > 0'
                severity: info
                for: 5m
              - name: Thanos Receive High Hashring File Refresh Failures
                description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed."
                query: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0'
                severity: warning
                for: 15m
              - name: Thanos Receive Config Reload Failure
                description: "Thanos Receive {{$labels.job}} has not been able to reload hashring configurations."
                query: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1'
                severity: warning
                for: 5m
              - name: Thanos Receive No Upload
                description: "Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage."
                query: '(up{job=~".*thanos-receive.*"} - 1) + on (job, instance) (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)'
                severity: critical
                for: 3h
          - name: Thanos Sidecar
            slug: thanos-sidecar
            rules:
              - name: Thanos Sidecar Bucket Operations Failed
                description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing ({{ $value | humanize }}/s)."
                query: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0.05'
                comments: |
                  Threshold of 0.05/s avoids firing on transient single-event spikes.
                severity: critical
                for: 5m
              - name: Thanos Sidecar No Connection To Started Prometheus
                description: "Thanos Sidecar {{$labels.instance}} is unhealthy."
                query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'
                severity: critical
                for: 5m
          - name: Thanos Store
            slug: thanos-store
            rules:
              - name: Thanos Store Grpc Error Rate
                description: "Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
                query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))/  sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5) and sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) > 0'
                severity: warning
                for: 5m
              - name: Thanos Store Series Gate Latency High
                description: "Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests."
                query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
                severity: warning
                for: 10m
              - name: Thanos Store Bucket High Operation Failures
                description: "Thanos Store {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations."
                query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) > 0'
                severity: warning
                for: 15m
              - name: Thanos Store Objstore Operation Latency High
                description: "Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations."
                query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and  sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
                severity: warning
                for: 10m
          - name: Thanos Ruler
            slug: thanos-ruler
            rules:
              - name: Thanos Rule Queue Is Dropping Alerts
                description: "Thanos Rule {{$labels.instance}} is failing to queue alerts ({{ $value | humanize }}/s)."
                query: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
                severity: critical
                for: 5m
              - name: Thanos Rule Sender Is Failing Alerts
                description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager ({{ $value | humanize }}/s)."
                query: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
                severity: critical
                for: 5m
              - name: Thanos Rule High Rule Evaluation Failures
                description: "Thanos Rule {{$labels.instance}} is failing to evaluate {{$value | humanize}}% of rules."
                query: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5) and sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) > 0'
                severity: critical
                for: 5m
              - name: Thanos Rule High Rule Evaluation Warnings
                description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings ({{ $value | humanize }}/s)."
                query: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0.05'
                comments: |
                  Threshold of 0.05/s avoids firing on transient single-event spikes.
                severity: info
                for: 15m
              - name: Thanos Rule Rule Evaluation Latency High
                description: "Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}."
                query: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))'
                severity: warning
                for: 5m
              - name: Thanos Rule Grpc Error Rate
                description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
                query: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/  sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5) and sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) > 0'
                severity: warning
                for: 5m
              - name: Thanos Rule Config Reload Failure
                description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration."
                query: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
                severity: info
                for: 5m
              - name: Thanos Rule Query High D N S Failures
                description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints."
                query: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0'
                severity: warning
                for: 15m
              - name: Thanos Rule Alertmanager High D N S Failures
                description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints."
                query: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0'
                severity: warning
                for: 15m
              - name: Thanos Rule No Evaluation For10 Intervals
                description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval."
                query: 'time() -  max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})'
                severity: info
                for: 5m
              - name: Thanos No Rule Evaluations
                description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes."
                query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0  and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
                severity: critical
                for: 5m
          - name: Thanos Bucket Replicate
            slug: thanos-bucket-replicate
            rules:
              - name: Thanos Bucket Replicate Error Rate
                description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed."
                query: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])) > 0'
                severity: critical
                for: 5m
              - name: Thanos Bucket Replicate Run Latency
                description: "Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations."
                query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and  sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
                severity: critical
                for: 5m
          - name: Thanos Component Absent
            slug: thanos-component-absent
            rules:
              - name: Thanos Compact Is Down
                description: "ThanosCompact has disappeared. Prometheus target for the component cannot be discovered."
                query: 'absent(up{job=~".*thanos-compact.*"} == 1)'
                severity: critical
                for: 5m
              - name: Thanos Query Is Down
                description: "ThanosQuery has disappeared. Prometheus target for the component cannot be discovered."
                query: 'absent(up{job=~".*thanos-query.*"} == 1)'
                severity: critical
                for: 5m
              - name: Thanos Receive Is Down
                description: "ThanosReceive has disappeared. Prometheus target for the component cannot be discovered."
                query: 'absent(up{job=~".*thanos-receive.*"} == 1)'
                severity: critical
                for: 5m
              - name: Thanos Rule Is Down
                description: "ThanosRule has disappeared. Prometheus target for the component cannot be discovered."
                query: 'absent(up{job=~".*thanos-rule.*"} == 1)'
                severity: critical
                for: 5m
              - name: Thanos Sidecar Is Down
                description: "ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered."
                query: 'absent(up{job=~".*thanos-sidecar.*"} == 1)'
                severity: critical
                for: 5m
              - name: Thanos Store Is Down
                description: "ThanosStore has disappeared. Prometheus target for the component cannot be discovered."
                query: absent(up{job=~".*thanos-store.*"} == 1)
                severity: critical
                for: 5m

      - name: Loki
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            rules:
              - name: Loki process too many restarts
                description: A loki process had too many restarts (target {{ $labels.instance }})
                query: changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2
                severity: warning
              - name: Loki request errors
                description: 'The {{ $labels.job }} and {{ $labels.route }} are experiencing {{ printf "%.2f" $value }}% errors.'
                query: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10 and sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 0'
                severity: critical
                for: 15m
              - name: Loki request panic
                description: The {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics
                query: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
                severity: critical
                for: 5m
              - name: Loki request latency
                description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency
                query: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le)))  > 1
                severity: critical
                for: 5m
      - name: Promtail
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            rules:
              - name: Promtail request errors
                description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
                query: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10 and sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 0'
                severity: critical
                for: 5m
              - name: Promtail request latency
                description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
                query: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1
                severity: critical
                for: 5m
      - name: Cortex
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            rules:
              - name: Cortex ruler configuration reload failure
                description: Cortex ruler configuration reload failure (instance {{ $labels.instance }})
                query: cortex_ruler_config_last_reload_successful != 1
                severity: warning
              - name: Cortex not connected to Alertmanager
                description: Cortex not connected to Alertmanager (instance {{ $labels.instance }})
                query: cortex_prometheus_notifications_alertmanagers_discovered < 1
                severity: critical
              - name: Cortex notification are being dropped
                description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s)."
                query: rate(cortex_prometheus_notifications_dropped_total[5m]) > 0.05
                comments: |
                  Threshold of 0.05/s avoids firing on transient single-event spikes.
                severity: critical
              - name: Cortex notification error
                description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }}, {{ $value | humanize }}/s)."
                query: rate(cortex_prometheus_notifications_errors_total[5m]) > 0.05
                comments: |
                  Threshold of 0.05/s avoids firing on transient single-event spikes.
                severity: critical
              - name: Cortex ingester unhealthy
                description: Cortex has an unhealthy ingester
                query: cortex_ring_members{state="Unhealthy", name="ingester"} > 0
                severity: critical
              - name: Cortex frontend queries stuck
                description: There are queued up queries in query-frontend.
                query: sum by (job) (cortex_query_frontend_queue_length) > 0
                severity: critical
                for: 5m

      - name: Grafana Tempo
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://grafana.com/docs/tempo/latest/operations/monitor/
            rules:
              - name: Tempo distributor unhealthy
                description: Tempo has {{ $value }} unhealthy distributor(s).
                query: max by (job) (tempo_ring_members{state="Unhealthy", name="distributor"}) > 0
                severity: warning
                for: 15m
              - name: Tempo live store unhealthy
                description: Tempo has {{ $value }} unhealthy live store(s).
                query: max by (job) (tempo_ring_members{state="Unhealthy", name="live-store"}) > 0
                severity: critical
                for: 15m
              - name: Tempo metrics generator unhealthy
                description: Tempo has {{ $value }} unhealthy metrics generator(s).
                query: max by (job) (tempo_ring_members{state="Unhealthy", name="metrics-generator"}) > 0
                severity: critical
                for: 15m
              - name: Tempo compactions failing
                description: "{{ $value }} compactions have failed in the past hour."
                query: sum by (job) (increase(tempodb_compaction_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_compaction_errors_total[5m])) > 0
                severity: critical
                for: 1h
                comments: |
                  Uses a two-window approach: 1h for historical count and 5m to confirm the issue is ongoing.
              - name: Tempo polls failing
                description: "{{ $value }} blocklist polls have failed in the past hour."
                query: sum by (job) (increase(tempodb_blocklist_poll_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_poll_errors_total[5m])) > 0
                severity: critical
              - name: Tempo tenant index failures
                description: "{{ $value }} tenant index failures in the past hour."
                query: sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[5m])) > 0
                severity: critical
              - name: Tempo no tenant index builders
                description: No tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale.
                query: sum by (tenant) (tempodb_blocklist_tenant_index_builder) == 0 and on() max(tempodb_blocklist_length) > 0
                severity: critical
                for: 5m
              - name: Tempo tenant index too old
                description: Tenant index for {{ $labels.tenant }} is {{ $value }}s old.
                query: max by (tenant) (tempodb_blocklist_tenant_index_age_seconds) > 600
                severity: critical
                for: 5m
                comments: |
                  Threshold of 600s (10 minutes). Adjust based on your tenant index build interval.
              - name: Tempo block list rising quickly
                description: Tempo blocklist length is up {{ printf "%.0f" $value }}% over the last 7 days. Consider scaling compactors.
                query: (avg(tempodb_blocklist_length) / avg(tempodb_blocklist_length offset 7d) - 1) * 100 > 40 and avg(tempodb_blocklist_length offset 7d) > 0
                severity: critical
                for: 15m
                comments: |
                  Fires when the blocklist grows more than 40% over 7 days.
              - name: Tempo bad overrides
                description: '{{ $labels.job }} failed to reload runtime overrides.'
                query: sum by (job) (tempo_runtime_config_last_reload_successful == 0) > 0
                severity: critical
                for: 15m
              - name: Tempo user configurable overrides reload failing
                description: "{{ $value }} user-configurable overrides reloads have failed in the past hour."
                query: sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[1h])) > 5 and sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[5m])) > 0
                severity: critical
              - name: Tempo compaction too many outstanding blocks warning
                description: There are too many outstanding compaction blocks for {{ $labels.instance }}. Consider increasing compactor resources.
                query: sum by (instance) (tempodb_compaction_outstanding_blocks) > 100
                severity: warning
                for: 6h
                comments: |
                  Threshold of 100 blocks per compactor instance. Adjust based on your environment.
              - name: Tempo compaction too many outstanding blocks critical
                description: There are too many outstanding compaction blocks for {{ $labels.instance }}. Increase compactor resources immediately.
                query: sum by (instance) (tempodb_compaction_outstanding_blocks) > 250
                severity: critical
                for: 24h
                comments: |
                  Official Tempo mixin normalizes by backend-worker count. Adjust threshold based on your compactor configuration.
              - name: Tempo distributor usage tracker errors
                description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }})."
                query: sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0
                severity: critical
                for: 30m
              - name: Tempo metrics generator processor updates failing
                description: "Tempo metrics generator processor updates are failing for {{ $labels.job }} ({{ $value }} failures in 5m)."
                query: sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 0
                severity: critical
                for: 15m
              - name: Tempo metrics generator service graphs dropping spans
                description: Tempo metrics generator is dropping {{ printf "%.2f" $value }}% of spans in service graphs for {{ $labels.job }}.
                query: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0'
                severity: warning
                for: 15m
              - name: Tempo metrics generator collections failing
                description: "Tempo metrics generator collections are failing for {{ $labels.job }} ({{ $value }} failures in 5m)."
                query: sum by (job) (increase(tempo_metrics_generator_registry_collections_failed_total[5m])) > 2
                severity: critical
                for: 5m
              - name: Tempo memcached errors elevated
                description: 'Tempo memcached error rate is {{ printf "%.2f" $value }}% for {{ $labels.name }} in {{ $labels.job }}.'
                query: '100 * sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count{status_code="500"}[5m])) / sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 20 and sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 0'
                severity: warning
                for: 10m
                comments: |
                  Fires when the memcached error rate exceeds 20%. Only relevant if Tempo is configured with memcached caching.

      - name: Grafana Mimir
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://grafana.com/docs/mimir/latest/manage/monitor-grafana-mimir/
            comments: |
              Mimir uses the `cortex_` metric prefix for backward compatibility with Cortex. This is intentional and expected.
            rules:
              # Core alerts
              - name: Mimir ingester unhealthy
                description: Mimir has {{ $value }} unhealthy ingester(s) in the ring.
                query: min by (job) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0
                severity: critical
                for: 15m
              - name: Mimir request errors
                description: 'Mimir {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.'
                query: '100 * sum by (job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..", route!~"ready|debug_pprof"}[5m])) / sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 1 and sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 0'
                severity: critical
                for: 15m
              - name: Mimir inconsistent runtime config
                description: An inconsistent runtime config file is used across Mimir instances.
                query: count(count by (job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1
                severity: critical
                for: 1h
              - name: Mimir bad runtime config
                description: '{{ $labels.job }} failed to reload runtime config.'
                query: sum by (job) (cortex_runtime_config_last_reload_successful == 0) > 0
                severity: critical
                for: 5m
              - name: Mimir scheduler queries stuck
                description: There are {{ $value }} queued up queries in {{ $labels.job }}.
                query: sum by (job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0
                severity: critical
                for: 7m
              - name: Mimir cache request errors
                description: 'Mimir cache {{ $labels.name }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation.'
                query: '(sum by (name, operation, job) (rate(thanos_cache_operation_failures_total[5m])) / sum by (name, operation, job) (rate(thanos_cache_operations_total[5m]))) * 100 > 5 and sum by (name, operation, job) (rate(thanos_cache_operations_total[5m])) > 0'
                severity: warning
                for: 5m
              - name: Mimir KV store failure
                description: 'Mimir {{ $labels.job }} KV store {{ $labels.kv_name }} is failing with 100% error rate.'
                query: '(sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.."}[5m])) / sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m]))) == 1 and sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m])) > 0'
                severity: critical
                for: 5m
              - name: Mimir memory map areas too high
                description: 'Mimir {{ $labels.job }} is using {{ printf "%.0f" $value }}% of its memory map area limit.'
                query: 'process_memory_map_areas{job=~".*(ingester|cortex|mimir|store-gateway).*"} / process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} * 100 > 80 and process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} > 0'
                severity: critical
                for: 5m
              - name: Mimir ingester instance has no tenants
                description: Mimir ingester {{ $labels.instance }} has no tenants assigned.
                query: (cortex_ingester_memory_users == 0) and on (instance) (cortex_ingester_memory_users offset 1h > 0)
                severity: warning
                for: 1h
              - name: Mimir ruler instance has no rule groups
                description: Mimir ruler {{ $labels.instance }} has no rule groups assigned.
                query: (cortex_ruler_managers_total == 0) and on (instance) (cortex_ruler_managers_total offset 1h > 0)
                severity: warning
                for: 1h
              - name: Mimir ingested data too far in the future
                description: Mimir ingester {{ $labels.job }} has ingested samples with timestamps more than 1 hour in the future.
                query: max by (job) (cortex_ingester_tsdb_head_max_timestamp_seconds - time() and cortex_ingester_tsdb_head_max_timestamp_seconds > 0) > 3600
                severity: warning
                for: 5m
              - name: Mimir store gateway too many failed operations
                description: Mimir store-gateway {{ $labels.job }} bucket operations are failing ({{ $value | humanize }}/s).
                query: sum by (job) (rate(thanos_objstore_bucket_operation_failures_total[5m])) > 0.05
                comments: |
                  Threshold of 0.05/s avoids firing on transient single-event spikes.
                severity: warning
                for: 5m
              - name: Mimir ring members mismatch
                description: Mimir {{ $labels.name }} ring has inconsistent member counts across instances.
                query: max by (name, job) (sum by (name, job, instance) (cortex_ring_members)) != min by (name, job) (sum by (name, job, instance) (cortex_ring_members))
                severity: warning
                for: 15m
              # Instance limits
              - name: Mimir ingester reaching series limit warning
                description: 'Mimir ingester {{ $labels.instance }} has reached {{ printf "%.0f" $value }}% of its series limit.'
                query: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_series"} > 0'
                severity: warning
                for: 3h
              - name: Mimir ingester reaching series limit critical
                description: 'Mimir ingester {{ $labels.instance }} has reached {{ printf "%.0f" $value }}% of its series limit.'
                query: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 90) and cortex_ingester_instance_limits{limit="max_series"} > 0'
                severity: critical
                for: 5m
              - name: Mimir ingester reaching tenants limit warning
                description: 'Mimir ingester {{ $labels.instance }} has reached {{ printf "%.0f" $value }}% of its tenants limit.'
                query: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 70) and cortex_ingester_instance_limits{limit="max_tenants"} > 0'
                severity: warning
                for: 5m
              - name: Mimir ingester reaching tenants limit critical
                description: 'Mimir ingester {{ $labels.instance }} has reached {{ printf "%.0f" $value }}% of its tenants limit.'
                query: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_tenants"} > 0'
                severity: critical
                for: 5m
              - name: Mimir reaching TCP connections limit
                description: 'Mimir instance {{ $labels.instance }} is using {{ printf "%.0f" $value }}% of its TCP connections limit.'
                query: cortex_tcp_connections / cortex_tcp_connections_limit * 100 > 80 and cortex_tcp_connections_limit > 0
                severity: critical
                for: 5m
              - name: Mimir distributor inflight requests high
                description: 'Mimir distributor {{ $labels.instance }} is using {{ printf "%.0f" $value }}% of its inflight push requests limit.'
                query: '(cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"} * 100 > 80) and cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0'
                severity: critical
                for: 5m
              # Blocks and TSDB
              - name: Mimir ingester TSDB head compaction failed
                description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head ({{ $value | humanize }}/s)."
                query: rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0
                severity: critical
                for: 15m
              - name: Mimir ingester TSDB head truncation failed
                description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head ({{ $value | humanize }}/s)."
                query: rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0
                severity: critical
              - name: Mimir ingester TSDB checkpoint creation failed
                description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints ({{ $value | humanize }}/s)."
                query: rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0
                severity: critical
              - name: Mimir ingester TSDB checkpoint deletion failed
                description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints ({{ $value | humanize }}/s)."
                query: rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0
                severity: critical
              - name: Mimir ingester TSDB WAL truncation failed
                description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL ({{ $value | humanize }}/s)."
                query: rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0
                severity: warning
              - name: Mimir ingester TSDB WAL writes failed
                description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL ({{ $value | humanize }}/s)."
                query: rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0
                severity: critical
                for: 3m
              - name: Mimir store gateway has not synced bucket
                description: Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 10 minutes.
                query: (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0
                comments: |
                  Threshold aligned with official Mimir mixin (30 minutes).
                severity: critical
                for: 5m
              - name: Mimir store gateway no synced tenants
                description: Mimir store-gateway {{ $labels.instance }} has no synced tenants.
                query: (min by (instance, job) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0) and on (instance) (cortex_bucket_stores_tenants_synced{component="store-gateway"} offset 1h > 0)
                severity: warning
                for: 1h
              - name: Mimir bucket index not updated
                description: 'Mimir bucket index for tenant {{ $labels.user }} has not been updated for more than 35 minutes.'
                query: min by (user, job) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100
                severity: critical
              # Compactor
              - name: Mimir compactor not cleaning up blocks
                description: Mimir compactor {{ $labels.instance }} has not cleaned up blocks in the last 6 hours.
                query: (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 21600) and cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 0
                severity: critical
                for: 1h
              - name: Mimir compactor not running compaction
                description: Mimir compactor {{ $labels.instance }} has not run compaction in the last 24 hours.
                query: (time() - cortex_compactor_last_successful_run_timestamp_seconds > 86400) and cortex_compactor_last_successful_run_timestamp_seconds > 0
                severity: critical
                for: 15m
              - name: Mimir compactor has consecutive failures
                description: "Mimir compactor {{ $labels.instance }} has had {{ $value }} compaction failures in the last 2 hours."
                query: increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) > 1
                severity: critical
              - name: Mimir compactor has run out of disk space
                description: Mimir compactor {{ $labels.instance }} has run out of disk space.
                query: increase(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1
                severity: critical
              - name: Mimir compactor has not uploaded blocks
                description: Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours.
                query: (time() - thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 86400) and thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 0
                severity: critical
                for: 15m
              - name: Mimir compactor skipped blocks
                description: "Mimir compactor has found {{ $value }} blocks that cannot be compacted (reason {{ $labels.reason }})."
                query: increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0
                comments: |
                  Using 24h window per official mixin — compaction skips are rare events.
                severity: warning
                for: 5m
              # Ruler
              - name: Mimir ruler too many failed pushes
                description: 'Mimir ruler {{ $labels.instance }} is failing to push {{ printf "%.2f" $value }}% of write requests.'
                query: '100 * sum by (instance, job) (rate(cortex_ruler_write_requests_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 0'
                severity: critical
                for: 5m
              - name: Mimir ruler too many failed queries
                description: 'Mimir ruler {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% of query evaluations.'
                query: '100 * sum by (instance, job) (rate(cortex_ruler_queries_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 0'
                severity: critical
                for: 5m
              - name: Mimir ruler missed evaluations
                description: 'Mimir ruler {{ $labels.instance }} is missing {{ printf "%.2f" $value }}% of rule group evaluations.'
                query: '100 * sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_missed_total[5m])) / sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 1 and sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 0'
                severity: warning
                for: 5m
              - name: Mimir ruler failed ring check
                description: Mimir ruler {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).
                query: sum by (job) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0.05
                comments: |
                  Threshold of 0.05/s avoids firing on transient single-event spikes.
                severity: critical
                for: 5m
              # Alertmanager
              - name: Mimir alertmanager sync configs failing
                description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs ({{ $value | humanize }}/s)."
                query: rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0
                severity: critical
                for: 30m
              - name: Mimir alertmanager ring check failing
                description: "Mimir alertmanager {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s)."
                query: rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0
                severity: critical
                for: 10m
              - name: Mimir alertmanager state merge failing
                description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates ({{ $value | humanize }}/s)."
                query: rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0
                severity: critical
                for: 10m
              - name: Mimir alertmanager replication failing
                description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state ({{ $value | humanize }}/s)."
                query: rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0
                severity: critical
                for: 10m
              - name: Mimir alertmanager persist state failing
                description: "Mimir alertmanager {{ $labels.job }} is failing to persist state ({{ $value | humanize }}/s)."
                query: rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0
                severity: critical
                for: 1h
              - name: Mimir alertmanager initial sync failed
                description: Mimir alertmanager {{ $labels.job }} failed initial state sync.
                query: increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0
                severity: warning
              - name: Mimir alertmanager instance has no tenants
                description: Mimir alertmanager {{ $labels.instance }} has no tenants assigned.
                query: (cortex_alertmanager_tenants_owned == 0) and on (instance) (cortex_alertmanager_tenants_owned offset 1h > 0)
                severity: warning
                for: 1h
              # Gossip
              - name: Mimir gossip members count too high
                description: Mimir gossip cluster has more members than expected.
                query: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 1.15 + 10 < max(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)'
                severity: warning
                for: 20m
              - name: Mimir gossip members count too low
                description: Mimir gossip cluster has fewer members than expected.
                query: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 0.5 > min(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)'
                severity: warning
                for: 20m
              # Go runtime
              - name: Mimir go threads too high warning
                description: 'Mimir {{ $labels.instance }} has {{ $value }} Go threads.'
                query: 'go_threads{job=~".*(mimir|cortex).*"} > 5000'
                severity: warning
                for: 15m
                comments: |
                  A high number of Go threads may indicate a goroutine leak.
              - name: Mimir go threads too high critical
                description: 'Mimir {{ $labels.instance }} has {{ $value }} Go threads.'
                query: 'go_threads{job=~".*(mimir|cortex).*"} > 8000'
                severity: critical
                for: 15m

      - name: Grafana Alloy
        exporters:
          - slug: embedded-exporter
            rules:
              - name: Grafana Alloy service down
                description: "Alloy on instance {{ $labels.instance }} is not responding or has stopped running."
                query: "count by (instance) (alloy_build_info offset 2h) unless count by (instance) (alloy_build_info)"
                severity: critical

      - name: OpenTelemetry Collector
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://opentelemetry.io/docs/collector/internal-telemetry/
            comments: |
              OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint.
              These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly.
              All collector internal metrics are prefixed with 'otelcol_'.
            rules:
              - name: OpenTelemetry Collector down
                description: OpenTelemetry Collector instance has disappeared or is not being scraped
                query: 'up{job=~".*otel.*collector.*"} == 0'
                severity: critical
                for: 1m
              - name: OpenTelemetry Collector receiver refused spans
                description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}."
                query: 'rate(otelcol_receiver_refused_spans[5m]) > 0'
                severity: critical
                for: 5m
              - name: OpenTelemetry Collector receiver refused metric points
                description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s metric points on {{ $labels.receiver }}."
                query: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0'
                severity: critical
                for: 5m
              - name: OpenTelemetry Collector receiver refused log records
                description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s log records on {{ $labels.receiver }}."
                query: 'rate(otelcol_receiver_refused_log_records[5m]) > 0'
                severity: critical
                for: 5m
              - name: OpenTelemetry Collector exporter failed spans
                description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s spans via {{ $labels.exporter }}."
                query: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0.05'
                comments: |
                  Threshold of 0.05/s avoids firing on transient single-event spikes.
                severity: warning
                for: 5m
              - name: OpenTelemetry Collector exporter failed metric points
                description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s metric points via {{ $labels.exporter }}."
                query: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0.05'
                comments: |
                  Threshold of 0.05/s avoids firing on transient single-event spikes.
                severity: warning
                for: 5m
              - name: OpenTelemetry Collector exporter failed log records
                description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s log records via {{ $labels.exporter }}."
                query: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0.05'
                comments: |
                  Threshold of 0.05/s avoids firing on transient single-event spikes.
                severity: warning
                for: 5m
              - name: OpenTelemetry Collector exporter queue nearly full
                description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full"
                query: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0'
                severity: warning
              - name: OpenTelemetry Collector processor refused spans
                description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans ({{ $value | humanize }}/s), likely due to backpressure."
                query: 'rate(otelcol_processor_refused_spans[5m]) > 0.05'
                comments: |
                  Threshold of 0.05/s avoids firing on transient single-event spikes.
                severity: warning
                for: 5m
              - name: OpenTelemetry Collector processor refused metric points
                description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points ({{ $value | humanize }}/s), likely due to backpressure."
                query: 'rate(otelcol_processor_refused_metric_points[5m]) > 0.05'
                comments: |
                  Threshold of 0.05/s avoids firing on transient single-event spikes.
                severity: warning
                for: 5m
              - name: OpenTelemetry Collector high memory usage
                description: "OpenTelemetry Collector memory usage is above 90%"
                query: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9'
                severity: warning
                for: 5m
              - name: OpenTelemetry Collector OTLP receiver errors
                description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused"
                query: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0'
                severity: critical
                for: 2m

      - name: Jaeger
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            doc_url: https://www.jaegertracing.io/docs/latest/monitoring/
            rules:
              - name: Jaeger agent HTTP server errors
                description: "Jaeger agent on {{ $labels.instance }} is experiencing {{ $value | humanize }}% HTTP server errors."
                query: '100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 0'
                severity: warning
                for: 15m
              - name: Jaeger client RPC request errors
                description: "Jaeger client on {{ $labels.instance }} is experiencing {{ $value | humanize }}% RPC HTTP errors."
                query: '100 * sum(rate(jaeger_client_jaeger_rpc_http_requests{status_code=~"4xx|5xx"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 0'
                severity: warning
                for: 15m
              - name: Jaeger client spans dropped
                description: "Jaeger client on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans."
                query: '100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 0'
                severity: warning
                for: 15m
              - name: Jaeger agent spans dropped
                description: "Jaeger agent on {{ $labels.instance }} is dropping {{ $value | humanize }}% of span batches."
                query: '100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 0'
                severity: warning
                for: 15m
              - name: Jaeger collector dropping spans
                description: "Jaeger collector on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans."
                query: '100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 0'
                severity: warning
                for: 15m
              - name: Jaeger sampling update failing
                description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of sampling policy updates."
                query: '100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 0'
                severity: warning
                for: 15m
              - name: Jaeger throttling update failing
                description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of throttling policy updates."
                query: '100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 0'
                severity: warning
                for: 15m
              - name: Jaeger query request failures
                description: "Jaeger query on {{ $labels.instance }} is failing {{ $value | humanize }}% of requests."
                query: '100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 0'
                severity: warning
                for: 15m

  - name: Other
    services:
      - name: APC UPS
        exporters:
          - name: mdlayher/apcupsd_exporter
            slug: apcupsd_exporter
            doc_url: https://github.com/mdlayher/apcupsd_exporter
            rules:
              - name: APC UPS Battery nearly empty
                description: Battery is almost empty (< 10% left)
                query: "apcupsd_battery_charge_percent < 10"
                severity: critical
              - name: APC UPS Less than 15 Minutes of battery time remaining
                description: Battery is almost empty (< 15 Minutes remaining)
                query: "apcupsd_battery_time_left_seconds < 900"
                severity: critical
              - name: APC UPS AC input outage
                description: UPS now running on battery (since {{$value | humanizeDuration}})
                query: "apcupsd_battery_time_on_seconds > 0"
                severity: warning
              - name: APC UPS low battery voltage
                description: Battery voltage is lower than nominal (< 95%)
                query: "(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95"
                severity: warning
              - name: APC UPS high temperature
                description: Internal temperature is high ({{$value}}°C)
                query: "apcupsd_internal_temperature_celsius >= 40"
                severity: warning
                for: 2m
              - name: APC UPS high load
                description: UPS load is > 80%
                query: "apcupsd_ups_load_percent > 80"
                severity: warning

      - name: Graph Node
        exporters:
          - name: Embedded exporter
            slug: embedded-exporter
            rules:
              - name: Provider failed because net_version failed
                description: "Failed net_version for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
                query: "eth_rpc_status == 1"
                severity: critical
              - name: Provider failed because get genesis failed
                description: "Failed to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
                query: "eth_rpc_status == 2"
                severity: critical
              - name: Provider failed because net_version timeout
                description: "net_version timeout for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
                query: "eth_rpc_status == 3"
                severity: critical
              - name: Provider failed because get genesis timeout
                description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
                query: "eth_rpc_status == 4"
                severity: critical
              - name: Store connection slow
                description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`"
                query: "store_connection_wait_time_ms > 10"
                severity: warning
              - name: Store connection very slow
                description: "Store connection is very slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`"
                query: "store_connection_wait_time_ms > 20"
                severity: critical


================================================
FILE: _layouts/default.html
================================================
<!DOCTYPE html>
<html lang="{{ site.lang | default: "en-US" }}">

<head>
  <meta charset="UTF-8">
  {% seo %}
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <meta name="theme-color" content="#157878">
  <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
  <link rel="stylesheet" href="{{ '/assets/css/style.css?v=' | append: site.github.build_revision | relative_url }}">
  <link rel="stylesheet" href="{{ '/assets/css/app.css?v=' | append: site.github.build_revision | relative_url }}">
  <link rel="icon" type="image/x-icon" href="{{ '/assets/favicon.ico' | relative_url }}">

  <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js"></script>
  <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js"></script>
  <script src="{{ '/assets/js/app.js?v=' | append: site.github.build_revision | relative_url }}"></script>

  <!-- Global site tag (gtag.js) - Google Analytics -->
  <script async src="https://www.googletagmanager.com/gtag/js?id=UA-118604063-2"></script>
  <script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
      dataLayer.push(arguments);
    }
    gtag('js', new Date());

    gtag('config', 'UA-118604063-2');
  </script>

</head>

<body>
  <style>
    #skip-to-content {
      height: 1px;
      width: 1px;
      position: absolute;
      overflow: hidden;
      top: -10px;

      &:focus {
        position: fixed;
        top: 10px;
        left: 10px;
        height: auto;
        width: auto;
        background: invert($body-link-color);
        outline: thick solid invert($body-link-color);
      }
    }

    ul.github-buttons-cta li {
      display: inline-block;
      height: 20px;
      padding: 0px 15px;
    }

    ul.github-buttons-cta li a {
      /* width: 100px; */
      text-decoration: none;
    }

    .fa {
      /* padding: 14px;
      width: 50px;
      height: 50px; */
      font-size: 25px;
      text-align: center;
      text-decoration: none;
      border-radius: 50%;
    }

    .fa:hover {
      opacity: 0.8;
    }

    .fa-twitter,
    .fa-linkedin {
      /* background: #55ACEE; */
      color: white;
    }
  </style>
  <a id="skip-to-content" href="#content">Skip to the content.</a>

  <header class="page-header" role="banner">
    <h1 class="project-name">
      <a href="{{ '/' | relative_url }}" style="color: white">
        {{ site.title | default: site.github.repository_name }}
      </a>
    </h1>
    <h2 class="project-tagline">{{ site.description | default: site.github.project_tagline }}</h2>
    <a href="{{ '/alertmanager' | relative_url  }}" class="btn">Global configuration</a>
    <a href="{{ '/rules' | relative_url }}" class="btn">Rules</a>
    <a href="{{ '/sleep-peacefully' | relative_url }}" class="btn">Sleep peacefully</a>
    <a href="{{ '/blackbox-exporter' | relative_url }}" class="btn">Blackbox</a>
    <a href="https://github.com/samber/awesome-prometheus-alerts/blob/master/CONTRIBUTING.md" class="btn">
      Contribute on GitHub
    </a>

    <ul class="github-buttons-cta">
      <li>
        <a href="https://github.com/samber/awesome-prometheus-alerts">
          <img alt="GitHub Repo Watchers" src="https://img.shields.io/github/watchers/samber/awesome-prometheus-alerts?style=social">
        </a>
      </li>
      <li>
        <a href="https://github.com/samber/awesome-prometheus-alerts">
          <img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/samber/awesome-prometheus-alerts?style=social">
        </a>
      </li>
      <li>
        <a href="https://github.com/samber/awesome-prometheus-alerts">
          <img alt="GitHub Repo forks" src="https://img.shields.io/github/forks/samber/awesome-prometheus-alerts?style=social">
        </a>
      </li>
      <li>
        <a href="https://twitter.com/share?via=samuelberthe&related=samuelberthe&text=🚨 📊 Here is a collection of Awesome Prometheus Alerts&url=https://samber.github.io/awesome-prometheus-alerts"
          class="fa fa-twitter" target="_blank"></a>
      </li>
      <li>
        <a href="http://www.linkedin.com/shareArticle?mini=true&url=https://samber.github.io/awesome-prometheus-alerts/"
          class="fa fa-linkedin" target="_blank"></a>
      </li>
    </ul>


    <ul id="sponsoring">
      <li>
        Kindly supported by&nbsp; 👉
      </li>
      <li>
        <a href="https://cast.ai/samuel">
          <img width="" src="assets/sponsor-cast-ai.png" />
        </a>
      </li>
      <li>
        <a href="https://betterstack.com/">
          <img width="" src="assets/sponsor-betterstack.png" />
        </a>
      </li>
    </ul>
  </header>

  <main id="content" class="main-content" role="main">
    {{ content }}

    <footer class="site-footer">
      {% if site.github.is_project_page %}
        <span class="site-footer-owner">
          <a href="{{ site.github.repository_url }}">{{ site.title }}</a> is maintained by
          <a href="{{ site.github.owner_url }}">{{ site.github.owner_name }}</a>.
        </span>
      {% endif %}
    </footer>
  </main>

</body>

</html>


================================================
FILE: alertmanager.md
================================================
<h1 style="text-align: center;">
  Global configuration
</h1>

If you notice a delay between an event and the first notification, read the following blog post => [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).

## Prometheus configuration

{% highlight yaml %}
# prometheus.yml

global:
  scrape_interval: 20s

  # A short evaluation_interval will check alerting rules very often.
  # It can be costly if you run Prometheus with 100+ alerts.
  evaluation_interval: 20s
  ...

rule_files:
  - 'alerts/*.yml'

scrape_configs:
  ...

{% endhighlight %}

{% highlight yaml %}
# alerts/example-redis.yml

groups:

- name: ExampleRedisGroup
  rules:
  - alert: ExampleRedisDown
    expr: redis_up{} == 0
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: "Redis instance down"
      description: "Whatever"

{% endhighlight %}

## AlertManager configuration

{% highlight yaml %}
{% raw %}
# alertmanager.yml

route:
  # When a new group of alerts is created by an incoming alert, wait at
  # least 'group_wait' to send the initial notification.
  # This way ensures that you get multiple alerts for the same group that start
  # firing shortly after another are batched together on the first
  # notification.
  group_wait: 10s

  # When the first notification was sent, wait 'group_interval' to send a batch
  # of new alerts that started firing for that group.
  group_interval: 30s

  # If an alert has successfully been sent, wait 'repeat_interval' to
  # resend them.
  repeat_interval: 30m

  # A default receiver
  receiver: "slack"

  # All the above attributes are inherited by all child routes and can
  # overwritten on each.
  routes:
    - receiver: "slack"
      group_wait: 10s
      match_re:
        severity: critical|warning
      continue: true

    - receiver: "pager"
      group_wait: 10s
      match_re:
        severity: critical
      continue: true

receivers:
  - name: "slack"
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/XXXXXXXXX/XXXXXXXXX/xxxxxxxxxxxxxxxxxxxxxxxxxxx'
        send_resolved: true
        channel: 'monitoring'
        text: "{{ range .Alerts }}<!channel> {{ .Annotations.summary }}\n{{ .Annotations.description }}\n{{ end }}"

  - name: "pager"
    webhook_configs:
      - url: http://a.b.c.d:8080/send/sms
        send_resolved: true

{% endraw %}
{% endhighlight %}

## Reduce Prometheus server load

For expansive or frequent PromQL queries, Prometheus allows to precompute rules.

{% highlight yaml %}
{% raw %}
groups:

  # first define the recorded rule
  - name: ExampleRecordedGroup
    rules:
    - record: job:rabbitmq_queue_messages_delivered_total:rate:5m
      expr: rate(rabbitmq_queue_messages_delivered_total[5m])

  # then use it in alerts
  - name: ExampleAlertingGroup
    rules:
    - alert: ExampleRabbitmqLowMessageDelivery
      expr: sum(job:rabbitmq_queue_messages_delivered_total:rate:5m) < 10
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: "Low delivery rate in Rabbitmq queues"
{% endraw %}
{% endhighlight %}

## Troubleshooting

If the notification takes too much time to be triggered, check the following delays:
- `scrape_interval = 20s` (prometheus.yml)
- `evaluation_interval = 20s` (prometheus.yml)
- `increase(mysql_global_status_slow_queries[1m]) > 0` (alerts/example-mysql.yml)
- `for: 5m` (alerts/example-mysql.yml)
- `group_wait = 10s` (alertmanager.yml)

Also read:
- [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).
- [https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/](https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/)
- [https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/](https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/)


================================================
FILE: assets/css/app.css
================================================
a.anchor {
    font-size: 15px;
    vertical-align: middle;
    color: darkblue;
    display: inline-block;
    padding-bottom: 5px;
    margin-right: 5px;
    opacity: 0;
    transition: opacity 0.4s;
}

h2:hover a.anchor,
h3:hover a.anchor,
h4:hover a.anchor {
    opacity: 1;
}

summary {
    position: relative;
    padding-left: 60px;
    padding-right: 50px;
    margin-bottom: 15px;
    font-size: 15px;
}

h2 {
    position: relative;
}

.clipboard-single,
.clipboard-multiple {
    right: 0;
    position: absolute;
    cursor: pointer;
    font-size: 14px;
    color: #606c71;
}

/* NAVBAR */
#rules-navbar.affix {
    /* showed by JS */
    display: none;

    position: fixed;
    overflow: auto;
    top: 0;
    right: 0;
    max-width: 250px;
    max-height: 100%;
    padding-top: 20px;
    padding-bottom: 20px;
    padding-left: 20px;
    padding-right: 10px;

    background-color: #f3f6fa;
}

/* hide menu on small screens */
@media screen and (max-width: 1350px) {
    #rules-navbar.affix {
        display: none !important;
    }
}

/* hide menu scrollbar */
#rules-navbar.affix::-webkit-scrollbar {
    display: none;
}

#rules-navbar.affix {
    -ms-overflow-style: none;
    /* IE and Edge */
    scrollbar-width: none;
    /* Firefox */
}

#rules-navbar.affix h3 {
    margin-bottom: 10px;
}

#rules-navbar.affix h4 {
    margin: 0;
    font-weight: bold;
    font-size: 14px;
    line-height: 14px;
}

#rules-navbar.affix ul,
#rules-navbar.affix ul li {
    margin: 0;
    padding-top: 0;
    padding-bottom: 0;
    line-height: normal;
}

#rules-navbar.affix>ul {
    padding-left: 0;
    padding-right: 0;
}

#rules-navbar.affix>ul>li {
    margin-bottom: 10px;
    padding-left: 0;
    padding-right: 0;
}

#rules-navbar.affix a {
    font-size: 14px;
    line-height: 14px;
}

/* https://github.com/samber/awesome-prometheus-alerts/issues/356 */
@media screen and (min-width: 64em) {
    .main-content {
        max-width: 85rem;
    }
}

ul#sponsoring {
    display: flex;
    align-items: center;
    justify-content: center;
    margin-top: 50px;
}

ul#sponsoring li {
    display: flex;
    padding: 0px 15px;
    font-size: 16px;
}

ul#sponsoring li a {
    display: flex;
}

ul#sponsoring li a img {
    max-width: 180px;
    max-height: 40px;
}

.page-header {
    padding-bottom: 30px;
}

@media (prefers-color-scheme: dark) {

    #rules-navbar.affix {
        background-color: #2b2b2b;
    }

    /*********************** style.css overrides ******************************/
    /* This should *probably* be its own theme instead. */

    body {
	color: #a3b0b6;
        background-color: #242424;
    }
    .page-header {
	color: #fff;
	text-align: center;
	background-color: #006128;
	background-image: linear-gradient(120deg, #002968, #003c04);
    }
    .site-footer {
	border-top: solid 1px #525354;
    }
    hr {
        background-color: #525354!important;
    }
    a {
        color: #3d86d6;
    }
    .main-content h1 ,
    .main-content h2 ,
    .main-content h3 ,
    .main-content h4 ,
    .main-content h5 ,
    .main-content h6 {
	color: #55c883;
    }

    /* Syntax Highlighting from dark-plus of pygments-styles */
    /* See: https://github.com/lepture/pygments-styles */
    .main-content pre {
        background: #1E1E1E;
	border: solid 1px #272f36;
    }
    code, .highlight {
        background: #1E1E1E;
        color: #D4D4D4
    }
    .highlight .hll {
        background-color: #ADD6FF26
    }
    .highlight .c   { color: #6A9955 }
    .highlight .err { color: #F44747 }
    .highlight .k   { color: #C586C0 }
    .highlight .l   { color: #CE9178 }
    .highlight .ch  { color: #6A9955 }
    .highlight .cm  { color: #6A9955 }
    .highlight .cp  { color: #C586C0 }
    .highlight .cpf { color: #CE9178 }
    .highlight .c1  { color: #6A9955 }
    .highlight .cs  { color: #6A9955 }
    .highlight .gd  { color: #CE9178 }
    .highlight .ge  { font-style: italic }
    .highlight .gr  { color: #F44747 }
    .highlight .gh  { color: #569CD6 }
    .highlight .gi  { color: #B5CEA8 }
    .highlight .go  { color: #CE9178 }
    .highlight .gp  { color: #C8C8C8 }
    .highlight .gs  { color: #569CD6; font-weight: bold }
    .highlight .gu  { color: #569CD6 }
    .highlight .gt  { color: #F44747 }
    .highlight .kc  { color: #CE9178 }
    .highlight .kd  { color: #C586C0 }
    .highlight .kn  { color: #C586C0 }
    .highlight .kp  { color: #D7BA7D }
    .highlight .kr  { color: #C586C0 }
    .highlight .kt  { color: #569CD6 }
    .highlight .ld  { color: #CE9178 }
    .highlight .m   { color: #B5CEA8 }
    .highlight .s   { color: #CE9178 }
    .highlight .na  { color: #9CDCFE }
    .highlight .nb  { color: #DCDCAA }
    .highlight .nc  { color: #4EC9B0 }
    .highlight .no  { color: #B5CEA8 }
    .highlight .nd  { color: #DCDCAA }
    .highlight .ne  { color: #4EC9B0 }
    .highlight .nf  { color: #DCDCAA }
    .highlight .nl  { color: #C8C8C8 }
    .highlight .nx  { color: #D4D4D4 }
    .highlight .nt  { color: #569CD6 }
    .highlight .w   { color: #D4D4D4 }
    .highlight .mb  { color: #B5CEA8 }
    .highlight .mf  { color: #B5CEA8 }
    .highlight .mh  { color: #B5CEA8 }
    .highlight .mi  { color: #B5CEA8 }
    .highlight .mo  { color: #B5CEA8 }
    .highlight .sa  { color: #CE9178 }
    .highlight .sb  { color: #CE9178 }
    .highlight .sc  { color: #CE9178 }
    .highlight .dl  { color: #CE9178 }
    .highlight .sd  { color: #CE9178 }
    .highlight .s2  { color: #CE9178 }
    .highlight .se  { color: #CE9178 }
    .highlight .sh  { color: #CE9178 }
    .highlight .si  { color: #569CD6 }
    .highlight .sx  { color: #CE9178 }
    .highlight .sr  { color: #D16969 }
    .highlight .s1  { color: #CE9178 }
    .highlight .ss  { color: #CE9178 }
    .highlight .bp  { color: #D7BA7D }
    .highlight .fm  { color: #DCDCAA }
    .highlight .il  { color: #B5CEA8 }
}


================================================
FILE: assets/js/app.js
================================================
$(function () {
    var clipboardRules = new ClipboardJS('.clipboard-single', {
        text: function (trigger) {
            const id = trigger.getAttribute('data-clipboard-target-id');
            const html = $("#" + id + " .highlight");
            return html.text() + '\n';
        },
    });
    var clipboardCategories = new ClipboardJS('.clipboard-multiple', {
        text: function (trigger) {
            const id = trigger.getAttribute('data-clipboard-target-id');
            const html = $("[id^=" + id + "] .highlight");
            return Array.from(html.map((i, target) => $(target).text())).join('\n\n');
        },
    });
});


================================================
FILE: blackbox-exporter.md
================================================

<h1 style="text-align: center;">
  Blackbox exporter
</h1>

## Wordwide probes

<a href="https://github.com/prometheus/blackbox_exporter" target="_blank">Blackbox Exporter</a> gives you the ability to probe endpoints over HTTP, HTTPS, DNS, TCP and ICMP.

You should deploy blackbox exporters in multiple Point of Presence around the globe, to monitor latency. Feel free to use the following endpoints for your own projects:

- https://probe-<b>montreal</b>.cleverapps.io
- https://probe-<b>paris</b>.cleverapps.io
- https://probe-<b>jeddah</b>.cleverapps.io
- https://probe-<b>singapore</b>.cleverapps.io
- https://probe-<b>sydney</b>.cleverapps.io
- https://probe-<b>warsaw</b>.cleverapps.io

☝️ Logs have been disabled. More probes from the community would be appreciated, please contribute <a href="https://github.com/samber/awesome-prometheus-alerts/" target="_blank">here</a>! These blackbox exporters use the following <a href="https://github.com/samber/blackbox_exporter/blob/master/samber.yml" target="_blank">configuration</a>.

## Prometheus Configuration

Blackbox exporters and endpoints must be declared in Prometheus. Here is a simple configuration, inspired by [Hayk Davtyan medium post](https://medium.com/geekculture/single-prometheus-job-for-dozens-of-blackbox-exporters-2a7ba492d6c8):

```yml
# sd/blackbox.yml

- targets:
  #
  # Montreal
  #
  # http
  - probe-montreal.cleverapps.io:_:http_2xx:_:Montreal:_:f229cy:_:https://api.screeb.app
  - probe-montreal.cleverapps.io:_:http_2xx:_:Montreal:_:f229cy:_:https://t.screeb.app/tag.js
  # icmp
  - probe-montreal.cleverapps.io:_:icmp_ipv4:_:Montreal:_:f229cy:_:api.screeb.app
  - probe-montreal.cleverapps.io:_:icmp_ipv4:_:Montreal:_:f229cy:_:t.screeb.app


  #
  # Paris
  #
  # http
  - probe-paris.cleverapps.io:_:http_2xx:_:Paris:_:u09tgy:_:https://api.screeb.app
  - probe-paris.cleverapps.io:_:http_2xx:_:Paris:_:u09tgy:_:https://t.screeb.app/tag.js
  # icmp
  - probe-paris.cleverapps.io:_:icmp_ipv4:_:Paris:_:u09tgy:_:api.screeb.app
  - probe-paris.cleverapps.io:_:icmp_ipv4:_:Paris:_:u09tgy:_:t.screeb.app


  #
  # Sydney
  #
  # http
  - probe-sydney.cleverapps.io:_:http_2xx:_:Sydney:_:r3gpkn:_:https://api.screeb.app
  - probe-sydney.cleverapps.io:_:http_2xx:_:Sydney:_:r3gpkn:_:https://t.screeb.app/tag.js
  # icmp
  - probe-sydney.cleverapps.io:_:icmp_ipv4:_:Sydney:_:r3gpkn:_:api.screeb.app
  - probe-sydney.cleverapps.io:_:icmp_ipv4:_:Sydney:_:r3gpkn:_:t.screeb.app

  # ...
```

```yml
# prometheus.yml

global:
  # ...

scrape_configs:

  - job_name: 'blackbox'
    metrics_path: /probe
    scrape_interval: 30s
    scheme: https
    file_sd_configs:
      - files:
        - /etc/prometheus/sd/blackbox.yml
    relabel_configs:
      # adds "module" label in the final labelset
      - source_labels: [__address__]
        regex: '.*:_:(.*):_:.*:_:.*:_:.*'
        target_label: module
      # adds "geohash" label in the final labelset
      - source_labels: [__address__]
        regex: '.*:_:.*:_:.*:_:(.*):_:.*'
        target_label: geohash
      # rewrites "instance" label with corresponding URL
      - source_labels: [__address__]
        regex: '.*:_:.*:_:.*:_:.*:_:(.*)'
        target_label: instance
      # rewrites "pop" label with corresponding location name
      - source_labels: [__address__]
        regex: '.*:_:.*:_:(.*):_:.*:_:.*'
        target_label: pop
      # passes "module" parameter to Blackbox exporter
      - source_labels: [module]
        target_label: __param_module
      # passes "target" parameter to Blackbox exporter
      - source_labels: [instance]
        target_label: __param_target
      # the Blackbox exporter's real hostname:port
      - source_labels: [__address__]
        regex: '(.*):_:.*:_:.*:_:.*:_:.*'
        target_label: __address__

  # ...

```

## Geohash

![](assets/grafana-map-panel.png)

To display nice maps in Grafana, you need to instruct blackbox exporters about the location. Grafana map panel speaks the "geohash" format:

- go to google map
- extract the lat/long from the url
- convert lat/long to geohash here: http://geohash.co

## Grafana

Some great dashboard have been created by the community: https://grafana.com/grafana/dashboards/?search=blackbox

Since Grafana v5.0.0, a map panel is available: https://grafana.com/docs/grafana/latest/panels-visualizations/visualizations/geomap/


================================================
FILE: dist/rules/apache/lusitaniae-apache-exporter.yml
================================================
groups:

- name: LusitaniaeApacheExporter

  
  rules:

    - alert: ApacheDown
      expr: 'apache_up == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Apache down (instance {{ $labels.instance }})
        description: "Apache down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ApacheWorkersLoad
      expr: '(sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80 and sum by (instance) (apache_scoreboard) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Apache workers load (instance {{ $labels.instance }})
        description: "Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ApacheRestart
      expr: 'apache_uptime_seconds_total / 60 < 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Apache restart (instance {{ $labels.instance }})
        description: "Apache has just been restarted.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/apache-flink/flink-prometheus-reporter.yml
================================================
groups:

- name: FlinkPrometheusReporter

  
  rules:

    - alert: FlinkJobIsNotRunning
      expr: 'flink_jobmanager_numRunningJobs == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Flink job is not running (instance {{ $labels.instance }})
        description: "No Flink jobs are currently running. All jobs may have failed or been cancelled.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: FlinkNoTaskmanagersRegistered
      expr: 'flink_jobmanager_numRegisteredTaskManagers == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Flink no TaskManagers registered (instance {{ $labels.instance }})
        description: "No TaskManagers are registered with the JobManager. The cluster has no processing capacity.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This alert fires when there are no available task slots. Adjust the threshold if your cluster is expected to run at full capacity.
    - alert: FlinkAllTaskSlotsUsed
      expr: 'flink_jobmanager_taskSlotsAvailable == 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Flink all task slots used (instance {{ $labels.instance }})
        description: "All Flink task slots are in use ({{ $value }} available). New jobs cannot be scheduled.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # A single restart may be normal during deployments. Adjust threshold based on restart tolerance.
    - alert: FlinkJobRestartIncreasing
      expr: 'increase(flink_jobmanager_job_numRestarts[5m]) > 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Flink job restart increasing (instance {{ $labels.instance }})
        description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: FlinkCheckpointFailures
      expr: 'increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Flink checkpoint failures (instance {{ $labels.instance }})
        description: "Flink job {{ $labels.job_name }} has {{ $value }} failed checkpoints in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Value is converted from milliseconds to seconds for correct humanizeDuration display.
    # Threshold is 60 seconds. Adjust based on your checkpoint interval and state size.
    - alert: FlinkCheckpointDurationHigh
      expr: 'flink_jobmanager_job_lastCheckpointDuration / 1000 > 60'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Flink checkpoint duration high (instance {{ $labels.instance }})
        description: "Flink job {{ $labels.job_name }} last checkpoint took {{ $value | humanizeDuration }} to complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: FlinkTaskBackpressured
      expr: 'flink_taskmanager_job_task_isBackPressured == 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Flink task backpressured (instance {{ $labels.instance }})
        description: "Flink task {{ $labels.task_name }} in job {{ $labels.job_name }} is backpressured.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Fires when a task spends more than 500ms/sec backpressured. This indicates the task cannot keep up with upstream data rate.
    - alert: FlinkTaskHighBackpressureTime
      expr: 'flink_taskmanager_job_task_backPressuredTimeMsPerSecond > 500'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Flink task high backpressure time (instance {{ $labels.instance }})
        description: "Flink task {{ $labels.task_name }} is spending {{ $value | humanize }}ms/sec in backpressure.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: FlinkTaskmanagerHeapMemoryHigh
      expr: 'flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Flink TaskManager heap memory high (instance {{ $labels.instance }})
        description: "Flink TaskManager {{ $labels.instance }} heap memory usage is above 90%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: FlinkJobmanagerHeapMemoryHigh
      expr: 'flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Flink JobManager heap memory high (instance {{ $labels.instance }})
        description: "Flink JobManager {{ $labels.instance }} heap memory usage is above 90%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold: more than 100ms/sec of GC time (10% of wall clock). Adjust based on your workload.
    - alert: FlinkTaskmanagerGcTimeHigh
      expr: 'rate(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Flink TaskManager GC time high (instance {{ $labels.instance }})
        description: "Flink TaskManager {{ $labels.instance }} is spending more than 10% of time in garbage collection.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Only fires for tasks that have previously received records, to avoid false positives during startup.
    - alert: FlinkNoRecordsProcessed
      expr: 'rate(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Flink no records processed (instance {{ $labels.instance }})
        description: "Flink task {{ $labels.task_name }} has not processed any records in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/apache-spark/spark-prometheus.yml
================================================
groups:

- name: SparkPrometheus

  # Spark exposes metrics via two built-in endpoints:
  # - PrometheusServlet: master/worker/driver metrics at /metrics/prometheus/ (ports 8080, 8081, 4040)
  # - PrometheusResource: executor metrics at /metrics/executors/prometheus/ (port 4040, requires spark.ui.prometheus.enabled=true in Spark 3.x)
  # Metric names from PrometheusServlet include a dynamic namespace (application ID), making static PromQL queries challenging.
  # Configuration: spark.metrics.conf.*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet
  
  rules:

    - alert: SparkNoAliveWorkers
      expr: 'metrics_master_aliveWorkers_Value == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Spark no alive workers (instance {{ $labels.instance }})
        description: "No Spark workers are alive. The cluster has no processing capacity.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Adjust the threshold based on your cluster's typical queuing behavior.
    - alert: SparkTooManyWaitingApps
      expr: 'metrics_master_waitingApps_Value > 10'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Spark too many waiting apps (instance {{ $labels.instance }})
        description: "Spark has {{ $value }} applications waiting for resources.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SparkWorkerMemoryExhausted
      expr: 'metrics_worker_memFree_MB_Value == 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Spark worker memory exhausted (instance {{ $labels.instance }})
        description: "Spark worker {{ $labels.instance }} has no free memory ({{ $value }}MB free).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Fires when a worker has no free cores. This may be normal under high load but can indicate capacity issues.
    - alert: SparkWorkerCoresExhausted
      expr: 'metrics_worker_coresFree_Value == 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Spark worker cores exhausted (instance {{ $labels.instance }})
        description: "Spark worker {{ $labels.instance }} has no free cores.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Fires when more than 10% of executor time is spent in garbage collection.
    # This metric comes from the PrometheusResource endpoint (/metrics/executors/prometheus/).
    - alert: SparkExecutorHighGcTime
      expr: 'metrics_executor_totalGCTime_seconds_total / metrics_executor_totalDuration > 0.1 and metrics_executor_totalDuration > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Spark executor high GC time (instance {{ $labels.instance }})
        description: "Spark executor {{ $labels.executor_id }} in {{ $labels.application_name }} is spending too much time in GC.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SparkExecutorAllTasksFailing
      expr: 'metrics_executor_failedTasks_total > 0 and metrics_executor_completedTasks_total == 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Spark executor all tasks failing (instance {{ $labels.instance }})
        description: "Spark executor {{ $labels.executor_id }} has only failing tasks ({{ $value }} failed, 0 completed).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SparkExecutorHighTaskFailureRate
      expr: 'metrics_executor_failedTasks_total / metrics_executor_totalTasks_total > 0.1 and metrics_executor_totalTasks_total > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Spark executor high task failure rate (instance {{ $labels.instance }})
        description: "Spark executor {{ $labels.executor_id }} has a task failure rate above 10%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # diskUsed is a gauge, not a counter — do not use rate(). Threshold of 1GB is a rough default.
    # Disk spilling indicates insufficient memory for the workload.
    - alert: SparkExecutorHighDiskSpill
      expr: 'metrics_executor_diskUsed_bytes > 1e9'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Spark executor high disk spill (instance {{ $labels.instance }})
        description: "Spark executor {{ $labels.executor_id }} is spilling data to disk. Consider increasing executor memory.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/apc-ups/apcupsd_exporter.yml
================================================
groups:

- name: Apcupsd_exporter

  
  rules:

    - alert: ApcUpsBatteryNearlyEmpty
      expr: 'apcupsd_battery_charge_percent < 10'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: APC UPS Battery nearly empty (instance {{ $labels.instance }})
        description: "Battery is almost empty (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ApcUpsLessThan15MinutesOfBatteryTimeRemaining
      expr: 'apcupsd_battery_time_left_seconds < 900'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: APC UPS Less than 15 Minutes of battery time remaining (instance {{ $labels.instance }})
        description: "Battery is almost empty (< 15 Minutes remaining)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ApcUpsAcInputOutage
      expr: 'apcupsd_battery_time_on_seconds > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: APC UPS AC input outage (instance {{ $labels.instance }})
        description: "UPS now running on battery (since {{$value | humanizeDuration}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ApcUpsLowBatteryVoltage
      expr: '(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: APC UPS low battery voltage (instance {{ $labels.instance }})
        description: "Battery voltage is lower than nominal (< 95%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ApcUpsHighTemperature
      expr: 'apcupsd_internal_temperature_celsius >= 40'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: APC UPS high temperature (instance {{ $labels.instance }})
        description: "Internal temperature is high ({{$value}}°C)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ApcUpsHighLoad
      expr: 'apcupsd_ups_load_percent > 80'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: APC UPS high load (instance {{ $labels.instance }})
        description: "UPS load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/argocd/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: ArgocdServiceNotSynced
      expr: 'argocd_app_info{sync_status!="Synced"} != 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: ArgoCD service not synced (instance {{ $labels.instance }})
        description: "Service {{ $labels.name }} run by argo is currently not in sync.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ArgocdServiceUnhealthy
      expr: 'argocd_app_info{health_status!="Healthy"} != 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: ArgoCD service unhealthy (instance {{ $labels.instance }})
        description: "Service {{ $labels.name }} run by argo is currently not healthy.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml
================================================
groups:

- name: PrometheusCloudwatchExporter

  # CloudWatch metrics are exported as aws_{namespace}_{metric_name}_{statistic} gauges.
  # The rules below cover both exporter health and common AWS service alerts.
  # Adjust thresholds and label filters to match your CloudWatch exporter configuration.
  
  rules:

    - alert: CloudwatchExporterScrapeError
      expr: 'cloudwatch_exporter_scrape_error > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: CloudWatch exporter scrape error (instance {{ $labels.instance }})
        description: "CloudWatch exporter on {{ $labels.instance }} failed to scrape metrics from AWS CloudWatch API.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CloudwatchExporterSlowScrape
      expr: 'cloudwatch_exporter_scrape_duration_seconds > 300'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: CloudWatch exporter slow scrape (instance {{ $labels.instance }})
        description: "CloudWatch exporter on {{ $labels.instance }} scrape is taking more than 5 minutes ({{ $value }}s). Consider reducing the number of metrics or splitting across multiple exporters.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # CloudWatch API calls cost money (~$0.01 per 1000 GetMetricData requests).
    # 100 requests/minute ≈ $45/month. Adjust the threshold based on your budget.
    - alert: CloudwatchApiHighRequestRate
      expr: 'sum by (instance, namespace) (rate(cloudwatch_requests_total[5m])) * 60 > 100'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: CloudWatch API high request rate (instance {{ $labels.instance }})
        description: "CloudWatch exporter on {{ $labels.instance }} is making {{ $value }} API calls per minute to namespace {{ $labels.namespace }}. This can lead to high AWS costs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires EC2 CPUUtilization metric configured in the CloudWatch exporter.
    - alert: AwsEc2HighCpuUtilization
      expr: 'aws_ec2_cpuutilization_average > 90'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: AWS EC2 high CPU utilization (instance {{ $labels.instance }})
        description: "EC2 instance {{ $labels.instance_id }} CPU utilization is above 90% ({{ $value }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires RDS FreeStorageSpace metric. The threshold of 2GB is a rough default.
    # Adjust based on your database size.
    - alert: AwsRdsLowFreeStorageSpace
      expr: 'aws_rds_free_storage_space_average < 2000000000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: AWS RDS low free storage space (instance {{ $labels.instance }})
        description: "RDS instance {{ $labels.dbinstance_identifier }} has less than 2GB free storage ({{ $value }} bytes remaining).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires RDS CPUUtilization metric configured in the CloudWatch exporter.
    - alert: AwsRdsHighCpuUtilization
      expr: 'aws_rds_cpuutilization_average > 90'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: AWS RDS high CPU utilization (instance {{ $labels.instance }})
        description: "RDS instance {{ $labels.dbinstance_identifier }} CPU utilization is above 90% ({{ $value }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # The threshold depends on the RDS instance class. Adjust based on your
    # instance type's max_connections parameter.
    - alert: AwsRdsHighDatabaseConnections
      expr: 'aws_rds_database_connections_average > 100'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: AWS RDS high database connections (instance {{ $labels.instance }})
        description: "RDS instance {{ $labels.dbinstance_identifier }} has {{ $value }} active connections.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires SQS ApproximateNumberOfMessagesVisible metric. The threshold of 1000
    # is a rough default. Adjust based on your expected queue depth.
    - alert: AwsSqsQueueMessagesVisible
      expr: 'aws_sqs_approximate_number_of_messages_visible_average > 1000'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: AWS SQS queue messages visible (instance {{ $labels.instance }})
        description: "SQS queue {{ $labels.queue_name }} has {{ $value }} messages waiting to be processed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires SQS ApproximateAgeOfOldestMessage metric.
    - alert: AwsSqsMessageAgeTooOld
      expr: 'aws_sqs_approximate_age_of_oldest_message_maximum > 3600'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: AWS SQS message age too old (instance {{ $labels.instance }})
        description: "SQS queue {{ $labels.queue_name }} has messages older than 1 hour ({{ $value }}s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires ApplicationELB UnHealthyHostCount metric.
    - alert: AwsAlbUnhealthyTargets
      expr: 'aws_applicationelb_unhealthy_host_count_average > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: AWS ALB unhealthy targets (instance {{ $labels.instance }})
        description: "ALB {{ $labels.load_balancer }} has {{ $value }} unhealthy target(s) in target group {{ $labels.target_group }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires ApplicationELB HTTPCode_ELB_5XX_Count and RequestCount metrics.
    - alert: AwsAlbHigh5xxErrorRate
      expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5 and aws_applicationelb_request_count_sum > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: AWS ALB high 5xx error rate (instance {{ $labels.instance }})
        description: "ALB {{ $labels.load_balancer }} 5xx error rate is above 5% ({{ $value }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires ApplicationELB TargetResponseTime metric.
    - alert: AwsAlbHighTargetResponseTime
      expr: 'aws_applicationelb_target_response_time_average > 2'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: AWS ALB high target response time (instance {{ $labels.instance }})
        description: "ALB {{ $labels.load_balancer }} average target response time is above 2 seconds ({{ $value }}s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires Lambda Errors and Invocations metrics.
    - alert: AwsLambdaHighErrorRate
      expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5 and aws_lambda_invocations_sum > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: AWS Lambda high error rate (instance {{ $labels.instance }})
        description: "Lambda function {{ $labels.function_name }} error rate is above 5% ({{ $value }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/azure/azure-metrics-exporter.yml
================================================
groups:

- name: AzureMetricsExporter

  # The exporter uses azurerm_resource_metric as the default metric name for forwarded Azure Monitor metrics.
  # The metric name can be customized via the name parameter in probe configuration.
  # Self-monitoring metrics use the azurerm_stats_* and azurerm_api_* prefixes.
  
  rules:

    - alert: AzureExporterRequestErrors
      expr: 'increase(azurerm_stats_metric_requests{result="error"}[15m]) > 5'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Azure exporter request errors (instance {{ $labels.instance }})
        description: "Azure metrics exporter on {{ $labels.instance }} has {{ $value }} API request errors in the last 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: AzureExporterHighErrorRate
      expr: 'sum by (instance) (rate(azurerm_stats_metric_requests{result="error"}[5m])) / sum by (instance) (rate(azurerm_stats_metric_requests[5m])) * 100 > 10 and sum by (instance) (rate(azurerm_stats_metric_requests[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Azure exporter high error rate (instance {{ $labels.instance }})
        description: "Azure metrics exporter on {{ $labels.instance }} has an error rate above 10% ({{ $value }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Azure Resource Manager enforces rate limits per subscription.
    # The threshold of 100 remaining calls is a rough default. Adjust based on your
    # scrape interval and number of monitored resources.
    - alert: AzureApiReadRateLimitApproaching
      expr: 'azurerm_api_ratelimit{type="read"} < 100'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Azure API read rate limit approaching (instance {{ $labels.instance }})
        description: "Azure API read rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: AzureApiWriteRateLimitApproaching
      expr: 'azurerm_api_ratelimit{type="write"} < 50'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Azure API write rate limit approaching (instance {{ $labels.instance }})
        description: "Azure API write rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: AzureExporterSlowCollection
      expr: 'azurerm_stats_metric_collecttime > 300'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Azure exporter slow collection (instance {{ $labels.instance }})
        description: "Azure metrics exporter on {{ $labels.instance }} metric collection is taking more than 5 minutes ({{ $value }}s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/blackbox/blackbox-exporter.yml
================================================
groups:

- name: BlackboxExporter

  
  rules:

    - alert: BlackboxProbeFailed
      expr: 'probe_success == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Blackbox probe failed (instance {{ $labels.instance }})
        description: "Probe failed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: BlackboxConfigurationReloadFailure
      expr: 'blackbox_exporter_config_last_reload_successful != 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Blackbox configuration reload failure (instance {{ $labels.instance }})
        description: "Blackbox configuration reload failure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: BlackboxSlowProbe
      expr: 'probe_duration_seconds > 1'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Blackbox slow probe (instance {{ $labels.instance }})
        description: "Blackbox probe took more than 1s to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: BlackboxProbeHttpFailure
      expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
        description: "HTTP status code is not 200-399\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: BlackboxSslCertificateWillExpireSoon
      expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
        description: "SSL certificate expires in less than 20 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: BlackboxSslCertificateWillExpireVerySoon
      expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Blackbox SSL certificate will expire very soon (instance {{ $labels.instance }})
        description: "SSL certificate expires in less than 3 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # For probe_ssl_earliest_cert_expiry to be exposed after expiration, you
    # need to enable insecure_skip_verify. Note that this will disable
    # certificate validation.
    # See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config
    - alert: BlackboxSslCertificateExpired
      expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
        description: "SSL certificate has expired already\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: BlackboxProbeSlowHttp
      expr: 'probe_http_duration_seconds > 1'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
        description: "HTTP request took more than 1s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: BlackboxProbeSlowPing
      expr: 'probe_icmp_duration_seconds > 1'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Blackbox probe slow ping (instance {{ $labels.instance }})
        description: "Blackbox ping took more than 1s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/caddy/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: CaddyReverseProxyDown
      expr: 'count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Caddy Reverse Proxy Down (instance {{ $labels.instance }})
        description: "All Caddy reverse proxies are down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CaddyHighHttp4xxErrorRateService
      expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Caddy high HTTP 4xx error rate service (instance {{ $labels.instance }})
        description: "Caddy service 4xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CaddyHighHttp5xxErrorRateService
      expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Caddy high HTTP 5xx error rate service (instance {{ $labels.instance }})
        description: "Caddy service 5xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/cassandra/criteo-cassandra-exporter.yml
================================================
groups:

- name: CriteoCassandraExporter

  
  rules:

    - alert: CassandraHintsCount
      expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:totalhints:count"}[1m]) > 3'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Cassandra hints count (instance {{ $labels.instance }})
        description: "Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraCompactionTaskPending
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"} > 100'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Cassandra compaction task pending (instance {{ $labels.instance }})
        description: "Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraViewwriteLatency
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile"} > 100000'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Cassandra viewwrite latency (instance {{ $labels.instance }})
        description: "High viewwrite latency on {{ $labels.instance }} cassandra node\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraAuthenticationFailures
      expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Cassandra authentication failures (instance {{ $labels.instance }})
        description: "Increase of Cassandra authentication failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # 1m delay allows a restart without triggering an alert.
    - alert: CassandraNodeDown
      expr: 'sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Cassandra node down (instance {{ $labels.instance }})
        description: "Cassandra node down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraCommitlogPendingTasks(criteo)
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Cassandra commitlog pending tasks (Criteo) (instance {{ $labels.instance }})
        description: "Unexpected number of Cassandra commitlog pending tasks\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraCompactionExecutorBlockedTasks(criteo)
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:compactionexecutor:currentlyblockedtasks:count"} > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Cassandra compaction executor blocked tasks (Criteo) (instance {{ $labels.instance }})
        description: "Some Cassandra compaction executor tasks are blocked\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraFlushWriterBlockedTasks(criteo)
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:memtableflushwriter:currentlyblockedtasks:count"} > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Cassandra flush writer blocked tasks (Criteo) (instance {{ $labels.instance }})
        description: "Some Cassandra flush writer tasks are blocked\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraRepairPendingTasks
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:pendingtasks:value"} > 2'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Cassandra repair pending tasks (instance {{ $labels.instance }})
        description: "Some Cassandra repair tasks are pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraRepairBlockedTasks
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:currentlyblockedtasks:count"} > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Cassandra repair blocked tasks (instance {{ $labels.instance }})
        description: "Some Cassandra repair tasks are blocked\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraConnectionTimeoutsTotal(criteo)
      expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Cassandra connection timeouts total (Criteo) (instance {{ $labels.instance }})
        description: "Some connection between nodes are ending in timeout\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraStorageExceptions(criteo)
      expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:exceptions:count"}[1m]) > 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Cassandra storage exceptions (Criteo) (instance {{ $labels.instance }})
        description: "Something is going wrong with cassandra storage\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraTombstoneDump(criteo)
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:table:tombstonescannedhistogram:99thpercentile"} > 1000'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Cassandra tombstone dump (Criteo) (instance {{ $labels.instance }})
        description: "Too much tombstones scanned in queries\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraClientRequestUnavailableWrite(criteo)
      expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:unavailables:count"}[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Cassandra client request unavailable write (Criteo) (instance {{ $labels.instance }})
        description: "Write failures have occurred because too many nodes are unavailable\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraClientRequestUnavailableRead(criteo)
      expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:unavailables:count"}[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Cassandra client request unavailable read (Criteo) (instance {{ $labels.instance }})
        description: "Read failures have occurred because too many nodes are unavailable\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraClientRequestWriteFailure(criteo)
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Cassandra client request write failure (Criteo) (instance {{ $labels.instance }})
        description: "A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraClientRequestReadFailure(criteo)
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Cassandra client request read failure (Criteo) (instance {{ $labels.instance }})
        description: "A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraCacheHitRateKeyCache
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:cache:keycache:hitrate:value"} < .85'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Cassandra cache hit rate key cache (instance {{ $labels.instance }})
        description: "Key cache hit rate is below 85%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/cassandra/instaclustr-cassandra-exporter.yml
================================================
groups:

- name: InstaclustrCassandraExporter

  
  rules:

    # 1m delay allows a restart without triggering an alert.
    - alert: CassandraNodeIsUnavailable
      expr: 'cassandra_endpoint_active < 1'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Cassandra Node is unavailable (instance {{ $labels.instance }})
        description: "Cassandra Node is unavailable - {{ $labels.cassandra_cluster }} {{ $labels.exported_endpoint }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraManyCompactionTasksArePending
      expr: 'cassandra_table_estimated_pending_compactions > 100'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Cassandra many compaction tasks are pending (instance {{ $labels.instance }})
        description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraCommitlogPendingTasks(instaclustr)
      expr: 'cassandra_commit_log_pending_tasks > 15'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Cassandra commitlog pending tasks (Instaclustr) (instance {{ $labels.instance }})
        description: "Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraCompactionExecutorBlockedTasks(instaclustr)
      expr: 'cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Cassandra compaction executor blocked tasks (Instaclustr) (instance {{ $labels.instance }})
        description: "Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraFlushWriterBlockedTasks(instaclustr)
      expr: 'cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Cassandra flush writer blocked tasks (Instaclustr) (instance {{ $labels.instance }})
        description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraConnectionTimeoutsTotal(instaclustr)
      expr: 'sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Cassandra connection timeouts total (Instaclustr) (instance {{ $labels.instance }})
        description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraStorageExceptions(instaclustr)
      expr: 'changes(cassandra_storage_exceptions_total[1m]) > 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Cassandra storage exceptions (Instaclustr) (instance {{ $labels.instance }})
        description: "Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraTombstoneDump(instaclustr)
      expr: 'avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Cassandra tombstone dump (Instaclustr) (instance {{ $labels.instance }})
        description: "Cassandra tombstone dump - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraClientRequestUnavailableWrite(instaclustr)
      expr: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Cassandra client request unavailable write (Instaclustr) (instance {{ $labels.instance }})
        description: "Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraClientRequestUnavailableRead(instaclustr)
      expr: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Cassandra client request unavailable read (Instaclustr) (instance {{ $labels.instance }})
        description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraClientRequestWriteFailure(instaclustr)
      expr: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Cassandra client request write failure (Instaclustr) (instance {{ $labels.instance }})
        description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraClientRequestReadFailure(instaclustr)
      expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Cassandra client request read failure (Instaclustr) (instance {{ $labels.instance }})
        description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/ceph/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: CephState
      expr: 'ceph_health_status != 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Ceph State (instance {{ $labels.instance }})
        description: "Ceph instance unhealthy\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephMonitorClockSkew
      expr: 'abs(ceph_monitor_clock_skew_seconds) > 0.2'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Ceph monitor clock skew (instance {{ $labels.instance }})
        description: "Ceph monitor clock skew detected. Please check ntp and hardware clock settings\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephMonitorLowSpace
      expr: 'ceph_monitor_avail_percent < 10'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Ceph monitor low space (instance {{ $labels.instance }})
        description: "Ceph monitor storage is low.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephOsdDown
      expr: 'ceph_osd_up == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Ceph OSD Down (instance {{ $labels.instance }})
        description: "Ceph Object Storage Daemon Down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephHighOsdLatency
      expr: 'ceph_osd_perf_apply_latency_seconds > 5'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Ceph high OSD latency (instance {{ $labels.instance }})
        description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephOsdLowSpace
      expr: 'ceph_osd_utilization > 90'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Ceph OSD low space (instance {{ $labels.instance }})
        description: "Ceph Object Storage Daemon is going out of space. Please add more disks.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephOsdReweighted
      expr: 'ceph_osd_weight < 1'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Ceph OSD reweighted (instance {{ $labels.instance }})
        description: "Ceph Object Storage Daemon takes too much time to resize.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephPgDown
      expr: 'ceph_pg_down > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Ceph PG down (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are down. Please ensure that all the data are available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephPgIncomplete
      expr: 'ceph_pg_incomplete > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Ceph PG incomplete (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are incomplete. Please ensure that all the data are available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephPgInconsistent
      expr: 'ceph_pg_inconsistent > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Ceph PG inconsistent (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephPgActivationLong
      expr: 'ceph_pg_activating > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Ceph PG activation long (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are too long to activate.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephPgBackfillFull
      expr: 'ceph_pg_backfill_toofull > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Ceph PG backfill full (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephPgUnavailable
      expr: 'ceph_pg_total - ceph_pg_active > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Ceph PG unavailable (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are unavailable.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/cert-manager/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: Cert-managerAbsent
      expr: 'absent(up{job="cert-manager"})'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Cert-Manager absent (instance {{ $labels.instance }})
        description: "Cert-Manager has disappeared from Prometheus service discovery. New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 21 days is a rough default. ACME certificates are typically renewed 30 days before expiry, so expiring within 21 days may indicate issuer misconfiguration.
    - alert: Cert-managerCertificateExpiringSoon
      expr: 'avg by (exported_namespace, namespace, name) (certmanager_certificate_expiration_timestamp_seconds - time()) < (21 * 24 * 3600)'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Cert-Manager certificate expiring soon (instance {{ $labels.instance }})
        description: "The certificate {{ $labels.name }} is expiring in less than 21 days.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: Cert-managerCertificateNotReady
      expr: 'max by (name, exported_namespace, namespace, condition) (certmanager_certificate_ready_status{condition!="True"} == 1)'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Cert-Manager certificate not ready (instance {{ $labels.instance }})
        description: "The certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready to serve traffic.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # In cert-manager 1.19+, the metric was renamed (dropped http_ prefix). Verify metric name against your version.
    - alert: Cert-managerHittingAcmeRateLimits
      expr: 'sum by (host) (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Cert-Manager hitting ACME rate limits (instance {{ $labels.instance }})
        description: "Cert-Manager is being rate-limited by the ACME provider. Certificate issuance and renewal may be blocked for up to a week.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/cilium/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    # Metric name depends on Cilium version. Use cilium_unreachable_nodes (older) or cilium_node_connectivity_status (1.14+).
    - alert: CiliumAgentUnreachableNodes
      expr: 'sum(cilium_unreachable_nodes{}) by (pod) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Cilium agent unreachable nodes (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} node(s). Check network connectivity and node health.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Metric name depends on Cilium version. Use cilium_unreachable_health_endpoints (older) or cilium_node_connectivity_status (1.14+).
    - alert: CiliumAgentUnreachableHealthEndpoints
      expr: 'sum(cilium_unreachable_health_endpoints{}) by (pod) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Cilium agent unreachable health endpoints (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} health endpoint(s). Node-to-node health probes are failing.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Metric name depends on Cilium version. Use cilium_controllers_failing (older) or cilium_controllers_runs_total (1.14+).
    - alert: CiliumAgentFailingControllers
      expr: 'sum(cilium_controllers_failing{}) by (pod) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium agent failing controllers (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} has {{ $value }} failing controller(s). Check cilium-agent logs for details.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentEndpointFailures
      expr: 'sum(cilium_endpoint_state{endpoint_state="invalid"}) by (pod) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium agent endpoint failures (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} has {{ $value }} endpoint(s) in invalid state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentEndpointRegenerationFailures
      expr: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium agent endpoint regeneration failures (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} is failing to regenerate endpoints. Network policy enforcement may be stale.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentEndpointUpdateFailure
      expr: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium agent endpoint update failure (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} is failing K8s endpoint update API calls ({{ $labels.method }} {{ $labels.return_code }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentEndpointCreateFailure
      expr: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0'
      for: 5m
      labels:
        severity: info
      annotations:
        summary: Cilium agent endpoint create failure (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} is failing CNI endpoint-create calls. New pods may fail to get networking.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentMapOperationFailures
      expr: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium agent map operation failures (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} has eBPF map operation failures on {{ $labels.map_name }}. Datapath may be degraded.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Map pressure is a ratio from 0 to 1. At 1.0, the map is full and new entries will be dropped.
    - alert: CiliumAgentBpfMapPressure
      expr: 'cilium_bpf_map_pressure{} > 0.9'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium agent BPF map pressure (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} eBPF map {{ $labels.map_name }} is above 90% utilization. Map may become full.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentConntrackTableFull
      expr: 'sum(rate(cilium_drop_count_total{reason="CT: Map insertion failed"}[5m])) by (pod) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Cilium agent conntrack table full (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} conntrack table is full, causing packet drops. Increase CT map size or investigate connection leaks.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentConntrackFailedGarbageCollection
      expr: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium agent conntrack failed garbage collection (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} conntrack garbage collection is failing. Stale entries may accumulate.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentNatTableFull
      expr: 'sum(rate(cilium_drop_count_total{reason="No mapping for NAT masquerade"}[1m])) by (pod) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Cilium agent NAT table full (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} NAT table is full, causing masquerade failures. Increase NAT map size or investigate.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Policy denials may be expected behavior. Investigate only if unexpected traffic is being blocked.
    - alert: CiliumAgentHighDeniedRate
      expr: 'sum(rate(cilium_drop_count_total{reason="Policy denied"}[1m])) by (pod) > 0'
      for: 10m
      labels:
        severity: info
      annotations:
        summary: Cilium agent high denied rate (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} is dropping packets due to policy denial. Verify network policies are correct.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentHighDropRate
      expr: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium agent high drop rate (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} is dropping packets for reason {{ $labels.reason }}. This indicates infrastructure issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentPolicyMapPressure
      expr: 'sum(cilium_bpf_map_pressure{map_name=~"cilium_policy_.*"}) by (pod) > 0.9'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium agent policy map pressure (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} policy BPF map is above 90% utilization. New policies may fail to apply.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentPolicyImportErrors
      expr: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium agent policy import errors (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} is failing to import network policies. Policy enforcement may be incomplete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 60s is a rough default. Adjust based on cluster size and policy complexity.
    - alert: CiliumAgentPolicyImplementationDelay
      expr: 'histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay[5m])) by (le, pod)) > 60'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium agent policy implementation delay (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} P99 policy deployment latency exceeds 60 seconds. Endpoints may run with stale policies.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumNode-localHighIdentityAllocation
      expr: '(sum(cilium_identity{type="node_local"}) by (pod) / (2^16-1)) > 0.8'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium node-local high identity allocation (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} node-local identity allocation is above 80%. Approaching the 65535 identity limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumClusterHighIdentityAllocation
      expr: '(sum(cilium_identity{type="cluster_local"}) by () / (2^16-256)) > 0.8'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium cluster high identity allocation (instance {{ $labels.instance }})
        description: "Cilium cluster-wide identity allocation is above 80%. Approaching the maximum identity limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumOperatorExhaustedIpamIps
      expr: 'sum(cilium_operator_ipam_ips{type="available"}) by () <= 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Cilium operator exhausted IPAM IPs (instance {{ $labels.instance }})
        description: "Cilium operator has no available IPAM IPs. New pods will fail to schedule networking.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 90% is a rough default. Adjust based on your pod churn rate and IP pool size.
    - alert: CiliumOperatorLowAvailableIpamIps
      expr: 'sum(cilium_operator_ipam_ips{type!="available"}) by () / sum(cilium_operator_ipam_ips) by () > 0.9 and sum(cilium_operator_ipam_ips) by () > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium operator low available IPAM IPs (instance {{ $labels.instance }})
        description: "Cilium operator IPAM IP pool is over 90% utilized. Allocate more IPs to avoid exhaustion.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Some Cilium versions may not have a status label on this metric. Verify against your Cilium version.
    - alert: CiliumOperatorIpamInterfaceCreationFailures
      expr: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Cilium operator IPAM interface creation failures (instance {{ $labels.instance }})
        description: "Cilium operator is failing to create IPAM network interfaces. IP allocation may be impacted.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentApiErrors
      expr: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium agent API errors (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} API is returning 5xx errors ({{ $labels.return_code }}). Agent may be unhealthy.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentKubernetesClientErrors
      expr: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0'
      for: 5m
      labels:
        severity: info
      annotations:
        summary: Cilium agent Kubernetes client errors (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} is receiving errors from K8s API for endpoint {{ $labels.endpoint }} ({{ $labels.return_code }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumClustermeshRemoteClusterNotReady
      expr: 'count(cilium_clustermesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Cilium ClusterMesh remote cluster not ready (instance {{ $labels.instance }})
        description: "Cilium ClusterMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumClustermeshRemoteClusterFailing
      expr: 'sum(rate(cilium_clustermesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Cilium ClusterMesh remote cluster failing (instance {{ $labels.instance }})
        description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumKvstoremeshRemoteClusterNotReady
      expr: 'count(cilium_kvstoremesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Cilium KVStoreMesh remote cluster not ready (instance {{ $labels.instance }})
        description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumKvstoremeshRemoteClusterFailing
      expr: 'sum(rate(cilium_kvstoremesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Cilium KVStoreMesh remote cluster failing (instance {{ $labels.instance }})
        description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumKvstoremeshSyncErrors
      expr: 'sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Cilium KVStoreMesh sync errors (instance {{ $labels.instance }})
        description: "Cilium KVStoreMesh from {{ $labels.source_cluster }} is experiencing kvstore sync errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumHubbleLostEvents
      expr: 'sum(rate(hubble_lost_events_total[5m])) by (pod) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium Hubble lost events (instance {{ $labels.instance }})
        description: "Cilium Hubble on {{ $labels.pod }} is losing flow events. Observability data may be incomplete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 10% is a rough default. Some DNS errors may be normal depending on your workload.
    - alert: CiliumHubbleHighDnsErrorRate
      expr: 'sum(rate(hubble_dns_responses_total{rcode!="No Error"}[5m])) by (pod) / sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0.1 and sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Cilium Hubble high DNS error rate (instance {{ $labels.instance }})
        description: "Cilium Hubble on {{ $labels.pod }} is observing more than 10% DNS error responses.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/clickhouse/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    # Adjust the job label to match your Prometheus configuration.
    - alert: ClickhouseNodeDown
      expr: 'up{job="clickhouse"} == 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: ClickHouse node down (instance {{ $labels.instance }})
        description: "No metrics received from ClickHouse exporter for over 2 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseMemoryUsageCritical
      expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: ClickHouse Memory Usage Critical (instance {{ $labels.instance }})
        description: "Memory usage is critically high, over 90%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseMemoryUsageWarning
      expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse Memory Usage Warning (instance {{ $labels.instance }})
        description: "Memory usage is over 80%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseDiskSpaceLowOnDefault
      expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse Disk Space Low on Default (instance {{ $labels.instance }})
        description: "Disk space on default is below 20%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseDiskSpaceCriticalOnDefault
      expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: ClickHouse Disk Space Critical on Default (instance {{ $labels.instance }})
        description: "Disk space on default disk is critically low, below 10%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseDiskSpaceLowOnBackups
      expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse Disk Space Low on Backups (instance {{ $labels.instance }})
        description: "Disk space on backups is below 20%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseReplicaErrors
      expr: 'ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: ClickHouse Replica Errors (instance {{ $labels.instance }})
        description: "Critical replica errors detected, either all replicas are stale or lost.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseNoAvailableReplicas
      expr: 'ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: ClickHouse No Available Replicas (instance {{ $labels.instance }})
        description: "No available replicas in ClickHouse.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseNoLiveReplicas
      expr: 'ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
        description: "There are too few live replicas available, risking data loss and service disruption.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Please replace the threshold with an appropriate value
    - alert: ClickhouseHighTcpConnections
      expr: 'ClickHouseMetrics_TCPConnection > 400'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse High TCP Connections (instance {{ $labels.instance }})
        description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Adjust the threshold based on your cluster size and expected replication traffic.
    - alert: ClickhouseInterserverConnectionIssues
      expr: 'ClickHouseMetrics_InterserverConnection > 50'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }})
        description: "High number of interserver connections may indicate replication or distributed query handling issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseZookeeperConnectionIssues
      expr: 'ClickHouseMetrics_ZooKeeperSession != 1'
      for: 3m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse ZooKeeper Connection Issues (instance {{ $labels.instance }})
        description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseAuthenticationFailures
      expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 3'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: ClickHouse Authentication Failures (instance {{ $labels.instance }})
        description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseAccessDeniedErrors
      expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 3'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: ClickHouse Access Denied Errors (instance {{ $labels.instance }})
        description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseRejectedInsertQueries
      expr: 'increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 0'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse rejected insert queries (instance {{ $labels.instance }})
        description: "INSERTs rejected due to too many active data parts. Reduce insert frequency.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseDelayedInsertQueries
      expr: 'increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse delayed insert queries (instance {{ $labels.instance }})
        description: "INSERTs delayed due to high number of active parts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseZookeeperHardwareException
      expr: 'increase(ClickHouseProfileEvents_ZooKeeperHardwareExceptions[1m]) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: ClickHouse zookeeper hardware exception (instance {{ $labels.instance }})
        description: "Zookeeper hardware exception: network issues communicating with ZooKeeper\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Please replace the threshold with an appropriate value
    - alert: ClickhouseHighNetworkUsage
      expr: 'rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse high network usage (instance {{ $labels.instance }})
        description: "High network usage. ClickHouse network usage exceeds 100MB/s.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseDistributedRejectedInserts
      expr: 'increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: ClickHouse distributed rejected inserts (instance {{ $labels.instance }})
        description: "INSERTs into Distributed tables rejected due to pending bytes limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/cloudflare/lablabs-cloudflare-exporter.yml
================================================
groups:

- name: LablabsCloudflareExporter

  
  rules:

    - alert: CloudflareHttp4xxErrorRate
      expr: '(sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[15m])) > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Cloudflare http 4xx error rate (instance {{ $labels.instance }})
        description: "Cloudflare high HTTP 4xx error rate (> 5% for domain {{ $labels.zone }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CloudflareHttp5xxErrorRate
      expr: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[5m])) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Cloudflare http 5xx error rate (instance {{ $labels.instance }})
        description: "Cloudflare high HTTP 5xx error rate (> 5% for domain {{ $labels.zone }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/consul/consul-exporter.yml
================================================
groups:

- name: ConsulExporter

  
  rules:

    - alert: ConsulServiceHealthcheckFailed
      expr: 'consul_catalog_service_node_healthy == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Consul service healthcheck failed (instance {{ $labels.instance }})
        description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ConsulMissingMasterNode
      expr: 'consul_raft_peers < 3'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Consul missing master node (instance {{ $labels.instance }})
        description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ConsulAgentUnhealthy
      expr: 'consul_health_node_status{status="critical"} == 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Consul agent unhealthy (instance {{ $labels.instance }})
        description: "A Consul agent is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/coredns/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: CorednsPanicCount
      expr: 'increase(coredns_panics_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: CoreDNS Panic Count (instance {{ $labels.instance }})
        description: "Number of CoreDNS panics encountered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/cortex/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: CortexRulerConfigurationReloadFailure
      expr: 'cortex_ruler_config_last_reload_successful != 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Cortex ruler configuration reload failure (instance {{ $labels.instance }})
        description: "Cortex ruler configuration reload failure (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CortexNotConnectedToAlertmanager
      expr: 'cortex_prometheus_notifications_alertmanagers_discovered < 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Cortex not connected to Alertmanager (instance {{ $labels.instance }})
        description: "Cortex not connected to Alertmanager (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: CortexNotificationAreBeingDropped
      expr: 'rate(cortex_prometheus_notifications_dropped_total[5m]) > 0.05'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Cortex notification are being dropped (instance {{ $labels.instance }})
        description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: CortexNotificationError
      expr: 'rate(cortex_prometheus_notifications_errors_total[5m]) > 0.05'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Cortex notification error (instance {{ $labels.instance }})
        description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CortexIngesterUnhealthy
      expr: 'cortex_ring_members{state="Unhealthy", name="ingester"} > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Cortex ingester unhealthy (instance {{ $labels.instance }})
        description: "Cortex has an unhealthy ingester\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CortexFrontendQueriesStuck
      expr: 'sum by (job) (cortex_query_frontend_queue_length) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Cortex frontend queries stuck (instance {{ $labels.instance }})
        description: "There are queued up queries in query-frontend.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml
================================================
groups:

- name: GesellixCouchdbPrometheusExporter

  
  rules:

    - alert: CouchdbNodeDown
      expr: 'couchdb_httpd_node_up == 0 or couchdb_httpd_up == 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: CouchDB node down (instance {{ $labels.instance }})
        description: "CouchDB node is not responding (node_up metric is 0) for more than 2 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbAtomMemoryUsageCritical
      expr: 'couchdb_erlang_memory_atom_used > 0.9 * couchdb_erlang_memory_atom'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: CouchDB atom memory usage critical (instance {{ $labels.instance }})
        description: "Atom memory usage is above 90% of limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbOpenDatabasesCritical
      expr: 'couchdb_httpd_open_databases > 0.9 * 1000'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: CouchDB open databases critical (instance {{ $labels.instance }})
        description: "Number of open databases exceeds 90% of node capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbOpenOsFilesCritical
      expr: 'couchdb_httpd_open_os_files > 0.9 * 65535'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: CouchDB open OS files critical (instance {{ $labels.instance }})
        description: "CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: Couchdb5xxErrorRatioHigh
      expr: 'rate(couchdb_httpd_status_codes{code=~"5.."}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05 and rate(couchdb_httpd_requests[5m]) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: CouchDB 5xx error ratio high (instance {{ $labels.instance }})
        description: "More than 5% of HTTP requests are returning 5xx errors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbTemporaryViewReadRateCritical
      expr: 'rate(couchdb_httpd_temporary_view_reads[5m]) > 100'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: CouchDB temporary view read rate critical (instance {{ $labels.instance }})
        description: "Temporary view read rate exceeds 100 reads/sec, high risk of performance degradation\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbMangoQueriesScanningTooManyDocs
      expr: 'rate(couchdb_mango_too_many_docs_scanned[5m]) > 50'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: CouchDB Mango queries scanning too many docs (instance {{ $labels.instance }})
        description: "Some Mango queries are scanning too many documents, consider adding indexes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbMangoQueriesFailedDueToInvalidIndex
      expr: 'rate(couchdb_mango_query_invalid_index[5m]) > 5'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: CouchDB Mango queries failed due to invalid index (instance {{ $labels.instance }})
        description: "Some Mango queries failed to execute because the index was missing or invalid\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbMangoDocsExaminedHigh
      expr: 'rate(couchdb_mango_docs_examined[5m]) > 1000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: CouchDB Mango docs examined high (instance {{ $labels.instance }})
        description: "High number of documents examined per Mango queries, consider indexing\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbReplicatorManagerDied
      expr: 'increase(couchdb_replicator_changes_manager_deaths[5m]) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: CouchDB Replicator manager died (instance {{ $labels.instance }})
        description: "Replication manager process has crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbReplicatorQueueProcessDied
      expr: 'increase(couchdb_replicator_changes_queue_deaths[5m]) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: CouchDB Replicator queue process died (instance {{ $labels.instance }})
        description: "Replication queue process has crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbReplicatorReaderProcessDied
      expr: 'increase(couchdb_replicator_changes_reader_deaths[5m]) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: CouchDB Replicator reader process died (instance {{ $labels.instance }})
        description: "Replication reader process has crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbReplicatorFailedToStart
      expr: 'increase(couchdb_replicator_failed_starts[5m]) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: CouchDB Replicator failed to start (instance {{ $labels.instance }})
        description: "One or more replication tasks failed to start\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbReplicationClusterUnstable
      expr: 'couchdb_replicator_cluster_is_stable == 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: CouchDB replication cluster unstable (instance {{ $labels.instance }})
        description: "The replication cluster is unstable, replication may be interrupted\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbReplicationReadFailures
      expr: 'increase(couchdb_replicator_changes_read_failures[5m]) > 5'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: CouchDB replication read failures (instance {{ $labels.instance }})
        description: "Replication changes feed has failed reads more than 5 times in 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbFileDescriptorsHigh
      expr: 'process_open_fds / process_max_fds > 0.85 and process_max_fds > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: CouchDB file descriptors high (instance {{ $labels.instance }})
        description: "Process is using more than 85% of allowed file descriptors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbProcessRestarted
      expr: 'changes(process_start_time_seconds[1h]) > 0'
      for: 1m
      labels:
        severity: info
      annotations:
        summary: CouchDB process restarted (instance {{ $labels.instance }})
        description: "CouchDB process has restarted recently\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbCriticalLogEntries
      expr: 'increase(couchdb_server_couch_log{level=~"error|critical"}[5m]) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: CouchDB critical log entries (instance {{ $labels.instance }})
        description: "Critical or error log entries detected in the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/digitalocean/digitalocean-exporter.yml
================================================
groups:

- name: DigitaloceanExporter

  
  rules:

    - alert: DigitaloceanDropletDown
      expr: 'digitalocean_droplet_up == 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: DigitalOcean droplet down (instance {{ $labels.instance }})
        description: "DigitalOcean droplet {{ $labels.name }} ({{ $labels.id }}) in {{ $labels.region }} is not running.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: DigitaloceanAccountNotActive
      expr: 'digitalocean_account_active != 1'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: DigitalOcean account not active (instance {{ $labels.instance }})
        description: "DigitalOcean account is not active. It may be suspended or locked.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: DigitaloceanDatabaseDown
      expr: 'digitalocean_database_status == 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: DigitalOcean database down (instance {{ $labels.instance }})
        description: "DigitalOcean managed database {{ $labels.name }} ({{ $labels.engine }}) in {{ $labels.region }} is offline.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: DigitaloceanKubernetesClusterDown
      expr: 'digitalocean_kubernetes_cluster_up == 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: DigitalOcean Kubernetes cluster down (instance {{ $labels.instance }})
        description: "DigitalOcean Kubernetes cluster {{ $labels.name }} ({{ $labels.version }}) in {{ $labels.region }} is not running.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: DigitaloceanLoadBalancerDown
      expr: 'digitalocean_loadbalancer_status == 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: DigitalOcean load balancer down (instance {{ $labels.instance }})
        description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) is not active.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: DigitaloceanLoadBalancerNoBackends
      expr: 'digitalocean_loadbalancer_droplets == 0'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: DigitalOcean load balancer no backends (instance {{ $labels.instance }})
        description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) has no droplets attached.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: DigitaloceanFloatingIpNotAssigned
      expr: 'digitalocean_floating_ipv4_active == 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: DigitalOcean floating IP not assigned (instance {{ $labels.instance }})
        description: "DigitalOcean floating IP {{ $labels.ipv4 }} in {{ $labels.region }} is not assigned to any droplet.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: DigitaloceanActiveIncidents
      expr: 'digitalocean_incidents_total > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: DigitalOcean active incidents (instance {{ $labels.instance }})
        description: "DigitalOcean platform has {{ $value }} active incident(s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: DigitaloceanExporterCollectionErrors
      expr: 'increase(digitalocean_errors_total[5m]) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: DigitalOcean exporter collection errors (instance {{ $labels.instance }})
        description: "DigitalOcean exporter {{ $labels.collector }} collector has {{ $value }} errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Fires when more than 80% of the account's droplet limit is in use.
    - alert: DigitaloceanDropletLimitApproaching
      expr: '(count(digitalocean_droplet_up) / digitalocean_account_droplet_limit) * 100 > 80 and digitalocean_account_droplet_limit > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: DigitalOcean droplet limit approaching (instance {{ $labels.instance }})
        description: "DigitalOcean account is using {{ $value }}% of its droplet quota.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/docker-containers/google-cadvisor.yml
================================================
groups:

- name: GoogleCadvisor

  
  rules:

    # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
    - alert: ContainerKilled
      expr: 'time() - container_last_seen > 60'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Container killed (instance {{ $labels.instance }})
        description: "A container has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
    - alert: ContainerAbsent
      expr: 'absent(container_last_seen)'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Container absent (instance {{ $labels.instance }})
        description: "A container is absent for 5 min\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Only fires for containers with explicit CPU limits. Containers without limits have cpu_quota=0, which is filtered out by the guard.
    - alert: ContainerHighCpuUtilization
      expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Container High CPU utilization (instance {{ $labels.instance }})
        description: "Container CPU utilization is above 80% (current: {{ $value | printf \"%.2f\" }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
    - alert: ContainerHighMemoryUsage
      expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Container High Memory usage (instance {{ $labels.instance }})
        description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ContainerVolumeUsage
      expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80 and sum(container_fs_inodes_total) BY (instance) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Container Volume usage (instance {{ $labels.instance }})
        description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ContainerHighThrottleRate
      expr: 'sum(rate(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Container high throttle rate (instance {{ $labels.instance }})
        description: "Container is being throttled ({{ $value | humanizePercentage }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ContainerHighLowChangeCpuUsage
      expr: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: Container high low change CPU usage (instance {{ $labels.instance }})
        description: "This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ContainerLowCpuUtilization
      expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
      for: 7d
      labels:
        severity: info
      annotations:
        summary: Container Low CPU utilization (instance {{ $labels.instance }})
        description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU. (current: {{ $value | printf \"%.2f\" }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ContainerLowMemoryUsage
      expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20'
      for: 7d
      labels:
        severity: info
      annotations:
        summary: Container Low Memory usage (instance {{ $labels.instance }})
        description: "Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/ebpf/ebpf-exporter.yml
================================================
groups:

- name: EbpfExporter

  
  rules:

    # The exporter uses loose attachment: if a program fails to load (missing BTF, kernel incompatibility), it sets this metric to 0 and continues running.
    - alert: EbpfExporterProgramNotAttached
      expr: 'ebpf_exporter_ebpf_program_attached == 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: eBPF exporter program not attached (instance {{ $labels.instance }})
        description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EbpfExporterDecoderErrors
      expr: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: eBPF exporter decoder errors (instance {{ $labels.instance }})
        description: "eBPF exporter is experiencing decoder errors for config {{ $labels.config }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EbpfExporterNoEnabledConfigs
      expr: 'ebpf_exporter_enabled_configs == 0 or absent(ebpf_exporter_enabled_configs)'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: eBPF exporter no enabled configs (instance {{ $labels.instance }})
        description: "eBPF exporter has no enabled configurations. No eBPF programs are being run. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
================================================
groups:

- name: PrometheusCommunityElasticsearchExporter

  
  rules:

    - alert: ElasticsearchHeapUsageTooHigh
      expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Elasticsearch Heap Usage Too High (instance {{ $labels.instance }})
        description: "The heap usage is over 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchHeapUsageWarning
      expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch Heap Usage warning (instance {{ $labels.instance }})
        description: "The heap usage is over 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchDiskOutOfSpace
      expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10 and elasticsearch_filesystem_data_size_bytes > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Elasticsearch disk out of space (instance {{ $labels.instance }})
        description: "The disk usage is over 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchDiskSpaceLow
      expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20 and elasticsearch_filesystem_data_size_bytes > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch disk space low (instance {{ $labels.instance }})
        description: "The disk usage is over 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchClusterRed
      expr: 'elasticsearch_cluster_health_status{color="red"} == 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Elasticsearch Cluster Red (instance {{ $labels.instance }})
        description: "Elastic Cluster Red status\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchClusterYellow
      expr: 'elasticsearch_cluster_health_status{color="yellow"} == 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch Cluster Yellow (instance {{ $labels.instance }})
        description: "Elastic Cluster Yellow status\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # 1m delay allows a restart without triggering an alert.
    - alert: ElasticsearchHealthyNodes
      expr: 'elasticsearch_cluster_health_number_of_nodes < 3'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Elasticsearch Healthy Nodes (instance {{ $labels.instance }})
        description: "Missing node in Elasticsearch cluster\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # 1m delay allows a restart without triggering an alert.
    - alert: ElasticsearchHealthyDataNodes
      expr: 'elasticsearch_cluster_health_number_of_data_nodes < 3'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Elasticsearch Healthy Data Nodes (instance {{ $labels.instance }})
        description: "Missing data node in Elasticsearch cluster\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchRelocatingShards
      expr: 'elasticsearch_cluster_health_relocating_shards > 0'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: Elasticsearch relocating shards (instance {{ $labels.instance }})
        description: "Elasticsearch is relocating shards\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchRelocatingShardsTooLong
      expr: 'elasticsearch_cluster_health_relocating_shards > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch relocating shards too long (instance {{ $labels.instance }})
        description: "Elasticsearch has been relocating shards for 15min\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchInitializingShards
      expr: 'elasticsearch_cluster_health_initializing_shards > 0'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: Elasticsearch initializing shards (instance {{ $labels.instance }})
        description: "Elasticsearch is initializing shards\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchInitializingShardsTooLong
      expr: 'elasticsearch_cluster_health_initializing_shards > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch initializing shards too long (instance {{ $labels.instance }})
        description: "Elasticsearch has been initializing shards for 15 min\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchUnassignedShards
      expr: 'elasticsearch_cluster_health_unassigned_shards > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Elasticsearch unassigned shards (instance {{ $labels.instance }})
        description: "Elasticsearch has unassigned shards\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchPendingTasks
      expr: 'elasticsearch_cluster_health_number_of_pending_tasks > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch pending tasks (instance {{ $labels.instance }})
        description: "Elasticsearch has pending tasks. Cluster works slowly.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchNoNewDocuments
      expr: 'increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch no new documents (instance {{ $labels.instance }})
        description: "No new documents for 10 min!\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchHighIndexingLatency
      expr: 'rate(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / rate(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and rate(elasticsearch_indices_indexing_index_total[1m]) > 0'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }})
        description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchHighIndexingRate
      expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch High Indexing Rate (instance {{ $labels.instance }})
        description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchHighQueryRate
      expr: 'sum(rate(elasticsearch_indices_search_query_total[1m])) > 100'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch High Query Rate (instance {{ $labels.instance }})
        description: "The query rate on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchHighQueryLatency
      expr: 'rate(elasticsearch_indices_search_query_time_seconds[1m]) / rate(elasticsearch_indices_search_query_total[1m]) > 1 and rate(elasticsearch_indices_search_query_total[1m]) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch High Query Latency (instance {{ $labels.instance }})
        description: "The query latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/envoy/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: EnvoyServerNotLive
      expr: 'envoy_server_live != 1'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Envoy server not live (instance {{ $labels.instance }})
        description: "Envoy server is not live (draining or shutting down) on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyHighMemoryUsage
      expr: 'envoy_server_memory_allocated / envoy_server_memory_heap_size * 100 > 90 and envoy_server_memory_heap_size > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Envoy high memory usage (instance {{ $labels.instance }})
        description: "Envoy memory allocated is above 90% of heap size on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyHighDownstreamHttp5xxErrorRate
      expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Envoy high downstream HTTP 5xx error rate (instance {{ $labels.instance }})
        description: "More than 5% of downstream HTTP responses are 5xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyHighDownstreamHttp4xxErrorRate
      expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Envoy high downstream HTTP 4xx error rate (instance {{ $labels.instance }})
        description: "More than 10% of downstream HTTP responses are 4xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyDownstreamConnectionsOverflowing
      expr: 'increase(envoy_listener_downstream_cx_overflow[5m]) > 5'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Envoy downstream connections overflowing (instance {{ $labels.instance }})
        description: "Downstream connections are being rejected due to listener overflow on {{ $labels.instance }} ({{ $value }} in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyClusterMembershipEmpty
      expr: 'envoy_cluster_membership_healthy == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Envoy cluster membership empty (instance {{ $labels.instance }})
        description: "Envoy cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} has no healthy members\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyClusterMembershipDegraded
      expr: 'envoy_cluster_membership_healthy / envoy_cluster_membership_total * 100 < 75 and envoy_cluster_membership_total > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Envoy cluster membership degraded (instance {{ $labels.instance }})
        description: "More than 25% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are unhealthy\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyHighClusterUpstreamConnectionFailures
      expr: 'increase(envoy_cluster_upstream_cx_connect_fail[5m]) > 10'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Envoy high cluster upstream connection failures (instance {{ $labels.instance }})
        description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyHighClusterUpstreamRequestTimeoutRate
      expr: 'rate(envoy_cluster_upstream_rq_timeout[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Envoy high cluster upstream request timeout rate (instance {{ $labels.instance }})
        description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyHighClusterUpstream5xxErrorRate
      expr: 'rate(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Envoy high cluster upstream 5xx error rate (instance {{ $labels.instance }})
        description: "More than 5% of upstream requests return 5xx in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyClusterHealthCheckFailures
      expr: 'increase(envoy_cluster_health_check_failure[5m]) > 5'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Envoy cluster health check failures (instance {{ $labels.instance }})
        description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyClusterOutlierDetectionEjectionsActive
      expr: 'envoy_cluster_outlier_detection_ejections_active > 0'
      for: 5m
      labels:
        severity: info
      annotations:
        summary: Envoy cluster outlier detection ejections active (instance {{ $labels.instance }})
        description: "There are active outlier detection ejections in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyListenerSslConnectionErrors
      expr: 'increase(envoy_listener_ssl_connection_error[5m]) > 5'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Envoy listener SSL connection errors (instance {{ $labels.instance }})
        description: "Envoy listener is experiencing SSL/TLS connection errors on {{ $labels.instance }} ({{ $value }} in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyGlobalDownstreamConnectionsOverflowing
      expr: 'increase(envoy_listener_downstream_global_cx_overflow[5m]) > 5'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Envoy global downstream connections overflowing (instance {{ $labels.instance }})
        description: "Downstream connections are being rejected due to global connection limit on {{ $labels.instance }} ({{ $value }} in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoySslCertificateExpiringSoon
      expr: 'envoy_server_days_until_first_cert_expiring < 7'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Envoy SSL certificate expiring soon (instance {{ $labels.instance }})
        description: "SSL certificate loaded by Envoy on {{ $labels.instance }} expires in less than 7 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoySslCertificateExpired
      expr: 'envoy_server_days_until_first_cert_expiring < 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Envoy SSL certificate expired (instance {{ $labels.instance }})
        description: "SSL certificate loaded by Envoy on {{ $labels.instance }} has expired\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyClusterCircuitBreakerTripped
      expr: 'envoy_cluster_circuit_breakers_default_cx_open == 1 or envoy_cluster_circuit_breakers_default_rq_open == 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Envoy cluster circuit breaker tripped (instance {{ $labels.instance }})
        description: "Circuit breaker is open for cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyNoHealthyUpstream
      expr: 'increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Envoy no healthy upstream (instance {{ $labels.instance }})
        description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyHighDownstreamRequestTimeoutRate
      expr: 'increase(envoy_http_downstream_rq_timeout[5m]) > 5'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Envoy high downstream request timeout rate (instance {{ $labels.instance }})
        description: "Downstream requests are timing out on {{ $labels.instance }} ({{ $value }} in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/etcd/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: EtcdInsufficientMembers
      expr: 'count(etcd_server_id) % 2 == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Etcd insufficient Members (instance {{ $labels.instance }})
        description: "Etcd cluster should have an odd number of members\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdNoLeader
      expr: 'etcd_server_has_leader == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Etcd no Leader (instance {{ $labels.instance }})
        description: "Etcd cluster have no leader\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdHighNumberOfLeaderChanges
      expr: 'increase(etcd_server_leader_changes_seen_total[10m]) > 2'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Etcd high number of leader changes (instance {{ $labels.instance }})
        description: "Etcd leader changed {{ $value }} times during 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
    - alert: EtcdHighNumberOfFailedGrpcRequestsWarning
      expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Etcd high number of failed GRPC requests warning (instance {{ $labels.instance }})
        description: "More than 1% GRPC request failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
    - alert: EtcdHighNumberOfFailedGrpcRequestsCritical
      expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Etcd high number of failed GRPC requests critical (instance {{ $labels.instance }})
        description: "More than 5% GRPC request failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdGrpcRequestsSlow
      expr: 'histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service, grpc_method, le)) > 0.15'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Etcd GRPC requests slow (instance {{ $labels.instance }})
        description: "GRPC requests slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdHighNumberOfFailedHttpRequestsWarning
      expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Etcd high number of failed HTTP requests warning (instance {{ $labels.instance }})
        description: "More than 1% HTTP failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdHighNumberOfFailedHttpRequestsCritical
      expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Etcd high number of failed HTTP requests critical (instance {{ $labels.instance }})
        description: "More than 5% HTTP failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdHttpRequestsSlow
      expr: 'histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Etcd HTTP requests slow (instance {{ $labels.instance }})
        description: "HTTP requests slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdMemberCommunicationSlow
      expr: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Etcd member communication slow (instance {{ $labels.instance }})
        description: "Etcd member communication slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdHighNumberOfFailedProposals
      expr: 'increase(etcd_server_proposals_failed_total[1h]) > 5'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Etcd high number of failed proposals (instance {{ $labels.instance }})
        description: "Etcd server got {{ $value }} failed proposals in the past hour\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdHighFsyncDurations
      expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Etcd high fsync durations (instance {{ $labels.instance }})
        description: "Etcd WAL fsync duration increasing, 99th percentile is over 0.5s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdHighCommitDurations
      expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Etcd high commit durations (instance {{ $labels.instance }})
        description: "Etcd commit duration increasing, 99th percentile is over 0.25s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/fluxcd/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: FluxKustomizationFailure
      expr: 'gotk_resource_info{ready="False", customresource_kind="Kustomization"} > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Flux Kustomization Failure (instance {{ $labels.instance }})
        description: "The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: FluxHelmreleaseFailure
      expr: 'gotk_resource_info{ready="False", customresource_kind="HelmRelease"} > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Flux HelmRelease Failure (instance {{ $labels.instance }})
        description: "The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: FluxSourceIssue
      expr: 'gotk_resource_info{ready="False", customresource_kind=~"GitRepository|HelmRepository|Bucket|OCIRepository"} > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Flux Source Issue (instance {{ $labels.instance }})
        description: "Flux source {{ $labels.customresource_kind }} '{{ $labels.name }}' has issue(s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: FluxImageIssue
      expr: 'gotk_resource_info{ready="False", customresource_kind=~"ImagePolicy|ImageRepository|ImageUpdateAutomation"} > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Flux Image Issue (instance {{ $labels.instance }})
        description: "The {{ $labels.customresource_kind }} '{{ $labels.name }}' is marked as not ready.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/freeswitch/znerol-freeswitch-exporter.yml
================================================
groups:

- name: ZnerolFreeswitchExporter

  
  rules:

    - alert: FreeswitchDown
      expr: 'freeswitch_up == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Freeswitch down (instance {{ $labels.instance }})
        description: "Freeswitch is unresponsive\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: FreeswitchSessionsWarning
      expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 80 and freeswitch_session_limit > 0'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Freeswitch Sessions Warning (instance {{ $labels.instance }})
        description: "High sessions usage on {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: FreeswitchSessionsCritical
      expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 90 and freeswitch_session_limit > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Freeswitch Sessions Critical (instance {{ $labels.instance }})
        description: "High sessions usage on {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/gitlab-ci/gitaly.yml
================================================
groups:

- name: Gitaly

  
  rules:

    # grpc_code!="OK" includes non-error codes like NotFound, AlreadyExists. Consider filtering to specific error codes for less noise.
    - alert: GitlabGitalyHighGrpcErrorRate
      expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab Gitaly high gRPC error rate (instance {{ $labels.instance }})
        description: "Gitaly on {{ $labels.instance }} is returning more than 5% gRPC errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # ResourceExhausted errors from Gitaly mean Git operations are being rejected due to
    # concurrency limits. This directly impacts users trying to push, pull, or clone.
    # This alert is derived from the GitLab Omnibus default rules.
    - alert: GitlabGitalyResourceExhausted
      expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: GitLab Gitaly resource exhausted (instance {{ $labels.instance }})
        description: "Gitaly on {{ $labels.instance }} is returning ResourceExhausted errors, indicating overload ({{ $value }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GitlabGitalyHighRpcLatency
      expr: 'histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job="gitaly",grpc_type="unary"}[5m])) by (le)) > 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab Gitaly high RPC latency (instance {{ $labels.instance }})
        description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GitlabGitalyCpuThrottled
      expr: 'rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab Gitaly CPU throttled (instance {{ $labels.instance }})
        description: "Gitaly processes on {{ $labels.instance }} are being CPU throttled by cgroups.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GitlabGitalyAuthenticationFailures
      expr: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: GitLab Gitaly authentication failures (instance {{ $labels.instance }})
        description: "Gitaly on {{ $labels.instance }} has authentication failures ({{ $value }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # When the circuit breaker trips to "open" state, Git operations (push, pull, clone) will fail.
    # Check Gitaly service health and logs.
    - alert: GitlabGitalyCircuitBreakerTripped
      expr: 'increase(gitaly_circuit_breaker_transitions_total{to_state="open"}[5m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: GitLab Gitaly circuit breaker tripped (instance {{ $labels.instance }})
        description: "Gitaly circuit breaker has tripped on {{ $labels.instance }}. Git operations are failing.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/gitlab-ci/gitlab-built-in-exporter.yml
================================================
groups:

- name: GitlabBuiltInExporter

  
  rules:

    # Queued connections indicate Puma workers are saturated.
    # Consider increasing puma['worker_processes'] or puma['max_threads'] in gitlab.rb.
    - alert: GitlabPumaHighQueuedConnections
      expr: 'puma_queued_connections > 5'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab Puma high queued connections (instance {{ $labels.instance }})
        description: "GitLab Puma has {{ $value }} queued connections on {{ $labels.instance }}. Requests are waiting for an available worker thread.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GitlabPumaNoAvailablePoolCapacity
      expr: 'puma_pool_capacity == 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: GitLab Puma no available pool capacity (instance {{ $labels.instance }})
        description: "GitLab Puma pool capacity on {{ $labels.instance }} has been at 0 for 5 minutes. All threads are busy.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GitlabPumaWorkersNotRunning
      expr: 'puma_running_workers < puma_workers'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab Puma workers not running (instance {{ $labels.instance }})
        description: "GitLab Puma on {{ $labels.instance }} has {{ $value }} running workers out of expected total.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold is 5% of all requests returning server errors.
    # Check GitLab logs at /var/log/gitlab/ for root cause.
    - alert: GitlabHighHttpErrorRate
      expr: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5 and sum(rate(http_requests_total[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: GitLab high HTTP error rate (instance {{ $labels.instance }})
        description: "GitLab is returning more than 5% HTTP 5xx errors on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 10s may need adjustment based on your instance size and workload.
    - alert: GitlabHighHttpRequestLatency
      expr: 'histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 10'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab high HTTP request latency (instance {{ $labels.instance }})
        description: "GitLab p95 HTTP request latency on {{ $labels.instance }} is above 10 seconds.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
    # A sustained failure rate indicates background processing issues.
    - alert: GitlabSidekiqJobsFailing
      expr: 'rate(sidekiq_jobs_failed_total[5m]) > 0.1'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: GitLab Sidekiq jobs failing (instance {{ $labels.instance }})
        description: "GitLab Sidekiq jobs are failing at a rate of {{ $value }} per second on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # When running jobs approach the concurrency limit, new jobs will queue up.
    # Consider scaling Sidekiq workers or increasing concurrency.
    - alert: GitlabSidekiqQueueTooLarge
      expr: 'sum(sidekiq_running_jobs) >= sum(sidekiq_concurrency) * 0.9'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: GitLab Sidekiq queue too large (instance {{ $labels.instance }})
        description: "GitLab Sidekiq has {{ $value }} running jobs, approaching concurrency limit on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
    - alert: GitlabSidekiqHighJobCompletionTime
      expr: 'histogram_quantile(0.95, sum(rate(sidekiq_jobs_completion_seconds_bucket[5m])) by (le, worker)) > 300'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: GitLab Sidekiq high job completion time (instance {{ $labels.instance }})
        description: "GitLab Sidekiq job p95 completion time on {{ $labels.instance }} is above 5 minutes ({{ $value | humanizeDuration }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
    # High queue latency means jobs are stuck waiting. Check Sidekiq concurrency and queue sizes.
    - alert: GitlabSidekiqHighQueueLatency
      expr: 'histogram_quantile(0.95, sum(rate(sidekiq_jobs_queue_duration_seconds_bucket[5m])) by (le)) > 60'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab Sidekiq high queue latency (instance {{ $labels.instance }})
        description: "GitLab Sidekiq jobs on {{ $labels.instance }} are waiting more than 60 seconds before being processed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # When the pool is near saturation, requests may block waiting for a connection.
    # Increase db_pool_size in gitlab.rb or investigate slow queries.
    - alert: GitlabDatabaseConnectionPoolSaturation
      expr: 'gitlab_database_connection_pool_busy / gitlab_database_connection_pool_size * 100 > 90 and gitlab_database_connection_pool_size > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab database connection pool saturation (instance {{ $labels.instance }})
        description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) is {{ $value }}% busy.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GitlabDatabaseConnectionPoolDeadConnections
      expr: 'gitlab_database_connection_pool_dead > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab database connection pool dead connections (instance {{ $labels.instance }})
        description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) has {{ $value }} dead connections.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GitlabDatabaseConnectionPoolWaiting
      expr: 'gitlab_database_connection_pool_waiting > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab database connection pool waiting (instance {{ $labels.instance }})
        description: "GitLab on {{ $labels.instance }} has {{ $value }} threads waiting for a database connection.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GitlabCiPipelineCreationSlow
      expr: 'histogram_quantile(0.95, sum(rate(gitlab_ci_pipeline_creation_duration_seconds_bucket[5m])) by (le)) > 30'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab CI pipeline creation slow (instance {{ $labels.instance }})
        description: "GitLab CI pipeline creation p95 latency on {{ $labels.instance }} is above 30 seconds.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This metric may not exist in all GitLab versions. Verify against your GitLab installation.
    - alert: GitlabCiPipelineFailuresIncreasing
      expr: 'rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: GitLab CI pipeline failures increasing (instance {{ $labels.instance }})
        description: "GitLab CI pipeline failures are increasing on {{ $labels.instance }} ({{ $value }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Frequent runner auth failures may indicate expired tokens or misconfigured runners.
    - alert: GitlabCiRunnerAuthenticationFailures
      expr: 'increase(gitlab_ci_runner_authentication_failure_total[5m]) > 5'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab CI runner authentication failures (instance {{ $labels.instance }})
        description: "GitLab CI runners are experiencing authentication failures on {{ $labels.instance }} ({{ $value }} failures).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 2GB may need adjustment based on your instance size.
    # High memory usage can lead to OOM kills and service disruptions.
    - alert: GitlabHighMemoryUsage
      expr: 'process_resident_memory_bytes{job=~".*gitlab.*"} > 2e+9'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: GitLab high memory usage (instance {{ $labels.instance }})
        description: "GitLab process on {{ $labels.instance }} is using {{ $value | humanize1024 }}B of RSS memory.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Heap fragmentation above 50% means a significant amount of memory is wasted.
    # A Puma worker restart may help reclaim memory.
    - alert: GitlabRubyHeapFragmentation
      expr: 'ruby_gc_stat_ext_heap_fragmentation{job=~".*gitlab.*"} > 0.5'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: GitLab Ruby heap fragmentation (instance {{ $labels.instance }})
        description: "GitLab Ruby heap fragmentation on {{ $labels.instance }} is {{ $value }}. High fragmentation wastes memory.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GitlabRackUncaughtErrors
      expr: 'rate(rack_uncaught_errors_total[5m]) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab rack uncaught errors (instance {{ $labels.instance }})
        description: "GitLab is experiencing uncaught errors in the Rack layer on {{ $labels.instance }} ({{ $value }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This may happen during a rolling deployment. If it persists, investigate incomplete upgrades.
    - alert: GitlabVersionMismatch
      expr: 'count(count by (version) (gitlab_build_info)) > 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: GitLab version mismatch (instance {{ $labels.instance }})
        description: "Multiple GitLab versions are running across the fleet.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GitlabHighFileDescriptorUsage
      expr: 'process_open_fds{job=~".*gitlab.*"} / process_max_fds * 100 > 80 and process_max_fds > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab high file descriptor usage (instance {{ $labels.instance }})
        description: "GitLab on {{ $labels.instance }} is using {{ $value }}% of available file descriptors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GitlabRubyThreadsSaturated
      expr: 'sum by (instance) (gitlab_ruby_threads_running_threads) > on(instance) gitlab_ruby_threads_max_expected_threads * 1.5'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: GitLab Ruby threads saturated (instance {{ $labels.instance }})
        description: "GitLab running threads on {{ $labels.instance }} have exceeded the expected maximum ({{ $value }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/gitlab-ci/workhorse.yml
================================================
groups:

- name: Workhorse

  
  rules:

    # Workhorse sits in front of Puma and handles Git HTTP, file uploads, and proxying.
    # Threshold from GitLab Omnibus default rules: 10% for high-traffic instances.
    - alert: GitlabWorkhorseHighErrorRate
      expr: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10 and sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: GitLab Workhorse high error rate (instance {{ $labels.instance }})
        description: "GitLab Workhorse on {{ $labels.instance }} is returning more than 10% HTTP 5xx errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GitlabWorkhorseHighLatency
      expr: 'histogram_quantile(0.95, sum(rate(gitlab_workhorse_http_request_duration_seconds_bucket[5m])) by (le)) > 10'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab Workhorse high latency (instance {{ $labels.instance }})
        description: "GitLab Workhorse on {{ $labels.instance }} p95 request latency is above 10 seconds.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 100 may need adjustment based on instance size.
    - alert: GitlabWorkhorseHighIn-flightRequests
      expr: 'gitlab_workhorse_http_in_flight_requests > 100'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: GitLab Workhorse high in-flight requests (instance {{ $labels.instance }})
        description: "GitLab Workhorse on {{ $labels.instance }} has {{ $value }} in-flight requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/golang/golang-exporter.yml
================================================
groups:

- name: GolangExporter

  
  rules:

    # Threshold is a rough default. High-concurrency servers may legitimately run thousands of goroutines. Adjust to match your baseline.
    - alert: GoGoroutineCountHigh
      expr: 'go_goroutines > 1000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Go goroutine count high (instance {{ $labels.instance }})
        description: "Go application has too many goroutines (> 1000), potential goroutine leak\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # quantile="1" is the maximum observed GC pause in the current summary window, not p99.
    # A single outlier pause can push this above 1s. The for: 5m ensures the max stays elevated.
    - alert: GoGcDurationHigh
      expr: 'go_gc_duration_seconds{quantile="1"} > 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Go GC duration high (instance {{ $labels.instance }})
        description: "Go GC pause duration is too high (max > 1s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # go_memstats_sys_bytes is the total memory obtained from the OS by the Go runtime, not total host memory.
    # This ratio measures Go-internal memory utilization, not system-level memory pressure.
    - alert: GoMemoryUsageHigh
      expr: '(go_memstats_heap_alloc_bytes / go_memstats_sys_bytes) * 100 > 90'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Go memory usage high (instance {{ $labels.instance }})
        description: "Go heap allocation is using most of the runtime's reserved memory (> 90%), indicating the process may need more memory or has a leak\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold is workload-dependent. Applications with heavy CGo or blocking I/O may legitimately use more OS threads. Adjust to match your baseline.
    - alert: GoThreadCountHigh
      expr: 'go_threads > 500'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Go thread count high (instance {{ $labels.instance }})
        description: "Go OS thread count is high (> 500), potential blocking syscall or CGo leak\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold is a rough default. Adjust based on your application's normal object count.
    - alert: GoHeapObjectsCountHigh
      expr: 'go_memstats_heap_objects > 10000000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Go heap objects count high (instance {{ $labels.instance }})
        description: "Go heap has too many live objects (> 10M), high GC pressure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # go_memstats_gc_cpu_fraction is deprecated since Go 1.20 and may return 0 in newer versions.
    # Consider using runtime/metrics-based alternatives if running Go >= 1.20.
    - alert: GoGcCpuFractionHigh
      expr: 'go_memstats_gc_cpu_fraction > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Go GC CPU fraction high (instance {{ $labels.instance }})
        description: "Go GC is consuming too much CPU (> 5%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GoGoroutineSpike
      expr: 'deriv(go_goroutines[5m]) > 100'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Go goroutine spike (instance {{ $labels.instance }})
        description: "Go goroutine count is growing rapidly\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GoHeapFragmentation
      expr: 'go_memstats_heap_idle_bytes / go_memstats_heap_sys_bytes > 0.9'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Go heap fragmentation (instance {{ $labels.instance }})
        description: "Go heap has high idle ratio (> 90%), indicating memory fragmentation\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GoMemoryLeak
      expr: 'rate(go_memstats_alloc_bytes_total[5m]) > 1e9'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Go memory leak (instance {{ $labels.instance }})
        description: "Go application has sustained high allocation rate (> 1GB/s), potential memory leak\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GoStackMemoryHigh
      expr: 'go_memstats_stack_inuse_bytes > 1e9'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Go stack memory high (instance {{ $labels.instance }})
        description: "Go stack memory usage is high (> 1GB), likely excessive goroutines or deep recursion\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/google-cloud-stackdriver/stackdriver-exporter.yml
================================================
groups:

- name: StackdriverExporter

  # Self-monitoring metrics use the stackdriver_monitoring_* prefix.
  # All self-monitoring metrics include a project_id label.
  
  rules:

    - alert: StackdriverExporterScrapeError
      expr: 'stackdriver_monitoring_last_scrape_error > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Stackdriver exporter scrape error (instance {{ $labels.instance }})
        description: "Stackdriver exporter failed to scrape metrics from Google Cloud Monitoring API for project {{ $labels.project_id }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: StackdriverExporterSlowScrape
      expr: 'stackdriver_monitoring_last_scrape_duration_seconds > 300'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Stackdriver exporter slow scrape (instance {{ $labels.instance }})
        description: "Stackdriver exporter scrape for project {{ $labels.project_id }} is taking more than 5 minutes ({{ $value }}s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: StackdriverExporterScrapeErrorsIncreasing
      expr: 'increase(stackdriver_monitoring_scrape_errors_total[15m]) > 5'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Stackdriver exporter scrape errors increasing (instance {{ $labels.instance }})
        description: "Stackdriver exporter has had {{ $value }} scrape errors in the last 15 minutes for project {{ $labels.project_id }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: StackdriverExporterHighApiCalls
      expr: 'rate(stackdriver_monitoring_api_calls_total[5m]) * 60 > 100'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Stackdriver exporter high API calls (instance {{ $labels.instance }})
        description: "Stackdriver exporter is making {{ $value }} API calls per minute for project {{ $labels.project_id }}. This may hit Google Cloud Monitoring API quotas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: StackdriverExporterScrapeStale
      expr: 'time() - stackdriver_monitoring_last_scrape_timestamp > 600'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Stackdriver exporter scrape stale (instance {{ $labels.instance }})
        description: "Stackdriver exporter has not successfully scraped metrics for project {{ $labels.project_id }} in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/grafana-alloy/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: GrafanaAlloyServiceDown
      expr: 'count by (instance) (alloy_build_info offset 2h) unless count by (instance) (alloy_build_info)'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Grafana Alloy service down (instance {{ $labels.instance }})
        description: "Alloy on instance {{ $labels.instance }} is not responding or has stopped running.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/grafana-mimir/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  # Mimir uses the `cortex_` metric prefix for backward compatibility with Cortex. This is intentional and expected.
  
  rules:

    - alert: MimirIngesterUnhealthy
      expr: 'min by (job) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0'
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: Mimir ingester unhealthy (instance {{ $labels.instance }})
        description: "Mimir has {{ $value }} unhealthy ingester(s) in the ring.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirRequestErrors
      expr: '100 * sum by (job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..", route!~"ready|debug_pprof"}[5m])) / sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 1 and sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 0'
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: Mimir request errors (instance {{ $labels.instance }})
        description: "Mimir {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirInconsistentRuntimeConfig
      expr: 'count(count by (job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1'
      for: 1h
      labels:
        severity: critical
      annotations:
        summary: Mimir inconsistent runtime config (instance {{ $labels.instance }})
        description: "An inconsistent runtime config file is used across Mimir instances.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirBadRuntimeConfig
      expr: 'sum by (job) (cortex_runtime_config_last_reload_successful == 0) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Mimir bad runtime config (instance {{ $labels.instance }})
        description: "{{ $labels.job }} failed to reload runtime config.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirSchedulerQueriesStuck
      expr: 'sum by (job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0'
      for: 7m
      labels:
        severity: critical
      annotations:
        summary: Mimir scheduler queries stuck (instance {{ $labels.instance }})
        description: "There are {{ $value }} queued up queries in {{ $labels.job }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirCacheRequestErrors
      expr: '(sum by (name, operation, job) (rate(thanos_cache_operation_failures_total[5m])) / sum by (name, operation, job) (rate(thanos_cache_operations_total[5m]))) * 100 > 5 and sum by (name, operation, job) (rate(thanos_cache_operations_total[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Mimir cache request errors (instance {{ $labels.instance }})
        description: "Mimir cache {{ $labels.name }} is experiencing {{ printf \"%.2f\" $value }}% errors for {{ $labels.operation }} operation.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirKvStoreFailure
      expr: '(sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.."}[5m])) / sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m]))) == 1 and sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Mimir KV store failure (instance {{ $labels.instance }})
        description: "Mimir {{ $labels.job }} KV store {{ $labels.kv_name }} is failing with 100% error rate.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirMemoryMapAreasTooHigh
      expr: 'process_memory_map_areas{job=~".*(ingester|cortex|mimir|store-gateway).*"} / process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} * 100 > 80 and process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Mimir memory map areas too high (instance {{ $labels.instance }})
        description: "Mimir {{ $labels.job }} is using {{ printf \"%.0f\" $value }}% of its memory map area limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterInstanceHasNoTenants
      expr: '(cortex_ingester_memory_users == 0) and on (instance) (cortex_ingester_memory_users offset 1h > 0)'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Mimir ingester instance has no tenants (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} has no tenants assigned.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirRulerInstanceHasNoRuleGroups
      expr: '(cortex_ruler_managers_total == 0) and on (instance) (cortex_ruler_managers_total offset 1h > 0)'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Mimir ruler instance has no rule groups (instance {{ $labels.instance }})
        description: "Mimir ruler {{ $labels.instance }} has no rule groups assigned.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngestedDataTooFarInTheFuture
      expr: 'max by (job) (cortex_ingester_tsdb_head_max_timestamp_seconds - time() and cortex_ingester_tsdb_head_max_timestamp_seconds > 0) > 3600'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Mimir ingested data too far in the future (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.job }} has ingested samples with timestamps more than 1 hour in the future.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: MimirStoreGatewayTooManyFailedOperations
      expr: 'sum by (job) (rate(thanos_objstore_bucket_operation_failures_total[5m])) > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Mimir store gateway too many failed operations (instance {{ $labels.instance }})
        description: "Mimir store-gateway {{ $labels.job }} bucket operations are failing ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirRingMembersMismatch
      expr: 'max by (name, job) (sum by (name, job, instance) (cortex_ring_members)) != min by (name, job) (sum by (name, job, instance) (cortex_ring_members))'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Mimir ring members mismatch (instance {{ $labels.instance }})
        description: "Mimir {{ $labels.name }} ring has inconsistent member counts across instances.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterReachingSeriesLimitWarning
      expr: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_series"} > 0'
      for: 3h
      labels:
        severity: warning
      annotations:
        summary: Mimir ingester reaching series limit warning (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its series limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterReachingSeriesLimitCritical
      expr: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 90) and cortex_ingester_instance_limits{limit="max_series"} > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Mimir ingester reaching series limit critical (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its series limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterReachingTenantsLimitWarning
      expr: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 70) and cortex_ingester_instance_limits{limit="max_tenants"} > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Mimir ingester reaching tenants limit warning (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its tenants limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterReachingTenantsLimitCritical
      expr: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_tenants"} > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Mimir ingester reaching tenants limit critical (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its tenants limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirReachingTcpConnectionsLimit
      expr: 'cortex_tcp_connections / cortex_tcp_connections_limit * 100 > 80 and cortex_tcp_connections_limit > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Mimir reaching TCP connections limit (instance {{ $labels.instance }})
        description: "Mimir instance {{ $labels.instance }} is using {{ printf \"%.0f\" $value }}% of its TCP connections limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirDistributorInflightRequestsHigh
      expr: '(cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"} * 100 > 80) and cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Mimir distributor inflight requests high (instance {{ $labels.instance }})
        description: "Mimir distributor {{ $labels.instance }} is using {{ printf \"%.0f\" $value }}% of its inflight push requests limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterTsdbHeadCompactionFailed
      expr: 'rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0'
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: Mimir ingester TSDB head compaction failed (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterTsdbHeadTruncationFailed
      expr: 'rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Mimir ingester TSDB head truncation failed (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterTsdbCheckpointCreationFailed
      expr: 'rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Mimir ingester TSDB checkpoint creation failed (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterTsdbCheckpointDeletionFailed
      expr: 'rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Mimir ingester TSDB checkpoint deletion failed (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterTsdbWalTruncationFailed
      expr: 'rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Mimir ingester TSDB WAL truncation failed (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterTsdbWalWritesFailed
      expr: 'rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0'
      for: 3m
      labels:
        severity: critical
      annotations:
        summary: Mimir ingester TSDB WAL writes failed (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold aligned with official Mimir mixin (30 minutes).
    - alert: MimirStoreGatewayHasNotSyncedBucket
      expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Mimir store gateway has not synced bucket (instance {{ $labels.instance }})
        description: "Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirStoreGatewayNoSyncedTenants
      expr: '(min by (instance, job) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0) and on (instance) (cortex_bucket_stores_tenants_synced{component="store-gateway"} offset 1h > 0)'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Mimir store gateway no synced tenants (instance {{ $labels.instance }})
        description: "Mimir store-gateway {{ $labels.instance }} has no synced tenants.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirBucketIndexNotUpdated
      expr: 'min by (user, job) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Mimir bucket index not updated (instance {{ $labels.instance }})
        description: "Mimir bucket index for tenant {{ $labels.user }} has not been updated for more than 35 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirCompactorNotCleaningUpBlocks
      expr: '(time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 21600) and cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 0'
      for: 1h
      labels:
        severity: critical
      annotations:
        summary: Mimir compactor not cleaning up blocks (instance {{ $labels.instance }})
        description: "Mimir compactor {{ $labels.instance }} has not cleaned up blocks in the last 6 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirCompactorNotRunningCompaction
      expr: '(time() - cortex_compactor_last_successful_run_timestamp_seconds > 86400) and cortex_compactor_last_successful_run_timestamp_seconds > 0'
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: Mimir compactor not running compaction (instance {{ $labels.instance }})
        description: "Mimir compactor {{ $labels.instance }} has not run compaction in the last 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirCompactorHasConsecutiveFailures
      expr: 'increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) > 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Mimir compactor has consecutive failures (instance {{ $labels.instance }})
        description: "Mimir compactor {{ $labels.instance }} has had {{ $value }} compaction failures in the last 2 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirCompactorHasRunOutOfDiskSpace
      expr: 'increase(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Mimir compactor has run out of disk space (instance {{ $labels.instance }})
        description: "Mimir compactor {{ $labels.instance }} has run out of disk space.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirCompactorHasNotUploadedBlocks
      expr: '(time() - thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 86400) and thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 0'
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: Mimir compactor has not uploaded blocks (instance {{ $labels.instance }})
        description: "Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Using 24h window per official mixin — compaction skips are rare events.
    - alert: MimirCompactorSkippedBlocks
      expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Mimir compactor skipped blocks (instance {{ $labels.instance }})
        description: "Mimir compactor has found {{ $value }} blocks that cannot be compacted (reason {{ $labels.reason }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirRulerTooManyFailedPushes
      expr: '100 * sum by (instance, job) (rate(cortex_ruler_write_requests_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Mimir ruler too many failed pushes (instance {{ $labels.instance }})
        description: "Mimir ruler {{ $labels.instance }} is failing to push {{ printf \"%.2f\" $value }}% of write requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirRulerTooManyFailedQueries
      expr: '100 * sum by (instance, job) (rate(cortex_ruler_queries_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Mimir ruler too many failed queries (instance {{ $labels.instance }})
        description: "Mimir ruler {{ $labels.instance }} is failing {{ printf \"%.2f\" $value }}% of query evaluations.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirRulerMissedEvaluations
      expr: '100 * sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_missed_total[5m])) / sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 1 and sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Mimir ruler missed evaluations (instance {{ $labels.instance }})
        description: "Mimir ruler {{ $labels.instance }} is missing {{ printf \"%.2f\" $value }}% of rule group evaluations.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: MimirRulerFailedRingCheck
      expr: 'sum by (job) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0.05'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Mimir ruler failed ring check (instance {{ $labels.instance }})
        description: "Mimir ruler {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirAlertmanagerSyncConfigsFailing
      expr: 'rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0'
      for: 30m
      labels:
        severity: critical
      annotations:
        summary: Mimir alertmanager sync configs failing (instance {{ $labels.instance }})
        description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirAlertmanagerRingCheckFailing
      expr: 'rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Mimir alertmanager ring check failing (instance {{ $labels.instance }})
        description: "Mimir alertmanager {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirAlertmanagerStateMergeFailing
      expr: 'rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Mimir alertmanager state merge failing (instance {{ $labels.instance }})
        description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirAlertmanagerReplicationFailing
      expr: 'rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Mimir alertmanager replication failing (instance {{ $labels.instance }})
        description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirAlertmanagerPersistStateFailing
      expr: 'rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0'
      for: 1h
      labels:
        severity: critical
      annotations:
        summary: Mimir alertmanager persist state failing (instance {{ $labels.instance }})
        description: "Mimir alertmanager {{ $labels.job }} is failing to persist state ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirAlertmanagerInitialSyncFailed
      expr: 'increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Mimir alertmanager initial sync failed (instance {{ $labels.instance }})
        description: "Mimir alertmanager {{ $labels.job }} failed initial state sync.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirAlertmanagerInstanceHasNoTenants
      expr: '(cortex_alertmanager_tenants_owned == 0) and on (instance) (cortex_alertmanager_tenants_owned offset 1h > 0)'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Mimir alertmanager instance has no tenants (instance {{ $labels.instance }})
        description: "Mimir alertmanager {{ $labels.instance }} has no tenants assigned.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirGossipMembersCountTooHigh
      expr: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 1.15 + 10 < max(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)'
      for: 20m
      labels:
        severity: warning
      annotations:
        summary: Mimir gossip members count too high (instance {{ $labels.instance }})
        description: "Mimir gossip cluster has more members than expected.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirGossipMembersCountTooLow
      expr: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 0.5 > min(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)'
      for: 20m
      labels:
        severity: warning
      annotations:
        summary: Mimir gossip members count too low (instance {{ $labels.instance }})
        description: "Mimir gossip cluster has fewer members than expected.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # A high number of Go threads may indicate a goroutine leak.
    - alert: MimirGoThreadsTooHighWarning
      expr: 'go_threads{job=~".*(mimir|cortex).*"} > 5000'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Mimir go threads too high warning (instance {{ $labels.instance }})
        description: "Mimir {{ $labels.instance }} has {{ $value }} Go threads.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirGoThreadsTooHighCritical
      expr: 'go_threads{job=~".*(mimir|cortex).*"} > 8000'
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: Mimir go threads too high critical (instance {{ $labels.instance }})
        description: "Mimir {{ $labels.instance }} has {{ $value }} Go threads.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/grafana-tempo/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: TempoDistributorUnhealthy
      expr: 'max by (job) (tempo_ring_members{state="Unhealthy", name="distributor"}) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Tempo distributor unhealthy (instance {{ $labels.instance }})
        description: "Tempo has {{ $value }} unhealthy distributor(s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoLiveStoreUnhealthy
      expr: 'max by (job) (tempo_ring_members{state="Unhealthy", name="live-store"}) > 0'
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: Tempo live store unhealthy (instance {{ $labels.instance }})
        description: "Tempo has {{ $value }} unhealthy live store(s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoMetricsGeneratorUnhealthy
      expr: 'max by (job) (tempo_ring_members{state="Unhealthy", name="metrics-generator"}) > 0'
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: Tempo metrics generator unhealthy (instance {{ $labels.instance }})
        description: "Tempo has {{ $value }} unhealthy metrics generator(s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Uses a two-window approach: 1h for historical count and 5m to confirm the issue is ongoing.
    - alert: TempoCompactionsFailing
      expr: 'sum by (job) (increase(tempodb_compaction_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_compaction_errors_total[5m])) > 0'
      for: 1h
      labels:
        severity: critical
      annotations:
        summary: Tempo compactions failing (instance {{ $labels.instance }})
        description: "{{ $value }} compactions have failed in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoPollsFailing
      expr: 'sum by (job) (increase(tempodb_blocklist_poll_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_poll_errors_total[5m])) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Tempo polls failing (instance {{ $labels.instance }})
        description: "{{ $value }} blocklist polls have failed in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoTenantIndexFailures
      expr: 'sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[5m])) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Tempo tenant index failures (instance {{ $labels.instance }})
        description: "{{ $value }} tenant index failures in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoNoTenantIndexBuilders
      expr: 'sum by (tenant) (tempodb_blocklist_tenant_index_builder) == 0 and on() max(tempodb_blocklist_length) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Tempo no tenant index builders (instance {{ $labels.instance }})
        description: "No tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 600s (10 minutes). Adjust based on your tenant index build interval.
    - alert: TempoTenantIndexTooOld
      expr: 'max by (tenant) (tempodb_blocklist_tenant_index_age_seconds) > 600'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Tempo tenant index too old (instance {{ $labels.instance }})
        description: "Tenant index for {{ $labels.tenant }} is {{ $value }}s old.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Fires when the blocklist grows more than 40% over 7 days.
    - alert: TempoBlockListRisingQuickly
      expr: '(avg(tempodb_blocklist_length) / avg(tempodb_blocklist_length offset 7d) - 1) * 100 > 40 and avg(tempodb_blocklist_length offset 7d) > 0'
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: Tempo block list rising quickly (instance {{ $labels.instance }})
        description: "Tempo blocklist length is up {{ printf \"%.0f\" $value }}% over the last 7 days. Consider scaling compactors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoBadOverrides
      expr: 'sum by (job) (tempo_runtime_config_last_reload_successful == 0) > 0'
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: Tempo bad overrides (instance {{ $labels.instance }})
        description: "{{ $labels.job }} failed to reload runtime overrides.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoUserConfigurableOverridesReloadFailing
      expr: 'sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[1h])) > 5 and sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[5m])) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Tempo user configurable overrides reload failing (instance {{ $labels.instance }})
        description: "{{ $value }} user-configurable overrides reloads have failed in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 100 blocks per compactor instance. Adjust based on your environment.
    - alert: TempoCompactionTooManyOutstandingBlocksWarning
      expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 100'
      for: 6h
      labels:
        severity: warning
      annotations:
        summary: Tempo compaction too many outstanding blocks warning (instance {{ $labels.instance }})
        description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Consider increasing compactor resources.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Official Tempo mixin normalizes by backend-worker count. Adjust threshold based on your compactor configuration.
    - alert: TempoCompactionTooManyOutstandingBlocksCritical
      expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 250'
      for: 24h
      labels:
        severity: critical
      annotations:
        summary: Tempo compaction too many outstanding blocks critical (instance {{ $labels.instance }})
        description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Increase compactor resources immediately.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoDistributorUsageTrackerErrors
      expr: 'sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0'
      for: 30m
      labels:
        severity: critical
      annotations:
        summary: Tempo distributor usage tracker errors (instance {{ $labels.instance }})
        description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoMetricsGeneratorProcessorUpdatesFailing
      expr: 'sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 0'
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: Tempo metrics generator processor updates failing (instance {{ $labels.instance }})
        description: "Tempo metrics generator processor updates are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoMetricsGeneratorServiceGraphsDroppingSpans
      expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Tempo metrics generator service graphs dropping spans (instance {{ $labels.instance }})
        description: "Tempo metrics generator is dropping {{ printf \"%.2f\" $value }}% of spans in service graphs for {{ $labels.job }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoMetricsGeneratorCollectionsFailing
      expr: 'sum by (job) (increase(tempo_metrics_generator_registry_collections_failed_total[5m])) > 2'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Tempo metrics generator collections failing (instance {{ $labels.instance }})
        description: "Tempo metrics generator collections are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Fires when the memcached error rate exceeds 20%. Only relevant if Tempo is configured with memcached caching.
    - alert: TempoMemcachedErrorsElevated
      expr: '100 * sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count{status_code="500"}[5m])) / sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 20 and sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 0'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Tempo memcached errors elevated (instance {{ $labels.instance }})
        description: "Tempo memcached error rate is {{ printf \"%.2f\" $value }}% for {{ $labels.name }} in {{ $labels.job }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/graph-node/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: ProviderFailedBecauseNet_versionFailed
      expr: 'eth_rpc_status == 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Provider failed because net_version failed (instance {{ $labels.instance }})
        description: "Failed net_version for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ProviderFailedBecauseGetGenesisFailed
      expr: 'eth_rpc_status == 2'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Provider failed because get genesis failed (instance {{ $labels.instance }})
        description: "Failed to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ProviderFailedBecauseNet_versionTimeout
      expr: 'eth_rpc_status == 3'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Provider failed because net_version timeout (instance {{ $labels.instance }})
        description: "net_version timeout for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ProviderFailedBecauseGetGenesisTimeout
      expr: 'eth_rpc_status == 4'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Provider failed because get genesis timeout (instance {{ $labels.instance }})
        description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: StoreConnectionSlow
      expr: 'store_connection_wait_time_ms > 10'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Store connection slow (instance {{ $labels.instance }})
        description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: StoreConnectionVerySlow
      expr: 'store_connection_wait_time_ms > 20'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Store connection very slow (instance {{ $labels.instance }})
        description: "Store connection is very slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/hadoop/jmx_exporter.yml
================================================
groups:

- name: Jmx_exporter

  
  rules:

    - alert: HadoopNameNodeDown
      expr: 'up{job="hadoop-namenode"} == 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Hadoop Name Node Down (instance {{ $labels.instance }})
        description: "The Hadoop NameNode service is unavailable.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HadoopResourceManagerDown
      expr: 'up{job="hadoop-resourcemanager"} == 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Hadoop Resource Manager Down (instance {{ $labels.instance }})
        description: "The Hadoop ResourceManager service is unavailable.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HadoopDataNodeOutOfService
      expr: 'hadoop_datanode_last_heartbeat == 0'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Hadoop Data Node Out Of Service (instance {{ $labels.instance }})
        description: "The Hadoop DataNode is not sending heartbeats.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HadoopHdfsDiskSpaceLow
      expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1 and hadoop_hdfs_bytes_total > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Hadoop HDFS Disk Space Low (instance {{ $labels.instance }})
        description: "Available HDFS disk space is running low.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HadoopMapReduceTaskFailures
      expr: 'increase(hadoop_mapreduce_task_failures_total[1h]) > 100'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Hadoop Map Reduce Task Failures (instance {{ $labels.instance }})
        description: "There is an unusually high number of MapReduce task failures.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HadoopResourceManagerMemoryHigh
      expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Hadoop Resource Manager Memory High (instance {{ $labels.instance }})
        description: "The Hadoop ResourceManager is approaching its memory limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HadoopYarnContainerAllocationFailures
      expr: 'increase(hadoop_yarn_container_allocation_failures_total[1h]) > 10'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Hadoop YARN Container Allocation Failures (instance {{ $labels.instance }})
        description: "There is a significant number of YARN container allocation failures.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HadoopHbaseRegionCountHigh
      expr: 'hadoop_hbase_region_count > 5000'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Hadoop HBase Region Count High (instance {{ $labels.instance }})
        description: "The HBase cluster has an unusually high number of regions.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HadoopHbaseRegionServerHeapLow
      expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }})
        description: "HBase Region Servers are running low on heap space.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HadoopHbaseWriteRequestsLatencyHigh
      expr: 'hadoop_hbase_write_requests_latency_seconds > 0.5'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Hadoop HBase Write Requests Latency High (instance {{ $labels.instance }})
        description: "HBase Write Requests are experiencing high latency.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/haproxy/embedded-exporter-v2.yml
================================================
groups:

- name: EmbeddedExporterV2

  
  rules:

    - alert: HaproxyHighHttp4xxErrorRateBackend
      expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
        description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHighHttp5xxErrorRateBackend
      expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
        description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHighHttp4xxErrorRateServer
      expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }})
        description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHighHttp5xxErrorRateServer
      expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }})
        description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyServerResponseErrors
      expr: '(sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: HAProxy server response errors (instance {{ $labels.instance }})
        description: "Too many response errors to {{ $labels.server }} server (> 5%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyBackendConnectionErrors
      expr: '(sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: HAProxy backend connection errors (instance {{ $labels.instance }})
        description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyServerConnectionErrors
      expr: '(sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: HAProxy server connection errors (instance {{ $labels.instance }})
        description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyBackendMaxActiveSession>80%
      expr: '((haproxy_backend_current_sessions >0) * 100) / (haproxy_backend_limit_sessions > 0) > 80'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: HAProxy backend max active session > 80% (instance {{ $labels.instance }})
        description: "Session limit from backend {{ $labels.proxy }} reached 80% of limit - {{ $value | printf \"%.2f\"}}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # haproxy_backend_current_queue is a gauge (current queue depth), not a counter.
    - alert: HaproxyPendingRequests
      expr: 'sum by (proxy) (haproxy_backend_current_queue) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: HAProxy pending requests (instance {{ $labels.instance }})
        description: "Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHttpSlowingDown
      expr: 'avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: HAProxy HTTP slowing down (instance {{ $labels.instance }})
        description: "Average request time is increasing - {{ $value | printf \"%.2f\"}}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyRetryHigh
      expr: 'sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: HAProxy retry high (instance {{ $labels.instance }})
        description: "High rate of retry on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHasNoAliveBackends
      expr: 'haproxy_backend_active_servers + haproxy_backend_backup_servers == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: HAproxy has no alive backends (instance {{ $labels.instance }})
        description: "HAProxy has no alive active or backup backends for {{ $labels.proxy }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyFrontendSecurityBlockedRequests
      expr: 'sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: HAProxy frontend security blocked requests (instance {{ $labels.instance }})
        description: "HAProxy is blocking requests for security reason\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyServerHealthcheckFailure
      expr: 'increase(haproxy_server_check_failures_total[1m]) > 0'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: HAProxy server healthcheck failure (instance {{ $labels.instance }})
        description: "Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/haproxy/haproxy-exporter-v1.yml
================================================
groups:

- name: HaproxyExporterV1

  
  rules:

    - alert: HaproxyDown
      expr: 'haproxy_up == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: HAProxy down (instance {{ $labels.instance }})
        description: "HAProxy down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHighHttp4xxErrorRateBackend(v1)
      expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: HAProxy high HTTP 4xx error rate backend (v1) (instance {{ $labels.instance }})
        description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHighHttp5xxErrorRateBackend(v1)
      expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: HAProxy high HTTP 5xx error rate backend (v1) (instance {{ $labels.instance }})
        description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHighHttp4xxErrorRateServer(v1)
      expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: HAProxy high HTTP 4xx error rate server (v1) (instance {{ $labels.instance }})
        description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHighHttp5xxErrorRateServer(v1)
      expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: HAProxy high HTTP 5xx error rate server (v1) (instance {{ $labels.instance }})
        description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyServerResponseErrors(v1)
      expr: 'sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: HAProxy server response errors (v1) (instance {{ $labels.instance }})
        description: "Too many response errors to {{ $labels.server }} server (> 5%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyBackendConnectionErrors(v1)
      expr: 'sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: HAProxy backend connection errors (v1) (instance {{ $labels.instance }})
        description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyServerConnectionErrors(v1)
      expr: 'sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: HAProxy server connection errors (v1) (instance {{ $labels.instance }})
        description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyBackendMaxActiveSession
      expr: '((sum by (backend) (haproxy_backend_current_sessions * 100) / sum by (backend) (haproxy_backend_limit_sessions))) > 80 and sum by (backend) (haproxy_backend_limit_sessions) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: HAProxy backend max active session (instance {{ $labels.instance }})
        description: "HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyPendingRequests(v1)
      expr: 'sum by (backend) (haproxy_backend_current_queue) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: HAProxy pending requests (v1) (instance {{ $labels.instance }})
        description: "Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHttpSlowingDown(v1)
      expr: 'avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: HAProxy HTTP slowing down (v1) (instance {{ $labels.instance }})
        description: "Average request time is increasing\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyRetryHigh(v1)
      expr: 'sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: HAProxy retry high (v1) (instance {{ $labels.instance }})
        description: "High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyBackendDown
      expr: 'haproxy_backend_up == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: HAProxy backend down (instance {{ $labels.instance }})
        description: "HAProxy backend is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyServerDown
      expr: 'haproxy_server_up == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: HAProxy server down (instance {{ $labels.instance }})
        description: "HAProxy server is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyFrontendSecurityBlockedRequests(v1)
      expr: 'sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: HAProxy frontend security blocked requests (v1) (instance {{ $labels.instance }})
        description: "HAProxy is blocking requests for security reason\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyServerHealthcheckFailure(v1)
      expr: 'increase(haproxy_server_check_failures_total[1m]) > 0'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: HAProxy server healthcheck failure (v1) (instance {{ $labels.instance }})
        description: "Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/hashicorp-vault/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: VaultSealed
      expr: 'vault_core_unsealed == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Vault sealed (instance {{ $labels.instance }})
        description: "Vault instance is sealed on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: VaultTooManyPendingTokens
      expr: 'avg(vault_token_create_count - vault_token_store_count) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Vault too many pending tokens (instance {{ $labels.instance }})
        description: "Too many pending tokens {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: VaultTooManyInfinityTokens
      expr: 'vault_token_count_by_ttl{creation_ttl="+Inf"} > 3'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Vault too many infinity tokens (instance {{ $labels.instance }})
        description: "Too many infinity tokens {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: VaultClusterHealth
      expr: 'sum(vault_core_active) / count(vault_core_active) <= 0.5'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Vault cluster health (instance {{ $labels.instance }})
        description: "Vault cluster is not healthy {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/host-and-hardware/node-exporter.yml
================================================
groups:

- name: NodeExporter

  
  rules:

    - alert: HostOutOfMemory
      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host out of memory (instance {{ $labels.instance }})
        description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostMemoryUnderMemoryPressure
      expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host memory under memory pressure (instance {{ $labels.instance }})
        description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
    - alert: HostMemoryIsUnderutilized
      expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: Host Memory is underutilized (instance {{ $labels.instance }})
        description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostUnusualNetworkThroughputIn
      expr: '((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host unusual network throughput in (instance {{ $labels.instance }})
        description: "Host receive bandwidth is high (>80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostUnusualNetworkThroughputOut
      expr: '((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host unusual network throughput out (instance {{ $labels.instance }})
        description: "Host transmit bandwidth is high (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostDiskIoUtilizationHigh
      expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host disk IO utilization high (instance {{ $labels.instance }})
        description: "Disk utilization is high (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Please add ignored mountpoints in node_exporter parameters like
    # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
    # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
    - alert: HostOutOfDiskSpace
      expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Host out of disk space (instance {{ $labels.instance }})
        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Please add ignored mountpoints in node_exporter parameters like
    # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
    # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
    - alert: HostDiskMayFillIn24Hours
      expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
        description: "Filesystem will likely run out of space within the next 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostOutOfInodes
      expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) and node_filesystem_files > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Host out of inodes (instance {{ $labels.instance }})
        description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostFilesystemDeviceError
      expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Host filesystem device error (instance {{ $labels.instance }})
        description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostInodesMayFillIn24Hours
      expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
        description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostUnusualDiskReadLatency
      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host unusual disk read latency (instance {{ $labels.instance }})
        description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostUnusualDiskWriteLatency
      expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host unusual disk write latency (instance {{ $labels.instance }})
        description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostHighCpuLoad
      expr: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Host high CPU load (instance {{ $labels.instance }})
        description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
    - alert: HostCpuIsUnderutilized
      expr: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
      for: 1w
      labels:
        severity: info
      annotations:
        summary: Host CPU is underutilized (instance {{ $labels.instance }})
        description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostCpuStealNoisyNeighbor
      expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostCpuHighIowait
      expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host CPU high iowait (instance {{ $labels.instance }})
        description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostUnusualDiskIo
      expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Host unusual disk IO (instance {{ $labels.instance }})
        description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # x2 context switches is an arbitrary number.
    # The alert threshold depends on the nature of the application.
    # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
    - alert: HostContextSwitchingHigh
      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host context switching high (instance {{ $labels.instance }})
        description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostSwapIsFillingUp
      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) and node_memory_SwapTotal_bytes > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host swap is filling up (instance {{ $labels.instance }})
        description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostSystemdServiceCrashed
      expr: '(node_systemd_unit_state{state="failed"} == 1)'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host systemd service crashed (instance {{ $labels.instance }})
        description: "systemd service {{ $labels.name }} crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostPhysicalComponentTooHot
      expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Host physical component too hot (instance {{ $labels.instance }})
        description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostNodeOvertemperatureAlarm
      expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Host node overtemperature alarm (instance {{ $labels.instance }})
        description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Uses ignoring(state) to handle additional labels on node_md_disks. Matches the official node-exporter mixin.
    - alert: HostSoftwareRaidInsufficientDrives
      expr: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
        description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostSoftwareRaidDiskFailure
      expr: '(node_md_disks{state="failed"} > 0)'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host software RAID disk failure (instance {{ $labels.instance }})
        description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostKernelVersionDeviations
      expr: 'changes(node_uname_info[1h]) > 0'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: Host kernel version deviations (instance {{ $labels.instance }})
        description: "Kernel version for {{ $labels.instance }} has changed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 15–20 minutes to recover, the alert should still trigger.
    - alert: HostOomKillDetected
      expr: '(increase(node_vmstat_oom_kill[30m]) > 0)'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host OOM kill detected (instance {{ $labels.instance }})
        description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostEdacCorrectableErrorsDetected
      expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostEdacUncorrectableErrorsDetected
      expr: '(node_edac_uncorrectable_errors_total > 0)'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostNetworkReceiveErrors
      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) and rate(node_network_receive_packets_total[2m]) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host Network Receive Errors (instance {{ $labels.instance }})
        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostNetworkTransmitErrors
      expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) and rate(node_network_transmit_packets_total[2m]) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host Network Transmit Errors (instance {{ $labels.instance }})
        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostNetworkBondDegraded
      expr: '((node_bonding_active - node_bonding_slaves) != 0)'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host Network Bond Degraded (instance {{ $labels.instance }})
        description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostConntrackLimit
      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) and node_nf_conntrack_entries_limit > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Host conntrack limit (instance {{ $labels.instance }})
        description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostClockSkew
      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Host clock skew (instance {{ $labels.instance }})
        description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostClockNotSynchronising
      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host clock not synchronising (instance {{ $labels.instance }})
        description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/ipmi/ipmi-exporter.yml
================================================
groups:

- name: IpmiExporter

  
  rules:

    # The ipmi_up metric is per-collector. A value of 0 means the collector could not retrieve data from the BMC.
    - alert: IpmiCollectorDown
      expr: 'ipmi_up == 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: IPMI collector down (instance {{ $labels.instance }})
        description: "IPMI collector {{ $labels.collector }} on {{ $labels.instance }} failed to scrape sensor data. Check FreeIPMI tools and BMC connectivity.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # State values: 0=nominal, 1=warning, 2=critical. Thresholds are defined in the BMC firmware.
    - alert: IpmiTemperatureSensorWarning
      expr: 'ipmi_temperature_state == 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: IPMI temperature sensor warning (instance {{ $labels.instance }})
        description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IpmiTemperatureSensorCritical
      expr: 'ipmi_temperature_state == 2'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: IPMI temperature sensor critical (instance {{ $labels.instance }})
        description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Immediate attention required to prevent hardware damage.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IpmiFanSpeedSensorWarning
      expr: 'ipmi_fan_speed_state == 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: IPMI fan speed sensor warning (instance {{ $labels.instance }})
        description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IpmiFanSpeedSensorCritical
      expr: 'ipmi_fan_speed_state == 2'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: IPMI fan speed sensor critical (instance {{ $labels.instance }})
        description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. A fan may have failed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IpmiFanSpeedZero
      expr: 'ipmi_fan_speed_rpm == 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: IPMI fan speed zero (instance {{ $labels.instance }})
        description: "IPMI fan {{ $labels.name }} on {{ $labels.instance }} reports 0 RPM. The fan may have failed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IpmiVoltageSensorWarning
      expr: 'ipmi_voltage_state == 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: IPMI voltage sensor warning (instance {{ $labels.instance }})
        description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IpmiVoltageSensorCritical
      expr: 'ipmi_voltage_state == 2'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: IPMI voltage sensor critical (instance {{ $labels.instance }})
        description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Power supply or motherboard issue possible.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IpmiCurrentSensorWarning
      expr: 'ipmi_current_state == 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: IPMI current sensor warning (instance {{ $labels.instance }})
        description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IpmiCurrentSensorCritical
      expr: 'ipmi_current_state == 2'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: IPMI current sensor critical (instance {{ $labels.instance }})
        description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IpmiPowerSensorWarning
      expr: 'ipmi_power_state == 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: IPMI power sensor warning (instance {{ $labels.instance }})
        description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IpmiPowerSensorCritical
      expr: 'ipmi_power_state == 2'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: IPMI power sensor critical (instance {{ $labels.instance }})
        description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Catches any sensor type not covered by the specific temperature/fan/voltage/current/power alerts.
    - alert: IpmiGenericSensorCritical
      expr: 'ipmi_sensor_state == 2'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: IPMI generic sensor critical (instance {{ $labels.instance }})
        description: "IPMI sensor {{ $labels.name }} (type={{ $labels.type }}) on {{ $labels.instance }} is in critical state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IpmiChassisPowerOff
      expr: 'ipmi_chassis_power_state == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: IPMI chassis power off (instance {{ $labels.instance }})
        description: "IPMI reports chassis power is off on {{ $labels.instance }}. The server may have shut down unexpectedly.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # The metric uses inverted logic: 1=no fault, 0=fault detected.
    - alert: IpmiChassisDriveFault
      expr: 'ipmi_chassis_drive_fault_state == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: IPMI chassis drive fault (instance {{ $labels.instance }})
        description: "IPMI reports a drive fault on {{ $labels.instance }}. Check disk health.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # The metric uses inverted logic: 1=no fault, 0=fault detected.
    - alert: IpmiChassisCoolingFault
      expr: 'ipmi_chassis_cooling_fault_state == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: IPMI chassis cooling fault (instance {{ $labels.instance }})
        description: "IPMI reports a cooling/fan fault on {{ $labels.instance }}. Check fans and airflow.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # SEL storage is typically very limited (e.g., 16KB). When full, new events may be dropped.
    - alert: IpmiSelAlmostFull
      expr: 'ipmi_sel_free_space_bytes < 512'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: IPMI SEL almost full (instance {{ $labels.instance }})
        description: "IPMI System Event Log on {{ $labels.instance }} has only {{ printf \"%.0f\" $value }} bytes free. Clear the SEL to prevent loss of new events.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/istio/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: IstioKubernetesGatewayAvailabilityDrop
      expr: 'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Istio Kubernetes gateway availability drop (instance {{ $labels.instance }})
        description: "Gateway pods have dropped. Inbound traffic will likely be affected.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IstioPilotHighTotalRequestRate
      expr: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5 and sum(rate(pilot_xds_pushes[1m])) > 0'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Istio Pilot high total request rate (instance {{ $labels.instance }})
        description: "Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IstioMixerPrometheusDispatchesLow
      expr: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Istio Mixer Prometheus dispatches low (instance {{ $labels.instance }})
        description: "Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IstioHighTotalRequestRate
      expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Istio high total request rate (instance {{ $labels.instance }})
        description: "Global request rate in the service mesh is unusually high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IstioLowTotalRequestRate
      expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Istio low total request rate (instance {{ $labels.instance }})
        description: "Global request rate in the service mesh is unusually low.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IstioHigh4xxErrorRate
      expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Istio high 4xx error rate (instance {{ $labels.instance }})
        description: "High percentage of HTTP 4xx responses in Istio (> 5%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IstioHigh5xxErrorRate
      expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Istio high 5xx error rate (instance {{ $labels.instance }})
        description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IstioHighRequestLatency
      expr: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100 and rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 0'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Istio high request latency (instance {{ $labels.instance }})
        description: "Istio average requests execution is longer than 100ms.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IstioLatency99Percentile
      expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Istio latency 99 percentile (instance {{ $labels.instance }})
        description: "Istio 1% slowest requests are longer than 1000ms.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IstioPilotDuplicateEntry
      expr: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }})
        description: "Istio pilot duplicate entry error.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/jaeger/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: JaegerAgentHttpServerErrors
      expr: '100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Jaeger agent HTTP server errors (instance {{ $labels.instance }})
        description: "Jaeger agent on {{ $labels.instance }} is experiencing {{ $value | humanize }}% HTTP server errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JaegerClientRpcRequestErrors
      expr: '100 * sum(rate(jaeger_client_jaeger_rpc_http_requests{status_code=~"4xx|5xx"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Jaeger client RPC request errors (instance {{ $labels.instance }})
        description: "Jaeger client on {{ $labels.instance }} is experiencing {{ $value | humanize }}% RPC HTTP errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JaegerClientSpansDropped
      expr: '100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Jaeger client spans dropped (instance {{ $labels.instance }})
        description: "Jaeger client on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JaegerAgentSpansDropped
      expr: '100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Jaeger agent spans dropped (instance {{ $labels.instance }})
        description: "Jaeger agent on {{ $labels.instance }} is dropping {{ $value | humanize }}% of span batches.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JaegerCollectorDroppingSpans
      expr: '100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Jaeger collector dropping spans (instance {{ $labels.instance }})
        description: "Jaeger collector on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JaegerSamplingUpdateFailing
      expr: '100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Jaeger sampling update failing (instance {{ $labels.instance }})
        description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of sampling policy updates.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JaegerThrottlingUpdateFailing
      expr: '100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Jaeger throttling update failing (instance {{ $labels.instance }})
        description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of throttling policy updates.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JaegerQueryRequestFailures
      expr: '100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Jaeger query request failures (instance {{ $labels.instance }})
        description: "Jaeger query on {{ $labels.instance }} is failing {{ $value | humanize }}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/jenkins/metric-plugin.yml
================================================
groups:

- name: MetricPlugin

  
  rules:

    - alert: JenkinsNodeOffline
      expr: 'jenkins_node_offline_value > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Jenkins node offline (instance {{ $labels.instance }})
        description: "At least one Jenkins node offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JenkinsNoNodeOnline
      expr: 'jenkins_node_online_value == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Jenkins no node online (instance {{ $labels.instance }})
        description: "No Jenkins nodes are online: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JenkinsHealthcheck
      expr: 'jenkins_health_check_score < 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Jenkins healthcheck (instance {{ $labels.instance }})
        description: "Jenkins healthcheck score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JenkinsOutdatedPlugins
      expr: 'sum(jenkins_plugins_withUpdate) by (instance) > 3'
      for: 1d
      labels:
        severity: warning
      annotations:
        summary: Jenkins outdated plugins (instance {{ $labels.instance }})
        description: "{{ $value }} plugins need update\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JenkinsBuildsHealthScore
      expr: 'default_jenkins_builds_health_score < 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Jenkins builds health score (instance {{ $labels.instance }})
        description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JenkinsRunFailureTotal
      expr: 'delta(jenkins_runs_failure_total[1h]) > 100'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Jenkins run failure total (instance {{ $labels.instance }})
        description: "Job run failures: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JenkinsBuildTestsFailing
      expr: 'default_jenkins_builds_last_build_tests_failing > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Jenkins build tests failing (instance {{ $labels.instance }})
        description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # * RUNNING  -1 true  - The build had no errors.
    # * SUCCESS   0 true  - The build had no errors.
    # * UNSTABLE  1 true  - The build had some errors but they were not fatal. For example, some tests failed.
    # * FAILURE   2 false - The build had a fatal error.
    # * NOT_BUILT 3 false - The module was not built.
    # * ABORTED   4 false - The build was manually aborted.
    - alert: JenkinsLastBuildFailed
      expr: 'default_jenkins_builds_last_build_result_ordinal == 2'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Jenkins last build failed (instance {{ $labels.instance }})
        description: "Last build failed: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/juniper/czerwonk-junos-exporter.yml
================================================
groups:

- name: CzerwonkJunosExporter

  
  rules:

    - alert: JuniperSwitchDown
      expr: 'junos_up == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Juniper switch down (instance {{ $labels.instance }})
        description: "The switch appears to be down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JuniperCriticalBandwidthUsage1gib
      expr: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.90'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Juniper critical Bandwidth Usage 1GiB (instance {{ $labels.instance }})
        description: "Interface is highly saturated. (> 0.90GiB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JuniperWarningBandwidthUsage1gib
      expr: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.80'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Juniper warning Bandwidth Usage 1GiB (instance {{ $labels.instance }})
        description: "Interface is getting saturated. (> 0.80GiB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/jvm/jvm-exporter.yml
================================================
groups:

- name: JvmExporter

  
  rules:

    - alert: JvmMemoryFillingUp
      expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80 and sum by (instance)(jvm_memory_max_bytes{area="heap"}) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: JVM memory filling up (instance {{ $labels.instance }})
        description: "JVM memory is filling up (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Many JVM configurations leave metaspace unbounded, in which case jvm_memory_max_bytes{area="nonheap"} is -1 and this alert will not fire.
    # The query filters out max_bytes <= 0 to avoid false negatives.
    - alert: JvmNon-heapMemoryFillingUp
      expr: '(sum by (instance)(jvm_memory_used_bytes{area="nonheap"}) / (sum by (instance)(jvm_memory_max_bytes{area="nonheap"}) > 0)) * 100 > 80'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: JVM non-heap memory filling up (instance {{ $labels.instance }})
        description: "JVM non-heap memory (metaspace/code cache) is filling up (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JvmGcTimeTooHigh
      expr: 'sum by (instance)(rate(jvm_gc_collection_seconds_sum[5m])) > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: JVM GC time too high (instance {{ $labels.instance }})
        description: "JVM is spending too much time in garbage collection (> 5% of wall clock time)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JvmThreadsDeadlocked
      expr: 'jvm_threads_deadlocked > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: JVM threads deadlocked (instance {{ $labels.instance }})
        description: "JVM has deadlocked threads\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JvmThreadCountHigh
      expr: 'jvm_threads_current > 300'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: JVM thread count high (instance {{ $labels.instance }})
        description: "JVM thread count is high (> 300), potential thread leak\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JvmThreadsBlocked
      expr: 'jvm_threads_state{state="BLOCKED"} > 50'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: JVM threads BLOCKED (instance {{ $labels.instance }})
        description: "JVM has high number of BLOCKED threads, indicating lock contention\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This regex matches CMS, G1, and Parallel collector names. It will not match ZGC or Shenandoah cycle names.
    # Adjust the gc label filter if you use a different collector.
    - alert: JvmOldGenGcFrequency
      expr: 'rate(jvm_gc_collection_seconds_count{gc=~".*old.*|.*major.*"}[5m]) > 0.3'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: JVM old gen GC frequency (instance {{ $labels.instance }})
        description: "Frequent old/major GC cycles, indicating memory pressure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JvmDirectBufferPoolFillingUp
      expr: '(jvm_buffer_pool_used_bytes / jvm_buffer_pool_capacity_bytes) * 100 > 90 and jvm_buffer_pool_capacity_bytes > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: JVM direct buffer pool filling up (instance {{ $labels.instance }})
        description: "JVM direct buffer pool is filling up (> 90%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JvmObjectsPendingFinalization
      expr: 'jvm_memory_objects_pending_finalization > 1000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: JVM objects pending finalization (instance {{ $labels.instance }})
        description: "JVM has objects pending finalization, potential memory leak\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not JVM-specific.
    # This alert will also fire for Go, Python, or any process exposing these metrics.
    - alert: JvmFileDescriptorsExhaustion
      expr: '(process_open_fds / process_max_fds) * 100 > 90 and process_max_fds > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: JVM file descriptors exhaustion (instance {{ $labels.instance }})
        description: "JVM process is running out of file descriptors (> 90% used)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JvmClassLoadingAnomaly
      expr: 'rate(jvm_classes_loaded_total[5m]) > 100'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: JVM class loading anomaly (instance {{ $labels.instance }})
        description: "Rapid class loading detected, potential classloader leak\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JvmCompilationTimeSpike
      expr: 'rate(jvm_compilation_time_seconds_total[5m]) > 0.1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: JVM compilation time spike (instance {{ $labels.instance }})
        description: "Excessive JIT compilation time consuming CPU\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/kafka/danielqsj-kafka-exporter.yml
================================================
groups:

- name: DanielqsjKafkaExporter

  
  rules:

    - alert: KafkaTopicsReplicas
      expr: 'min(kafka_topic_partition_in_sync_replica) by (topic) < 3'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Kafka topics replicas (instance {{ $labels.instance }})
        description: "Kafka topic in-sync partition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KafkaConsumerGroupLag
      expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 10000'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Kafka consumer group lag (instance {{ $labels.instance }})
        description: "Kafka consumer group {{ $labels.consumergroup }} is lagging behind ({{ $value }} messages)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/kafka/linkedin-kafka-exporter.yml
================================================
groups:

- name: LinkedinKafkaExporter

  
  rules:

    - alert: KafkaTopicOffsetDecreased
      expr: 'delta(kafka_burrow_partition_current_offset[1m]) < 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Kafka topic offset decreased (instance {{ $labels.instance }})
        description: "Kafka topic offset has decreased\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KafkaConsumerLag
      expr: 'kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Kafka consumer lag (instance {{ $labels.instance }})
        description: "Kafka consumer has a 30 minutes and increasing lag\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml
================================================
groups:

- name: AerogearKeycloakMetricsSpi

  
  rules:

    # Threshold of 5% is a rough default. Adjust based on your user base and expected error rates.
    # A spike in failed logins may indicate a brute-force attack or misconfigured client.
    - alert: KeycloakHighLoginFailureRate
      expr: '(sum by (realm) (rate(keycloak_failed_login_attempts_total[5m])) / (sum by (realm) (rate(keycloak_logins_total[5m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[5m])))) * 100 > 5 and (sum by (realm) (rate(keycloak_logins_total[5m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[5m]))) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Keycloak high login failure rate (instance {{ $labels.instance }})
        description: "More than 5% of login attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Only fires when login attempts exist but none succeed — may indicate an authentication outage.
    - alert: KeycloakNoSuccessfulLogins
      expr: 'sum by (realm) (rate(keycloak_logins_total[15m])) == 0 and (sum by (realm) (rate(keycloak_logins_total[15m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[15m]))) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Keycloak no successful logins (instance {{ $labels.instance }})
        description: "No successful logins in realm {{ $labels.realm }} for the last 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 10% is a rough default. High refresh token errors may indicate expired sessions or token store issues.
    - alert: KeycloakHighTokenRefreshErrorRate
      expr: '(sum by (realm) (rate(keycloak_refresh_tokens_errors_total[5m])) / sum by (realm) (rate(keycloak_refresh_tokens_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_refresh_tokens_total[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Keycloak high token refresh error rate (instance {{ $labels.instance }})
        description: "More than 10% of token refresh attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 10% is a rough default. Code-to-token failures may indicate misconfigured OAuth clients or replay attacks.
    - alert: KeycloakHighCode-to-tokenExchangeErrorRate
      expr: '(sum by (realm) (rate(keycloak_code_to_tokens_errors_total[5m])) / sum by (realm) (rate(keycloak_code_to_tokens_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_code_to_tokens_total[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Keycloak high code-to-token exchange error rate (instance {{ $labels.instance }})
        description: "More than 10% of code-to-token exchanges are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 10% is a rough default.
    - alert: KeycloakHighRegistrationFailureRate
      expr: '(sum by (realm) (rate(keycloak_registrations_errors_total[5m])) / sum by (realm) (rate(keycloak_registrations_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_registrations_total[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Keycloak high registration failure rate (instance {{ $labels.instance }})
        description: "More than 10% of registration attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # keycloak_request_duration is in milliseconds. Threshold of 2000ms (2 seconds) is a rough default.
    - alert: KeycloakSlowRequestResponseTime
      expr: 'sum by (method) (rate(keycloak_request_duration_sum[5m])) / sum by (method) (rate(keycloak_request_duration_count[5m])) > 2000 and sum by (method) (rate(keycloak_request_duration_count[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Keycloak slow request response time (instance {{ $labels.instance }})
        description: "Keycloak {{ $labels.method }} requests are taking more than 2 seconds on average.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/kubernetes/kubestate-exporter.yml
================================================
groups:

- name: KubestateExporter

  
  rules:

    - alert: KubernetesNodeNotReady
      expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes Node not ready (instance {{ $labels.instance }})
        description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Kubernetes Node with disabled schedules are fine.
    # This alarm can be useful to get warned if there are nodes which are longer unscheduled.
    - alert: KubernetesNodeSchedulingDisabled
      expr: 'kube_node_spec_taint{key="node.kubernetes.io/unschedulable"} == 1'
      for: 30m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes Node scheduling disabled (instance {{ $labels.instance }})
        description: "Node {{ $labels.node }} has been marked as unschedulable for more than 30 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesNodeMemoryPressure
      expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
        description: "Node {{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesNodeDiskPressure
      expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes Node disk pressure (instance {{ $labels.instance }})
        description: "Node {{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesNodeNetworkUnavailable
      expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes Node network unavailable (instance {{ $labels.instance }})
        description: "Node {{ $labels.node }} has NetworkUnavailable condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesNodeOutOfPodCapacity
      expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }})
        description: "Node {{ $labels.node }} is out of pod capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesContainerOomKiller
      expr: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
        description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesJobFailed
      expr: 'kube_job_status_failed > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes Job failed (instance {{ $labels.instance }})
        description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesJobNotStarting
      expr: 'kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes Job not starting (instance {{ $labels.instance }})
        description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesCronjobFailing
      expr: '(kube_cronjob_status_last_schedule_time > kube_cronjob_status_last_successful_time) AND (kube_cronjob_status_active == 0) AND (kube_cronjob_spec_suspend == 0)'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes CronJob failing (instance {{ $labels.instance }})
        description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is failing\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesCronjobSuspended
      expr: 'kube_cronjob_spec_suspend != 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
        description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesPersistentvolumeclaimPending
      expr: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
        description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesVolumeOutOfDiskSpace
      expr: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10 and kubelet_volume_stats_capacity_bytes > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
        description: "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesVolumeFullInFourDays
      expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
        description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesPersistentvolumeError
      expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
        description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesStatefulsetDown
      expr: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesHpaScaleInability
      expr: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesHpaMetricsUnavailability
      expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesHpaScaleMaximum
      expr: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)'
      for: 2m
      labels:
        severity: info
      annotations:
        summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesHpaUnderutilized
      expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesPodNotHealthy
      expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
        description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesPodCrashLooping
      expr: 'increase(kube_pod_container_status_restarts_total[1m]) > 3'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
        description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesReplicasetReplicasMismatch
      expr: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes ReplicaSet replicas mismatch (instance {{ $labels.instance }})
        description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesDeploymentReplicasMismatch
      expr: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
        description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesStatefulsetReplicasMismatch
      expr: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
        description: "StatefulSet does not match the expected number of replicas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesDeploymentGenerationMismatch
      expr: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
        description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesStatefulsetGenerationMismatch
      expr: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesStatefulsetUpdateNotRolledOut
      expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesDaemonsetRolloutStuck
      expr: '(kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 and kube_daemonset_status_desired_number_scheduled > 0) or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
        description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesDaemonsetMisscheduled
      expr: 'kube_daemonset_status_number_misscheduled > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
        description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold should be customized for each cronjob name.
    - alert: KubernetesCronjobTooLong
      expr: 'kube_job_status_start_time > 0 and absent(kube_job_status_completion_time) and (time() - kube_job_status_start_time) > 3600'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
        description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesJobSlowCompletion
      expr: 'kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0'
      for: 12h
      labels:
        severity: critical
      annotations:
        summary: Kubernetes Job slow completion (instance {{ $labels.instance }})
        description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesApiServerErrors
      expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3 and sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes API server errors (instance {{ $labels.instance }})
        description: "Kubernetes API server is experiencing {{ $value | humanize }}% error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesApiClientErrors
      expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 and sum(rate(rest_client_requests_total[1m])) by (instance, job) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes API client errors (instance {{ $labels.instance }})
        description: "Kubernetes API client is experiencing {{ $value | humanize }}% error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesClientCertificateExpiresNextWeek
      expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
        description: "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesClientCertificateExpiresSoon
      expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
        description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesApiServerLatency
      expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes API server latency (instance {{ $labels.instance }})
        description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/linkerd/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    # Linkerd does not expose request_errors_total. Errors are tracked via response_total{classification="failure"}.
    - alert: LinkerdHighErrorRate
      expr: 'sum(rate(response_total{classification="failure"}[1m])) by (deployment, statefulset, daemonset) / sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10 and sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) > 0'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Linkerd high error rate (instance {{ $labels.instance }})
        description: "Linkerd error rate for {{ $labels.deployment }}{{ $labels.statefulset }}{{ $labels.daemonset }} is over 10%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/loki/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: LokiProcessTooManyRestarts
      expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Loki process too many restarts (instance {{ $labels.instance }})
        description: "A loki process had too many restarts (target {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: LokiRequestErrors
      expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10 and sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 0'
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: Loki request errors (instance {{ $labels.instance }})
        description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing {{ printf \"%.2f\" $value }}% errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: LokiRequestPanic
      expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Loki request panic (instance {{ $labels.instance }})
        description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: LokiRequestLatency
      expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le)))  > 1'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Loki request latency (instance {{ $labels.instance }})
        description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/meilisearch/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: MeilisearchIndexIsEmpty
      expr: 'meilisearch_index_docs_count == 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Meilisearch index is empty (instance {{ $labels.instance }})
        description: "Meilisearch index {{ $labels.index }} has zero documents\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MeilisearchHttpResponseTime
      expr: 'meilisearch_http_response_time_seconds > 0.5'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Meilisearch http response time (instance {{ $labels.instance }})
        description: "Meilisearch http response time is too high\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/memcached/memcached-exporter.yml
================================================
groups:

- name: MemcachedExporter

  
  rules:

    # 1m delay allows a restart without triggering an alert.
    - alert: MemcachedDown
      expr: 'memcached_up == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Memcached down (instance {{ $labels.instance }})
        description: "Memcached instance is down on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MemcachedConnectionLimitApproaching(>80%)
      expr: '(memcached_current_connections / memcached_max_connections * 100) > 80 and memcached_max_connections > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Memcached connection limit approaching (> 80%) (instance {{ $labels.instance }})
        description: "Memcached connection usage is above 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MemcachedConnectionLimitApproaching(>95%)
      expr: '(memcached_current_connections / memcached_max_connections * 100) > 95 and memcached_max_connections > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Memcached connection limit approaching (> 95%) (instance {{ $labels.instance }})
        description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MemcachedOutOfMemoryErrors
      expr: 'sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Memcached out of memory errors (instance {{ $labels.instance }})
        description: "Memcached is returning out-of-memory errors on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions.
    - alert: MemcachedMemoryUsageHigh(>90%)
      expr: '(memcached_current_bytes / memcached_limit_bytes * 100) > 90 and memcached_limit_bytes > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Memcached memory usage high (> 90%) (instance {{ $labels.instance }})
        description: "Memcached memory usage is above 90% on {{ $labels.instance }} (current value: {{ $value }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # A sustained eviction rate indicates memory pressure. Consider increasing memcached memory limit or reducing cache usage. Threshold of 10 evictions/s is a rough default — adjust based on your workload.
    - alert: MemcachedHighEvictionRate
      expr: 'rate(memcached_items_evicted_total[5m]) > 10'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Memcached high eviction rate (instance {{ $labels.instance }})
        description: "Memcached is evicting items at a high rate on {{ $labels.instance }} ({{ $value }} evictions/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # A low hit rate may indicate poor cache utilization, incorrect cache keys, or TTLs that are too short. Threshold of 80% is a rough default — adjust based on your workload and access patterns.
    - alert: MemcachedLowCacheHitRate(<80%)
      expr: '(rate(memcached_commands_total{command="get", status="hit"}[5m]) / (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) * 100) < 80 and (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) > 0'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Memcached low cache hit rate (< 80%) (instance {{ $labels.instance }})
        description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MemcachedConnectionsRejected
      expr: 'increase(memcached_connections_rejected_total[5m]) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Memcached connections rejected (instance {{ $labels.instance }})
        description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MemcachedItemsTooLarge
      expr: 'increase(memcached_item_too_large_total[5m]) > 0'
      for: 5m
      labels:
        severity: info
      annotations:
        summary: Memcached items too large (instance {{ $labels.instance }})
        description: "Memcached is rejecting items exceeding max-item-size on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/minio/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: MinioClusterDiskOffline
      expr: 'minio_cluster_drive_offline_total > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Minio cluster disk offline (instance {{ $labels.instance }})
        description: "Minio cluster disk is offline\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MinioNodeDiskOffline
      expr: 'minio_cluster_nodes_offline_total > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Minio node disk offline (instance {{ $labels.instance }})
        description: "Minio cluster node disk is offline\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MinioDiskSpaceUsage
      expr: 'minio_cluster_capacity_raw_free_bytes / minio_cluster_capacity_raw_total_bytes * 100 < 10 and minio_cluster_capacity_raw_total_bytes > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Minio disk space usage (instance {{ $labels.instance }})
        description: "Minio available free space is low (< 10%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/mongodb/dcu-mongodb-exporter.yml
================================================
groups:

- name: DcuMongodbExporter

  
  rules:

    - alert: MongodbReplicationLag(dcu)
      expr: 'avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}) > 10'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: MongoDB replication lag (DCU) (instance {{ $labels.instance }})
        description: "Mongodb replication lag is more than 10s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MongodbReplicationStatus3
      expr: 'mongodb_replset_member_state == 3'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: MongoDB replication Status 3 (instance {{ $labels.instance }})
        description: "MongoDB Replication set member either perform startup self-checks, or transition from completing a rollback or resync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MongodbReplicationStatus6
      expr: 'mongodb_replset_member_state == 6'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: MongoDB replication Status 6 (instance {{ $labels.instance }})
        description: "MongoDB Replication set member as seen from another member of the set, is not yet known\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MongodbReplicationStatus8
      expr: 'mongodb_replset_member_state == 8'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: MongoDB replication Status 8 (instance {{ $labels.instance }})
        description: "MongoDB Replication set member as seen from another member of the set, is unreachable\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MongodbReplicationStatus9
      expr: 'mongodb_replset_member_state == 9'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: MongoDB replication Status 9 (instance {{ $labels.instance }})
        description: "MongoDB Replication set member is actively performing a rollback. Data is not available for reads\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MongodbReplicationStatus10
      expr: 'mongodb_replset_member_state == 10'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: MongoDB replication Status 10 (instance {{ $labels.instance }})
        description: "MongoDB Replication set member was once in a replica set but was subsequently removed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MongodbNumberCursorsOpen(dcu)
      expr: 'mongodb_metrics_cursor_open{state="total_open"} > 10000'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MongoDB number cursors open (DCU) (instance {{ $labels.instance }})
        description: "Too many cursors opened by MongoDB for clients (> 10k)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MongodbCursorsTimeouts(dcu)
      expr: 'increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MongoDB cursors timeouts (DCU) (instance {{ $labels.instance }})
        description: "Too many cursors are timing out ({{ $value }} in the last minute)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MongodbTooManyConnections(dcu)
      expr: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80 and (mongodb_connections{state="current"} + mongodb_connections{state="available"}) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MongoDB too many connections (DCU) (instance {{ $labels.instance }})
        description: "Too many connections (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/mongodb/percona-mongodb-exporter.yml
================================================
groups:

- name: PerconaMongodbExporter

  
  rules:

    # 1m delay allows a restart without triggering an alert.
    - alert: MongodbDown
      expr: 'mongodb_up == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: MongoDB Down (instance {{ $labels.instance }})
        description: "MongoDB instance is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # 1m delay allows a restart without triggering an alert.
    - alert: MongodbReplicaMemberUnhealthy
      expr: 'mongodb_rs_members_health == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Mongodb replica member unhealthy (instance {{ $labels.instance }})
        description: "MongoDB replica member is not healthy\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MongodbReplicationLag(percona)
      expr: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: MongoDB replication lag (Percona) (instance {{ $labels.instance }})
        description: "Mongodb replication lag is more than 10s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both.
    - alert: MongodbReplicationHeadroom
      expr: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: MongoDB replication headroom (instance {{ $labels.instance }})
        description: "MongoDB replication headroom is <= 0\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MongodbNumberCursorsOpen(percona)
      expr: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MongoDB number cursors open (Percona) (instance {{ $labels.instance }})
        description: "Too many cursors opened by MongoDB for clients (> 10k)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MongodbCursorsTimeouts(percona)
      expr: 'increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MongoDB cursors timeouts (Percona) (instance {{ $labels.instance }})
        description: "Too many cursors are timing out ({{ $value }} in the last minute)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MongodbTooManyConnections(percona)
      expr: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80 and (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MongoDB too many connections (Percona) (instance {{ $labels.instance }})
        description: "Too many connections (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/mongodb/stefanprodan-mgob-exporter.yml
================================================
groups:

- name: StefanprodanMgobExporter

  
  rules:

    - alert: MgobBackupFailed
      expr: 'changes(mgob_scheduler_backup_total{status="500"}[1h]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Mgob backup failed (instance {{ $labels.instance }})
        description: "MongoDB backup has failed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/mysql/mysqld-exporter.yml
================================================
groups:

- name: MysqldExporter

  
  rules:

    # 1m delay allows a restart without triggering an alert.
    - alert: MysqlDown
      expr: 'mysql_up == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: MySQL down (instance {{ $labels.instance }})
        description: "MySQL instance is down on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MysqlTooManyConnections(>80%)
      expr: 'max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80 and mysql_global_variables_max_connections > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
        description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MysqlHighPreparedStatementsUtilization(>80%)
      expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80 and mysql_global_variables_max_prepared_stmt_count > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MySQL high prepared statements utilization (> 80%) (instance {{ $labels.instance }})
        description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MysqlHighThreadsRunning
      expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60 and mysql_global_variables_max_connections > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MySQL high threads running (instance {{ $labels.instance }})
        description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # 1m delay allows a restart without triggering an alert.
    - alert: MysqlSlaveIoThreadNotRunning
      expr: '( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: MySQL Slave IO thread not running (instance {{ $labels.instance }})
        description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # 1m delay allows a restart without triggering an alert.
    - alert: MysqlSlaveSqlThreadNotRunning
      expr: '( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: MySQL Slave SQL thread not running (instance {{ $labels.instance }})
        description: "MySQL Slave SQL thread not running on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MysqlSlaveReplicationLag
      expr: '( (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and ON (instance) mysql_slave_status_master_server_id > 0 ) > 30'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: MySQL Slave replication lag (instance {{ $labels.instance }})
        description: "MySQL replication lag on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MysqlSlowQueries
      expr: 'increase(mysql_global_status_slow_queries[1m]) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MySQL slow queries (instance {{ $labels.instance }})
        description: "MySQL server mysql has some new slow query ({{ $value }} in the last minute).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MysqlInnodbLogWaits
      expr: 'rate(mysql_global_status_innodb_log_waits[15m]) > 10'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: MySQL InnoDB log waits (instance {{ $labels.instance }})
        description: "MySQL innodb log writes stalling ({{ $value }} waits/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MysqlRestarted
      expr: 'mysql_global_status_uptime < 60'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: MySQL restarted (instance {{ $labels.instance }})
        description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MysqlHighQps
      expr: 'irate(mysql_global_status_questions[1m]) > 10000'
      for: 2m
      labels:
        severity: info
      annotations:
        summary: MySQL High QPS (instance {{ $labels.instance }})
        description: "MySQL is being overload with unusual QPS (> 10k QPS).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MysqlTooManyOpenFiles
      expr: 'mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75 and mysql_global_variables_open_files_limit > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MySQL too many open files (instance {{ $labels.instance }})
        description: "MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MysqlInnodbForceRecoveryIsEnabled
      expr: 'mysql_global_variables_innodb_force_recovery != 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MySQL InnoDB Force Recovery is enabled (instance {{ $labels.instance }})
        description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MysqlInnodbHistory_lenTooLong
      expr: 'mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MySQL InnoDB history_len too long (instance {{ $labels.instance }})
        description: "MySQL history_len (undo log) too long on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/nats/nats-exporter.yml
================================================
groups:

- name: NatsExporter

  
  rules:

    - alert: NatsHighRoutesCount
      expr: 'gnatsd_varz_routes > 10'
      for: 3m
      labels:
        severity: warning
      annotations:
        summary: Nats high routes count (instance {{ $labels.instance }})
        description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsHighMemoryUsage
      expr: 'gnatsd_varz_mem > 200 * 1024 * 1024'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats high memory usage (instance {{ $labels.instance }})
        description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsSlowConsumers
      expr: 'gnatsd_varz_slow_consumers > 0'
      for: 3m
      labels:
        severity: critical
      annotations:
        summary: Nats slow consumers (instance {{ $labels.instance }})
        description: "There are slow consumers in NATS for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsServerDown
      expr: 'absent(up{job="nats"})'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Nats server down (instance {{ $labels.instance }})
        description: "NATS server has been down for more than 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # gnatsd_varz_cpu is a gauge reporting CPU percentage (0-100 scale).
    - alert: NatsHighCpuUsage
      expr: 'gnatsd_varz_cpu > 80'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats high CPU usage (instance {{ $labels.instance }})
        description: "NATS server is using more than 80% CPU for the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsHighNumberOfConnections
      expr: 'gnatsd_connz_num_connections > 1000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats high number of connections (instance {{ $labels.instance }})
        description: "NATS server has more than 1000 active connections\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsHighJetstreamStoreUsage
      expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 and gnatsd_varz_jetstream_config_max_storage > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats high JetStream store usage (instance {{ $labels.instance }})
        description: "JetStream store usage is over 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsHighJetstreamMemoryUsage
      expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 and gnatsd_varz_jetstream_config_max_memory > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats high JetStream memory usage (instance {{ $labels.instance }})
        description: "JetStream memory usage is over 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsHighNumberOfSubscriptions
      expr: 'gnatsd_connz_subscriptions > 1000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats high number of subscriptions (instance {{ $labels.instance }})
        description: "NATS server has more than 1000 active subscriptions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsHighPendingBytes
      expr: 'gnatsd_connz_pending_bytes > 100000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats high pending bytes (instance {{ $labels.instance }})
        description: "NATS server has more than 100,000 pending bytes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsTooManyErrors
      expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats too many errors (instance {{ $labels.instance }})
        description: "NATS server has encountered {{ $value }} JetStream API errors in the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsJetstreamAccountsExceeded
      expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats JetStream accounts exceeded (instance {{ $labels.instance }})
        description: "JetStream has more than 100 active accounts\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsLeafNodeConnectionIssue
      expr: 'gnatsd_varz_leafnodes == 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats leaf node connection issue (instance {{ $labels.instance }})
        description: "No leaf node connections on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/netdata/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    # This is a gauge metric (not a counter). Checking idle < 20% means CPU usage > 80%.
    - alert: NetdataHighCpuUsage
      expr: 'netdata_cpu_cpu_percentage_average{dimension="idle"} < 20'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Netdata high cpu usage (instance {{ $labels.instance }})
        description: "Netdata high CPU usage (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NetdataCpuStealNoisyNeighbor
      expr: 'netdata_cpu_cpu_percentage_average{dimension="steal"} > 10'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Netdata CPU steal noisy neighbor (instance {{ $labels.instance }})
        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NetdataHighMemoryUsage
      expr: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20 and netdata_system_ram_MiB_average > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Netdata high memory usage (instance {{ $labels.instance }})
        description: "Netdata high memory usage (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NetdataLowDiskSpace
      expr: '100 / netdata_disk_space_GB_average * netdata_disk_space_GB_average{dimension=~"avail|cached"} < 20 and netdata_disk_space_GB_average > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Netdata low disk space (instance {{ $labels.instance }})
        description: "Netdata low disk space (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NetdataPredictedDiskFull
      expr: 'predict_linear(netdata_disk_space_GB_average{dimension=~"avail|cached"}[3h], 24 * 3600) < 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Netdata predicted disk full (instance {{ $labels.instance }})
        description: "Netdata predicted disk full in 24 hours\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NetdataMdMismatchCntUnsynchronizedBlocks
      expr: 'netdata_md_mismatch_cnt_unsynchronized_blocks_average > 1024'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Netdata MD mismatch cnt unsynchronized blocks (instance {{ $labels.instance }})
        description: "RAID Array have unsynchronized blocks\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NetdataDiskReallocatedSectors
      expr: 'increase(netdata_smartd_log_reallocated_sectors_count_sectors_average[1m]) > 0'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: Netdata disk reallocated sectors (instance {{ $labels.instance }})
        description: "Disk reallocated sectors detected ({{ $value }} sectors)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NetdataDiskCurrentPendingSector
      expr: 'netdata_smartd_log_current_pending_sector_count_sectors_average > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Netdata disk current pending sector (instance {{ $labels.instance }})
        description: "Disk current pending sector\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NetdataReportedUncorrectableDiskSectors
      expr: 'increase(netdata_smartd_log_offline_uncorrectable_sector_count_sectors_average[2m]) > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Netdata reported uncorrectable disk sectors (instance {{ $labels.instance }})
        description: "Reported uncorrectable disk sectors ({{ $value }} sectors)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/nginx/knyar-nginx-exporter.yml
================================================
groups:

- name: KnyarNginxExporter

  
  rules:

    - alert: NginxHighHttp4xxErrorRate
      expr: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
        description: "Too many HTTP requests with status 4xx (> 5%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NginxHighHttp5xxErrorRate
      expr: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
        description: "Too many HTTP requests with status 5xx (> 5%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NginxLatencyHigh
      expr: 'histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node, le)) > 3'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Nginx latency high (instance {{ $labels.instance }})
        description: "Nginx p99 latency is higher than 3 seconds\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/nomad/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: NomadJobFailed
      expr: 'nomad_nomad_job_summary_failed > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Nomad job failed (instance {{ $labels.instance }})
        description: "Nomad job failed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NomadJobLost
      expr: 'nomad_nomad_job_summary_lost > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Nomad job lost (instance {{ $labels.instance }})
        description: "Nomad job lost\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NomadJobQueued
      expr: 'nomad_nomad_job_summary_queued > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Nomad job queued (instance {{ $labels.instance }})
        description: "Nomad job queued\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NomadBlockedEvaluation
      expr: 'nomad_nomad_blocked_evals_total_blocked > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Nomad blocked evaluation (instance {{ $labels.instance }})
        description: "Nomad blocked evaluation\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/openebs/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: OpenebsUsedPoolCapacity
      expr: 'openebs_used_pool_capacity_percent > 80'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: OpenEBS used pool capacity (instance {{ $labels.instance }})
        description: "OpenEBS Pool use more than 80% of his capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/openstack/openstack-exporter.yml
================================================
groups:

- name: OpenstackExporter

  
  rules:

    - alert: OpenstackExporterDown
      expr: 'up{job=~".*openstack.*"} == 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: OpenStack exporter down (instance {{ $labels.instance }})
        description: "The OpenStack exporter is down. OpenStack cloud metrics are no longer being collected.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackNovaAgentDown
      expr: 'openstack_nova_agent_state{adminState="enabled"} == 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: OpenStack Nova agent down (instance {{ $labels.instance }})
        description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackNeutronAgentDown
      expr: 'openstack_neutron_agent_state{adminState="up"} == 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: OpenStack Neutron agent down (instance {{ $labels.instance }})
        description: "Neutron agent {{ $labels.hostname }} ({{ $labels.service }}) is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackCinderAgentDown
      expr: 'openstack_cinder_agent_state{adminState="enabled"} == 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: OpenStack Cinder agent down (instance {{ $labels.instance }})
        description: "Cinder agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
    - alert: OpenstackHypervisorHighVcpuUsage
      expr: 'openstack_nova_vcpus_used / openstack_nova_vcpus_available > 0.9 and openstack_nova_vcpus_available > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenStack hypervisor high vCPU usage (instance {{ $labels.instance }})
        description: "Hypervisor {{ $labels.hostname }} vCPU usage is above 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
    - alert: OpenstackHypervisorHighMemoryUsage
      expr: 'openstack_nova_memory_used_bytes / openstack_nova_memory_available_bytes > 0.9 and openstack_nova_memory_available_bytes > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenStack hypervisor high memory usage (instance {{ $labels.instance }})
        description: "Hypervisor {{ $labels.hostname }} memory usage is above 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackHypervisorHighDiskUsage
      expr: 'openstack_nova_local_storage_used_bytes / openstack_nova_local_storage_available_bytes > 0.9 and openstack_nova_local_storage_available_bytes > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenStack hypervisor high disk usage (instance {{ $labels.instance }})
        description: "Hypervisor {{ $labels.hostname }} local disk usage is above 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # A value of -1 for limits_vcpus_max means unlimited quota (no limit set).
    - alert: OpenstackNovaTenantVcpuQuotaNearlyExhausted
      expr: 'openstack_nova_limits_vcpus_used / openstack_nova_limits_vcpus_max > 0.9 and openstack_nova_limits_vcpus_max > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: OpenStack Nova tenant vCPU quota nearly exhausted (instance {{ $labels.instance }})
        description: "Tenant {{ $labels.tenant }} has used over 90% of its vCPU quota\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackNovaTenantMemoryQuotaNearlyExhausted
      expr: 'openstack_nova_limits_memory_used / openstack_nova_limits_memory_max > 0.9 and openstack_nova_limits_memory_max > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: OpenStack Nova tenant memory quota nearly exhausted (instance {{ $labels.instance }})
        description: "Tenant {{ $labels.tenant }} has used over 90% of its memory quota\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackNovaTenantInstanceQuotaNearlyExhausted
      expr: 'openstack_nova_limits_instances_used / openstack_nova_limits_instances_max > 0.9 and openstack_nova_limits_instances_max > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: OpenStack Nova tenant instance quota nearly exhausted (instance {{ $labels.instance }})
        description: "Tenant {{ $labels.tenant }} has used over 90% of its instance quota\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackCinderTenantVolumeQuotaNearlyExhausted
      expr: 'openstack_cinder_limits_volume_used_gb / openstack_cinder_limits_volume_max_gb > 0.9 and openstack_cinder_limits_volume_max_gb > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: OpenStack Cinder tenant volume quota nearly exhausted (instance {{ $labels.instance }})
        description: "Tenant {{ $labels.tenant }} has used over 90% of its volume storage quota\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackCinderPoolLowFreeCapacity
      expr: 'openstack_cinder_pool_capacity_free_gb / openstack_cinder_pool_capacity_total_gb < 0.1 and openstack_cinder_pool_capacity_total_gb > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenStack Cinder pool low free capacity (instance {{ $labels.instance }})
        description: "Cinder storage pool {{ $labels.name }} has less than 10% free capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackNeutronFloatingIpsAssociatedButNotActive
      expr: 'openstack_neutron_floating_ips_associated_not_active > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenStack Neutron floating IPs associated but not active (instance {{ $labels.instance }})
        description: "{{ $value }} floating IPs are associated to a private IP but are not in ACTIVE state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackNeutronRoutersNotActive
      expr: 'openstack_neutron_routers_not_active > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenStack Neutron routers not active (instance {{ $labels.instance }})
        description: "{{ $value }} Neutron routers are not in ACTIVE state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackNeutronSubnetIpPoolExhaustion
      expr: 'openstack_neutron_network_ip_availabilities_used / openstack_neutron_network_ip_availabilities_total > 0.9 and openstack_neutron_network_ip_availabilities_total > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: OpenStack Neutron subnet IP pool exhaustion (instance {{ $labels.instance }})
        description: "Subnet {{ $labels.subnet_name }} on network {{ $labels.network_name }} has used over 90% of its IP pool\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackNeutronPortsWithoutIps
      expr: 'openstack_neutron_ports_no_ips > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenStack Neutron ports without IPs (instance {{ $labels.instance }})
        description: "{{ $value }} active ports have no IP addresses assigned\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackLoadBalancerNotOnline
      expr: 'openstack_loadbalancer_loadbalancer_status{operating_status!="ONLINE"} > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenStack load balancer not online (instance {{ $labels.instance }})
        description: "Load balancer {{ $labels.name }} ({{ $labels.id }}) operating status is {{ $labels.operating_status }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackNovaInstancesInErrorState
      expr: 'sum(openstack_nova_server_status{status="ERROR"}) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenStack Nova instances in ERROR state (instance {{ $labels.instance }})
        description: "{{ $value }} Nova instances are in ERROR state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackCinderVolumesInErrorState
      expr: 'openstack_cinder_volume_status_counter{status=~"error.*"} > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenStack Cinder volumes in error state (instance {{ $labels.instance }})
        description: "{{ $value }} Cinder volumes are in an error state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This alert factors in the allocation ratio to compute effective capacity.
    # The threshold of 90% is a rough default. Adjust based on your allocation ratios and workload patterns.
    - alert: OpenstackPlacementResourceHighUsage
      expr: 'openstack_placement_resource_usage / (openstack_placement_resource_total * openstack_placement_resource_allocation_ratio) > 0.9 and openstack_placement_resource_total > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenStack placement resource high usage (instance {{ $labels.instance }})
        description: "Resource {{ $labels.resourcetype }} on host {{ $labels.hostname }} usage exceeds 90% of its allocation\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/opentelemetry-collector/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  # OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint.
  # These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly.
  # All collector internal metrics are prefixed with 'otelcol_'.
  
  rules:

    - alert: OpentelemetryCollectorDown
      expr: 'up{job=~".*otel.*collector.*"} == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: OpenTelemetry Collector down (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector instance has disappeared or is not being scraped\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpentelemetryCollectorReceiverRefusedSpans
      expr: 'rate(otelcol_receiver_refused_spans[5m]) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: OpenTelemetry Collector receiver refused spans (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpentelemetryCollectorReceiverRefusedMetricPoints
      expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: OpenTelemetry Collector receiver refused metric points (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s metric points on {{ $labels.receiver }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpentelemetryCollectorReceiverRefusedLogRecords
      expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: OpenTelemetry Collector receiver refused log records (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s log records on {{ $labels.receiver }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: OpentelemetryCollectorExporterFailedSpans
      expr: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenTelemetry Collector exporter failed spans (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s spans via {{ $labels.exporter }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: OpentelemetryCollectorExporterFailedMetricPoints
      expr: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenTelemetry Collector exporter failed metric points (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s metric points via {{ $labels.exporter }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: OpentelemetryCollectorExporterFailedLogRecords
      expr: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenTelemetry Collector exporter failed log records (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s log records via {{ $labels.exporter }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpentelemetryCollectorExporterQueueNearlyFull
      expr: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: OpenTelemetry Collector exporter queue nearly full (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: OpentelemetryCollectorProcessorRefusedSpans
      expr: 'rate(otelcol_processor_refused_spans[5m]) > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenTelemetry Collector processor refused spans (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans ({{ $value | humanize }}/s), likely due to backpressure.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: OpentelemetryCollectorProcessorRefusedMetricPoints
      expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenTelemetry Collector processor refused metric points (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points ({{ $value | humanize }}/s), likely due to backpressure.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpentelemetryCollectorHighMemoryUsage
      expr: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenTelemetry Collector high memory usage (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector memory usage is above 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpentelemetryCollectorOtlpReceiverErrors
      expr: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: OpenTelemetry Collector OTLP receiver errors (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/oracle-database/iamseth-oracledb-exporter.yml
================================================
groups:

- name: IamsethOracledbExporter

  
  rules:

    # 1m delay allows a restart without triggering an alert.
    - alert: OracleDbDown
      expr: 'oracledb_up == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Oracle DB down (instance {{ $labels.instance }})
        description: "Oracle Database instance is down on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold is workload-dependent. Adjust 85% to suit your environment.
    - alert: OracleDbSessionsReachingLimit(>85%)
      expr: 'oracledb_resource_current_utilization{resource_name="sessions"} / oracledb_resource_limit_value{resource_name="sessions"} * 100 > 85 and oracledb_resource_limit_value{resource_name="sessions"} > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Oracle DB sessions reaching limit (> 85%) (instance {{ $labels.instance }})
        description: "Oracle Database session utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold is workload-dependent. Adjust 85% to suit your environment.
    - alert: OracleDbProcessesReachingLimit(>85%)
      expr: 'oracledb_resource_current_utilization{resource_name="processes"} / oracledb_resource_limit_value{resource_name="processes"} * 100 > 85 and oracledb_resource_limit_value{resource_name="processes"} > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Oracle DB processes reaching limit (> 85%) (instance {{ $labels.instance }})
        description: "Oracle Database process utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OracleDbTablespaceReachingCapacity(>85%)
      expr: 'oracledb_tablespace_used_percent > 85'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Oracle DB tablespace reaching capacity (> 85%) (instance {{ $labels.instance }})
        description: "Oracle Database tablespace {{ $labels.tablespace }} is above 85% usage on {{ $labels.instance }} (current value: {{ $value }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OracleDbTablespaceFull(>95%)
      expr: 'oracledb_tablespace_used_percent > 95'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Oracle DB tablespace full (> 95%) (instance {{ $labels.instance }})
        description: "Oracle Database tablespace {{ $labels.tablespace }} is critically full on {{ $labels.instance }} (current value: {{ $value }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # A high rollback rate (>20%) often indicates application-level issues such as deadlocks, constraint violations, or poorly designed transactions.
    - alert: OracleDbHighUserRollbacks
      expr: 'rate(oracledb_activity_user_rollbacks[5m]) / (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) * 100 > 20 and (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Oracle DB high user rollbacks (instance {{ $labels.instance }})
        description: "Oracle Database on {{ $labels.instance }} has a high rollback rate ({{ $value }}% of transactions are rolled back)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold is highly workload-dependent. Adjust 200 to suit your environment.
    - alert: OracleDbTooManyActiveSessions
      expr: 'oracledb_sessions_value{status="ACTIVE", type="USER"} > 200'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Oracle DB too many active sessions (instance {{ $labels.instance }})
        description: "Oracle Database on {{ $labels.instance }} has too many active user sessions (current value: {{ $value }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # The metric from v$waitclassmetric is already a normalized rate (centiseconds per second). Threshold 300 means 3 seconds of I/O wait per second of wall time.
    - alert: OracleDbHighWaitTime(userI/o)
      expr: 'oracledb_wait_time_user_io > 300'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Oracle DB high wait time (user I/O) (instance {{ $labels.instance }})
        description: "Oracle Database on {{ $labels.instance }} is experiencing high user I/O wait time\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/patroni/embedded-exporter-patroni.yml
================================================
groups:

- name: EmbeddedExporterPatroni

  
  rules:

    # 1m delay allows a restart without triggering an alert.
    - alert: PatroniHasNoLeader
      expr: '(max by (scope) (patroni_primary) < 1) and (max by (scope) (patroni_standby_leader) < 1)'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Patroni has no Leader (instance {{ $labels.instance }})
        description: "A leader node (neither primary nor standby) cannot be found inside the cluster {{ $labels.scope }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml
================================================
groups:

- name: SpreakerPgbouncerExporter

  
  rules:

    - alert: PgbouncerActiveConnections
      expr: 'pgbouncer_pools_server_active_connections > 200'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: PGBouncer active connections (instance {{ $labels.instance }})
        description: "PGBouncer pools are filling up\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PgbouncerErrors
      expr: 'increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[1m]) > 10'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: PGBouncer errors (instance {{ $labels.instance }})
        description: "PGBouncer is logging errors. This may be due to a server restart or an admin typing commands at the pgbouncer console.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PgbouncerMaxConnections
      expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[2m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: PGBouncer max connections (instance {{ $labels.instance }})
        description: "The number of PGBouncer client connections has reached max_client_conn.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/php-fpm/bakins-fpm-exporter.yml
================================================
groups:

- name: BakinsFpmExporter

  
  rules:

    - alert: Php-fpmMax-childrenReached
      expr: 'sum(increase(phpfpm_max_children_reached_total[5m])) by (instance) > 3'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: PHP-FPM max-children reached (instance {{ $labels.instance }})
        description: "PHP-FPM reached max children on {{ $labels.instance }} ({{ $value }} times in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/postgresql/postgres-exporter.yml
================================================
groups:

- name: PostgresExporter

  
  rules:

    # 1m delay allows a restart without triggering an alert.
    - alert: PostgresqlDown
      expr: 'pg_up == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Postgresql down (instance {{ $labels.instance }})
        description: "Postgresql instance is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlRestarted
      expr: 'time() - pg_postmaster_start_time_seconds < 60'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Postgresql restarted (instance {{ $labels.instance }})
        description: "Postgresql restarted\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlExporterError
      expr: 'pg_exporter_last_scrape_error > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Postgresql exporter error (instance {{ $labels.instance }})
        description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlTableNotAutoVacuumed
      expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
        description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlTableNotAutoAnalyzed
      expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
        description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlTooManyConnections
      expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Postgresql too many connections (instance {{ $labels.instance }})
        description: "PostgreSQL instance has too many connections (> 80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlNotEnoughConnections
      expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Postgresql not enough connections (instance {{ $labels.instance }})
        description: "PostgreSQL instance should have more connections (> 5)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlDeadLocks
      expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Postgresql dead locks (instance {{ $labels.instance }})
        description: "PostgreSQL has dead-locks ({{ $value }} in the last minute)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlHighRollbackRate
      expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Postgresql high rollback rate (instance {{ $labels.instance }})
        description: "Ratio of transactions being aborted compared to committed is > 2 %\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlCommitRateLow
      expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Postgresql commit rate low (instance {{ $labels.instance }})
        description: "Postgresql seems to be processing very few transactions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlLowXidConsumption
      expr: 'rate(pg_txid_current[1m]) < 5'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Postgresql low XID consumption (instance {{ $labels.instance }})
        description: "Postgresql seems to be consuming transaction IDs very slowly\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlUnusedReplicationSlot
      expr: '(pg_replication_slots_active == 0) and (pg_replication_is_replica == 0)'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Postgresql unused replication slot (instance {{ $labels.instance }})
        description: "Unused Replication Slots\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlTooManyDeadTuples
      expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 and (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
        description: "PostgreSQL dead tuples is too large\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlConfigurationChanged
      expr: '{__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} != ON(__name__, instance) {__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} OFFSET 5m'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: Postgresql configuration changed (instance {{ $labels.instance }})
        description: "Postgres Database configuration change has occurred\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlSslCompressionActive
      expr: 'sum by (instance) (pg_stat_ssl_compression) > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Postgresql SSL compression active (instance {{ $labels.instance }})
        description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlTooManyLocksAcquired
      expr: '((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 and (pg_settings_max_locks_per_transaction * pg_settings_max_connections) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
        description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
    - alert: PostgresqlBloatIndexHigh(>80%)
      expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
        description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
    - alert: PostgresqlBloatTableHigh(>80%)
      expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
        description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
    - alert: PostgresqlInvalidIndex
      expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
      for: 6h
      labels:
        severity: warning
      annotations:
        summary: Postgresql invalid index (instance {{ $labels.instance }})
        description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlReplicationLag
      expr: 'pg_replication_lag_seconds > 5'
      for: 30s
      labels:
        severity: warning
      annotations:
        summary: Postgresql replication lag (instance {{ $labels.instance }})
        description: "The PostgreSQL replication lag is high (> 5s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/process-exporter/process-exporter.yml
================================================
groups:

- name: ProcessExporter

  
  rules:

    - alert: ProcessExporterGroupDown
      expr: 'namedprocess_namegroup_num_procs == 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Process exporter group down (instance {{ $labels.instance }})
        description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 4GB is arbitrary and depends on the process being monitored. Adjust per group.
    - alert: ProcessExporterHighMemoryUsage
      expr: 'namedprocess_namegroup_memory_bytes{memtype="resident"} > 4e+09'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Process exporter high memory usage (instance {{ $labels.instance }})
        description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of resident memory. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Value is core-equivalent %: 100% = 1 full core, 200% = 2 cores, etc. Threshold of 80% is per-core. Adjust based on expected workload.
    - alert: ProcessExporterHighCpuUsage
      expr: 'rate(namedprocess_namegroup_cpu_seconds_total[5m]) * 100 > 80'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Process exporter high CPU usage (instance {{ $labels.instance }})
        description: "Process group {{ $labels.groupname }} is using {{ $value }}% CPU (core-equivalent). (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ProcessExporterHighFileDescriptorUsage
      expr: 'namedprocess_namegroup_worst_fd_ratio > 0.8'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Process exporter high file descriptor usage (instance {{ $labels.instance }})
        description: "Process group {{ $labels.groupname }} is using more than 80% of its file descriptor limit. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ProcessExporterFileDescriptorsExhausted
      expr: 'namedprocess_namegroup_worst_fd_ratio > 0.95'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Process exporter file descriptors exhausted (instance {{ $labels.instance }})
        description: "Process group {{ $labels.groupname }} has nearly exhausted its file descriptor limit. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 512MB is arbitrary. Adjust per group and environment.
    - alert: ProcessExporterHighSwapUsage
      expr: 'namedprocess_namegroup_memory_bytes{memtype="swapped"} > 512e+06'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Process exporter high swap usage (instance {{ $labels.instance }})
        description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ProcessExporterZombieProcesses
      expr: 'namedprocess_namegroup_states{state="Zombie"} > 5'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Process exporter zombie processes (instance {{ $labels.instance }})
        description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Filters to voluntary switches only — involuntary switches are normal under CPU contention. Threshold of 50000/s is a rough default. Adjust based on workload.
    - alert: ProcessExporterHighContextSwitching
      expr: 'rate(namedprocess_namegroup_context_switches_total{ctxswitchtype="voluntary"}[5m]) > 50000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Process exporter high context switching (instance {{ $labels.instance }})
        description: "Process group {{ $labels.groupname }} has a high rate of context switches ({{ $value }}/s). (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 100MB/s is arbitrary. Adjust per group.
    - alert: ProcessExporterHighDiskWriteIo
      expr: 'rate(namedprocess_namegroup_write_bytes_total[5m]) > 100e+06'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Process exporter high disk write IO (instance {{ $labels.instance }})
        description: "Process group {{ $labels.groupname }} is performing {{ $value | humanize }}B/s of disk writes. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Detects restarts by watching for changes in the oldest process start time within the group.
    - alert: ProcessExporterProcessRestarting
      expr: 'changes(namedprocess_namegroup_oldest_start_time_seconds[5m]) > 0 and namedprocess_namegroup_num_procs > 0'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: Process exporter process restarting (instance {{ $labels.instance }})
        description: "Process group {{ $labels.groupname }} has restarted (oldest process start time changed). (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/prometheus-self-monitoring/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: PrometheusJobMissing
      expr: 'absent(up{job="prometheus"})'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus job missing (instance {{ $labels.instance }})
        description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Only fire if at least one target in the job is still up.
    # If all targets are down, PrometheusJobMissing or PrometheusAllTargetsMissing will fire instead.
    - alert: PrometheusTargetMissing
      expr: 'up == 0 unless on(job) (sum by (job) (up) == 0)'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Prometheus target missing (instance {{ $labels.instance }})
        description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusAllTargetsMissing
      expr: 'sum by (job) (up) == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Prometheus all targets missing (instance {{ $labels.instance }})
        description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTargetMissingWithWarmupTime
      expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Prometheus target missing with warmup time (instance {{ $labels.instance }})
        description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusConfigurationReloadFailure
      expr: 'prometheus_config_last_reload_successful != 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
        description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTooManyRestarts
      expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus too many restarts (instance {{ $labels.instance }})
        description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusAlertmanagerJobMissing
      expr: 'absent(up{job="alertmanager"})'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
        description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusAlertmanagerConfigurationReloadFailure
      expr: 'alertmanager_config_last_reload_successful != 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
        description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusAlertmanagerConfigNotSynced
      expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
        description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusAlertmanagerE2eDeadManSwitch
      expr: 'vector(1)'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
        description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusNotConnectedToAlertmanager
      expr: 'prometheus_notifications_alertmanagers_discovered < 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
        description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusRuleEvaluationFailures
      expr: 'increase(prometheus_rule_evaluation_failures_total[3m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTemplateTextExpansionFailures
      expr: 'increase(prometheus_template_text_expansion_failures_total[3m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusRuleEvaluationSlow
      expr: 'prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
        description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusNotificationsBacklog
      expr: 'min_over_time(prometheus_notifications_queue_length[10m]) > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus notifications backlog (instance {{ $labels.instance }})
        description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusAlertmanagerNotificationFailing
      expr: 'rate(alertmanager_notifications_failed_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
        description: "Alertmanager is failing sending notifications ({{ $value }} notifications/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTargetEmpty
      expr: 'prometheus_sd_discovered_targets == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus target empty (instance {{ $labels.instance }})
        description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTargetScrapingSlow
      expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Prometheus target scraping slow (instance {{ $labels.instance }})
        description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusLargeScrape
      expr: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Prometheus large scrape (instance {{ $labels.instance }})
        description: "Prometheus has many scrapes that exceed the sample limit ({{ $value }} scrapes)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTargetScrapeDuplicate
      expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 3'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
        description: "Prometheus has many samples rejected due to duplicate timestamps but different values ({{ $value }} samples)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTsdbCheckpointCreationFailures
      expr: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTsdbCheckpointDeletionFailures
      expr: 'increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTsdbCompactionsFailed
      expr: 'increase(prometheus_tsdb_compactions_failed_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTsdbHeadTruncationsFailed
      expr: 'increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTsdbReloadFailures
      expr: 'increase(prometheus_tsdb_reloads_failures_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTsdbWalCorruptions
      expr: 'increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTsdbWalTruncationsFailed
      expr: 'increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTimeseriesCardinality
      expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
        description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/promtail/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: PromtailRequestErrors
      expr: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10 and sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Promtail request errors (instance {{ $labels.instance }})
        description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PromtailRequestLatency
      expr: 'histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Promtail request latency (instance {{ $labels.instance }})
        description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/proxmox-ve/prometheus-pve-exporter.yml
================================================
groups:

- name: PrometheusPveExporter

  
  rules:

    - alert: PveNodeDown
      expr: 'pve_up{id=~"node/.*"} == 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: PVE node down (instance {{ $labels.instance }})
        description: "Proxmox VE node {{ $labels.id }} is down.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This alert triggers for all VMs and containers that are not running.
    # You may want to filter by specific guests using the `id` label, or exclude
    # intentionally stopped guests with additional label matchers.
    - alert: PveVm/ctDown
      expr: 'pve_up{id=~"(qemu|lxc)/.*"} == 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: PVE VM/CT down (instance {{ $labels.instance }})
        description: "Proxmox VE guest {{ $labels.id }} is not running.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PveHighCpuUsage
      expr: 'pve_cpu_usage_ratio * 100 > 90'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: PVE high CPU usage (instance {{ $labels.instance }})
        description: "Proxmox VE CPU usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf \"%.2f\" }}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PveHighMemoryUsage
      expr: 'pve_memory_usage_bytes / pve_memory_size_bytes * 100 > 90 and pve_memory_size_bytes > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: PVE high memory usage (instance {{ $labels.instance }})
        description: "Proxmox VE memory usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf \"%.2f\" }}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PveStorageFillingUp
      expr: 'pve_disk_usage_bytes{id=~"storage/.*"} / pve_disk_size_bytes{id=~"storage/.*"} * 100 > 80 and pve_disk_size_bytes{id=~"storage/.*"} > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: PVE storage filling up (instance {{ $labels.instance }})
        description: "Proxmox VE storage {{ $labels.id }} is above 80% used. Current value: {{ $value | printf \"%.2f\" }}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PveStorageAlmostFull
      expr: 'pve_disk_usage_bytes{id=~"storage/.*"} / pve_disk_size_bytes{id=~"storage/.*"} * 100 > 95 and pve_disk_size_bytes{id=~"storage/.*"} > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: PVE storage almost full (instance {{ $labels.instance }})
        description: "Proxmox VE storage {{ $labels.id }} is above 95% used. Current value: {{ $value | printf \"%.2f\" }}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PveGuestNotBackedUp
      expr: 'pve_not_backed_up_total > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: PVE guest not backed up (instance {{ $labels.instance }})
        description: "{{ $value }} Proxmox VE guest(s) are not covered by any backup job.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PveReplicationFailed
      expr: 'pve_replication_failed_syncs > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: PVE replication failed (instance {{ $labels.instance }})
        description: "Proxmox VE replication for {{ $labels.id }} has {{ $value }} failed sync(s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Loss of quorum means the cluster cannot make decisions about VM placement
    # and fencing. This requires immediate attention.
    - alert: PveClusterNotQuorate
      expr: 'pve_cluster_info{quorate="0"} == 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: PVE cluster not quorate (instance {{ $labels.instance }})
        description: "Proxmox VE cluster has lost quorum.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/pulsar/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: PulsarSubscriptionHighNumberOfBacklogEntries
      expr: 'sum(pulsar_subscription_back_log) by (subscription) > 5000'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Pulsar subscription high number of backlog entries (instance {{ $labels.instance }})
        description: "The number of subscription backlog entries is over 5k\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PulsarSubscriptionVeryHighNumberOfBacklogEntries
      expr: 'sum(pulsar_subscription_back_log) by (subscription) > 100000'
      for: 1h
      labels:
        severity: critical
      annotations:
        summary: Pulsar subscription very high number of backlog entries (instance {{ $labels.instance }})
        description: "The number of subscription backlog entries is over 100k\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PulsarTopicLargeBacklogStorageSize
      expr: 'sum(pulsar_storage_size) by (topic) > 5*1024*1024*1024'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Pulsar topic large backlog storage size (instance {{ $labels.instance }})
        description: "The topic backlog storage size is over 5 GB\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PulsarTopicVeryLargeBacklogStorageSize
      expr: 'sum(pulsar_storage_size) by (topic) > 20*1024*1024*1024'
      for: 1h
      labels:
        severity: critical
      annotations:
        summary: Pulsar topic very large backlog storage size (instance {{ $labels.instance }})
        description: "The topic backlog storage size is over 20 GB\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PulsarHighWriteLatency
      expr: 'sum(pulsar_storage_write_latency_overflow > 0) by (topic)'
      for: 1h
      labels:
        severity: critical
      annotations:
        summary: Pulsar high write latency (instance {{ $labels.instance }})
        description: "Messages cannot be written in a timely fashion\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PulsarLargeMessagePayload
      expr: 'sum(pulsar_entry_size_overflow > 0) by (topic)'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Pulsar large message payload (instance {{ $labels.instance }})
        description: "Observing large message payload (> 1MB)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PulsarHighLedgerDiskUsage
      expr: 'sum(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_usage) by (kubernetes_pod_name) > 75'
      for: 1h
      labels:
        severity: critical
      annotations:
        summary: Pulsar high ledger disk usage (instance {{ $labels.instance }})
        description: "Observing Ledger Disk Usage (> 75%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PulsarReadOnlyBookies
      expr: 'count(bookie_SERVER_STATUS{} == 0) by (pod)'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Pulsar read only bookies (instance {{ $labels.instance }})
        description: "Observing Readonly Bookies\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PulsarHighNumberOfFunctionErrors
      expr: 'sum(rate(pulsar_function_user_exceptions_total[1m]) + rate(pulsar_function_system_exceptions_total[1m])) by (name) > 10'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Pulsar high number of function errors (instance {{ $labels.instance }})
        description: "Observing more than 10 Function errors per minute\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PulsarHighNumberOfSinkErrors
      expr: 'sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Pulsar high number of sink errors (instance {{ $labels.instance }})
        description: "Observing more than 10 Sink errors per minute\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/python/python-exporter.yml
================================================
groups:

- name: PythonExporter

  
  rules:

    - alert: PythonGcObjectsUncollectable
      expr: 'increase(python_gc_objects_uncollectable_total[5m]) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Python GC objects uncollectable (instance {{ $labels.instance }})
        description: "Python has uncollectable objects, potential memory leak via reference cycles\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PythonGcCollectionsHigh
      expr: 'rate(python_gc_objects_collected_total[5m]) > 10000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Python GC collections high (instance {{ $labels.instance }})
        description: "Python GC is collecting too many objects (> 10k/s), high allocation pressure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not Python-specific.
    - alert: PythonFileDescriptorsExhaustion
      expr: '(process_open_fds / process_max_fds) * 100 > 90 and process_max_fds > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Python file descriptors exhaustion (instance {{ $labels.instance }})
        description: "Python process is running out of file descriptors (> 90% used)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Gen2 collection rate > 1/s is very high. In most applications, gen2 runs are infrequent. Adjust threshold based on your workload.
    - alert: PythonGcGeneration2CollectionsHigh
      expr: 'rate(python_gc_collections_total{generation="2"}[5m]) > 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Python GC generation 2 collections high (instance {{ $labels.instance }})
        description: "Python full GC (generation 2) is running too frequently, indicating memory pressure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold is a rough default. Adjust based on your application's expected memory footprint.
    - alert: PythonVirtualMemoryHigh
      expr: 'process_virtual_memory_bytes > 4e9'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Python virtual memory high (instance {{ $labels.instance }})
        description: "Python process virtual memory is high (> 4GB)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
================================================
groups:

- name: KbuddeRabbitmqExporter

  
  rules:

    # 1m delay allows a restart without triggering an alert.
    - alert: RabbitmqDown
      expr: 'rabbitmq_up == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: RabbitMQ down (instance {{ $labels.instance }})
        description: "RabbitMQ node down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # 1m delay allows a restart without triggering an alert.
    - alert: RabbitmqClusterDown
      expr: 'sum(rabbitmq_running) < 3'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: RabbitMQ cluster down (instance {{ $labels.instance }})
        description: "Less than 3 nodes running in RabbitMQ cluster\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqClusterPartition
      expr: 'rabbitmq_partitions > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: RabbitMQ cluster partition (instance {{ $labels.instance }})
        description: "Cluster partition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqOutOfMemory
      expr: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ out of memory (instance {{ $labels.instance }})
        description: "Memory available for RabbitMQ is low (< 10%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqInstanceTooManyConnections
      expr: 'rabbitmq_connectionsTotal > 1000'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ instance too many connections (instance {{ $labels.instance }})
        description: "RabbitMQ instance has too many connections (> 1000)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Indicate the queue name in dedicated label.
    - alert: RabbitmqDeadLetterQueueFillingUp
      expr: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ dead letter queue filling up (instance {{ $labels.instance }})
        description: "Dead letter queue is filling up (> 10 msgs)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Indicate the queue name in dedicated label.
    - alert: RabbitmqTooManyMessagesInQueue
      expr: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ too many messages in queue (instance {{ $labels.instance }})
        description: "Queue is filling up (> 1000 msgs)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Indicate the queue name in dedicated label.
    - alert: RabbitmqSlowQueueConsuming
      expr: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ slow queue consuming (instance {{ $labels.instance }})
        description: "Queue messages are consumed slowly (> 60s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Allows a short service restart.
    - alert: RabbitmqNoConsumer
      expr: 'rabbitmq_queue_consumers == 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: RabbitMQ no consumer (instance {{ $labels.instance }})
        description: "Queue has no consumer\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Indicate the queue name in dedicated label.
    - alert: RabbitmqTooManyConsumers
      expr: 'rabbitmq_queue_consumers{queue="my-queue"} > 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: RabbitMQ too many consumers (instance {{ $labels.instance }})
        description: "Queue should have only 1 consumer\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Indicate the exchange name in dedicated label.
    - alert: RabbitmqInactiveExchange
      expr: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ inactive exchange (instance {{ $labels.instance }})
        description: "Exchange receive less than 5 msgs per second\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/rabbitmq/rabbitmq-exporter.yml
================================================
groups:

- name: RabbitmqExporter

  
  rules:

    # 1m delay allows a restart without triggering an alert.
    - alert: RabbitmqNodeDown
      expr: 'sum(rabbitmq_build_info) < 3'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: RabbitMQ node down (instance {{ $labels.instance }})
        description: "Less than 3 nodes running in RabbitMQ cluster\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # 1m delay allows a restart without triggering an alert.
    - alert: RabbitmqNodeNotDistributed
      expr: 'erlang_vm_dist_node_state < 3'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: RabbitMQ node not distributed (instance {{ $labels.instance }})
        description: "Distribution link state is not 'up'\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqInstancesDifferentVersions
      expr: 'count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ instances different versions (instance {{ $labels.instance }})
        description: "Running different version of RabbitMQ in the same cluster, can lead to failure.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqMemoryHigh
      expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90 and rabbitmq_resident_memory_limit_bytes > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ memory high (instance {{ $labels.instance }})
        description: "A node use more than 90% of allocated RAM\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqFileDescriptorsUsage
      expr: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 and rabbitmq_process_max_fds > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ file descriptors usage (instance {{ $labels.instance }})
        description: "A node use more than 90% of file descriptors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqTooManyReadyMessages
      expr: 'sum(rabbitmq_queue_messages_ready) BY (queue) > 1000'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ too many ready messages (instance {{ $labels.instance }})
        description: "RabbitMQ too many ready messages on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqTooManyUnackMessages
      expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ too many unack messages (instance {{ $labels.instance }})
        description: "Too many unacknowledged messages\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqTooManyConnections
      expr: 'rabbitmq_connections > 1000'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ too many connections (instance {{ $labels.instance }})
        description: "The total connections of a node is too high\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqNoQueueConsumer
      expr: 'rabbitmq_queue_consumers < 1'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ no queue consumer (instance {{ $labels.instance }})
        description: "A queue has less than 1 consumer\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqUnroutableMessages
      expr: 'increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ unroutable messages (instance {{ $labels.instance }})
        description: "A queue has unroutable messages ({{ $value }} in the last 1m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/redis/oliver006-redis-exporter.yml
================================================
groups:

- name: Oliver006RedisExporter

  
  rules:

    # 1m delay allows a restart without triggering an alert.
    - alert: RedisDown
      expr: 'redis_up == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Redis down (instance {{ $labels.instance }})
        description: "Redis instance is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RedisMissingMaster
      expr: '(count(redis_instance_info{role="master"}) or vector(0)) < 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Redis missing master (instance {{ $labels.instance }})
        description: "Redis cluster has no node marked as master.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # 1m delay allows a restart without triggering an alert.
    - alert: RedisTooManyMasters
      expr: 'count(redis_instance_info{role="master"}) > 1'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Redis too many masters (instance {{ $labels.instance }})
        description: "Redis cluster has too many nodes marked as master.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RedisDisconnectedSlaves
      expr: 'count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Redis disconnected slaves (instance {{ $labels.instance }})
        description: "Redis not replicating for all slaves. Consider reviewing the redis replication status.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RedisReplicationBroken
      expr: 'delta(redis_connected_slaves[1m]) < 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Redis replication broken (instance {{ $labels.instance }})
        description: "Redis instance lost a slave\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RedisClusterFlapping
      expr: 'changes(redis_connected_slaves[1m]) > 1'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Redis cluster flapping (instance {{ $labels.instance }})
        description: "Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RedisMissingBackup
      expr: 'time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 48'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Redis missing backup (instance {{ $labels.instance }})
        description: "Redis has not been backed up for 48 hours\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
    - alert: RedisOutOfSystemMemory
      expr: 'redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90 and redis_total_system_memory_bytes > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Redis out of system memory (instance {{ $labels.instance }})
        description: "Redis is running out of system memory (> 90%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RedisOutOfConfiguredMaxmemory
      expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Redis out of configured maxmemory (instance {{ $labels.instance }})
        description: "Redis is running out of configured maxmemory (> 90%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RedisTooManyConnections
      expr: 'redis_connected_clients / redis_config_maxclients * 100 > 90 and redis_config_maxclients > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Redis too many connections (instance {{ $labels.instance }})
        description: "Redis is running out of connections (> 90% used)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RedisNotEnoughConnections
      expr: 'redis_connected_clients < 5'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Redis not enough connections (instance {{ $labels.instance }})
        description: "Redis instance should have more connections (> 5)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RedisRejectedConnections
      expr: 'increase(redis_rejected_connections_total[1m]) > 5'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Redis rejected connections (instance {{ $labels.instance }})
        description: "Some connections to Redis has been rejected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/ruby/ruby-exporter.yml
================================================
groups:

- name: RubyExporter

  
  rules:

    # Threshold is a rough default. Adjust based on your application's normal heap size.
    - alert: RubyHeapLiveSlotsHigh
      expr: 'ruby_heap_live_slots > 500000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Ruby heap live slots high (instance {{ $labels.instance }})
        description: "Ruby heap has too many live slots (> 500k), heap bloat\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RubyHeapFreeSlotsHigh
      expr: 'ruby_heap_free_slots > 500000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Ruby heap free slots high (instance {{ $labels.instance }})
        description: "Ruby heap has too many free slots (> 500k), memory fragmentation after large allocations\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Major GC rate > 5/s is extremely high. Consider lowering to > 1 or > 2 for earlier detection.
    - alert: RubyMajorGcRateHigh
      expr: 'rate(ruby_major_gc_ops_total[5m]) > 5'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Ruby major GC rate high (instance {{ $labels.instance }})
        description: "Ruby is performing too many major GC cycles, indicating memory pressure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RubyRssHigh
      expr: 'ruby_rss > 1e9'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Ruby RSS high (instance {{ $labels.instance }})
        description: "Ruby process RSS is high (> 1GB)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RubyAllocatedObjectsSpike
      expr: 'rate(ruby_allocated_objects_total[5m]) > 100000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Ruby allocated objects spike (instance {{ $labels.instance }})
        description: "Ruby is allocating objects at a high rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
================================================
groups:

- name: SmartctlExporter

  
  rules:

    - alert: SmartDeviceTemperatureWarning
      expr: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: SMART device temperature warning (instance {{ $labels.instance }})
        description: "Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SmartDeviceTemperatureCritical
      expr: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: SMART device temperature critical (instance {{ $labels.instance }})
        description: "Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SmartDeviceTemperatureOverTripValue
      expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: SMART device temperature over trip value (instance {{ $labels.instance }})
        description: "Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SmartDeviceTemperatureNearingTripValue
      expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: SMART device temperature nearing trip value (instance {{ $labels.instance }})
        description: "Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SmartStatus
      expr: 'smartctl_device_smart_status != 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: SMART status (instance {{ $labels.instance }})
        description: "Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SmartCriticalWarning
      expr: 'smartctl_device_critical_warning > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: SMART critical warning (instance {{ $labels.instance }})
        description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SmartMediaErrors
      expr: 'smartctl_device_media_errors > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: SMART media errors (instance {{ $labels.instance }})
        description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SmartWearoutIndicator
      expr: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: SMART Wearout Indicator (instance {{ $labels.instance }})
        description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/sidekiq/strech-sidekiq-exporter.yml
================================================
groups:

- name: StrechSidekiqExporter

  
  rules:

    - alert: SidekiqQueueSize
      expr: 'sidekiq_queue_size > 100'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Sidekiq queue size (instance {{ $labels.instance }})
        description: "Sidekiq queue {{ $labels.name }} is growing\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SidekiqSchedulingLatencyTooHigh
      expr: 'max(sidekiq_queue_latency) > 60'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Sidekiq scheduling latency too high (instance {{ $labels.instance }})
        description: "Sidekiq jobs are taking more than 1min to be picked up. Users may be seeing delays in background processing.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/snmp/snmp-exporter.yml
================================================
groups:

- name: SnmpExporter

  # These rules use standard IF-MIB and SNMPv2-MIB metrics. Metric names depend on your snmp.yml module configuration.
  # Thresholds for bandwidth and error rates are rough defaults - adjust to your environment.
  
  rules:

    # From the official snmp-mixin.
    - alert: SnmpTargetDown
      expr: 'up{job=~"snmp.*"} == 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: SNMP target down (instance {{ $labels.instance }})
        description: "SNMP device {{ $labels.instance }} is unreachable.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SnmpInterfaceDown
      expr: '(ifOperStatus{job=~"snmp.*"} == 2) and on(instance, job, ifIndex) (ifAdminStatus{job=~"snmp.*"} == 1)'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: SNMP interface down (instance {{ $labels.instance }})
        description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} is operationally down while administratively up.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold is a rough default. Adjust based on your network environment.
    - alert: SnmpInterfaceHighInboundErrorRate
      expr: 'rate(ifInErrors{job=~"snmp.*"}[5m]) / (rate(ifHCInUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInMulticastPkts{job=~"snmp.*"}[5m])) > 0.05 and (rate(ifHCInUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInMulticastPkts{job=~"snmp.*"}[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: SNMP interface high inbound error rate (instance {{ $labels.instance }})
        description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an inbound error rate above 5%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold is a rough default. Adjust based on your network environment.
    - alert: SnmpInterfaceHighOutboundErrorRate
      expr: 'rate(ifOutErrors{job=~"snmp.*"}[5m]) / (rate(ifHCOutUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutMulticastPkts{job=~"snmp.*"}[5m])) > 0.05 and (rate(ifHCOutUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutMulticastPkts{job=~"snmp.*"}[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: SNMP interface high outbound error rate (instance {{ $labels.instance }})
        description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an outbound error rate above 5%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead.
    - alert: SnmpInterfaceHighBandwidthUsageInbound
      expr: 'rate(ifHCInOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: SNMP interface high bandwidth usage inbound (instance {{ $labels.instance }})
        description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} inbound utilization is above 80%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead.
    - alert: SnmpInterfaceHighBandwidthUsageOutbound
      expr: 'rate(ifHCOutOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: SNMP interface high bandwidth usage outbound (instance {{ $labels.instance }})
        description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} outbound utilization is above 80%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # sysUpTime is in centiseconds (hundredths of a second).
    - alert: SnmpDeviceRestarted
      expr: 'sysUpTime / 100 < 300'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: SNMP device restarted (instance {{ $labels.instance }})
        description: "SNMP device {{ $labels.instance }} has restarted (uptime < 5 minutes).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/solr/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: SolrUpdateErrors
      expr: 'increase(solr_metrics_core_update_handler_errors_total[1m]) > 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Solr update errors (instance {{ $labels.instance }})
        description: "Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SolrQueryErrors
      expr: 'increase(solr_metrics_core_errors_total{category="QUERY"}[1m]) > 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Solr query errors (instance {{ $labels.instance }})
        description: "Solr has increased query errors in collection {{ $labels.collection }} for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SolrReplicationErrors
      expr: 'increase(solr_metrics_core_errors_total{category="REPLICATION"}[1m]) > 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Solr replication errors (instance {{ $labels.instance }})
        description: "Solr collection {{ $labels.collection }} has replication errors for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SolrLowLiveNodeCount
      expr: 'solr_collections_live_nodes < 2'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Solr low live node count (instance {{ $labels.instance }})
        description: "Solr collection {{ $labels.collection }} has less than two live nodes for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/speedtest/nlamirault-speedtest-exporter.yml
================================================
groups:

- name: NlamiraultSpeedtestExporter

  
  rules:

    - alert: SpeedtestSlowInternetDownload
      expr: 'avg_over_time(speedtest_download[10m]) < 100'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: SpeedTest Slow Internet Download (instance {{ $labels.instance }})
        description: "Internet download speed is currently {{humanize $value}} Mbps.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SpeedtestSlowInternetUpload
      expr: 'avg_over_time(speedtest_upload[10m]) < 20'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: SpeedTest Slow Internet Upload (instance {{ $labels.instance }})
        description: "Internet upload speed is currently {{humanize $value}} Mbps.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/spinnaker/embedded-exporter.yml
================================================
groups:

- name: EmbeddedExporter

  
  rules:

    - alert: SpinnakerCircuitBreakerOpen
      expr: 'resilience4j_circuitbreaker_state{state="open"} == 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Spinnaker circuit breaker open (instance {{ $labels.instance }})
        description: "Circuit breaker {{ $labels.name }} is open on {{ $labels.instance }}, indicating repeated downstream failures.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # In a healthy Spinnaker, queue_ready_depth should stay at or near 0.
    # Sustained non-zero values indicate Orca cannot keep up with incoming work.
    - alert: SpinnakerOrcaQueueBackingUp
      expr: 'queue_ready_depth > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Spinnaker Orca queue backing up (instance {{ $labels.instance }})
        description: "Orca work queue has {{ $value }} messages ready for delivery but not yet picked up. Pipeline executions may be delayed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # The 30s threshold is a rough default. Adjust based on your pipeline SLOs.
    - alert: SpinnakerOrcaQueueMessageLagHigh
      expr: 'rate(queue_message_lag_seconds_sum[5m]) / rate(queue_message_lag_seconds_count[5m]) > 30 and rate(queue_message_lag_seconds_count[5m]) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Spinnaker Orca queue message lag high (instance {{ $labels.instance }})
        description: "Orca queue message lag is {{ $value }}s. Pipeline stages are waiting too long before being processed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SpinnakerDeadMessages
      expr: 'rate(queue_dead_messages_total[5m]) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Spinnaker dead messages (instance {{ $labels.instance }})
        description: "Orca is producing dead-lettered messages ({{ $value }} per second). These are tasks that exhausted all retries and will not be executed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Zombies are pipeline executions that are running but have lost their queue entry.
    # See https://spinnaker.io/docs/guides/runbooks/orca-zombie-executions/
    - alert: SpinnakerZombieExecutions
      expr: 'rate(queue_zombies_total[5m]) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Spinnaker zombie executions (instance {{ $labels.instance }})
        description: "{{ $value }} zombie pipeline executions detected. These are executions with no corresponding queue messages.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SpinnakerThreadPoolExhaustion
      expr: 'threadpool_blockingQueueSize > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Spinnaker thread pool exhaustion (instance {{ $labels.instance }})
        description: "Orca message handler thread pool has {{ $value }} blocked threads on {{ $labels.instance }}. Pipeline execution throughput is degraded.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # When this threshold is exceeded, Igor stops triggering pipelines for the affected monitor.
    # See https://kb.armory.io/s/article/Hitting-Igor-s-caching-thresholds
    - alert: SpinnakerPollingMonitorItemsOverThreshold
      expr: 'sum by (monitor, partition) (pollingMonitor_itemsOverThreshold) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Spinnaker polling monitor items over threshold (instance {{ $labels.instance }})
        description: "Igor polling monitor {{ $labels.monitor }} for {{ $labels.partition }} has exceeded its item threshold, preventing pipeline triggers.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SpinnakerPollingMonitorFailures
      expr: 'rate(pollingMonitor_failed_total[5m]) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Spinnaker polling monitor failures (instance {{ $labels.instance }})
        description: "Igor polling monitor is experiencing failures ({{ $value }} per second). CI/SCM integrations may not trigger pipelines.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # The 5% threshold is a rough default. Adjust based on your traffic patterns.
    - alert: SpinnakerHighApiErrorRate
      expr: 'sum by (instance) (rate(controller_invocations_total{status="5xx"}[5m])) / sum by (instance) (rate(controller_invocations_total[5m])) > 0.05 and sum by (instance) (rate(controller_invocations_total[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Spinnaker high API error rate (instance {{ $labels.instance }})
        description: "Spinnaker API 5xx error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SpinnakerApiRateLimitThrottling
      expr: 'rate(rateLimitThrottling_total[5m]) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Spinnaker API rate limit throttling (instance {{ $labels.instance }})
        description: "Gate is actively throttling API requests on {{ $labels.instance }} ({{ $value }} throttled requests per second).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SpinnakerClouddriverHighErrorRate
      expr: 'sum by (instance) (rate(controller_invocations_total{status="5xx", job=~".*clouddriver.*"}[5m])) / sum by (instance) (rate(controller_invocations_total{job=~".*clouddriver.*"}[5m])) > 0.05 and sum by (instance) (rate(controller_invocations_total{job=~".*clouddriver.*"}[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Spinnaker Clouddriver high error rate (instance {{ $labels.instance }})
        description: "Clouddriver 5xx error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}. Cloud operations may be failing.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This metric is specific to AWS cloud providers in Clouddriver.
    # The 1000ms threshold is a rough default. Adjust based on your AWS usage patterns.
    - alert: SpinnakerAwsRateLimiting
      expr: 'amazonClientProvider_rateLimitDelayMil > 1000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Spinnaker AWS rate limiting (instance {{ $labels.instance }})
        description: "Clouddriver is being rate-limited by AWS on {{ $labels.instance }} ({{ $value }}ms delay). Cloud operations will be slower.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/sql-server/ozarklake-mssql-exporter.yml
================================================
groups:

- name: OzarklakeMssqlExporter

  
  rules:

    # 1m delay allows a restart without triggering an alert.
    - alert: SqlServerDown
      expr: 'mssql_up == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: SQL Server down (instance {{ $labels.instance }})
        description: "SQL server instance is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SqlServerDeadlock
      expr: 'mssql_deadlocks > 5'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: SQL Server deadlock (instance {{ $labels.instance }})
        description: "SQL Server {{ $labels.instance }} is experiencing deadlocks ({{ $value }}/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/ssl/tls/ribbybibby-ssl-exporter.yml
================================================
groups:

- name: RibbybibbySslExporter

  
  rules:

    - alert: SslCertificateProbeFailed
      expr: 'ssl_probe_success == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: SSL certificate probe failed (instance {{ $labels.instance }})
        description: "Failed to fetch SSL information {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SslCertificateOscpStatusUnknown
      expr: 'ssl_ocsp_response_status == 2'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: SSL certificate OSCP status unknown (instance {{ $labels.instance }})
        description: "Failed to get the OSCP status {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SslCertificateRevoked
      expr: 'ssl_ocsp_response_status == 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: SSL certificate revoked (instance {{ $labels.instance }})
        description: "SSL certificate revoked {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SslCertificateExpiry(<7Days)
      expr: 'ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 7'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: SSL certificate expiry (< 7 days) (instance {{ $labels.instance }})
        description: "{{ $labels.instance }} Certificate is expiring in 7 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/systemd/systemd-exporter.yml
================================================
groups:

- name: SystemdExporter

  
  rules:

    - alert: SystemdUnitFailed
      expr: 'systemd_unit_state{state="failed"} == 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Systemd unit failed (instance {{ $labels.instance }})
        description: "Systemd unit {{ $labels.name }} has entered failed state. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Many units are legitimately inactive. You must adjust the name=~ filter to match your critical services.
    - alert: SystemdUnitInactive
      expr: 'systemd_unit_state{state="inactive", type="service", name=~"your-critical-service.+"} == 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Systemd unit inactive (instance {{ $labels.instance }})
        description: "Systemd unit {{ $labels.name }} is inactive. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SystemdServiceCrashLooping
      expr: 'increase(systemd_service_restart_total[1h]) > 5'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Systemd service crash looping (instance {{ $labels.instance }})
        description: "Systemd service {{ $labels.name }} has restarted {{ $value }} times in the last hour. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SystemdUnitTasksNearLimit
      expr: 'systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max > 0.9 and ignoring(type) systemd_unit_tasks_max > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Systemd unit tasks near limit (instance {{ $labels.instance }})
        description: "Systemd unit {{ $labels.name }} is using {{ $value | humanizePercentage }} of its task limit. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SystemdSocketRefusedConnections
      expr: 'increase(systemd_socket_refused_connections_total[5m]) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Systemd socket refused connections (instance {{ $labels.instance }})
        description: "Systemd socket {{ $labels.name }} is refusing connections. ({{ $value }} refused in last 5m, instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 100 connections is arbitrary. Adjust to your workload.
    - alert: SystemdSocketHighConnections
      expr: 'systemd_socket_current_connections > 100'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Systemd socket high connections (instance {{ $labels.instance }})
        description: "Systemd socket {{ $labels.name }} has {{ $value }} active connections. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Triggers if timer hasn't fired in 24 hours. Adjust threshold per timer schedule.
    - alert: SystemdTimerMissedTrigger
      expr: '(time() - systemd_timer_last_trigger_seconds) / 3600 > 24 and systemd_timer_last_trigger_seconds > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Systemd timer missed trigger (instance {{ $labels.instance }})
        description: "Systemd timer {{ $labels.name }} has not triggered for over 24 hours. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/thanos/thanos-bucket-replicate.yml
================================================
groups:

- name: ThanosBucketReplicate

  
  rules:

    - alert: ThanosBucketReplicateErrorRate
      expr: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Bucket Replicate Error Rate (instance {{ $labels.instance }})
        description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosBucketReplicateRunLatency
      expr: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and  sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Bucket Replicate Run Latency (instance {{ $labels.instance }})
        description: "Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/thanos/thanos-compactor.yml
================================================
groups:

- name: ThanosCompactor

  
  rules:

    - alert: ThanosCompactorMultipleRunning
      expr: 'sum by (job) (up{job=~".*thanos-compact.*"}) > 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Thanos Compactor Multiple Running (instance {{ $labels.instance }})
        description: "No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosCompactorHalted
      expr: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Thanos Compactor Halted (instance {{ $labels.instance }})
        description: "Thanos Compact {{$labels.job}} has failed to run and now is halted.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosCompactorHighCompactionFailures
      expr: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Thanos Compactor High Compaction Failures (instance {{ $labels.instance }})
        description: "Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosCompactBucketHighOperationFailures
      expr: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Thanos Compact Bucket High Operation Failures (instance {{ $labels.instance }})
        description: "Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosCompactHasNotRun
      expr: '(time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Thanos Compact Has Not Run (instance {{ $labels.instance }})
        description: "Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/thanos/thanos-component-absent.yml
================================================
groups:

- name: ThanosComponentAbsent

  
  rules:

    - alert: ThanosCompactIsDown
      expr: 'absent(up{job=~".*thanos-compact.*"} == 1)'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Compact Is Down (instance {{ $labels.instance }})
        description: "ThanosCompact has disappeared. Prometheus target for the component cannot be discovered.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryIsDown
      expr: 'absent(up{job=~".*thanos-query.*"} == 1)'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Query Is Down (instance {{ $labels.instance }})
        description: "ThanosQuery has disappeared. Prometheus target for the component cannot be discovered.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosReceiveIsDown
      expr: 'absent(up{job=~".*thanos-receive.*"} == 1)'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Receive Is Down (instance {{ $labels.instance }})
        description: "ThanosReceive has disappeared. Prometheus target for the component cannot be discovered.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleIsDown
      expr: 'absent(up{job=~".*thanos-rule.*"} == 1)'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Rule Is Down (instance {{ $labels.instance }})
        description: "ThanosRule has disappeared. Prometheus target for the component cannot be discovered.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosSidecarIsDown
      expr: 'absent(up{job=~".*thanos-sidecar.*"} == 1)'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Sidecar Is Down (instance {{ $labels.instance }})
        description: "ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosStoreIsDown
      expr: 'absent(up{job=~".*thanos-store.*"} == 1)'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Store Is Down (instance {{ $labels.instance }})
        description: "ThanosStore has disappeared. Prometheus target for the component cannot be discovered.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/thanos/thanos-query.yml
================================================
groups:

- name: ThanosQuery

  
  rules:

    - alert: ThanosQueryHttpRequestQueryErrorRateHigh
      expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/  sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Query Http Request Query Error Rate High (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query\" requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
      expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/  sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Query Http Request Query Range Error Rate High (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query_range\" requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryGrpcServerErrorRate
      expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/  sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5) and sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Thanos Query Grpc Server Error Rate (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryGrpcClientErrorRate
      expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5 and sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Thanos Query Grpc Client Error Rate (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryHighDNSFailures
      expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Thanos Query High D N S Failures (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryInstantLatencyHigh
      expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Thanos Query Instant Latency High (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryRangeLatencyHigh
      expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Thanos Query Range Latency High (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryOverload
      expr: '(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Thanos Query Overload (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/thanos/thanos-receiver.yml
================================================
groups:

- name: ThanosReceiver

  
  rules:

    - alert: ThanosReceiveHttpRequestErrorRateHigh
      expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/  sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Receive Http Request Error Rate High (instance {{ $labels.instance }})
        description: "Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosReceiveHttpRequestLatencyHigh
      expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Thanos Receive Http Request Latency High (instance {{ $labels.instance }})
        description: "Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosReceiveHighReplicationFailures
      expr: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Thanos Receive High Replication Failures (instance {{ $labels.instance }})
        description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosReceiveHighForwardRequestFailures
      expr: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/  sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) > 0'
      for: 5m
      labels:
        severity: info
      annotations:
        summary: Thanos Receive High Forward Request Failures (instance {{ $labels.instance }})
        description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosReceiveHighHashringFileRefreshFailures
      expr: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Thanos Receive High Hashring File Refresh Failures (instance {{ $labels.instance }})
        description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosReceiveConfigReloadFailure
      expr: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Thanos Receive Config Reload Failure (instance {{ $labels.instance }})
        description: "Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosReceiveNoUpload
      expr: '(up{job=~".*thanos-receive.*"} - 1) + on (job, instance) (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)'
      for: 3h
      labels:
        severity: critical
      annotations:
        summary: Thanos Receive No Upload (instance {{ $labels.instance }})
        description: "Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/thanos/thanos-ruler.yml
================================================
groups:

- name: ThanosRuler

  
  rules:

    - alert: ThanosRuleQueueIsDroppingAlerts
      expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Rule Queue Is Dropping Alerts (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.instance}} is failing to queue alerts ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleSenderIsFailingAlerts
      expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Rule Sender Is Failing Alerts (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleHighRuleEvaluationFailures
      expr: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5) and sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Rule High Rule Evaluation Failures (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.instance}} is failing to evaluate {{$value | humanize}}% of rules.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: ThanosRuleHighRuleEvaluationWarnings
      expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0.05'
      for: 15m
      labels:
        severity: info
      annotations:
        summary: Thanos Rule High Rule Evaluation Warnings (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleRuleEvaluationLatencyHigh
      expr: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Thanos Rule Rule Evaluation Latency High (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleGrpcErrorRate
      expr: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/  sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5) and sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Thanos Rule Grpc Error Rate (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleConfigReloadFailure
      expr: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
      for: 5m
      labels:
        severity: info
      annotations:
        summary: Thanos Rule Config Reload Failure (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleQueryHighDNSFailures
      expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Thanos Rule Query High D N S Failures (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleAlertmanagerHighDNSFailures
      expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Thanos Rule Alertmanager High D N S Failures (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleNoEvaluationFor10Intervals
      expr: 'time() -  max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})'
      for: 5m
      labels:
        severity: info
      annotations:
        summary: Thanos Rule No Evaluation For10 Intervals (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosNoRuleEvaluations
      expr: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0  and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos No Rule Evaluations (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/thanos/thanos-sidecar.yml
================================================
groups:

- name: ThanosSidecar

  
  rules:

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: ThanosSidecarBucketOperationsFailed
      expr: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0.05'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Sidecar Bucket Operations Failed (instance {{ $labels.instance }})
        description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosSidecarNoConnectionToStartedPrometheus
      expr: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Sidecar No Connection To Started Prometheus (instance {{ $labels.instance }})
        description: "Thanos Sidecar {{$labels.instance}} is unhealthy.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/thanos/thanos-store.yml
================================================
groups:

- name: ThanosStore

  
  rules:

    - alert: ThanosStoreGrpcErrorRate
      expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))/  sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5) and sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Thanos Store Grpc Error Rate (instance {{ $labels.instance }})
        description: "Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosStoreSeriesGateLatencyHigh
      expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Thanos Store Series Gate Latency High (instance {{ $labels.instance }})
        description: "Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosStoreBucketHighOperationFailures
      expr: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) > 0'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Thanos Store Bucket High Operation Failures (instance {{ $labels.instance }})
        description: "Thanos Store {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosStoreObjstoreOperationLatencyHigh
      expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and  sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Thanos Store Objstore Operation Latency High (instance {{ $labels.instance }})
        description: "Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/traefik/embedded-exporter-v1.yml
================================================
groups:

- name: EmbeddedExporterV1

  
  rules:

    - alert: TraefikBackendDown
      expr: 'count(traefik_backend_server_up) by (backend) == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Traefik backend down (instance {{ $labels.instance }})
        description: "All Traefik backends are down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TraefikHighHttp4xxErrorRateBackend
      expr: 'sum(rate(traefik_backend_requests_total{code=~"4.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Traefik high HTTP 4xx error rate backend (instance {{ $labels.instance }})
        description: "Traefik backend 4xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TraefikHighHttp5xxErrorRateBackend
      expr: 'sum(rate(traefik_backend_requests_total{code=~"5.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Traefik high HTTP 5xx error rate backend (instance {{ $labels.instance }})
        description: "Traefik backend 5xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/traefik/embedded-exporter-v2.yml
================================================
groups:

- name: EmbeddedExporterV2

  
  rules:

    - alert: TraefikServiceDown
      expr: 'count(traefik_service_server_up) by (service) == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Traefik service down (instance {{ $labels.instance }})
        description: "All Traefik services are down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TraefikHighHttp4xxErrorRateService
      expr: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Traefik high HTTP 4xx error rate service (instance {{ $labels.instance }})
        description: "Traefik service 4xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TraefikHighHttp5xxErrorRateService
      expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }})
        description: "Traefik service 5xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/vmware/pryorda-vmware-exporter.yml
================================================
groups:

- name: PryordaVmwareExporter

  
  rules:

    - alert: VirtualMachineMemoryWarning
      expr: 'vmware_vm_mem_usage_average / 100 >= 80 and vmware_vm_mem_usage_average / 100 < 90'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Virtual Machine Memory Warning (instance {{ $labels.instance }})
        description: "High memory usage on {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: VirtualMachineMemoryCritical
      expr: 'vmware_vm_mem_usage_average / 100 >= 90'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Virtual Machine Memory Critical (instance {{ $labels.instance }})
        description: "High memory usage on {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HighNumberOfSnapshots
      expr: 'vmware_vm_snapshots > 3'
      for: 30m
      labels:
        severity: warning
      annotations:
        summary: High Number of Snapshots (instance {{ $labels.instance }})
        description: "High snapshots number on {{ $labels.instance }}: {{ $value }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OutdatedSnapshots
      expr: '(time() - vmware_vm_snapshot_timestamp_seconds) / (60 * 60 * 24) >= 3'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Outdated Snapshots (instance {{ $labels.instance }})
        description: "Outdated snapshots on {{ $labels.instance }}: {{ $value | printf \"%.0f\"}} days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/windows-server/windows-exporter.yml
================================================
groups:

- name: WindowsExporter

  
  rules:

    - alert: WindowsServerCollectorError
      expr: 'windows_exporter_collector_success == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Windows Server collector Error (instance {{ $labels.instance }})
        description: "Collector {{ $labels.collector }} was not successful\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: WindowsServerServiceStatus
      expr: 'windows_service_status{status="ok"} != 1'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Windows Server service Status (instance {{ $labels.instance }})
        description: "Windows Service state is not OK\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: WindowsServerCpuUsage
      expr: '100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Windows Server CPU Usage (instance {{ $labels.instance }})
        description: "CPU Usage is more than 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: WindowsServerMemoryUsage
      expr: '100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Windows Server memory Usage (instance {{ $labels.instance }})
        description: "Memory usage is more than 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: WindowsServerDiskSpaceUsage
      expr: '100 - 100 * (windows_logical_disk_free_bytes / windows_logical_disk_size_bytes) > 80 and windows_logical_disk_size_bytes > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Windows Server disk Space Usage (instance {{ $labels.instance }})
        description: "Disk usage is more than 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/wireguard/mindflavor-prometheus-wireguard-exporter.yml
================================================
groups:

- name: MindflavorPrometheusWireguardExporter

  
  rules:

    # The threshold of 300 seconds (5 minutes) is a rough default. WireGuard peers that are idle but reachable
    # typically re-handshake every 2 minutes. Adjust based on your keepalive interval.
    # The `> 0` guard excludes peers that have never completed a handshake (covered by a separate rule).
    - alert: WireguardPeerHandshakeTooOld
      expr: 'time() - wireguard_latest_handshake_seconds > 300 and wireguard_latest_handshake_seconds > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: WireGuard peer handshake too old (instance {{ $labels.instance }})
        description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has not had a handshake for over 5 minutes. The tunnel may be down.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This alert will fire for all offline mobile/laptop peers. Consider filtering by expected-online peers.
    - alert: WireguardPeerHandshakeNeverEstablished
      expr: 'wireguard_latest_handshake_seconds == 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: WireGuard peer handshake never established (instance {{ $labels.instance }})
        description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has never completed a handshake. Check peer configuration and network connectivity.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This alert fires when a peer has a recent handshake but zero traffic flow.
    # May indicate routing issues or a misconfigured allowed-ips.
    # Only useful if you expect continuous traffic on all peers.
    - alert: WireguardNoTrafficOnPeer
      expr: '(rate(wireguard_sent_bytes_total[15m]) + rate(wireguard_received_bytes_total[15m])) == 0 and wireguard_latest_handshake_seconds > 0 and (time() - wireguard_latest_handshake_seconds) < 300'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: WireGuard no traffic on peer (instance {{ $labels.instance }})
        description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has had no traffic for 15 minutes despite an active handshake.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/zfs/node-exporter.yml
================================================
groups:

- name: NodeExporter

  
  rules:

    - alert: ZfsOfflinePool
      expr: 'node_zfs_zpool_state{state!="online"} > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: ZFS offline pool (instance {{ $labels.instance }})
        description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/zfs/zfs_exporter.yml
================================================
groups:

- name: Zfs_exporter

  
  rules:

    - alert: ZfsPoolOutOfSpace
      expr: 'zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0 and zfs_pool_size_bytes > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: ZFS pool out of space (instance {{ $labels.instance }})
        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # 0: ONLINE
    # 1: DEGRADED
    # 2: FAULTED
    # 3: OFFLINE
    # 4: UNAVAIL
    # 5: REMOVED
    # 6: SUSPENDED
    - alert: ZfsPoolUnhealthy
      expr: 'zfs_pool_health > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: ZFS pool unhealthy (instance {{ $labels.instance }})
        description: "ZFS pool state is {{ $value }}. See comments for more information.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ZfsCollectorFailed
      expr: 'zfs_scrape_collector_success != 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: ZFS collector failed (instance {{ $labels.instance }})
        description: "ZFS collector for {{ $labels.instance }} has failed to collect information\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/rules/zookeeper/cloudflare-kafka-zookeeper-exporter.yml
================================================
groups:

- name: CloudflareKafkaZookeeperExporter

  
  rules:


================================================
FILE: dist/rules/zookeeper/dabealu-zookeeper-exporter.yml
================================================
groups:

- name: DabealuZookeeperExporter

  
  rules:

    # 1m delay allows a restart without triggering an alert.
    - alert: ZookeeperDown
      expr: 'zk_up == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Zookeeper Down (instance {{ $labels.instance }})
        description: "Zookeeper down on instance {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ZookeeperMissingLeader
      expr: 'sum(zk_server_leader) == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Zookeeper missing leader (instance {{ $labels.instance }})
        description: "Zookeeper cluster has no node marked as leader\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ZookeeperTooManyLeaders
      expr: 'sum(zk_server_leader) > 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Zookeeper Too Many Leaders (instance {{ $labels.instance }})
        description: "Zookeeper cluster has too many nodes marked as leader\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ZookeeperNotOk
      expr: 'zk_ruok == 0'
      for: 3m
      labels:
        severity: warning
      annotations:
        summary: Zookeeper Not Ok (instance {{ $labels.instance }})
        description: "Zookeeper instance is not ok\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


================================================
FILE: dist/template.yml
================================================
groups:
{% assign groupName = slug | split: '-' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %}
- name: {{ groupNameCamelcase | remove: ' ' | remove: '-' }}

  {% assign lines = comments | split: "
" %}{% for line in lines %}# {{ line | strip }}
  {% endfor %}
  rules:
{% for rule in rules %}{% assign ruleName = rule.name | split: ' ' %}{% capture ruleNameCamelcase %}{% for word in ruleName %}{{ word | capitalize }} {% endfor %}{% endcapture %}
    {% assign lines = rule.comments | split: "
" %}{% for line in lines %}# {{ line | strip }}
    {% endfor %}- alert: {{ ruleNameCamelcase | remove: ' ' }}
      expr: '{{ rule.query }}'
      for: {% if rule.for %}{{ rule.for }}{% else %}0m{% endif %}
      labels:
        severity: {{ rule.severity }}
      annotations:
        summary: {% if rule.summary %}{{ rule.summary }}{% else %}{{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %}){% endif %}
        description: "{{ rule.description | replace: '"', '\"' }}\n  VALUE = {% raw %}{{ $value }}{% endraw %}\n  LABELS = {% raw %}{{ $labels }}{% endraw %}"
{% endfor %}

================================================
FILE: docker-compose.yml
================================================
version: '3'

services:

  jekyll:
    image: jekyll/jekyll:latest
    command: jekyll serve
    volumes:
      - ./:/srv/jekyll
    ports:
      - 4000:4000


================================================
FILE: index.md
================================================

<style>
.center-image
{
    margin: 0 auto;
    display: block;
}
</style>


![Prometheus logo](/assets/prometheus-logo.png){: .center-image }


<h2>
  Hello world
</h2>

<a href="/awesome-prometheus-alerts/alertmanager">
  AlertManager configuration
</a>

<a href="/awesome-prometheus-alerts/sleep-peacefully">
  Alerting time window
</a>

<h2>
  Out of the box prometheus alerting rules
</h2>

<ul>
  {% for group in site.data.rules.groups %}
    <li style="margin-top: 30px;">
      {% assign nbrRules = 0 %}
      {% for service in group.services %}
        {% for exporter in service.exporters %}
          {% for rule in exporter.rules %}
            {% assign nbrRules = nbrRules | plus: 1 %}
          {% endfor %}
        {% endfor %}
      {% endfor %}

      <h3>{{ group.name }} <small style="margin-left: 20px;">({{ nbrRules }} rules)</small></h3>
      <ul>
        {% for service in group.services %}
        <li>
          <a href="/awesome-prometheus-alerts/rules#{{ service.name | replace: " ", "-" | downcase }}">
            {{ service.name }}
          </a>
        </li>
        {% endfor %}
      </ul>
    </li>
  {% endfor %}
</ul>


================================================
FILE: package.json
================================================
{
	  "scripts": {
		    "test": "awesome-lint"
	  },
	  "devDependencies": {
		    "awesome-lint": "*"
	  }
}


================================================
FILE: rules.md
================================================
<style>
  ul {
    list-style: none;
  }
</style>

<!-- CAUTIONS -->
<div style="padding: 20px 20px 10px 20px; border: solid grey 1px; border-radius: 10px;">
  <h2 style="text-align:center;">⚠️ Caution ⚠️</h2>

  <p style="text-align:center;">
    Alert thresholds depend on nature of applications.
    <br>
    Some queries in this page may have arbitrary tolerance threshold.
    <br><br>
    Building an efficient and battle-tested monitoring platform takes time. 😉
  </p>
</div>

<br>
<br>

<h1></h1>

<!-- RULES -->
<ul>
  {% for group in site.data.rules.groups %}
  {% assign groupIndex = forloop.index %}
    {% for service in group.services %}
    {% assign serviceIndex = forloop.index %}
    {% assign nbrExporters = service.exporters | size %}
      {% for exporter in service.exporters %}
      {% assign exporterIndex = forloop.index %}
      {% assign nbrRules = exporter.rules | size %}
      <li>
        {% assign serviceId = service.name | replace: " ", "-" | downcase %}
        <h2 id="{{ serviceId }}">
          <span id="{{ serviceId }}-{{ exporterIndex }}"></span>
          <a class="anchor" href="#{{ serviceId }}-{{ exporterIndex }}">#</a>
          {{ groupIndex }}.{{ serviceIndex }}.{% if nbrExporters > 1 %}{{ exporterIndex }}.{% endif %}
          {{ service.name }}
          {% if exporter.name %}:
          {% if exporter.doc_url %}
          <a href="{{ exporter.doc_url }}">
            {{ exporter.name }}
          </a>
          {% else %}
          {{ exporter.name }}
          {% endif %}
          {% endif %}

          {% if nbrRules > 0 %}
            <small style="font-size: 60%; vertical-align: middle; margin-left: 10px;">
              ({{ nbrRules }} rules)
            </small>
            <span class="clipboard-multiple" data-clipboard-target-id="group-{{ groupIndex }}-service-{{ serviceIndex }}-exporter-{{ exporterIndex }}">[copy section]</span>
          {% endif %}
        </h2>

        {% if nbrRules == 0 %}
{% highlight javascript %}
// @TODO: Please contribute => https://github.com/samber/awesome-prometheus-alerts 👋
{% endhighlight %}
        {% else %}
{{ exporter.comments | strip | newline_to_br }}
{% highlight bash %}
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/refs/heads/master/dist/rules/{{ service.name | replace: " ", "-" | downcase }}/{{ exporter.slug }}.yml
{% endhighlight %}
        {% endif %}

        <ul>
          {% for rule in exporter.rules %}
          {% assign ruleIndex = forloop.index %}
          {% assign comments = rule.comments | strip | newline_to_br | split: '<br />' %}
          <li>
            <h4 id="rule-{{ serviceId }}-{{ exporterIndex }}-{{ ruleIndex }}">
              <span id="rule-{{ serviceId }}-{{ ruleIndex }}"></span><!-- @deprecated -->
              <a class="anchor" href="#rule-{{ serviceId }}-{{ exporterIndex }}-{{ ruleIndex }}">#</a>
              {{ groupIndex}}.{{ serviceIndex }}.{% if nbrExporters > 1 %}{{ exporterIndex }}.{% endif %}{{ ruleIndex }}.
              {{ rule.name }}
            </h4>
            <summary>
              {{ rule.description }}
              <span class="clipboard-single" data-clipboard-target-id="group-{{ groupIndex }}-service-{{ serviceIndex }}-exporter-{{ exporterIndex }}-rule-{{ ruleIndex }}" onclick="event.preventDefault();">[copy]</span>
            </summary>
            <div id="group-{{ groupIndex }}-service-{{ serviceIndex }}-exporter-{{ exporterIndex }}-rule-{{ ruleIndex }}">
              {% assign ruleName = rule.name | split: ' ' %}
              {% capture ruleNameCamelcase %}{% for word in ruleName %}{{ word | capitalize }} {% endfor %}{% endcapture %}

  {% highlight yaml %}
  {% for comment in comments %}# {{ comment | strip }}
  {% endfor %}- alert: {{ ruleNameCamelcase | remove: ' ' }}
    expr: {{ rule.query }}
    for: {% if rule.for %}{{ rule.for }}{% else %}0m{% endif %}
    labels:
      severity: {{ rule.severity }}
    annotations:
      summary: {{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %})
      description: "{{ rule.description | replace: '"', '\"' }}\n  VALUE = {% raw %}{{ $value }}{% endraw %}\n  LABELS = {% raw %}{{ $labels }}{% endraw %}"

{% endhighlight %}

            </div>
            <br/>
          </li>
          {% endfor %}
        </ul>

      <hr/>
      </li>
    {% endfor %}
    {% endfor %}
  {% endfor %}
</ul>


<!-- NAVBAR -->
<div id="rules-navbar" class="affix">
  <h3>Menu</h3>
  <ul>
    {% for group in site.data.rules.groups %}
      <li>
        <h4>{{ group.name }}</h4>
        <ul>
          {% for service in group.services %}
            <li>
              <a href="#{{ service.name | replace: " ", "-" | downcase }}">
                👉 {{ service.name }}
              </a>
            </li>
          {% endfor %}
        </ul>
      </li>
    {% endfor %}
  </ul>

  <script>
    $('#rules-navbar').affix({offset: {top: 750} }).css('display', 'block');
  </script>
</div>


================================================
FILE: sleep-peacefully.md
================================================
<h1 style="text-align: center;">
  Sleep Peacefully
</h1>

## Alerting time window

In some applications, load and activity can vary over the day/week/year.

In order to prevent alarm fatigue and busy pager, alerts can be disabled during a period of time (such as night or weekend).

Example:

- Weekday: `node_load5 > 10 and ON() (0 < day_of_week() < 6)`
- Day time: `node_load5 > 10 and ON() (8 < hour() < 18)`
- Exclude December: `node_load5 > 10 and ON() (month() != 12)`

## Advanced time windows and timezones

```yml
# rules.yml

groups:
  - name: timezones
    rules:
    - record: european_summer_time_offset
      expr: |
          (vector(1) and (month() > 3 and month() < 10))
          or
          (vector(1) and (month() == 3 and (day_of_month() - day_of_week()) >= 25) and absent((day_of_month() >= 25) and (day_of_week() == 0)))
          or
          (vector(1) and (month() == 10 and (day_of_month() - day_of_week()) < 25) and absent((day_of_month() >= 25) and (day_of_week() == 0)))
          or
          (vector(1) and ((month() == 10 and hour() < 1) or (month() == 3 and hour() > 0)) and ((day_of_month() >= 25) and (day_of_week() == 0)))
          or
          vector(0)

    - record: europe_london_time
      expr: time() + 3600 * european_summer_time_offset
    - record: europe_paris_time
      expr: time() + 3600 * (1 + european_summer_time_offset)

    - record: europe_london_hour
      expr: hour(europe_london_time)
    - record: europe_paris_hour
      expr: hour(europe_paris_time)

    - record: europe_london_weekday
      expr: 0 < day_of_week(europe_london_time) < 6
    - record: europe_paris_weekday
      expr: 0 < day_of_week(europe_paris_time) < 6
    # opposite
    - record: not_europe_london_weekday
      expr: absent(europe_london_weekday)
    - record: not_europe_paris_weekday
      expr: absent(europe_paris_weekday)

    - record: europe_london_business_hours
      expr: 9 <= europe_london_hour < 18
    - record: europe_paris_business_hours
      expr: 9 <= europe_paris_hour < 18
    # opposite
    - record: not_europe_london_business_hours
      expr: absent(europe_london_business_hours)
    - record: not_europe_paris_business_hours
      expr: absent(europe_paris_business_hours)

    # new year's day / xmas / labor day / all saints' day / ...
    - record: europe_french_public_holidays
      expr: |
          (vector(1) and month(europe_paris_time) == 1 and day_of_month(europe_paris_time) == 1)
          or
          (vector(1) and month(europe_paris_time) == 12 and day_of_month(europe_paris_time) == 25)
          or
          (vector(1) and month(europe_paris_time) == 5 and day_of_month(europe_paris_time) == 1)
          or
          (vector(1) and month(europe_paris_time) == 11 and day_of_month(europe_paris_time) == 1)
          or
          vector(0)
    # opposite
    - record: not_europe_french_public_holidays
      expr: absent(europe_french_public_holidays)
```

```yml
# alerts.yml

groups:
  - name: CPU Load
    rules:
      - alert: HighLoadQuietDuringWeekendAndNight
        expr: node_load5 > 10 and ON() (europe_london_weekday and europe_paris_weekday)

      - alert: HighLoadQuietDuringBackup
        expr: node_load5 > 10 and ON() absent(hour() == 2)

      - alert: HighLoad
        expr: |
            node_load5 > 20 and ON() (europe_london_weekday and europe_paris_weekday)
            or
            node_load5 > 10
```

## Sources

- [https://medium.com/@tom.fawcett/time-of-day-based-notifications-with-prometheus-and-alertmanager-1bf7a23b7695](https://medium.com/@tom.fawcett/time-of-day-based-notifications-with-prometheus-and-alertmanager-1bf7a23b7695)
- [https://promcon.io/2019-munich/slides/improved-alerting-with-prometheus-and-alertmanager.pdf](https://promcon.io/2019-munich/slides/improved-alerting-with-prometheus-and-alertmanager.pdf)