Repository: grafana/alloy-scenarios Branch: main Commit: fb8fd5ed16bb Files: 430 Total size: 1.3 MB Directory structure: gitextract_o8zi_7a5/ ├── .coda/ │ ├── coda-start.service │ ├── coda-start.sh │ └── packer-install.sh ├── .cursor/ │ ├── docker-example.mdc │ └── k8s-example.mdc ├── .github/ │ ├── k8s-scenarios.json │ ├── scenario-list.txt │ └── workflows/ │ ├── check-image-versions.yml │ ├── validate-k8s-scenarios.yml │ └── validate-scenarios.yml ├── .gitignore ├── CLAUDE.md ├── LICENSE ├── README.md ├── app-instrumentation/ │ └── logging/ │ └── popular-logging-frameworks/ │ ├── README.md │ ├── alloy/ │ │ ├── config.alloy │ │ └── helper.alloy │ ├── cpp/ │ │ ├── CMakeLists.txt │ │ ├── Dockerfile │ │ └── main.cpp │ ├── csharp/ │ │ ├── Dockerfile │ │ ├── LoggingExample.csproj │ │ └── Program.cs │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ ├── go/ │ │ ├── Dockerfile │ │ ├── go.mod │ │ ├── go.sum │ │ └── main.go │ ├── java/ │ │ ├── App.java │ │ ├── Dockerfile │ │ └── logback.xml │ ├── javascript/ │ │ ├── Dockerfile │ │ └── app.js │ ├── loki-config.yaml │ ├── php/ │ │ ├── Dockerfile │ │ └── app.php │ └── python/ │ ├── Dockerfile │ └── app.py ├── aws-firehose-logs/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.yml │ ├── firehose_sender.py │ └── loki-config.yaml ├── blackbox-probing/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ └── prom-config.yaml ├── cloudwatch-metrics/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.yml │ ├── prom-config.yaml │ └── seed-metrics.py ├── coda ├── continuous-profiling/ │ ├── README.md │ ├── app/ │ │ ├── go.mod │ │ └── main.go │ ├── config.alloy │ ├── docker-compose.coda.yml │ └── docker-compose.yml ├── docker-monitoring/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.yml │ ├── grafana/ │ │ └── datasources/ │ │ └── default.yml │ └── loki-config.yaml ├── elasticsearch-monitoring/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ └── prom-config.yaml ├── faro-frontend-observability/ │ ├── README.md │ ├── app/ │ │ └── index.html │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ └── loki-config.yaml ├── game-of-tracing/ │ ├── AGENTS.md │ ├── CLAUDE.md │ ├── README.md │ ├── SPAN_LINKS.md │ ├── ai_opponent/ │ │ ├── CLAUDE.md │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── ai_server.py │ │ ├── requirements.txt │ │ └── telemetry.py │ ├── app/ │ │ ├── CLAUDE.md │ │ ├── Dockerfile │ │ ├── game_config.py │ │ ├── location_server.py │ │ ├── requirements.txt │ │ ├── run_game.py │ │ └── telemetry.py │ ├── config-otel.yaml │ ├── config.alloy │ ├── docker-compose-otel.yml │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ ├── grafana/ │ │ ├── dashboards/ │ │ │ ├── War of Kingdoms-1747821967780.json │ │ │ └── dashboards.yaml │ │ └── datasources/ │ │ └── defaults.yml │ ├── loki-config.yaml │ ├── prom-config.yaml │ ├── pyroscope-config.yaml │ ├── tempo-config.yaml │ └── war_map/ │ ├── CLAUDE.md │ ├── Dockerfile │ ├── app.py │ ├── requirements.txt │ ├── static/ │ │ └── css/ │ │ └── style.css │ ├── telemetry.py │ └── templates/ │ ├── index.html │ ├── layout.html │ ├── map.html │ ├── map_picker.html │ ├── replay.html │ └── replay_session.html ├── gelf-log-ingestion/ │ ├── README.md │ ├── app/ │ │ └── main.py │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ └── loki-config.yaml ├── image-versions.env ├── k8s/ │ ├── README.md │ ├── events/ │ │ ├── README.md │ │ ├── alloy-config.yaml │ │ ├── alloy-deployment.yaml │ │ ├── alloy-rbac.yaml │ │ ├── grafana-values.yml │ │ ├── kind.yml │ │ └── loki-values.yml │ ├── logs/ │ │ ├── README.md │ │ ├── grafana-values.yml │ │ ├── k8s-monitoring-values.yml │ │ ├── killercoda/ │ │ │ └── loki-values.yml │ │ ├── kind.yml │ │ └── loki-values.yml │ ├── metrics/ │ │ ├── README.md │ │ ├── grafana-values.yml │ │ ├── k8s-monitoring-values.yml │ │ ├── kind.yml │ │ └── prometheus-values.yml │ ├── profiling/ │ │ ├── README.md │ │ ├── grafana-values.yml │ │ ├── k8s-monitoring-values.yml │ │ ├── kind.yml │ │ └── pyroscope-values.yml │ └── tracing/ │ ├── README.md │ ├── grafana-values.yml │ ├── k8s-monitoring-values.yml │ ├── kind.yml │ └── tempo-values.yml ├── kafka/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ ├── gen_log.sh │ └── loki-config.yaml ├── linux/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.yml │ ├── loki-config.yaml │ └── prom-config.yaml ├── log-api-gateway/ │ ├── README.md │ ├── app/ │ │ └── producer.py │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ └── loki-config.yaml ├── log-secret-filtering/ │ ├── README.md │ ├── app/ │ │ └── main.py │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ └── loki-config.yaml ├── logs-file/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ ├── loki-config.yaml │ └── main.py ├── logs-tcp/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ ├── loki-config.yaml │ └── simulator.py ├── mail-house/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ ├── loki-config.yaml │ └── main.py ├── memcached-monitoring/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ └── prom-config.yaml ├── mysql-monitoring/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ └── prom-config.yaml ├── nginx-monitoring/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.yml │ ├── loki-config.yaml │ ├── nginx.conf │ └── prom-config.yaml ├── otel-basic-tracing/ │ ├── README.md │ ├── app/ │ │ ├── Dockerfile │ │ ├── app.py │ │ └── requirements.txt │ ├── config-otel.yaml │ ├── config.alloy │ ├── docker-compose-otel.yml │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ ├── prom-config.yaml │ └── tempo-config.yaml ├── otel-examples/ │ ├── README.md │ ├── cost-control/ │ │ ├── README.md │ │ ├── app/ │ │ │ ├── Dockerfile │ │ │ ├── app.py │ │ │ └── requirements.txt │ │ ├── config-otel.yaml │ │ ├── config.alloy │ │ ├── docker-compose.coda.yml │ │ ├── docker-compose.yml │ │ ├── loki-config.yaml │ │ └── tempo-config.yaml │ ├── count-connector/ │ │ ├── README.md │ │ ├── app/ │ │ │ ├── Dockerfile │ │ │ ├── app.py │ │ │ └── requirements.txt │ │ ├── config-otel.yaml │ │ ├── config.alloy │ │ ├── docker-compose.coda.yml │ │ ├── docker-compose.yml │ │ ├── loki-config.yaml │ │ ├── prom-config.yaml │ │ └── tempo-config.yaml │ ├── filelog-processing/ │ │ ├── README.md │ │ ├── app/ │ │ │ └── generate_logs.py │ │ ├── config-otel.yaml │ │ ├── config.alloy │ │ ├── docker-compose.coda.yml │ │ ├── docker-compose.yml │ │ └── loki-config.yaml │ ├── host-metrics/ │ │ ├── README.md │ │ ├── config-otel.yaml │ │ ├── config.alloy │ │ ├── docker-compose.coda.yml │ │ ├── docker-compose.yml │ │ └── prom-config.yaml │ ├── kafka-buffer/ │ │ ├── README.md │ │ ├── app/ │ │ │ ├── Dockerfile │ │ │ ├── app.py │ │ │ └── requirements.txt │ │ ├── config-otel.yaml │ │ ├── config.alloy │ │ ├── docker-compose.coda.yml │ │ ├── docker-compose.yml │ │ └── tempo-config.yaml │ ├── multi-pipeline-fanout/ │ │ ├── README.md │ │ ├── app/ │ │ │ ├── Dockerfile │ │ │ ├── app.py │ │ │ └── requirements.txt │ │ ├── config-otel.yaml │ │ ├── config.alloy │ │ ├── docker-compose.coda.yml │ │ ├── docker-compose.yml │ │ ├── prom-config.yaml │ │ └── tempo-config.yaml │ ├── ottl-transform/ │ │ ├── README.md │ │ ├── app/ │ │ │ ├── Dockerfile │ │ │ ├── app.py │ │ │ └── requirements.txt │ │ ├── config-otel.yaml │ │ ├── config.alloy │ │ ├── docker-compose.coda.yml │ │ ├── docker-compose.yml │ │ ├── loki-config.yaml │ │ └── tempo-config.yaml │ ├── pii-redaction/ │ │ ├── README.md │ │ ├── app/ │ │ │ ├── Dockerfile │ │ │ ├── app.py │ │ │ └── requirements.txt │ │ ├── config-otel.yaml │ │ ├── config.alloy │ │ ├── docker-compose.coda.yml │ │ ├── docker-compose.yml │ │ ├── loki-config.yaml │ │ └── tempo-config.yaml │ ├── resource-enrichment/ │ │ ├── README.md │ │ ├── app/ │ │ │ ├── Dockerfile │ │ │ ├── app.py │ │ │ └── requirements.txt │ │ ├── config-otel.yaml │ │ ├── config.alloy │ │ ├── docker-compose.coda.yml │ │ ├── docker-compose.yml │ │ ├── prom-config.yaml │ │ └── tempo-config.yaml │ └── routing-multi-tenant/ │ ├── README.md │ ├── app/ │ │ ├── generate_logs.py │ │ └── requirements.txt │ ├── config-otel.yaml │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ └── loki-config.yaml ├── otel-metrics-pipeline/ │ ├── README.md │ ├── app/ │ │ └── main.py │ ├── config-otel.yaml │ ├── config.alloy │ ├── docker-compose-otel.yml │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ └── prom-config.yaml ├── otel-span-metrics/ │ ├── README.md │ ├── app/ │ │ ├── load.py │ │ ├── main.py │ │ └── requirements.txt │ ├── config-otel.yaml │ ├── config.alloy │ ├── docker-compose-otel.yml │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ ├── prom-config.yaml │ └── tempo-config.yaml ├── otel-tail-sampling/ │ ├── README.md │ ├── app/ │ │ ├── Dockerfile │ │ ├── app.py │ │ └── requirements.txt │ ├── config-otel.yaml │ ├── config.alloy │ ├── docker-compose-otel.yml │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ ├── prom-config.yaml │ └── tempo-config.yaml ├── otel-tracing-service-graphs/ │ ├── README.md │ ├── app/ │ │ ├── Dockerfile │ │ ├── app.py │ │ └── requirements.txt │ ├── config-otel.yaml │ ├── config.alloy │ ├── docker-compose-otel.yml │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ ├── prom-config.yaml │ └── tempo-config.yaml ├── postgres-monitoring/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ └── prom-config.yaml ├── rabbitmq-monitoring/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ ├── enabled_plugins │ ├── loki-config.yaml │ ├── prom-config.yaml │ └── rabbitmq.conf ├── redis-monitoring/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ └── prom-config.yaml ├── renovate.json ├── routing/ │ ├── README.MD │ ├── config.alloy │ ├── docker-compose.yaml │ └── support/ │ ├── grafana/ │ │ └── datasources.yml │ ├── loki/ │ │ └── server.yaml │ └── promtail/ │ ├── myCustomLog.txt │ └── promtail-config.yml ├── run-example.sh ├── self-monitoring/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.yaml │ └── loki-config.yaml ├── snmp/ │ ├── Readme.md │ ├── config.alloy │ ├── docker-compose.yml │ ├── loki-config.yaml │ ├── prom-config.yaml │ └── snmp.yml ├── syslog/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ ├── loki-config.yaml │ ├── rsyslog.conf │ └── syslog_simulator.py ├── systemd-journal/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.yml │ └── loki-config.yaml ├── trace-delivery/ │ ├── README.md │ ├── app/ │ │ ├── Dockerfile │ │ ├── app.py │ │ └── requirements.txt │ ├── config-otel.yaml │ ├── config.alloy │ ├── docker-compose-otel.yml │ ├── docker-compose.coda.yml │ ├── docker-compose.yml │ ├── prom-config.yaml │ └── tempo-config.yaml ├── vault-secrets/ │ ├── README.md │ ├── auth/ │ │ └── htpasswd │ ├── config.alloy │ ├── docker-compose.yml │ ├── nginx.conf │ ├── prom-config.yaml │ └── rotate.sh ├── windows/ │ ├── README.md │ ├── config.alloy │ ├── docker-compose.yml │ ├── loki-config.yaml │ └── prom-config.yaml └── windows-events/ ├── README.md ├── config.alloy ├── docker-compose.yml └── loki-config.yaml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .coda/coda-start.service ================================================ [Unit] Description=Coda Alloy Scenario Start After=network-online.target docker.service Wants=network-online.target Requires=docker.service [Service] Type=oneshot ExecStart=/usr/local/bin/coda-start.sh WorkingDirectory=/opt/alloy-scenarios StandardOutput=journal StandardError=journal RemainAfterExit=yes [Install] WantedBy=multi-user.target ================================================ FILE: .coda/coda-start.sh ================================================ #!/usr/bin/env bash set -euo pipefail SCENARIO_FILE="/etc/coda/scenario" REPO_DIR="/opt/alloy-scenarios" # Wait for the scenario file to be written by user_data echo "Waiting for ${SCENARIO_FILE}..." timeout=120 elapsed=0 while [[ ! -f "$SCENARIO_FILE" ]]; do sleep 2 elapsed=$((elapsed + 2)) if [[ $elapsed -ge $timeout ]]; then echo "Timed out waiting for ${SCENARIO_FILE} after ${timeout}s" >&2 exit 1 fi done SCENARIO="$(cat "$SCENARIO_FILE")" echo "Scenario: ${SCENARIO}" # Pull latest changes from main so new scenarios are always available. # Explicitly fetch+reset main to handle AMIs built from non-main branches. echo "Updating alloy-scenarios repo..." git -C "$REPO_DIR" fetch origin main 2>/dev/null \ && git -C "$REPO_DIR" checkout main 2>/dev/null \ && git -C "$REPO_DIR" reset --hard origin/main 2>/dev/null \ || echo "Warning: git update failed, using baked version" # Start the scenario (builds images on demand) exec "$REPO_DIR/coda" start "$SCENARIO" ================================================ FILE: .coda/packer-install.sh ================================================ #!/usr/bin/env bash # Packer provisioner: set up coda CLI and systemd services on an AMI. # # Expects the alloy-scenarios repo to already be cloned to /opt/alloy-scenarios. # This script is called by the consuming Packer template after cloning. # # It intentionally does NOT pre-build scenario images. Scenarios are built # on demand by `coda start`, so new scenarios work without re-baking the AMI. set -euo pipefail INSTALL_DIR="${1:-/opt/alloy-scenarios}" echo "==> Adding host aliases for alloy" grep -qxF '127.0.0.1 alloy' /etc/hosts || echo '127.0.0.1 alloy' >> /etc/hosts echo "==> Symlinking coda CLI" chmod +x "${INSTALL_DIR}/coda" ln -sf "${INSTALL_DIR}/coda" /usr/local/bin/coda echo "==> Pre-pulling common base images" # Only pull widely-shared base images to speed up first boot. # Scenario-specific images are built on demand by `coda start`. docker pull "python:3.11-slim" || true docker pull "apache/kafka:3.9.0" || true echo "==> Installing systemd services" cp "${INSTALL_DIR}/.coda/coda-start.service" /etc/systemd/system/coda-start.service install -m 0755 "${INSTALL_DIR}/.coda/coda-start.sh" /usr/local/bin/coda-start.sh systemctl daemon-reload echo "==> Done" ================================================ FILE: .cursor/docker-example.mdc ================================================ --- description: creating a new alloy docker example globs: alwaysApply: false --- # Grafana Alloy Docker Example Template This template provides a comprehensive structure for creating a new Grafana Alloy example using Docker Compose. It includes all the necessary components to monitor your application or system with the LGMT stack (Loki, Grafana, Metrics/Prometheus, Tempo). ## Directory Structure ``` your-example-name/ ├── config.alloy # Alloy configuration file ├── docker-compose.yml # Docker Compose configuration ├── loki-config.yaml # Loki configuration ├── prom-config.yaml # Prometheus configuration ├── tempo-config.yaml # Tempo configuration (optional) ├── README.md # Documentation for your example └── [additional files...] # Any additional files needed for your example ``` ## Docker Compose Template Below is a template for your `docker-compose.yml` file that includes all components of the LGMT stack. You can customize it based on your specific needs. ```yaml version: '3.8' services: # Loki for log aggregation loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.7} ports: - 3100:3100/tcp volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml # Prometheus for metrics collection prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.10.0} command: - --web.enable-remote-write-receiver - --web.enable-otlp-receiver - --enable-feature=native-histograms - --enable-feature=exemplar-storage - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090/tcp volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml # Memcached for Tempo memcached: image: memcached:1.6.29 container_name: memcached ports: - "11211:11211" environment: - MEMCACHED_MAX_MEMORY=64m # Set the maximum memory usage - MEMCACHED_THREADS=4 # Number of threads to use # Tempo initialization (required for file permissions) tempo-init: image: &tempoImage grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.1} user: root entrypoint: - "chown" - "10001:10001" - "/var/tempo" volumes: - ./tempo-data:/var/tempo # Tempo for tracing tempo: image: *tempoImage command: ["-config.file=/etc/tempo.yaml"] ports: - 3200:3200/tcp # tempo - 4317:4317/tcp # otlp grpc - 4318:4318/tcp # otlp http - 14268:14268/tcp # jaeger thrift http - 14250:14250/tcp # jaeger grpc - 6831:6831/udp # jaeger thrift compact - 6832:6832/udp # jaeger thrift binary - 9411:9411/tcp # zipkin volumes: - ./tempo-config.yaml:/etc/tempo.yaml - ./tempo-data:/var/tempo depends_on: - tempo-init - memcached - prometheus # Grafana for visualization grafana: image: grafana/grafana:${GRAFANA_VERSION:-12.4.0} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false - GF_INSTALL_PLUGINS=https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false - name: Tempo type: tempo access: proxy orgId: 1 url: http://tempo:3200 basicAuth: false isDefault: false version: 1 editable: false jsonData: serviceMap: datasourceUid: 'Prometheus' nodeGraph: enabled: true EOF /run.sh depends_on: - prometheus - tempo # Alloy for telemetry pipeline alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.14.0} ports: - 12345:12345 # Alloy HTTP server volumes: - ./config.alloy:/etc/alloy/config.alloy - /var/run/docker.sock:/var/run/docker.sock # For Docker monitoring (optional) command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy ``` ## Configuration Files ### Loki Configuration (loki-config.yaml) ```yaml auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true ingester: max_chunk_age: 2h ``` ### Prometheus Configuration (prom-config.yaml) ```yaml global: scrape_interval: 15s evaluation_interval: 15s scrape_configs: - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] - job_name: 'alloy' static_configs: - targets: ['alloy:12345'] otlp: # Recommended attributes to be promoted to labels. promote_resource_attributes: - service.instance.id - service.name - service.namespace - service.version - cloud.availability_zone - cloud.region - container.name - deployment.environment - deployment.environment.name - k8s.cluster.name - k8s.container.name - k8s.namespace.name - k8s.pod.name storage: tsdb: out_of_order_time_window: 30m ``` ### Tempo Configuration (tempo-config.yaml) ```yaml server: http_listen_port: 3200 log_level: info cache: background: writeback_goroutines: 5 caches: - roles: - frontend-search memcached: addresses: dns+memcached:11211 query_frontend: search: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 metadata_slo: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 trace_by_id: duration_slo: 100ms metrics: max_duration: 200h # maximum duration of a metrics query, increase for local setups query_backend_after: 5m duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 distributor: receivers: jaeger: protocols: thrift_http: endpoint: "tempo:14268" grpc: endpoint: "tempo:14250" thrift_binary: endpoint: "tempo:6832" thrift_compact: endpoint: "tempo:6831" zipkin: endpoint: "tempo:9411" otlp: protocols: grpc: endpoint: "tempo:4317" http: endpoint: "tempo:4318" opencensus: endpoint: "tempo:55678" ingester: max_block_duration: 5m compactor: compaction: block_retention: 720h # Note: The metrics_generator section below can be enabled for built-in service graphs. # Alternatively, use Alloy's servicegraph connector as shown in alloy-service-graphs example. # metrics_generator: # registry: # external_labels: # source: tempo # cluster: docker-compose # storage: # path: /var/tempo/generator/wal # remote_write: # - url: http://prometheus:9090/api/v1/write # send_exemplars: true # traces_storage: # path: /var/tempo/generator/traces # processor: # local_blocks: # filter_server_spans: false # flush_to_storage: true storage: trace: backend: local wal: path: /var/tempo/wal local: path: /var/tempo/blocks # Note: Service graph generation is commented out to allow using Alloy for this purpose. # overrides: # defaults: # metrics_generator: # processors: [service-graphs, span-metrics, local-blocks] # generate_native_histograms: both ``` ### Alloy Configuration with Service Graph Generation (config.alloy) ```river /* * Alloy Configuration for OpenTelemetry Trace Collection with Service Graph Generation */ // Receive OpenTelemetry traces otelcol.receiver.otlp "default" { http {} grpc {} output { traces = [otelcol.processor.batch.default.input] } } // Batch processor to improve performance otelcol.processor.batch "default" { output { traces = [ otelcol.connector.servicegraph.default.input, otelcol.exporter.otlp.tempo.input, ] } } // Service Graph Generator otelcol.connector.servicegraph "default" { metrics_flush_interval = "10s" dimensions = ["http.method"] output { metrics = [otelcol.exporter.otlphttp.prometheus.input] } } // Send service graph metrics to Prometheus via OTLP otelcol.exporter.otlphttp "prometheus" { client { endpoint = "http://prometheus:9090/api/v1/otlp" tls { insecure = true } } } // Send traces to Tempo for storage and visualization otelcol.exporter.otlp "tempo" { client { endpoint = "tempo:4317" tls { insecure = true } } } ``` ## README Template The README.md file for your example should include: 1. A brief description of what the example demonstrates 2. Instructions for running the example 3. What to expect after running the example 4. Any additional steps or configuration needed Example: ```markdown # Your Example Name Brief description of what this example demonstrates and its purpose. ## Overview The example includes: - Component 1 (brief description) - Component 2 (brief description) - ... ## Running the Demo 1. Clone the repository: ``` git clone https://github.com/grafana/alloy-scenarios.git cd alloy-scenarios ``` 2. Navigate to this example directory: ``` cd your-example-name ``` 3. Run using Docker Compose: ``` docker compose up -d ``` Or use the centralized image management: ``` cd .. ./run-example.sh your-example-name ``` 4. Access Grafana at http://localhost:3000 ## What to Expect Describe what the user should see after running the example, including: - What metrics/logs are being collected - Any dashboards that are automatically set up - How to interact with the example ## Service Graphs (if applicable) If your example includes service graph visualization capabilities: 1. Open Grafana (http://localhost:3000) 2. Navigate to Explore 3. Select the Tempo data source 4. Click on the "Service Graph" tab 5. You should see a visual representation of the relationships between services ## Architecture ``` ┌────────────┐ ┌──────────┐ ┌───────┐ ┌─────────┐ │ Component1 │────▶│ Component2│─────▶│Component3│──▶│ Grafana │ └────────────┘ └──────────┘ └───┬───┘ └─────────┘ │ ▲ ▼ │ ┌─────────┐ │ │Component4│───────────┘ └─────────┘ ``` Brief explanation of the architecture and data flow. ## Additional Configuration Any additional steps or configuration that might be needed. ``` ## Customizing Your Example To create your own example: 1. Create a new directory with your example name at the root of the repository 2. Copy the template files from this template 3. Customize the files for your specific use case 4. Update the README.md with specific instructions for your example 5. Add your example to the main README.md table with a link and description ================================================ FILE: .cursor/k8s-example.mdc ================================================ --- description: globs: alwaysApply: false --- # Grafana Alloy Kubernetes Example Template This template provides a comprehensive structure for creating a new Grafana Alloy example using Kubernetes. It is based on the Kubernetes Monitoring Helm chart which abstracts the need to configure Loki and deploys with best practices for monitoring Kubernetes clusters. ## Directory Structure ``` your-k8s-example-name/ ├── k8s-monitoring-values.yml # K8s monitoring helm chart values ├── loki-values.yml # Loki helm chart values ├── grafana-values.yml # Grafana helm chart values ├── kind.yml # Kind cluster configuration (optional) ├── README.md # Documentation for your example └── [additional files...] # Any additional files needed for your example ``` ## Kubernetes Configuration Files ### Kind Cluster Configuration (kind.yml) If you're using Kind for local development, you can use this configuration: ```yaml kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 nodes: - role: control-plane extraPortMappings: - containerPort: 30000 hostPort: 30000 ``` ### K8s Monitoring Helm Chart Values (k8s-monitoring-values.yml) This file configures the Kubernetes Monitoring Helm chart with Alloy settings: ```yaml --- cluster: name: example-monitoring destinations: - name: loki type: loki url: http://loki-gateway.meta.svc.cluster.local/loki/api/v1/push # Cluster Events Collection clusterEvents: enabled: true collector: alloy-logs namespaces: - meta - default # Node Logs Collection nodeLogs: enabled: true collector: alloy-logs # Pod Logs Collection podLogs: enabled: true gatherMethod: kubernetesApi collector: alloy-logs labelsToKeep: ["app_kubernetes_io_name","container","instance","job","level","namespace","service_name","service_namespace","deployment_environment","deployment_environment_name"] structuredMetadata: pod: pod # Set structured metadata "pod" from label "pod" namespaces: - meta - default # Node Metrics Collection nodeMetrics: enabled: true collector: alloy-metrics # Pod Metrics Collection podMetrics: enabled: true collector: alloy-metrics namespaces: - meta - default # Kubernetes API Server Metrics kubernetesMetrics: enabled: true collector: alloy-metrics # Traces Collection (if applicable) traces: enabled: true collector: alloy-receiver namespaces: - meta - default # Profiles Collection (if applicable) profiles: enabled: true collector: alloy-profiles namespaces: - meta - default # Collectors Configuration alloy-singleton: enabled: false alloy-metrics: enabled: true alloy: clustering: enabled: true alloy-logs: enabled: true alloy: mounts: varlog: true clustering: enabled: true alloy-profiles: enabled: true alloy: clustering: enabled: true alloy-receiver: enabled: true alloy: clustering: enabled: true ``` ### Loki Helm Chart Values (loki-values.yml) Configuration for the Loki Helm chart: ```yaml --- loki: auth_enabled: false commonConfig: replication_factor: 1 schemaConfig: configs: - from: 2024-01-01 store: tsdb object_store: s3 schema: v13 index: prefix: loki_index_ period: 24h ingester: chunk_encoding: snappy tracing: enabled: true pattern_ingester: enabled: true limits_config: allow_structured_metadata: true volume_enabled: true ruler: enable_api: true querier: max_concurrent: 4 minio: enabled: true deploymentMode: SingleBinary singleBinary: replicas: 1 resources: limits: cpu: 4 memory: 4Gi requests: cpu: 2 memory: 2Gi extraEnv: - name: GOMEMLIMIT value: 3750MiB chunksCache: writebackSizeLimit: 10MB # Zero out replica counts of other deployment modes backend: replicas: 0 read: replicas: 0 write: replicas: 0 ingester: replicas: 0 querier: replicas: 0 queryFrontend: replicas: 0 queryScheduler: replicas: 0 distributor: replicas: 0 compactor: replicas: 0 indexGateway: replicas: 0 bloomCompactor: replicas: 0 bloomGateway: replicas: 0 ``` ### Grafana Helm Chart Values (grafana-values.yml) Configuration for the Grafana Helm chart: ```yaml --- persistence: type: pvc enabled: true # DO NOT DO THIS IN PRODUCTION USECASES adminUser: admin adminPassword: adminadminadmin # CONSIDER USING AN EXISTING SECRET # admin: # existingSecret: "" # userKey: admin-user # passwordKey: admin-password service: enabled: true type: ClusterIP datasources: datasources.yaml: apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki-gateway.meta.svc.cluster.local:80 basicAuth: false isDefault: false version: 1 editable: false - name: Prometheus type: prometheus access: proxy orgId: 1 url: http://prometheus-server.meta.svc.cluster.local:80 basicAuth: false isDefault: true version: 1 editable: false - name: Tempo type: tempo access: proxy orgId: 1 url: http://tempo.meta.svc.cluster.local:80 basicAuth: false isDefault: false version: 1 editable: false ``` ## README Template Here's a template for your example's README.md: ```markdown # Your Kubernetes Example Name Brief description of what this example demonstrates and its purpose. ## Prerequisites - Kubernetes cluster (or Kind for local development) - Helm (v3.x) - kubectl ## Setup ### 1. Create a Kubernetes Cluster (Optional, if using Kind) ```bash kind create cluster --config kind.yml ``` ### 2. Create a Namespace for Monitoring ```bash kubectl create namespace meta ``` ### 3. Install Loki Add the Grafana Helm repository if you haven't already: ```bash helm repo add grafana https://grafana.github.io/helm-charts helm repo update ``` Install Loki: ```bash helm install --values loki-values.yml loki grafana/loki -n meta ``` ### 4. Install Grafana ```bash helm install --values grafana-values.yml grafana grafana/grafana --namespace meta ``` ### 5. Install Kubernetes Monitoring (with Alloy) ```bash helm install --values ./k8s-monitoring-values.yml k8s grafana/k8s-monitoring -n meta ``` ## Accessing the Dashboard ### Port Forward Grafana ```bash kubectl port-forward -n meta svc/grafana 3000:80 ``` Navigate to http://localhost:3000 in your browser. The default credentials are: - Username: admin - Password: adminadminadmin ## What to Expect Describe what the user should see after setting up the example, including: - What metrics/logs are being collected - Any dashboards that are automatically set up - How to interact with the example ## Cleanup To remove the deployed resources: ```bash helm uninstall k8s -n meta helm uninstall grafana -n meta helm uninstall loki -n meta kubectl delete namespace meta ``` If you created a Kind cluster: ```bash kind delete cluster ``` ``` ## Customizing Your Example To create your own example: 1. Create a new directory with your example name at the root of the repository 2. Copy the template files from this template 3. Customize the files for your specific use case 4. Update the README.md with specific instructions for your example 5. Add your example to the main README.md table with a link and description ## Typical Use Cases for Kubernetes Examples 1. **Logs Collection**: Collecting and analyzing logs from applications running in Kubernetes 2. **Metrics Monitoring**: Monitoring application and infrastructure metrics 3. **Tracing**: Distributed tracing for microservices 4. **Profiling**: Performance profiling of applications 5. **Combined Observability**: Demonstrating how to use all telemetry types together ## Special Considerations for Kubernetes - **Resource Limits**: Adjust resource requests and limits based on your cluster capacity - **Persistent Storage**: Configure appropriate storage classes for your environment - **Security**: In production environments, use proper authentication methods - **Network Policies**: Consider adding network policies if required for your environment ================================================ FILE: .github/k8s-scenarios.json ================================================ { "metrics": [ { "release": "prometheus", "chart": "prometheus-community/prometheus", "values": "prometheus-values.yml" }, { "release": "grafana", "chart": "grafana/grafana", "values": "grafana-values.yml" }, { "release": "k8s", "chart": "grafana/k8s-monitoring", "values": "k8s-monitoring-values.yml", "version": "^4.0.0" } ], "logs": [ { "release": "loki", "chart": "grafana/loki", "values": "loki-values.yml" }, { "release": "grafana", "chart": "grafana/grafana", "values": "grafana-values.yml" }, { "release": "k8s", "chart": "grafana/k8s-monitoring", "values": "k8s-monitoring-values.yml", "version": "^4.0.0" } ], "tracing": [ { "release": "tempo", "chart": "grafana/tempo", "values": "tempo-values.yml" }, { "release": "grafana", "chart": "grafana/grafana", "values": "grafana-values.yml" }, { "release": "k8s", "chart": "grafana/k8s-monitoring", "values": "k8s-monitoring-values.yml", "version": "^4.0.0" } ], "profiling": [ { "release": "pyroscope", "chart": "grafana/pyroscope", "values": "pyroscope-values.yml" }, { "release": "grafana", "chart": "grafana/grafana", "values": "grafana-values.yml" }, { "release": "k8s", "chart": "grafana/k8s-monitoring", "values": "k8s-monitoring-values.yml", "version": "^4.0.0" } ], "events": [ { "release": "loki", "chart": "grafana/loki", "values": "loki-values.yml" }, { "release": "grafana", "chart": "grafana/grafana", "values": "grafana-values.yml" } ] } ================================================ FILE: .github/scenario-list.txt ================================================ aws-firehose-logs blackbox-probing continuous-profiling docker-monitoring elasticsearch-monitoring faro-frontend-observability game-of-tracing gelf-log-ingestion kafka linux log-api-gateway log-secret-filtering logs-file logs-tcp mail-house memcached-monitoring mysql-monitoring nginx-monitoring otel-basic-tracing otel-metrics-pipeline otel-span-metrics otel-tail-sampling otel-tracing-service-graphs postgres-monitoring redis-monitoring routing self-monitoring snmp syslog systemd-journal trace-delivery vault-secrets windows windows-events ================================================ FILE: .github/workflows/check-image-versions.yml ================================================ name: check-image-versions # Drift guard: every ${VAR:-default} fallback in a docker-compose file # must match the value of VAR in image-versions.env. # # Without this check, renovate's docker manager (which updates fallbacks # in compose files) and the customManager in renovate.json (which # updates image-versions.env) can fall out of lockstep — leaving anyone # who runs `docker compose up` without `--env-file image-versions.env` # on stale versions. on: pull_request: paths: - '**/docker-compose.yml' - '**/docker-compose.yaml' - '**/docker-compose.coda.yml' - '**/docker-compose.coda.yaml' - 'image-versions.env' - '.github/workflows/check-image-versions.yml' push: branches: [main] permissions: contents: read jobs: check: name: Compose fallbacks vs image-versions.env runs-on: ubuntu-latest timeout-minutes: 3 steps: - name: Harden runner uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 with: egress-policy: block allowed-endpoints: > api.github.com:443 github.com:443 - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Compare fallbacks against image-versions.env run: | set -euo pipefail # Build a map of VAR=value from image-versions.env declare -A want while IFS='=' read -r k v; do [[ "$k" =~ ^[A-Z_]+_VERSION$ ]] || continue want[$k]="$v" done < <(grep -E '^[A-Z_]+_VERSION=' image-versions.env) echo "Tracking ${#want[@]} version variables:" for k in "${!want[@]}"; do echo " $k=${want[$k]}" done echo # Scan every fallback. Pattern: ${VAR:-default} mismatches=0 while IFS= read -r -d '' f; do while IFS= read -r line; do if [[ "$line" =~ \$\{([A-Z_]+_VERSION):-([^}]+)\} ]]; then var="${BASH_REMATCH[1]}" fallback="${BASH_REMATCH[2]}" expected="${want[$var]:-}" if [ -z "$expected" ]; then echo "::warning file=$f::unknown variable $var (not in image-versions.env)" continue fi if [ "$fallback" != "$expected" ]; then echo "::error file=$f::\${$var:-$fallback} should be \${$var:-$expected}" mismatches=$((mismatches+1)) fi fi done < "$f" done < <(find . -type f \ \( -name 'docker-compose.yml' -o -name 'docker-compose.yaml' \ -o -name 'docker-compose.coda.yml' -o -name 'docker-compose.coda.yaml' \) \ -not -path '*/k8s/*' -not -path '*/.git/*' -print0) if [ "$mismatches" -gt 0 ]; then echo echo "::error::Found $mismatches drift(s). Update either the fallback in the compose file or image-versions.env." exit 1 fi echo "OK — all fallbacks match image-versions.env" ================================================ FILE: .github/workflows/validate-k8s-scenarios.yml ================================================ name: validate-k8s-scenarios # Lightweight validation for k8s scenarios under k8s/. Mirrors the # defense-in-depth posture of validate-scenarios.yml (docker), but # without paying the cost of a real cluster on every PR: # # validate (every PR): helm template + kubeconform per chart per scenario. # Renders offline, validates against k8s API schemas. # kind-integration: opt-in via workflow_dispatch only. Boots kind, # helm-installs all charts, waits for pods Ready. # # Defense-in-depth (same as the docker workflow): # - permissions: contents: read (no token write, no secrets) # - harden-runner egress allowlist (compromised tool can't phone home) # - third-party actions SHA-pinned (tag pushes can't sneak in) # - direct binary downloads, version-pinned (helm, kubeconform) # - github-hosted ephemeral runners # - pull_request, not pull_request_target on: pull_request: paths: - 'k8s/**' - '.github/k8s-scenarios.json' - '.github/workflows/validate-k8s-scenarios.yml' workflow_dispatch: inputs: kind_integration: description: 'Run the kind-cluster integration job after validation' type: boolean default: false scenario: description: 'Which scenario(s) to run kind-integration for ("all" or comma-separated subset, e.g. "metrics,logs")' type: string default: 'all' permissions: contents: read concurrency: group: validate-k8s-${{ github.event.pull_request.number || github.run_id }} cancel-in-progress: true env: HELM_VERSION: 'v4.1.4' KUBECONFORM_VERSION: 'v0.6.7' KUBERNETES_VERSION: '1.31.0' jobs: # ────────────────────────────────────────────────────────────────── # validate: helm template + kubeconform per chart for each of the # 4 scenarios. Pure offline — no API server, no real cluster. # ────────────────────────────────────────────────────────────────── validate: name: Validate ${{ matrix.scenario }} runs-on: ubuntu-latest timeout-minutes: 8 strategy: fail-fast: false matrix: scenario: [metrics, logs, tracing, profiling, events] steps: - name: Harden runner uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 with: egress-policy: block allowed-endpoints: > api.github.com:443 github.com:443 objects.githubusercontent.com:443 release-assets.githubusercontent.com:443 raw.githubusercontent.com:443 get.helm.sh:443 grafana.github.io:443 prometheus-community.github.io:443 charts.bitnami.com:443 pypi.org:443 files.pythonhosted.org:443 - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Install helm + kubeconform + yamllint run: | set -euo pipefail # Helm — pinned by version. Upstream tarball, verify by sha would # be ideal but Helm doesn't publish stable per-tag checksums in a # consumable way; pinning the version + restricting egress is the # workable compromise. curl -fsSL "https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz" \ | tar -xz -C /tmp sudo install -m 0755 /tmp/linux-amd64/helm /usr/local/bin/helm # kubeconform — pinned. The release archive contains just the # binary; we extract it directly. curl -fsSL "https://github.com/yannh/kubeconform/releases/download/${KUBECONFORM_VERSION}/kubeconform-linux-amd64.tar.gz" \ | sudo tar -xz -C /usr/local/bin/ kubeconform # yamllint — preinstalled python3 + pip on ubuntu-latest. sudo pip install --quiet yamllint helm version --short kubeconform -v yamllint --version - name: Helm repo bootstrap run: | set -euo pipefail helm repo add grafana https://grafana.github.io/helm-charts helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo update - name: yamllint values files # Loose ruleset — values files commonly use long datasource URLs and # don't need a leading `---`. Run as advisory: don't fail the job on # style; we want it for hygiene signal, not blocking. continue-on-error: true run: | yamllint -d "{extends: relaxed, rules: {line-length: disable, document-start: disable}}" \ k8s/${{ matrix.scenario }}/ - name: Helm template + kubeconform per chart run: | set -euo pipefail mkdir -p /tmp/rendered fail=0 # Iterate the scenario's chart list. helm template against the # remote chart triggers values.schema.json validation upstream # (most grafana charts ship a schema), then kubeconform validates # the rendered Kubernetes API objects against the target version. while IFS= read -r entry; do release=$(jq -r '.release' <<<"$entry") chart=$(jq -r '.chart' <<<"$entry") values=$(jq -r '.values' <<<"$entry") version=$(jq -r '.version // ""' <<<"$entry") values_path="k8s/${{ matrix.scenario }}/$values" ver_arg=() [ -n "$version" ] && ver_arg=(--version "$version") echo "::group::helm template $release ($chart${version:+ @$version})" out="/tmp/rendered/${{ matrix.scenario }}-$release.yaml" if ! helm template "$release" "$chart" "${ver_arg[@]}" \ -f "$values_path" > "$out" 2> "/tmp/rendered/${{ matrix.scenario }}-$release.err"; then echo "::error::helm template failed for $release" cat "/tmp/rendered/${{ matrix.scenario }}-$release.err" fail=1 echo "::endgroup::" continue fi lines=$(wc -l < "$out") echo "Rendered $lines lines to $out" echo "::endgroup::" echo "::group::kubeconform $release" # -ignore-missing-schemas: skip CRDs whose schemas aren't in the # datree catalog (catching built-in K8s API drift is the real # signal; CRD validation is the chart maintainer's responsibility). if ! kubeconform -strict -summary \ -kubernetes-version "$KUBERNETES_VERSION" \ -schema-location default \ -schema-location 'https://raw.githubusercontent.com/datreeio/CRDs-catalog/main/{{.Group}}/{{.ResourceKind}}_{{.ResourceAPIVersion}}.json' \ -ignore-missing-schemas \ "$out"; then echo "::error::kubeconform failed for $release" fail=1 fi echo "::endgroup::" done < <(jq -c --arg s "${{ matrix.scenario }}" '.[$s][]' .github/k8s-scenarios.json) exit $fail - name: Upload rendered manifests if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: rendered-${{ matrix.scenario }} path: /tmp/rendered/ retention-days: 7 # ────────────────────────────────────────────────────────────────── # kind-integration: Boots a real kind cluster and helm-installs all # charts for the scenario. Heavy — only on workflow_dispatch. # ────────────────────────────────────────────────────────────────── kind-integration: name: Kind integration ${{ matrix.scenario }} if: github.event_name == 'workflow_dispatch' && inputs.kind_integration == true runs-on: ubuntu-latest timeout-minutes: 25 strategy: fail-fast: false matrix: scenario: [metrics, logs, tracing, profiling, events] steps: - name: Harden runner uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 with: egress-policy: block # Adds image registries on top of the validate allowlist — # helm install actually pulls images for kind to schedule. allowed-endpoints: > api.github.com:443 github.com:443 objects.githubusercontent.com:443 release-assets.githubusercontent.com:443 raw.githubusercontent.com:443 get.helm.sh:443 grafana.github.io:443 prometheus-community.github.io:443 charts.bitnami.com:443 registry-1.docker.io:443 auth.docker.io:443 production.cloudflare.docker.com:443 ghcr.io:443 quay.io:443 cdn.quay.io:443 grafana.com:443 mcr.microsoft.com:443 public.ecr.aws:443 - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Filter by scenario input id: filter # User-supplied workflow_dispatch input is passed via env, NOT # interpolated directly into the run block, to prevent # template-injection (zizmor: template-injection rule). # matrix.scenario IS safe to interpolate directly because it's # constrained to the static list above. env: USER_SCENARIO: ${{ inputs.scenario }} MATRIX_SCENARIO: ${{ matrix.scenario }} run: | set -euo pipefail if [ "$USER_SCENARIO" = "all" ]; then echo "run=true" >> "$GITHUB_OUTPUT" exit 0 fi if grep -qx "$MATRIX_SCENARIO" <(tr ',' '\n' <<<"$USER_SCENARIO"); then echo "run=true" >> "$GITHUB_OUTPUT" else echo "run=false" >> "$GITHUB_OUTPUT" echo "::notice::Skipping $MATRIX_SCENARIO (not in user-selected subset '$USER_SCENARIO')" fi - name: Install helm if: steps.filter.outputs.run == 'true' run: | curl -fsSL "https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz" \ | tar -xz -C /tmp sudo install -m 0755 /tmp/linux-amd64/helm /usr/local/bin/helm - name: Create kind cluster if: steps.filter.outputs.run == 'true' uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0 with: config: k8s/${{ matrix.scenario }}/kind.yml cluster_name: ${{ matrix.scenario }} - name: Helm bootstrap + install all charts if: steps.filter.outputs.run == 'true' run: | set -euo pipefail helm repo add grafana https://grafana.github.io/helm-charts helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo update kubectl create namespace meta || true while IFS= read -r entry; do release=$(jq -r '.release' <<<"$entry") chart=$(jq -r '.chart' <<<"$entry") values=$(jq -r '.values' <<<"$entry") version=$(jq -r '.version // ""' <<<"$entry") values_path="k8s/${{ matrix.scenario }}/$values" ver_arg=() [ -n "$version" ] && ver_arg=(--version "$version") echo "::group::helm install $release ($chart)" helm install "$release" "$chart" "${ver_arg[@]}" \ -f "$values_path" -n meta --create-namespace \ --wait --timeout 5m echo "::endgroup::" done < <(jq -c --arg s "${{ matrix.scenario }}" '.[$s][]' .github/k8s-scenarios.json) - name: Wait for pods Ready in meta namespace if: steps.filter.outputs.run == 'true' run: | if ! kubectl wait --for=condition=Ready pods --all -n meta --timeout=10m; then echo "::error::Pods did not become Ready" kubectl get pods -n meta -o wide kubectl describe pods -n meta exit 1 fi kubectl get pods -n meta -o wide ================================================ FILE: .github/workflows/validate-scenarios.yml ================================================ name: validate-scenarios # Boots every scenario whose files were touched by the PR, after a CVE # scan of every image the scenario will run. Designed to make renovate # dependency PRs reviewable on signal rather than diff-eyeballing alone. # # Defense-in-depth (intentional, not paranoia): # - permissions: contents: read — no token write, no secrets # - third-party actions SHA-pinned — tag pushes can't sneak in # - trivy advisory scan before boot — known-bad images flagged in PR # - github-hosted ephemeral runners — runner state is not persisted # # Triggered on pull_request (NOT pull_request_target): fork PRs run # without secrets, which is the safe default. Updating this file # requires the same scrutiny as updating any third-party action SHA. on: pull_request: paths: - '*/docker-compose.yml' - '*/docker-compose.yaml' - '*/docker-compose.coda.yml' - '*/Dockerfile' - '*/config.alloy' - '*/app/**' - '*/*/Dockerfile' - '*/*/requirements.txt' - '*/*/package.json' - '*/*/*.csproj' - 'image-versions.env' - '.github/scenario-list.txt' - '.github/workflows/validate-scenarios.yml' # Manual trigger — runs the full matrix without the sampling cap, so a # maintainer can validate a cross-cutting change (e.g. an LGMT bump # that touches every scenario) before merging. PRs auto-sample when # affected count exceeds MATRIX_CAP; workflow_dispatch always runs all. workflow_dispatch: {} env: # Maximum scenarios to validate on a PR before sampling kicks in. # Picked so a typical big update finishes within ~30 min wall-clock # at the configured max-parallel; bypassed by workflow_dispatch. MATRIX_CAP: '8' permissions: contents: read concurrency: # `pull_request.number || run_id` keeps PR runs grouped (and superseded # by force-pushes) while still giving every workflow_dispatch run its # own slot — manual full-matrix runs shouldn't cancel each other. group: validate-scenarios-${{ github.event.pull_request.number || github.run_id }} cancel-in-progress: true jobs: # ────────────────────────────────────────────────────────────────── # detect: Map changed files to top-level scenario directories. # Pure shell — no third-party action — to keep the supply-chain # surface minimal. # ────────────────────────────────────────────────────────────────── detect: name: Detect affected scenarios runs-on: ubuntu-latest timeout-minutes: 5 outputs: scenarios: ${{ steps.filter.outputs.scenarios }} count: ${{ steps.filter.outputs.count }} count_full: ${{ steps.filter.outputs.count_full }} sampled: ${{ steps.filter.outputs.sampled }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 - name: Compute affected scenarios id: filter env: EVENT_NAME: ${{ github.event_name }} BASE_SHA: ${{ github.event.pull_request.base.sha }} HEAD_SHA: ${{ github.event.pull_request.head.sha }} run: | set -euo pipefail if [ "$EVENT_NAME" = "workflow_dispatch" ]; then # Manual run: validate every scenario in the canonical list. # No diff to compute; sampling cap is bypassed. cp .github/scenario-list.txt /tmp/affected.txt else # The base sha may not be in the local clone with a shallow # checkout; fetch-depth: 0 avoids that, but be belt-and-braces. git fetch origin "$BASE_SHA" "$HEAD_SHA" --depth=200 2>/dev/null || true # Map every changed file to its first path segment. Empty lines # come from root-level files (no segment); awk drops those. git diff --name-only "$BASE_SHA" "$HEAD_SHA" \ | awk -F/ 'NF>1 {print $1}' \ | sort -u > /tmp/segments.txt # Intersect with the canonical scenario list. `|| true` keeps # the pipeline alive when there's no overlap (e.g. a PR that # only touches docs/CI). grep -Fxf /tmp/segments.txt .github/scenario-list.txt \ | sort -u > /tmp/affected.txt || true fi count_full=$(wc -l < /tmp/affected.txt | tr -d ' ') sampled=false # Sampling cap: when a single PR touches more than MATRIX_CAP # scenarios (typical for image-versions.env / shared-base # changes), validate a deterministic representative subset # rather than the full matrix. Maintainers can run the full # matrix via workflow_dispatch before merging if signal on # every scenario is wanted. # # Determinism: sort by the SHA-256 of "". # Same commit → same subset, so re-runs are stable. Different # commits get different subsets, so coverage rotates over # time across many big-update PRs. if [ "$EVENT_NAME" != "workflow_dispatch" ] \ && [ "$count_full" -gt "$MATRIX_CAP" ]; then sampled=true commit_hash="${HEAD_SHA:-$GITHUB_SHA}" while read -r line; do [ -z "$line" ] && continue key=$(printf "%s%s" "$line" "$commit_hash" \ | sha256sum | head -c 16) printf "%s\t%s\n" "$key" "$line" done < /tmp/affected.txt \ | sort | head -n "$MATRIX_CAP" | cut -f2 > /tmp/active.txt else cp /tmp/affected.txt /tmp/active.txt fi count=$(wc -l < /tmp/active.txt | tr -d ' ') scenarios=$(jq -Rsc 'split("\n") | map(select(length>0))' /tmp/active.txt) echo "scenarios=$scenarios" >> "$GITHUB_OUTPUT" echo "count=$count" >> "$GITHUB_OUTPUT" echo "count_full=$count_full" >> "$GITHUB_OUTPUT" echo "sampled=$sampled" >> "$GITHUB_OUTPUT" { echo "## Affected scenarios" echo if [ "$count_full" = "0" ]; then echo "_None — PR does not touch any scenario directory._" elif [ "$sampled" = "true" ]; then echo "**$count_full** scenarios affected; sampled **$count** for validation (cap is \`$MATRIX_CAP\`)." echo echo "Trigger \`workflow_dispatch\` on this branch to validate the full matrix." echo echo "Sampled subset:" echo '```' cat /tmp/active.txt echo '```' echo echo "
Full affected list ($count_full)" echo echo '```' cat /tmp/affected.txt echo '```' echo echo "
" else echo "Count: \`$count\`" echo echo '```' cat /tmp/active.txt echo '```' fi } >> "$GITHUB_STEP_SUMMARY" if [ "$sampled" = "true" ]; then echo "::warning::Sampled $count of $count_full affected scenarios. Run workflow_dispatch on this branch to validate them all." fi # ────────────────────────────────────────────────────────────────── # scan: For each affected scenario, resolve every image reference # via `docker compose config --images`, then trivy-scan each one. # Hard-fails on HIGH/CRITICAL CVEs that have a fix available. # ────────────────────────────────────────────────────────────────── scan: name: Scan images needs: detect if: needs.detect.outputs.count != '0' runs-on: ubuntu-latest timeout-minutes: 10 strategy: fail-fast: false max-parallel: 6 matrix: scenario: ${{ fromJSON(needs.detect.outputs.scenarios) }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Compute today's UTC date for cache key id: date run: echo "today=$(date -u +%Y-%m-%d)" >> "$GITHUB_OUTPUT" - name: Restore trivy DB cache # Trivy fetches a fresh vulnerability DB on every cold scan # (~30 MB, ~5-10 s per scenario from mirror.gcr.io). Caching # the DB shaves the cold-pull off every matrix entry after the # first one of the day. Key rotates daily so the DB stays # fresh; the restore-keys fallback is intentional — even a # stale-by-hours DB is far better than a cold fetch. uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 with: path: /tmp/trivy-cache key: trivy-db-${{ steps.date.outputs.today }} restore-keys: | trivy-db- - name: Resolve images for ${{ matrix.scenario }} id: images run: | set -euo pipefail # Try .yml first, fall back to .yaml (some scenarios use either). compose_file="" for ext in yml yaml; do f="${{ matrix.scenario }}/docker-compose.$ext" if [ -f "$f" ]; then compose_file="$f"; break; fi done if [ -z "$compose_file" ]; then echo "No docker-compose found for ${{ matrix.scenario }}" >&2 exit 1 fi # `docker compose config --images` returns service-name defaults # like `game-of-tracing-ai-opponent` for `build:`-only services # — those don't exist in any registry, so trivy fails with # UNAUTHORIZED. Filter to services with an explicit `image:` # field (third-party registry artifacts only). Locally-built # images aren't directly scanned; their FROM base image (e.g. # python:3.11-slim) lives in the Dockerfile and is tracked # separately by renovate's docker manager. docker compose -f "$compose_file" \ --env-file image-versions.env \ config --format json \ | jq -r '.services | to_entries[] | select(.value.image != null) | .value.image' \ | sort -u > /tmp/images.txt echo "Images to scan:" cat /tmp/images.txt if [ ! -s /tmp/images.txt ]; then echo "::notice::No third-party images to scan in this scenario (all services build locally)." fi - name: Trivy scan each image (advisory) # Run trivy via its own docker image (digest-pinned). No # docker.sock mount: trivy pulls the target image itself rather # than reaching into the host's docker — keeps the trivy # container from gaining root-equivalent access on the runner. # # Advisory mode: HIGH/CRITICAL findings are reported via the job # log + step summary table + ::warning:: annotations, but the # step always exits 0. These are demo scenarios; upstream LGMT # images regularly carry HIGH-with-fix findings between releases # and blocking every PR until they catch up isn't useful. Treat # the report as a signal to bump base images, not a merge gate. env: # Suppress ANSI escapes so the log + summary parse cleanly NO_COLOR: '1' run: | set -euo pipefail TRIVY_IMAGE='aquasec/trivy:0.66.0@sha256:086971aaf400beebd94e8300fd8ea623774419597169156cec56eec5b00dfb1e' # Pre-pull once so loop iterations don't re-resolve. docker pull "$TRIVY_IMAGE" mkdir -p /tmp/trivy-cache report_log=/tmp/trivy-output.log : > "$report_log" while IFS= read -r img; do [ -z "$img" ] && continue echo "::group::Scanning $img" echo "=== $img ===" >> "$report_log" # `|| true` so a non-zero trivy exit (had findings) doesn't # abort the loop — we want to scan every image. docker run --rm \ -e NO_COLOR=1 \ -v /tmp/trivy-cache:/root/.cache/trivy \ "$TRIVY_IMAGE" image \ --severity HIGH,CRITICAL \ --ignore-unfixed \ --no-progress \ --timeout 5m \ "$img" 2>&1 | tee -a "$report_log" || true echo "::endgroup::" done < /tmp/images.txt # Per-image summary table for the PR's step summary. { echo "## CVE scan: ${{ matrix.scenario }}" echo if [ ! -s /tmp/images.txt ]; then echo "_No third-party images to scan (all services build locally)._" else echo "| Image | HIGH | CRITICAL |" echo "|---|---:|---:|" current="" h=0; c=0 while IFS= read -r line; do if [[ "$line" =~ ^===\ (.+)\ ===$ ]]; then if [ -n "$current" ]; then echo "| \`$current\` | $h | $c |" fi current="${BASH_REMATCH[1]}" h=0; c=0 elif [[ "$line" =~ Total:\ [0-9]+\ \(HIGH:\ ([0-9]+),\ CRITICAL:\ ([0-9]+)\) ]]; then h=$((h + ${BASH_REMATCH[1]})) c=$((c + ${BASH_REMATCH[2]})) fi done < "$report_log" if [ -n "$current" ]; then echo "| \`$current\` | $h | $c |" fi echo echo "_HIGH+CRITICAL counts are unfixed CVEs with patches available upstream. Findings here don't block merge — see the job log for the full per-CVE table. Upgrade base images via the relevant renovate PR when fixes appear in a published release._" fi } >> "$GITHUB_STEP_SUMMARY" # Emit a single ::warning:: if anything was found, so the PR # gets an inline annotation pointing at the job summary. if grep -qE 'Total:\ [^0]' "$report_log"; then echo "::warning::trivy found HIGH/CRITICAL unfixed CVEs in scanned images for ${{ matrix.scenario }}. See job summary for per-image counts and the log for details." fi # ────────────────────────────────────────────────────────────────── # smoke: For each affected scenario, boot it via run-example.sh, # wait until something healthy answers (Grafana, then Alloy, then # Prometheus), then tear down. # ────────────────────────────────────────────────────────────────── smoke: name: Smoke test needs: [detect, scan] runs-on: ubuntu-latest timeout-minutes: 15 strategy: fail-fast: false max-parallel: 4 matrix: scenario: ${{ fromJSON(needs.detect.outputs.scenarios) }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Boot ${{ matrix.scenario }} run: | set -euo pipefail chmod +x ./run-example.sh ./run-example.sh "${{ matrix.scenario }}" - name: Wait for a healthy endpoint (Grafana, Alloy, or Prometheus) run: | set -euo pipefail # Probe in priority order. Most scenarios expose Grafana on # :3000; self-monitoring exposes Alloy on :12345 instead; # routing remaps Alloy. Grafana wins when present, else any # ready endpoint counts as bring-up success. probes=( "http://localhost:3000/api/health" "http://localhost:12345/-/ready" "http://localhost:9090/-/ready" ) deadline=$(( $(date +%s) + 180 )) # 3 min total while [ "$(date +%s)" -lt "$deadline" ]; do for url in "${probes[@]}"; do code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 3 "$url" 2>/dev/null || true) if [ "$code" = "200" ]; then echo "Healthy: $url" exit 0 fi done sleep 5 done echo "::error::No probe endpoint became healthy within 3 min" exit 1 - name: Verify no exited containers run: | set -euo pipefail cd "${{ matrix.scenario }}" # `docker compose ps --status exited` lists any container that # crashed during bring-up. An empty list is the pass case. exited=$(docker compose ps --status exited --format '{{.Name}}' || true) if [ -n "$exited" ]; then echo "::error::Exited containers detected:" echo "$exited" exit 1 fi - name: Dump container logs on failure if: failure() run: | cd "${{ matrix.scenario }}" docker compose logs --no-color || true - name: Tear down if: always() run: | cd "${{ matrix.scenario }}" docker compose down --volumes --remove-orphans || true ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/latest/usage/project/#working-with-version-control .pdm.toml .pdm-python .pdm-build/ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ ================================================ FILE: CLAUDE.md ================================================ # CLAUDE.md This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. ## Project Overview This is a collection of self-contained demonstration scenarios for **Grafana Alloy**, the telemetry collection and processing pipeline. Each scenario lives in its own top-level directory and showcases a specific monitoring use case using the **LGMT stack** (Loki, Grafana, Metrics/Prometheus, Tempo). ## Running Scenarios ```bash # Option 1: Direct (uses default image versions in docker-compose.yml) cd && docker compose up -d # Option 2: Centralized image versions (from repo root) ./run-example.sh # Stop a scenario cd && docker compose down ``` Image versions are centralized in `image-versions.env` at the repo root. Docker-compose files reference these via `${VAR:-default}` syntax. Kubernetes scenarios (under `k8s/`) use Helm charts instead of Docker Compose — see their individual READMEs. ## Scenario Structure Every Docker-based scenario follows this layout: ``` scenario-name/ ├── docker-compose.yml # LGMT stack + Alloy (infrastructure only) ├── docker-compose.coda.yml # Demo app services (run via coda CLI or -f flag) ├── config.alloy # Alloy pipeline configuration (River/HCL syntax) ├── loki-config.yaml # Loki backend config ├── prom-config.yaml # Prometheus backend config ├── tempo-config.yaml # Tempo config (if tracing is involved) ├── README.md # What the scenario demonstrates and how to use it └── app/ # Optional demo application (typically Python/Flask) ``` ## Alloy Configuration Language `config.alloy` files use Alloy's River syntax (HCL-like). Pipelines follow a consistent pattern: 1. **Receivers/Sources** — ingest data (`loki.source.*`, `otelcol.receiver.*`, `prometheus.exporter.*`) 2. **Processors/Transformers** — parse, relabel, batch (`loki.process.*`, `discovery.relabel`, `otelcol.processor.*`) 3. **Writers/Exporters** — send to backends (`loki.write.*`, `prometheus.remote_write.*`, `otelcol.exporter.*`) Components are wired together by passing outputs to inputs (e.g., `forward_to = [loki.write.default.receiver]`). ## Creating a New Scenario Templates exist in `.cursor/docker-example.mdc` (Docker) and `.cursor/k8s-example.mdc` (Kubernetes) with full boilerplate for all config files. Checklist for a new scenario: 1. Create a new top-level directory named after the scenario 2. Include `docker-compose.yml`, `config.alloy`, backend configs, and `README.md` 3. Use `${VAR:-default}` for image versions matching `image-versions.env` keys 4. Grafana service should auto-provision datasources via entrypoint script (see template) 5. Add the scenario to the main `README.md` table 6. Alloy UI is available at `http://localhost:12345` for debugging pipelines ## Key Conventions - Grafana runs on port 3000 with anonymous admin auth enabled (no login required) - Alloy HTTP server runs on port 12345 - Python demo apps use OpenTelemetry SDK for instrumentation (`telemetry.py` pattern) - Backend configs (loki, prometheus, tempo) are minimal single-instance dev configs — not production-ready ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================

Grafana Alloy Scenarios Banner

# Grafana Alloy Scenarios A collection of self-contained, runnable scenarios demonstrating how to use [Grafana Alloy](https://grafana.com/docs/alloy/) for telemetry collection and processing. Each scenario includes a full LGMT stack (Loki, Grafana, Mimir, Tempo) with pre-configured dashboards so you can explore immediately. ## Getting Started ### Prerequisites - [Docker](https://docs.docker.com/get-docker/) and [Docker Compose](https://docs.docker.com/compose/install/) ### Run a scenario ```bash # Option 1: Navigate to the scenario directory cd && docker compose up -d # Option 2: Use centralized image management (from repo root) ./run-example.sh ``` The centralized approach manages all Docker image versions in a single `image-versions.env` file, making it easy to update images across all scenarios. ### Access the stack Once a scenario is running: - **Grafana**: [http://localhost:3000](http://localhost:3000) (no login required) - **Alloy UI**: [http://localhost:12345](http://localhost:12345) (pipeline debugging) ### Run with the Coda app overlay Each scenario includes a `docker-compose.coda.yml` file that defines the demo application services separately from the infrastructure stack. This lets you run just the observability backend on its own, or layer in the app when you're ready: ```bash # Infrastructure only cd && docker compose up -d # Infrastructure + demo app cd && docker compose -f docker-compose.yml -f docker-compose.coda.yml up -d ``` If you have the `coda` CLI installed, it manages the app overlay automatically: ```bash coda start # Start app containers coda stop # Stop app containers coda status # Show container status coda list # List all available scenarios ``` ### Stop a scenario ```bash cd && docker compose down ``` ## Scenarios ### Logs | Scenario | Description | | -------- | ----------- | | [GELF log ingestion](gelf-log-ingestion/) | Ingest structured logs from applications using the GELF (Graylog Extended Log Format) protocol over UDP. | | [Kafka logs](kafka/) | Consume and process logs from Apache Kafka topics. | | [Log API gateway](log-api-gateway/) | Use Alloy as a centralized log gateway that accepts logs via a Loki-compatible push API endpoint. | | [Log routing](routing/) | Route logs from multiple sources to different Loki tenants based on log content and origin. | | [Log secret filtering](log-secret-filtering/) | Automatically redact sensitive credentials and secrets from logs using pattern matching before storage. | | [Logs from file](logs-file/) | Monitor and tail log files using Alloy. | | [Logs over TCP](logs-tcp/) | Receive and process TCP logs in JSON format. | | [Popular logging frameworks](app-instrumentation/logging/popular-logging-frameworks/) | Parse logs from popular logging frameworks across 7 programming languages. | | [Structured log parsing](mail-house/) | Parse structured logs into labels and structured metadata. | | [Syslog monitoring](syslog/) | Monitor non-RFC5424 compliant syslog messages using `rsyslog` and Alloy. | ### Tracing | Scenario | Description | | -------- | ----------- | | [Distributed tracing](trace-delivery/) | Learn distributed tracing through a sofa delivery workflow from order to doorstep. | | [Game of tracing](game-of-tracing/) | An interactive strategy game teaching distributed tracing, sampling, and service graphs. | | [OpenTelemetry basic tracing](otel-basic-tracing/) | Collect and visualize OpenTelemetry traces using Alloy and Tempo. | | [OpenTelemetry service graphs](otel-tracing-service-graphs/) | Generate service graphs using the Alloy `servicegraph` connector. | | [OpenTelemetry span metrics](otel-span-metrics/) | Generate RED metrics (Request rate, Error rate, Duration) from OpenTelemetry traces using the span metrics connector. | | [OpenTelemetry tail sampling](otel-tail-sampling/) | Apply tail sampling policies to OpenTelemetry traces with Alloy and Tempo. | ### Metrics | Scenario | Description | | -------- | ----------- | | [Blackbox probing](blackbox-probing/) | Monitor endpoint availability and response times using synthetic HTTP probes. | | [OTel metrics pipeline](otel-metrics-pipeline/) | Forward OpenTelemetry metrics from applications through Alloy with batching and transformation into Prometheus. | ### Profiling | Scenario | Description | | -------- | ----------- | | [Continuous profiling](continuous-profiling/) | Collect and visualize CPU, memory, and goroutine profiles from Go applications using Grafana Pyroscope. | ### Secrets and configuration | Scenario | Description | | -------- | ----------- | | [Vault secrets](vault-secrets/) | Pull `prometheus.remote_write` basic_auth credentials from HashiCorp Vault at runtime using `remote.vault`, with hot-reload on rotation. | ### Frontend | Scenario | Description | | -------- | ----------- | | [Faro frontend observability](faro-frontend-observability/) | Collect frontend web telemetry (logs, errors, web vitals) from browser applications using the Faro Web SDK. | ### Cloud Monitoring | Scenario | Description | | -------- | ----------- | | [CloudWatch metrics](cloudwatch-metrics/) | Pull AWS CloudWatch metrics into Prometheus via `prometheus.exporter.cloudwatch`. Uses LocalStack for offline reproducibility — no AWS account required. | ### Infrastructure Monitoring | Scenario | Description | | -------- | ----------- | | [Docker monitoring](docker-monitoring/) | Monitor Docker container metrics and logs. | | [Monitor Linux](linux/) | Monitor a Linux server's system metrics using Alloy. | | [Monitor Windows](windows/) | Monitor Windows system metrics and Event Logs. | | [Self-monitoring](self-monitoring/) | Configure Alloy to monitor itself, collecting its own metrics and logs. | | [SNMP monitoring](snmp/) | Monitor SNMP devices using the Alloy SNMP exporter. | ### Database and Cache Monitoring | Scenario | Description | | -------- | ----------- | | [Elasticsearch monitoring](elasticsearch-monitoring/) | Monitor Elasticsearch cluster health, node status, and performance metrics. | | [Memcached monitoring](memcached-monitoring/) | Monitor Memcached instance metrics including connections, memory usage, and command performance. | | [MySQL monitoring](mysql-monitoring/) | Monitor MySQL database server metrics and performance indicators. | | [PostgreSQL monitoring](postgres-monitoring/) | Monitor PostgreSQL transaction statistics, connections, and server configuration. | | [RabbitMQ monitoring](rabbitmq-monitoring/) | Monitor RabbitMQ queue, connection, and channel metrics plus broker container logs. | | [Redis monitoring](redis-monitoring/) | Monitor Redis instance metrics including connections, memory usage, and command throughput. | ### Kubernetes | Scenario | Description | | -------- | ----------- | | [Kubernetes](k8s/) | A series of scenarios demonstrating Alloy setup using the Kubernetes monitoring Helm chart. See subdirectories for telemetry-specific examples. | ### OTel Engine Examples (Experimental) Alloy v1.14+ includes an experimental **OTel Engine** that runs standard OpenTelemetry Collector YAML configs directly. These scenarios use `alloy otel` instead of River/HCL syntax. See the [OTel examples README](otel-examples/) for details. | Scenario | Description | | -------- | ----------- | | [File log processing](otel-examples/filelog-processing/) | Collect and parse mixed-format log files using the OTel `filelog` receiver with operator chains. | | [PII redaction](otel-examples/pii-redaction/) | Scrub credit cards, emails, and IPs from traces and logs using OTTL `replace_pattern`. | | [Multi-tenant routing](otel-examples/routing-multi-tenant/) | Route logs to different Loki tenants based on resource attributes using fan-out and filter. | | [Cost control](otel-examples/cost-control/) | Drop health checks, filter debug logs, and apply probabilistic sampling to cut telemetry volume. | | [Resource enrichment](otel-examples/resource-enrichment/) | Auto-attach host, OS, and Docker metadata to all signals via `resourcedetection`. | | [Count connector](otel-examples/count-connector/) | Derive request rate and error rate metrics from traces and logs using the `count` connector. | | [OTTL transform cookbook](otel-examples/ottl-transform/) | A cookbook of OTTL patterns: JSON parsing, severity mapping, attribute promotion, truncation. | | [Host metrics](otel-examples/host-metrics/) | Collect CPU, memory, disk, and network metrics using the `hostmetrics` receiver. | | [Multi-pipeline fan-out](otel-examples/multi-pipeline-fanout/) | Send traces to two backends with different processing per destination. | | [Kafka buffer](otel-examples/kafka-buffer/) | Buffer traces through Kafka for durability and backpressure handling. | ## Contributing Contributions of scenarios or improvements to scenarios are welcome. You can contribute in several ways: ### Suggest a scenario If you have an idea for a scenario but don't have time to implement it: 1. Open an [issue](https://github.com/grafana/alloy-scenarios/issues/new) with the label `scenario-suggestion` 2. Describe the scenario and what it would demonstrate 3. Explain why this would be valuable to the community 4. Outline any special requirements or considerations ### Contribute a scenario If you'd like to contribute a complete scenario: 1. Fork this repository and create a branch 2. Create a directory in the root of this repository with a descriptive name for your scenario 3. Follow the [scenario template](#scenario-template) below 4. Submit a pull request with your scenario ### Improve a scenario To improve a scenario: 1. Fork this repository and create a branch 2. Make your improvements to the scenario 3. Submit a pull request with a clear description of your changes ### Scenario template When creating a scenario, include the following files: - `docker-compose.yml` - Docker Compose file with the LGMT stack - `docker-compose.coda.yml` - Docker Compose override with the demo app services (for use with the `coda` CLI or `-f` flag) - `config.alloy` - Alloy configuration file for the scenario - `README.md` - Documentation explaining the scenario - Any additional files needed for your scenario, such as scripts or data files ### Scenario checklist Before submitting your scenario, ensure that you have: - [ ] Created a directory in the root of this repository with a descriptive name - [ ] Included a docker-compose.yml file with the necessary components, such as LGMT stack or subset - [ ] Created a complete config.alloy file that demonstrates the monitoring approach - [ ] Written a README.md with: - A clear description of what the scenario demonstrates - Prerequisites for running the demo - Step-by-step instructions for running the demo - Expected output and what to look for - Screenshots if applicable - Explanation of key configuration elements - [ ] Added the scenario to the table in this README.md - [ ] Ensured the scenario works with the centralized image management system - [ ] Verified all components start correctly with `docker compose up -d` ### Best practices for scenarios - Keep the scenario focused on demonstrating one concept - Use clear, descriptive component and variable names - Add comments to explain complex parts of your Alloy configuration - Consider including a "Customizing" section in your README.md - Provide sample queries for Grafana/Prometheus/Loki/Tempo that work with your scenario - Use environment variables for versions and configurable parameters ## Get help If you have questions about creating a scenario or need help with Alloy: - Join the [Grafana Labs Community Forums](https://community.grafana.com/) - Check the [Grafana Alloy documentation](https://grafana.com/docs/alloy/) ## License This repository is licensed under the Apache License, Version 2.0. Refer to [LICENSE](LICENSE) for the full license text. ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/README.md ================================================ # App Instrumentation - Structured Logging with Alloy Parsing This directory contains a comprehensive **Alloy tutorial** demonstrating how to parse structured logs from 7 popular programming languages using modern logging frameworks. Each language uses industry-standard structured logging libraries, and all logs are processed through a unified Alloy pipeline for collection, parsing, and storage in Loki. ## 🎯 Tutorial Objectives - **Learn Alloy log parsing**: Understand how to parse different log formats using `loki.process` stages - **Multi-language support**: Handle logs from 7 different programming languages in a single pipeline - **Structured logging**: Demonstrate modern logging practices with structured data - **Real-world scenarios**: Show practical log parsing for containerized applications ## Languages and Modern Logging Frameworks | Language | Logging Framework | Type | Key Features | Docker Base Image | |----------|------------------|------|--------------|-------------------| | **JavaScript** | `Pino` | JSON structured | High performance, child loggers, ndjson output | `node:22-alpine` | | **Python** | `logging` module | Structured text | Built-in standard library with custom formatting | `python:3.12-slim` | | **Java** | `SLF4J + Logback` | Structured text | Parameterized messages, MDC context, thread info | `openjdk:26-slim` | | **C#** | `Microsoft.Extensions.Logging` | Structured text | .NET standard framework, event IDs, structured data | `mcr.microsoft.com/dotnet/*:9.0` | | **C++** | `spdlog` | Structured text | High performance, source location, thread-safe | `ubuntu:24.04` | | **Go** | `Zap` | JSON structured | High performance, named loggers, structured fields | `golang:1.23-alpine` | | **PHP** | `Monolog` | Structured text | Context arrays, processors, multiple handlers | `php:8.3-cli-alpine` | ## Directory Structure ``` app-instrumentation/logging/popular-logging-frameworks/ ├── alloy/ │ ├── config.alloy # Main Alloy configuration │ └── helper.alloy # Language-specific log parsers ├── javascript/ │ ├── app.js # Pino structured logging │ └── Dockerfile ├── python/ │ ├── app.py # Python logging with custom format │ └── Dockerfile ├── java/ │ ├── App.java # SLF4J + Logback │ ├── logback.xml │ └── Dockerfile ├── csharp/ │ ├── Program.cs # Microsoft.Extensions.Logging │ ├── LoggingExample.csproj │ └── Dockerfile ├── cpp/ │ ├── main.cpp # spdlog structured logging │ ├── CMakeLists.txt │ └── Dockerfile ├── go/ │ ├── main.go # Zap JSON logging │ ├── go.mod │ ├── go.sum │ └── Dockerfile ├── php/ │ ├── app.php # Monolog with context │ └── Dockerfile ├── docker-compose.yml # Complete stack with Loki + Grafana ├── loki-config.yaml └── README.md ``` ## 🔍 Alloy Parsing Features Demonstrated ### Core Alloy Components Used - **`loki.source.docker`**: Automatic Docker container log discovery - **`loki.process`**: Multi-stage log parsing pipeline - **`discovery.docker`**: Container metadata extraction - **`discovery.relabel`**: Label transformation and routing ### Advanced Parsing Techniques Each language parser demonstrates different Alloy parsing capabilities: - **Regex parsing** (`stage.regex`): Extract structured fields from text logs - **JSON parsing** (`stage.json`): Handle native JSON log formats - **Multiline handling** (`stage.multiline`): Process stack traces and exception logs - **Label management** (`stage.labels`): Efficient indexing for filtering - **Structured metadata** (`stage.structured_metadata`): Searchable non-indexed data - **Timestamp parsing** (`stage.timestamp`): Multiple timestamp format support - **Template formatting** (`stage.template`): Custom output formatting - **Conditional logic**: Level conversion, error prioritization ### Language-Specific Parsing Examples | Language | Primary Challenge | Alloy Solution | |----------|------------------|----------------| | **JavaScript (Pino)** | JSON numeric levels | Template stage for level conversion | | **Python** | Custom text format | Regex extraction with line numbers | | **Java (Logback)** | Multi-line stack traces | Multiline stage + regex parsing | | **C#** | Event IDs and namespaces | Regex parsing with structured metadata | | **C++** | Source location details | Complex regex for file:line extraction | | **Go (Zap)** | Unix timestamps | Timestamp parsing with fractional seconds | | **PHP (Monolog)** | Nested JSON context | Multiple JSON parsing stages | ## 🚀 Quick Start Tutorial ### Step 1: Clone the Repository ```bash git clone https://github.com/grafana/alloy-scenarios.git cd app-instrumentation/logging/popular-logging-frameworks ``` ### Step 2: Launch the Complete Stack ```bash # Build and run all applications with Alloy + Loki + Grafana docker compose up --build # Run in detached mode to see clean output docker compose up --build -d ``` This starts: - **7 language applications** generating structured logs - **Alloy** parsing and forwarding logs to Loki - **Loki** storing parsed logs with labels and metadata - **Grafana** for log visualization and querying ### Step 3: Explore the Logs - Head to http://localhost:3000/a/grafana-lokiexplore-app to see the logs in Grafana - Each language has its own service name / app so you can identify which languge you would like to see the parsed logs for ## 📚 Learning Outcomes After completing this tutorial, you'll understand: ### Alloy Concepts - **Multi-stage processing**: How to chain `loki.process` stages for complex parsing - **Component composition**: Using `import.file` to modularize configurations - **Discovery patterns**: Automatic service discovery with Docker integration - **Label vs. metadata strategy**: When to use indexed labels vs. structured metadata ### Log Parsing Techniques - **Regex mastery**: Complex pattern matching for text log formats - **JSON handling**: Extracting nested fields from structured logs - **Timestamp parsing**: Supporting multiple timestamp formats across languages - **Multiline processing**: Handling stack traces and exception logs - **Conditional formatting**: Template logic for log transformation ### Real-World Patterns - **Language-specific challenges**: Understanding unique parsing requirements per language - **Performance considerations**: Efficient labeling and metadata strategies - **Observability best practices**: Structured logging principles across tech stacks - **Container log collection**: Production-ready log aggregation patterns ## 🔧 Configuration Details ### Language-Specific Parsing Challenges Each language presents unique parsing requirements: #### JavaScript (Pino) ```alloy // Challenge: Numeric log levels (10, 20, 30, 40, 50, 60) stage.template { source = "level" template = "{{- if eq .level_num \"30\" -}}info{{- else if eq .level_num \"50\" -}}error{{- end -}}" } ``` #### Java (Logback) ```alloy // Challenge: Multi-line stack traces stage.multiline { firstline = "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}" } ``` #### Go (Zap) ```alloy // Challenge: Unix timestamp with fractional seconds stage.timestamp { source = "ts" format = "1750342991.0445938" } ``` ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/alloy/config.alloy ================================================ // ############################### // #### Main Logging Configuration #### // ############################### // Import the custom log parsing helper module. // This gives us access to the "app_logs_parser" component that handles // language-specific log parsing for Python, JavaScript, Go, Java, C#, PHP, and C++. import.file "helper" { filename = "/etc/alloy/helper.alloy" } // Discover Docker containers running on the local Docker daemon. // This component continuously monitors the Docker socket for container changes, // providing real-time discovery of log sources without manual configuration. discovery.docker "linux" { host = "unix:///var/run/docker.sock" // Connect to local Docker daemon via Unix socket } // Transform Docker container metadata into useful labels for log routing. // This creates a "service_name" label from the container name, which is used // by our parsing stages to determine which language parser to apply. discovery.relabel "logs_integrations_docker" { targets = [] // Start with empty targets (will be populated by discovery.docker) // Extract container name and use it as service identifier // Example: container "/python-app" becomes service_name="python-app" rule { source_labels = ["__meta_docker_container_name"] // Docker provides this metadata regex = "/(.*)" // Remove leading slash from container name target_label = "service_name" // Create clean service identifier } } // Instantiate our custom log parser with output destination. // This creates the processing pipeline that will parse logs from all supported languages // and forward them to Loki for storage and querying. helper.app_logs_parser "default" { write_to = [loki.write.local.receiver] // Send parsed logs to our Loki instance } // Collect logs from all discovered Docker containers. // This is the main log collection engine that streams container logs in real-time // and feeds them into our language-specific parsing pipeline. loki.source.docker "default" { host = "unix:///var/run/docker.sock" // Connect to Docker daemon targets = discovery.docker.linux.targets // Use discovered containers labels = {"platform" = "docker"} // Add platform label to all logs relabel_rules = discovery.relabel.logs_integrations_docker.rules // Apply container name transformation forward_to = [helper.app_logs_parser.default.parser_input] // Send raw logs to our parser } // Configure Loki write endpoint for log storage. // This is where all parsed and enriched logs are finally stored for querying, // alerting, and analysis in Grafana or other tools. loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" // Loki's standard push API endpoint } } ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/alloy/helper.alloy ================================================ declare "app_logs_parser" { // argument.write_to is a required argument that specifies where parsed // log lines are sent. // // The value of the argument is retrieved in this file with // argument.write_to.value. argument "write_to" { optional = false } // loki.process.app_logs_parser is our component which executes the parsing, // passing parsed logs to argument.write_to.value. loki.process "app_logs_parser" { // ## Python Processing ## // Let only python logs pass through this stage. This is done via the label match on the service_name label. stage.match { pipeline_name = "python" selector = "{service_name=\"python\"}" // Extract the timestamp, file, line number, level, and message from the log line. // Python logs format: "2025-06-17 09:54:15,283 - main.py:25 - INFO - Starting application" stage.regex { expression = "^(?P\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}) - (?P[^:]+):(?P\\d+) - (?P[^ ]+) - (?P.*)" } // Set the file and level as labels for efficient filtering and querying in Loki. // Labels are indexed and should be used for high-cardinality filtering. stage.labels { values = { file = "", level = "", } } // Set the timestamp to the timestamp extracted from the log line. // This ensures proper chronological ordering in Loki. stage.timestamp { source = "timestamp" format = "2006-01-02 15:04:05,000" } // Set the line number as structured metadata in Loki (non-indexed). // Structured metadata is searchable but not indexed, reducing storage costs. stage.structured_metadata { values = { line_num = "", } } // We want to maintain a similar format to the original log line so we use template to create a new // temporary variable called output. This creates a clean, consistent format across all Python logs. stage.template { source = "output" template = "{{.file}} - {{.line_num}} - {{.level}} - {{.msg}}" } // We use the new output variable to create a new log body. This is the log line that will be sent to loki. // The output stage replaces the original log message with our formatted version. stage.output { source = "output" } } // ## Node.js Processing ## // Let only node.js logs pass through this stage. This is done via the label match on the service_name label. stage.match { pipeline_name = "javascript" selector = "{service_name=\"javascript\"}" // Extract fields from JSON-formatted Pino logs. // Pino outputs structured JSON logs with fields like level (numeric), time (timestamp), msg, etc. stage.json { expressions = { level_num = "level", time = "time", pid = "pid", hostname = "hostname", msg = "msg", obj = "obj", counter = "counter", component = "component", query = "query", duration = "duration", version = "version", method = "method", path = "path", status = "status", nested_obj = "nested.obj", nested_timestamp = "nested.timestamp", err_type = "err.type", err_message = "err.message", err_stack = "err.stack", } } // Convert Pino's numeric log levels to human-readable strings. // Pino uses numbers: 10=trace, 20=debug, 30=info, 40=warn, 50=error, 60=fatal stage.template { source = "level" template = "{{- if eq .level_num \"10\" -}}trace{{- else if eq .level_num \"20\" -}}debug{{- else if eq .level_num \"30\" -}}info{{- else if eq .level_num \"40\" -}}warn{{- else if eq .level_num \"50\" -}}error{{- else if eq .level_num \"60\" -}}fatal{{- else -}}unknown{{- end -}}" } // Set important fields as labels for efficient querying. // hostname and component help identify log sources, level enables filtering by severity. stage.labels { values = { file = "", hostname = "", component = "", level = "", } } // Set the timestamp from Pino's Unix millisecond timestamp. // Pino logs include precise timestamps for accurate log ordering. stage.timestamp { source = "time" format = "UnixMs" } // Store all extracted fields as structured metadata for searchability without indexing costs. // This includes process info, request details, and error information. stage.structured_metadata { values = { level_num = "", pid = "", query = "", duration = "", version = "", method = "", path = "", status = "", nested_obj = "", nested_timestamp = "", err_type = "", err_message = "", err_stack = "", } } // Create a consistent output format prioritizing error messages over regular messages. // This provides better visibility of errors while maintaining standard log structure. stage.template { source = "output" template = "{{.hostname}} - {{.level}} - {{ if .err_message }}{{ .err_message }}{{ else }}{{ .msg }}{{ end }}" } // Apply the formatted output as the final log message sent to Loki. stage.output { source = "output" } } // ## Go Processing ## // Let only go logs pass through this stage. This is done via the label match on the service_name label. stage.match { pipeline_name = "go" selector = "{service_name=\"go\"}" // Extract fields from Zap's JSON-structured logs. // Zap outputs detailed JSON logs with structured fields for better observability. stage.json { expressions = { level = "level", ts = "ts", logger = "logger", caller = "caller", msg = "msg", answer = "answer", obj = "obj", counter = "counter", feature = "feature", query = "query", duration = "duration", method = "method", path = "path", status = "status", requestId = "requestId", context1 = "context1", context2 = "context2", error = "error", stacktrace = "stacktrace", nested_obj = "nested.obj", nested_timestamp = "nested.timestamp", } } // Set logger name and level as indexed labels for efficient filtering. // This enables quick filtering by specific loggers (e.g., database, api) and log levels. stage.labels { values = { logger = "", level = "", } } // Parse Zap's Unix timestamp with fractional seconds. // Zap provides high-precision timestamps for accurate log correlation. stage.timestamp { source = "ts" format = "1750342991.0445938" } // Store all contextual information as structured metadata. // This includes caller info, request details, errors, and application-specific data. stage.structured_metadata { values = { caller = "caller", answer = "answer", obj = "obj", counter = "counter", feature = "feature", query = "query", duration = "duration", method = "method", path = "path", status = "status", requestId = "requestId", context1 = "context1", context2 = "context2", error = "error", stacktrace = "stacktrace", nested_obj = "nested.obj", nested_timestamp = "nested.timestamp", } } // Create a clean, consistent output format showing logger, level, and message. // This maintains readability while preserving structured data in metadata. stage.template { source = "output" template = "{{.logger}} - {{.level}} - {{.msg}}" } // Apply the formatted output as the final log message. stage.output { source = "output" } } // ## Java Processing ## // Let only java logs pass through this stage. This is done via the label match on the service_name label. stage.match { pipeline_name = "java" selector = "{service_name=\"java\"}" // Handle multi-line Java stack traces by identifying the start of new log entries. // Java exceptions often span multiple lines, so we need to group them properly. stage.multiline { firstline = "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{3}\\+\\d{4}\\[[^\\]]+\\]\\s+[A-Z]+\\s+\\w+\\s+-\\s+" } // Parse Logback's structured log format including timestamps, threads, levels, and stack traces. // Format: "2024-01-15T14:41:02.423+0000[main] INFO App - Starting application" stage.regex { expression = "^(?P\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{3}\\+\\d{4})\\[(?P[^\\]]+)\\] (?P[A-Z]+)\\s+(?P[^ ]+) - (?P[^\n]*)(?:\\n(?P.*))?" } // Set logger and level as indexed labels for efficient log filtering. // This enables filtering by specific Java classes/packages and log severity. stage.labels { values = { logger = "", level = "", } } // Parse ISO 8601 timestamp with timezone for accurate time correlation. // Java's Logback uses precise timestamps with timezone information. stage.timestamp { source = "timestamp" format = "2006-01-02T15:04:05.000-0700" } // Store thread information and stack traces as structured metadata. // Thread info helps with concurrent debugging, stack traces provide error context. stage.structured_metadata { values = { thread = "", stacktrace = "", } } // Format output to show essential information: logger, level, and message. // Stack traces are preserved in metadata for when they're needed. stage.template { source = "output" template = "{{.logger}} - {{.level}} - {{.msg}}" } // Apply the clean formatted output while preserving detailed metadata. stage.output { source = "output" } } // ## C# Processing ## // Let only c# logs pass through this stage. This is done via the label match on the service_name label. stage.match { pipeline_name = "csharp" selector = "{service_name=\"csharp\"}" // Handle multi-line .NET logs and exception stack traces. // .NET logging can span multiple lines, especially with structured logging and exceptions. stage.multiline { firstline = "^\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3} [a-z]+: [^\\[]+\\[\\d+\\]" } // Parse .NET's structured logging format with event IDs. // Format: "2024-01-15 14:41:02.423 info: Microsoft.Extensions.Hosting[1] Starting application" stage.regex { expression = "^(?P\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}) (?P[a-z]+): (?P[^\\[]+)\\[(?P\\d+)\\]\\n\\s+(?P.*)" } // Set logger namespace and level as indexed labels for filtering. // .NET uses hierarchical logger names (e.g., Microsoft.Extensions.Hosting) for categorization. stage.labels { values = { logger = "", level = "", } } // Parse .NET's standard timestamp format (no timezone). // .NET logging typically uses local time format. stage.timestamp { source = "timestamp" format = "2006-01-02 15:04:05.000" } // Store .NET-specific event IDs as structured metadata. // Event IDs help categorize and filter specific types of .NET framework events. stage.structured_metadata { values = { event_id = "", } } // Create consistent output format showing logger namespace, level, and message. stage.template { source = "output" template = "{{.logger}} - {{.level}} - {{.msg}}" } // Apply the formatted output to maintain consistency with other language logs. stage.output { source = "output" } } // ## PHP Processing ## // Let only php logs pass through this stage. This is done via the label match on the service_name label. stage.match { pipeline_name = "php" selector = "{service_name=\"php\"}" // Parse Monolog's default line format with timestamp, logger, level, message, context, and extra data. // Format: "[2024-01-15T14:41:02.123456+00:00] app.INFO: hello world {"counter":42} {"environment":"production"}" stage.regex { expression = "^\\[(?P[^\\]]+)\\] (?P[^.]+)\\.(?P[A-Z]+): (?P.*?) (?P\\[\\]|\\{.*?\\}) (?P\\{.*?\\})$" } // Set logger name and level as indexed labels for efficient querying. // PHP applications often use multiple named loggers (app, database, api, etc.). stage.labels { values = { logger = "", level = "", } } // Parse Monolog's ISO 8601 timestamp with microseconds and timezone. // Monolog provides high-precision timestamps for accurate log correlation. stage.timestamp { source = "timestamp" format = "2006-01-02T15:04:05.000000-07:00" } // Extract application-specific data from the context JSON. // Context contains request-specific data like counters, query info, API details, etc. stage.json { source = "context_json" expressions = { counter = "counter", obj = "obj", query = "query", duration = "duration", method = "method", path = "path", status = "status", exception = "exception", error_code = "error_code", affected_service = "affected_service", } } // Extract environment and system-level data from the extra JSON. // Extra data typically contains environment info, process details, etc. stage.json { source = "extra_json" expressions = { environment = "environment", } } // Store all extracted PHP context and environment data as structured metadata. // This provides rich searchability for PHP application debugging and monitoring. stage.structured_metadata { values = { counter = "", obj = "", query = "", duration = "", method = "", path = "", status = "", exception = "", error_code = "", affected_service = "", environment = "", } } // Create clean output format showing logger, level, and message. // Detailed context remains accessible in structured metadata. stage.template { source = "output" template = "{{.logger}} - {{.level}} - {{.msg}}" } // Apply the standardized output format while preserving rich PHP context data. stage.output { source = "output" } } // ## C++ Processing ## // Let only cpp logs pass through this stage. This is done via the label match on the service_name label. stage.match { pipeline_name = "cpp" selector = "{service_name=\"cpp\"}" // Parse C++ structured logging format with detailed source location information. // Format: "2024-01-15 14:41:02.423 [info] [logger] [thread 1] [main.cpp:25 main] - Starting application" stage.regex { expression = "^(?P\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}) \\[(?P[^\\]]+)\\] \\[(?P[^\\]]+)\\] \\[(?P[^\\]]+)\\] \\[(?P[^:]+):(?P\\d+) (?P[^\\]]+)\\] - (?P.*)" } // Set logger, level, and source file as indexed labels for debugging. // C++ logs benefit from file-based filtering for debugging specific modules. stage.labels { values = { logger = "", level = "", file = "", } } // Parse standard timestamp format used by C++ logging libraries. stage.timestamp { source = "timestamp" format = "2006-01-02 15:04:05.000" } // Store detailed C++ debugging information as structured metadata. // Thread info, line numbers, and function names are crucial for C++ debugging. stage.structured_metadata { values = { thread = "", line_num = "", function = "", } } // Create detailed output showing file location, function, level, and message. // C++ debugging often requires precise source location information. stage.template { source = "output" template = "{{.file}}:{{.line_num}} {{.function}} - {{.level}} - {{.msg}}" } // Apply the detailed C++ format optimized for debugging and troubleshooting. stage.output { source = "output" } } // Send processed logs to our argument. forward_to = argument.write_to.value } // export.parser_input exports a value to the module consumer. export "parser_input" { // Expose the receiver of loki.process so the module importer can send // logs to our loki.process component. value = loki.process.app_logs_parser.receiver } } ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/cpp/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.16) project(LoggingExample) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) # Find required packages find_package(PkgConfig REQUIRED) find_package(Threads REQUIRED) # Add spdlog include(FetchContent) FetchContent_Declare( spdlog GIT_REPOSITORY https://github.com/gabime/spdlog.git GIT_TAG v1.12.0 ) FetchContent_MakeAvailable(spdlog) # Create executable add_executable(logging_example main.cpp) # Link libraries target_link_libraries(logging_example PRIVATE spdlog::spdlog Threads::Threads ) # Compiler-specific options if(MSVC) target_compile_options(logging_example PRIVATE /W4) else() target_compile_options(logging_example PRIVATE -Wall -Wextra -Wpedantic) endif() ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/cpp/Dockerfile ================================================ FROM ubuntu:26.04@sha256:f3d28607ddd78734bb7f71f117f3c6706c666b8b76cbff7c9ff6e5718d46ff64 # Install build dependencies RUN apt-get update && apt-get install -y \ cmake \ g++ \ make \ git \ pkg-config \ && rm -rf /var/lib/apt/lists/* WORKDIR /app COPY . . # Build the application RUN cmake -B build -S . && \ cmake --build build --config Release # Run the application CMD ["./build/logging_example"] ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/cpp/main.cpp ================================================ #include #include #include #include int main() { auto console = spdlog::stdout_color_mt("logger"); spdlog::set_default_logger(console); spdlog::set_level(spdlog::level::debug); spdlog::set_pattern( "%Y-%m-%d %H:%M:%S.%e [%^%l%$] [%n] [thread %t] [%s:%# %!] - %v" ); int counter = 0; SPDLOG_LOGGER_INFO(console, "Starting C++ basic logging example"); SPDLOG_LOGGER_INFO(console, "Demonstrating spdlog formatting"); while (true) { counter++; int logType = counter % 5; switch (logType) { case 0: SPDLOG_LOGGER_DEBUG(console, "Basic debug message, counter: {}", counter); break; case 1: SPDLOG_LOGGER_INFO(console, "Information message, counter: {}", counter); break; case 2: SPDLOG_LOGGER_WARN(console, "Warning message, counter: {}", counter); break; case 3: SPDLOG_LOGGER_ERROR(console, "Error message, counter: {}", counter); break; case 4: SPDLOG_LOGGER_CRITICAL(console, "Critical message, counter: {}", counter); break; } std::this_thread::sleep_for(std::chrono::seconds(1)); } return 0; } ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/csharp/Dockerfile ================================================ FROM mcr.microsoft.com/dotnet/sdk:9.0@sha256:0300d42309afd86168fa57d62db79020a34ee396d39c9634844b9c0ab285ea55 AS build WORKDIR /app COPY *.csproj . RUN dotnet restore COPY . . RUN dotnet publish -c Release -o out FROM mcr.microsoft.com/dotnet/runtime:9.0@sha256:7590f1b7e124fe7a4b7cffa5f6f9958f2c02a22bf5bd7a0387a84b88cddf4057 WORKDIR /app COPY --from=build /app/out . ENTRYPOINT ["dotnet", "LoggingExample.dll"] ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/csharp/LoggingExample.csproj ================================================ Exe net9.0 enable ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/csharp/Program.cs ================================================ using Microsoft.Extensions.Logging; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Hosting; using System; using System.Threading.Tasks; namespace LoggingExample { class Program { private static ILogger? _logger; static async Task Main(string[] args) { // Configure logging with proper formatting using var host = Host.CreateDefaultBuilder(args) .ConfigureLogging(logging => { logging.ClearProviders(); logging.AddConsole(options => { options.TimestampFormat = "yyyy-MM-dd HH:mm:ss.fff "; options.IncludeScopes = false; }); logging.SetMinimumLevel(LogLevel.Debug); }) .Build(); _logger = host.Services.GetRequiredService>(); int counter = 0; _logger.LogInformation("Starting C# basic logging example"); _logger.LogInformation("Demonstrating Microsoft.Extensions.Logging"); // Infinite loop with different log levels while (true) { counter++; // Cycle through different log levels int logType = counter % 5; switch (logType) { case 0: _logger.LogDebug("Basic debug message, counter: {Counter}", counter); break; case 1: _logger.LogInformation("Information message, counter: {Counter}", counter); break; case 2: _logger.LogWarning("Warning message, counter: {Counter}", counter); break; case 3: _logger.LogError("Error message, counter: {Counter}", counter); break; case 4: _logger.LogCritical("Critical message, counter: {Counter}", counter); break; } // Wait 1 second before next log await Task.Delay(1000); } } } } ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/docker-compose.coda.yml ================================================ services: javascript-logging: build: context: ./javascript dockerfile: Dockerfile container_name: javascript environment: - NODE_ENV=production restart: unless-stopped python-logging: build: context: ./python dockerfile: Dockerfile container_name: python environment: - PYTHON_ENV=production restart: unless-stopped java-logging: build: context: ./java dockerfile: Dockerfile container_name: java environment: - JAVA_ENV=production restart: unless-stopped csharp-logging: build: context: ./csharp dockerfile: Dockerfile container_name: csharp environment: - DOTNET_ENVIRONMENT=Production restart: unless-stopped cpp-logging: build: context: ./cpp dockerfile: Dockerfile container_name: cpp environment: - CPP_ENV=production restart: unless-stopped go-logging: build: context: ./go dockerfile: Dockerfile container_name: go environment: - GO_ENV=production restart: unless-stopped php-logging: build: context: ./php dockerfile: Dockerfile container_name: php environment: - PHP_ENV=production restart: unless-stopped ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/docker-compose.yml ================================================ version: '3.8' services: javascript-logging: build: context: ./javascript dockerfile: Dockerfile container_name: javascript environment: - NODE_ENV=production restart: unless-stopped python-logging: build: context: ./python dockerfile: Dockerfile container_name: python environment: - PYTHON_ENV=production restart: unless-stopped java-logging: build: context: ./java dockerfile: Dockerfile container_name: java environment: - JAVA_ENV=production restart: unless-stopped csharp-logging: build: context: ./csharp dockerfile: Dockerfile container_name: csharp environment: - DOTNET_ENVIRONMENT=Production restart: unless-stopped cpp-logging: build: context: ./cpp dockerfile: Dockerfile container_name: cpp environment: - CPP_ENV=production restart: unless-stopped go-logging: build: context: ./go dockerfile: Dockerfile container_name: go environment: - GO_ENV=production restart: unless-stopped php-logging: build: context: ./php dockerfile: Dockerfile container_name: php environment: - PHP_ENV=production restart: unless-stopped loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} container_name: loki ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} container_name: grafana environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false EOF /run.sh alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} container_name: alloy ports: - 12345:12345 - 4317:4317 - 4318:4318 volumes: - ./alloy/:/etc/alloy/ - /var/run/docker.sock:/var/run/docker.sock command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy networks: default: name: logging-examples-network ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/go/Dockerfile ================================================ FROM golang:1.26-alpine@sha256:91eda9776261207ea25fd06b5b7fed8d397dd2c0a283e77f2ab6e91bfa71079d WORKDIR /app # Copy go.mod and go.sum for better caching COPY go.mod go.sum ./ RUN go mod download # Copy source code COPY main.go . RUN go build -o logging_example main.go CMD ["./logging_example"] ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/go/go.mod ================================================ module logging-example go 1.23 require go.uber.org/zap v1.28.0 require go.uber.org/multierr v1.10.0 // indirect ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/go/go.sum ================================================ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.10.0 h1:S0h4aNzvfcFsC3dRF1jLoaov7oRaKqRGC/pUEJ2yvPQ= go.uber.org/multierr v1.10.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.28.0 h1:IZzaP1Fv73/T/pBMLk4VutPl36uNC+OSUh3JLG3FIjo= go.uber.org/zap v1.28.0/go.mod h1:rDLpOi171uODNm/mxFcuYWxDsqWSAVkFdX4XojSKg/Q= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/go/main.go ================================================ package main import ( "errors" "time" "go.uber.org/zap" "go.uber.org/zap/zapcore" ) func main() { // Configure Zap logger for JSON output to stdout config := zap.NewProductionConfig() config.Level = zap.NewAtomicLevelAt(zap.DebugLevel) config.OutputPaths = []string{"stdout"} logger, err := config.Build() if err != nil { panic(err) } defer logger.Sync() // Create child loggers for different components appLogger := logger.Named("app") dbLogger := logger.Named("database") apiLogger := logger.Named("api") counter := 0 appLogger.Info("Starting Go basic logging example with Zap") appLogger.Info("Demonstrating Zap structured logging features") // Infinite loop with different logging examples for { counter++ // Cycle through different logging examples logType := counter % 12 switch logType { case 0: appLogger.Info("hello world") case 1: appLogger.Error("this is at error level") case 2: appLogger.Info("the answer is 42", zap.Int("answer", 42)) case 3: appLogger.Info("hello world", zap.Int("obj", 42)) case 4: appLogger.Info("hello world with counter", zap.Int("obj", 42), zap.Int("counter", counter)) case 5: appLogger.Info("nested object", zap.Object("nested", zapcore.ObjectMarshalerFunc(func(enc zapcore.ObjectEncoder) error { enc.AddInt("obj", 42) enc.AddTime("timestamp", time.Now()) return nil }))) case 6: appLogger.Error("simulated error", zap.Error(errors.New("kaboom"))) case 7: appLogger.Info("hello from app component!") case 8: dbLogger.Warn("slow query detected", zap.String("query", "SELECT * FROM users"), zap.Duration("duration", 250*time.Millisecond)) case 9: apiLogger.Info("API request completed", zap.String("method", "GET"), zap.String("path", "/api/users"), zap.Int("status", 200)) case 10: tempChild := appLogger.With(zap.String("requestId", "req-"+string(rune(counter)))) tempChild.Debug("this is a debug statement via child") case 11: appLogger.Error("error with additional context", zap.Error(errors.New("kaboom")), zap.String("context1", "additional"), zap.String("context2", "information")) } // Occasionally demonstrate sugar logger if counter%15 == 0 { sugar := logger.Sugar() sugar.Infow("using sugar logger", "counter", counter, "feature", "sugar") } // Occasionally demonstrate different log levels if counter%20 == 0 { appLogger.Debug("this is a debug message", zap.Int("counter", counter)) appLogger.Warn("this is a warning message", zap.Int("counter", counter)) } // Wait 1 second before next log time.Sleep(1 * time.Second) } } ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/java/App.java ================================================ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.MDC; public class App { private static final Logger logger = LoggerFactory.getLogger(App.class); private static final Logger appLogger = LoggerFactory.getLogger("app"); private static final Logger dbLogger = LoggerFactory.getLogger("database"); private static final Logger apiLogger = LoggerFactory.getLogger("api"); public static void main(String[] args) { int counter = 0; logger.info("Starting Java basic logging example with SLF4J + Logback"); logger.info("Demonstrating SLF4J structured logging features"); // Infinite loop with different logging examples while (true) { counter++; // Cycle through different logging examples int logType = counter % 12; switch (logType) { case 0: logger.info("hello world"); break; case 1: logger.error("this is at error level"); break; case 2: logger.info("the answer is {}", 42); break; case 3: logger.info("hello world with obj {}", 42); break; case 4: logger.info("hello world with counter {} and obj {}", counter, 42); break; case 5: logger.info("nested object with timestamp {} and value {}", java.time.LocalDateTime.now(), 42); break; case 6: Exception simulatedError = new RuntimeException("kaboom"); logger.error("simulated error", simulatedError); break; case 7: appLogger.info("hello from app component!"); break; case 8: dbLogger.warn("slow query detected: {} took {}ms", "SELECT * FROM users", 250); break; case 9: apiLogger.info("API request completed: {} {} status={}", "GET", "/api/users", 200); break; case 10: // Using MDC (Mapped Diagnostic Context) for contextual logging MDC.put("requestId", "req-" + counter); logger.debug("this is a debug statement with MDC context"); MDC.clear(); break; case 11: Exception error = new RuntimeException("kaboom"); logger.error("error with additional context: {} {}", "additional", "information", error); break; } // Occasionally demonstrate different log levels if (counter % 15 == 0) { logger.debug("this is a debug message with counter {}", counter); logger.warn("this is a warning message with counter {}", counter); } // Occasionally demonstrate MDC usage if (counter % 20 == 0) { MDC.put("userId", "user123"); MDC.put("sessionId", "session456"); logger.info("using MDC for contextual logging"); MDC.clear(); } // Wait 1 second before next log try { Thread.sleep(1000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); logger.warn("Thread interrupted: {}", e.getMessage()); break; } } } } ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/java/Dockerfile ================================================ FROM openjdk:26-slim@sha256:63814a9d8bbea6d39d5ce9c91843bec5e9d9d1d1bc2bade4bb57ba70c0839553 WORKDIR /app # Download SLF4J API, Logback dependencies, and Jackson for JSON encoding RUN apt-get update && apt-get install -y wget && \ wget https://repo1.maven.org/maven2/org/slf4j/slf4j-api/2.0.9/slf4j-api-2.0.9.jar && \ wget https://repo1.maven.org/maven2/ch/qos/logback/logback-classic/1.4.14/logback-classic-1.4.14.jar && \ wget https://repo1.maven.org/maven2/ch/qos/logback/logback-core/1.4.14/logback-core-1.4.14.jar && \ wget https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-core/2.16.1/jackson-core-2.16.1.jar && \ wget https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-databind/2.16.1/jackson-databind-2.16.1.jar && \ wget https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-annotations/2.16.1/jackson-annotations-2.16.1.jar && \ apt-get clean && rm -rf /var/lib/apt/lists/* COPY App.java . COPY logback.xml . RUN javac -cp "slf4j-api-2.0.9.jar:logback-classic-1.4.14.jar:logback-core-1.4.14.jar:jackson-core-2.16.1.jar:jackson-databind-2.16.1.jar:jackson-annotations-2.16.1.jar" App.java CMD ["java", "-cp", ".:slf4j-api-2.0.9.jar:logback-classic-1.4.14.jar:logback-core-1.4.14.jar:jackson-core-2.16.1.jar:jackson-databind-2.16.1.jar:jackson-annotations-2.16.1.jar", "App"] ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/java/logback.xml ================================================ %d{yyyy-MM-dd'T'HH:mm:ss.SSSZ}[%thread] %-5level %logger{36} - %msg%n ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/javascript/Dockerfile ================================================ FROM node:24-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f WORKDIR /app # Create package.json and install pino with pino-pretty for better output formatting RUN echo '{"name": "logging-example", "version": "1.0.0", "dependencies": {"pino": "^8.17.2", "pino-pretty": "^10.3.1"}}' > package.json RUN npm install COPY app.js . RUN chmod +x app.js CMD ["node", "app.js"] ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/javascript/app.js ================================================ #!/usr/bin/env node // Pino's primary usage writes ndjson to `stdout`: const pino = require('pino')() // However, if "human readable" output is desired, // `pino-pretty` can be provided as the destination // stream by uncommenting the following line in place // of the previous declaration: // const pino = require('pino')(require('pino-pretty')()) let counter = 0; pino.info('Starting JavaScript basic logging example with Pino'); pino.info('Demonstrating various Pino logging features'); // Create child loggers with different contexts const appLogger = pino.child({ component: 'app' }); const dbLogger = pino.child({ component: 'database' }); const apiLogger = pino.child({ component: 'api', version: '1.0' }); // Function to demonstrate various logging features function logMessage() { counter++; // Cycle through different logging examples const logType = counter % 12; switch (logType) { case 0: pino.info('hello world'); break; case 1: pino.error('this is at error level'); break; case 2: pino.info('the answer is %d', 42); break; case 3: pino.info({ obj: 42 }, 'hello world'); break; case 4: pino.info({ obj: 42, counter: counter }, 'hello world with counter'); break; case 5: pino.info({ nested: { obj: 42, timestamp: new Date() } }, 'nested object'); break; case 6: pino.error(new Error('simulated error')); break; case 7: appLogger.info('hello from app component!'); break; case 8: dbLogger.warn({ query: 'SELECT * FROM users', duration: 250 }, 'slow query detected'); break; case 9: apiLogger.info({ method: 'GET', path: '/api/users', status: 200 }, 'API request completed'); break; case 10: const tempChild = pino.child({ requestId: `req-${counter}` }); tempChild.debug('this is a debug statement via child'); break; case 11: pino.info(new Error('kaboom'), 'with', 'additional', 'context'); break; } // Occasionally demonstrate level changes if (counter % 20 === 0) { pino.level = 'debug'; pino.debug('switched to debug level - this should now be visible'); setTimeout(() => { pino.level = 'info'; pino.info('switched back to info level'); }, 500); } // Occasionally demonstrate trace level if (counter % 25 === 0) { const originalLevel = pino.level; pino.level = 'trace'; pino.trace('this is a trace statement'); pino.level = originalLevel; } } // Log every 1 second infinitely setInterval(logMessage, 1000); ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/loki-config.yaml ================================================ # This is a complete configuration to deploy Loki backed by the filesystem. # The index will be shipped to the storage via tsdb-shipper. auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/php/Dockerfile ================================================ FROM php:8.5-cli-alpine@sha256:6ca76906d789edfac74e5f109c800b71e571bd313277133eaddc079733ee0b65 WORKDIR /app # Install Composer RUN curl -sS https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer # Create composer.json for Monolog RUN echo '{"require": {"monolog/monolog": "^3.5"}}' > composer.json # Install dependencies RUN composer install --no-dev --optimize-autoloader COPY app.php . CMD ["php", "app.php"] ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/php/app.php ================================================ pushHandler($consoleHandler); // Add a processor to inject an 'environment' extra field into every log entry $logger->pushProcessor(function ($record) { $record['extra']['environment'] = 'production'; // You can set any value or use getenv() etc. return $record; }); // Create component-specific loggers if you want $appLogger = $logger->withName('app'); $dbLogger = $logger->withName('database'); $apiLogger = $logger->withName('api'); $counter = 0; $logger->info("Starting PHP basic logging example with Monolog"); $logger->info("Demonstrating Monolog structured logging features"); while (true) { $counter++; $logType = $counter % 6; switch ($logType) { case 0: $logger->info("hello world"); break; case 1: $logger->error("this is at error level"); break; case 2: $logger->info("hello world with counter", [ 'counter' => $counter, 'obj' => 42 ]); break; case 3: $dbLogger->warning("slow query detected", [ 'query' => 'SELECT * FROM users', 'duration' => 250 ]); break; case 4: $apiLogger->info("API request completed", [ 'method' => 'GET', 'path' => '/api/users', 'status' => 200 ]); break; case 5: // Fatal error with stack trace $fatalException = new Exception("Critical system failure - database connection lost"); $logger->emergency("System encountered a fatal error", [ 'exception' => $fatalException, 'error_code' => 'DB_CONNECTION_LOST', 'affected_service' => 'user_authentication' ]); break; } // Wait 1 second before next log sleep(1); } ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/python/Dockerfile ================================================ FROM python:3.12-slim@sha256:46cb7cc2877e60fbd5e21a9ae6115c30ace7a077b9f8772da879e4590c18c2e3 WORKDIR /app COPY app.py . RUN chmod +x app.py CMD ["python", "app.py"] ================================================ FILE: app-instrumentation/logging/popular-logging-frameworks/python/app.py ================================================ #!/usr/bin/env python3 import logging import time # Configure logging logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s - %(filename)s:%(lineno)d - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), ] ) logger = logging.getLogger(__name__) def main(): counter = 0 logger.info("Starting Python basic logging example") logger.info("Demonstrating Python logging module") # Infinite loop with different log levels while True: counter += 1 # Cycle through different log levels log_type = counter % 5 if log_type == 0: logger.debug(f"Basic debug message, counter: {counter}") elif log_type == 1: logger.info(f"Information message, counter: {counter}") elif log_type == 2: logger.warning(f"Warning message, counter: {counter}") elif log_type == 3: logger.error(f"Error message, counter: {counter}") elif log_type == 4: logger.critical(f"Critical message, counter: {counter}") # Wait 1 second before next log time.sleep(1) if __name__ == "__main__": main() ================================================ FILE: aws-firehose-logs/README.md ================================================ # AWS Kinesis Data Firehose to Loki — no AWS account required Demonstrates `loki.source.awsfirehose`, the HTTP receiver that accepts AWS Kinesis Data Firehose's documented delivery format. **You don't need an AWS account or any AWS SDKs** — Firehose is just an HTTPS POST in a known JSON shape, and this scenario emulates the producer with a small Python container. This is the same producer-emulator pattern used by [`syslog/`](../syslog/) and [`gelf-log-ingestion/`](../gelf-log-ingestion/). ## Architecture - **`alloy`** runs `loki.source.awsfirehose` on port `:9999`, listening at `/awsfirehose/api/v1/push` - **`firehose-sender`** (Python) generates synthetic CloudWatch-style log batches every 5 seconds and POSTs them to Alloy in the documented Firehose delivery format (records array with gzip-compressed, base64-encoded data fields) - **`loki`** + **`grafana`** for storage and visualization, with the Loki datasource auto-provisioned The sender alternates between three log streams: 1. VPC flow logs on `eni-0abc1234-all` (channel `/aws/vpc/flowlogs`) 2. VPC flow logs on `eni-0def5678-all` (same channel, different stream) 3. Lambda invocation logs on `[$LATEST]abc` (channel `/aws/lambda/checkout-service`) ## Running ```bash # From this directory docker compose up -d # Or from the repo root ./run-example.sh aws-firehose-logs ``` ## Accessing - **Grafana**: http://localhost:3000 (no login) - **Alloy UI**: http://localhost:12345 — confirm components healthy, use livedebugging to watch records flow through - **Firehose endpoint**: http://localhost:9999/awsfirehose/api/v1/push (POSTable from your laptop) - **Loki API**: http://localhost:3100 ## Trying it out Within ~10 seconds of bring-up, the sender starts producing batches. In Grafana Explore on Loki: ```logql # All Firehose-delivered logs {log_group=~".+"} # Just VPC flow logs {log_group="/aws/vpc/flowlogs"} # A specific ENI {log_group="/aws/vpc/flowlogs", log_stream="eni-0abc1234-all"} # Lambda invocations {log_group="/aws/lambda/checkout-service"} # Just the data records (vs control messages) {msg_type="DATA_MESSAGE"} ``` The promoted labels `log_group`, `log_stream`, and `msg_type` come from the CloudWatch envelope — `loki.source.awsfirehose` automatically attaches `__aws_cw_log_group`, `__aws_cw_log_stream`, and `__aws_cw_msg_type` discovery labels when the records contain a CloudWatch subscription filter envelope; this scenario's `loki.relabel` block promotes them. ## Send your own records The receiver is just an HTTP endpoint. From your laptop: ```bash curl -X POST http://localhost:9999/awsfirehose/api/v1/push \ -H 'Content-Type: application/json' \ -d '{ "requestId": "test-1", "timestamp": 1234567890, "records": [ {"data": "'$(printf '{"messageType":"DATA_MESSAGE","logGroup":"/manual","logStream":"laptop","logEvents":[{"id":"x","timestamp":1234567890000,"message":"hi from curl"}]}' | gzip | base64)'"} ] }' ``` This adds a one-off entry visible at `{log_group="/manual"}`. ## Differences from real Firehose This scenario emulates the wire format. A real Firehose delivery stream has a few additional concerns the demo doesn't cover: - **Authentication**: real Firehose includes an `X-Amz-Firehose-Access-Key` header that the receiver validates. `loki.source.awsfirehose` supports this via the `access_key` argument; we leave it disabled in the demo for ease of trying it from curl. In production, **always** set an access key. - **TLS**: real Firehose requires HTTPS. Add `tls { cert_file = ..., key_file = ... }` to the Alloy `http` block in production. - **Retry semantics**: real Firehose retries on 5xx and partial successes. The Python sender here just logs failures and moves on. - **Custom labels via header**: real Firehose can set `X-Amz-Firehose-Common-Attributes` (label names prefixed `lbl_`). Try adding this to your own producer to see additional discovery labels appear. ## Stopping ```bash docker compose down -v ``` ================================================ FILE: aws-firehose-logs/config.alloy ================================================ // AWS Kinesis Data Firehose → Loki, no AWS account required. // // `loki.source.awsfirehose` is just an HTTP endpoint that accepts // Firehose's documented delivery format (a `records` array of base64 // blobs). A small Python sender container in this scenario fakes the // producer side, posting CloudWatch-style log batches every few // seconds. The component auto-detects the CloudWatch envelope and // attaches the `__aws_cw_*` discovery labels we relabel below. livedebugging { enabled = true } // CloudWatch envelope discovery labels are exposed by // `loki.source.awsfirehose` only via its `relabel_rules` argument // (same pattern as `loki.source.journal`). They are NOT attached to // outgoing entries by default — running them through a standalone // `loki.relabel` after the source would see no `__aws_cw_*` labels. loki.relabel "firehose" { forward_to = [] rule { source_labels = ["__aws_cw_log_group"] target_label = "log_group" } rule { source_labels = ["__aws_cw_log_stream"] target_label = "log_stream" } rule { source_labels = ["__aws_cw_msg_type"] target_label = "msg_type" } } loki.source.awsfirehose "fake" { http { listen_address = "0.0.0.0" listen_port = 9999 } relabel_rules = loki.relabel.firehose.rules forward_to = [loki.write.local.receiver] } loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" } } ================================================ FILE: aws-firehose-logs/docker-compose.yml ================================================ services: loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100/tcp" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - "3000:3000/tcp" entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - "12345:12345" - "9999:9999" volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - loki firehose-sender: image: python:${PYTHON_VERSION:-3.11-slim} volumes: - ./firehose_sender.py:/firehose_sender.py:ro environment: - ALLOY_FIREHOSE_URL=http://alloy:9999/awsfirehose/api/v1/push - INTERVAL_SECONDS=5 - EVENTS_PER_BATCH=8 depends_on: - alloy command: ["python3", "-u", "/firehose_sender.py"] restart: unless-stopped ================================================ FILE: aws-firehose-logs/firehose_sender.py ================================================ """Fake AWS Kinesis Firehose producer for the aws-firehose-logs scenario. Generates synthetic VPC-flow-style log batches, wraps them in the CloudWatch logs subscription envelope (so Alloy attaches the `__aws_cw_*` discovery labels), then posts them to Alloy's `loki.source.awsfirehose` HTTP endpoint in the documented Firehose delivery format. No AWS account or SDK required — this is just an HTTP client. """ import base64 import gzip import json import os import random import sys import time import uuid from datetime import datetime from urllib import request as urlrequest ENDPOINT = os.environ.get( "ALLOY_FIREHOSE_URL", "http://alloy:9999/awsfirehose/api/v1/push", ) INTERVAL = float(os.environ.get("INTERVAL_SECONDS", "5")) EVENTS_PER_BATCH = int(os.environ.get("EVENTS_PER_BATCH", "8")) LOG_GROUPS = [ ("/aws/vpc/flowlogs", "eni-0abc1234-all"), ("/aws/vpc/flowlogs", "eni-0def5678-all"), ("/aws/lambda/checkout-service", "2026/04/28/[$LATEST]abc"), ] ACTIONS = ["ACCEPT", "REJECT"] def vpc_flow_line() -> str: src = f"10.0.{random.randint(0,255)}.{random.randint(1,254)}" dst = f"10.0.{random.randint(0,255)}.{random.randint(1,254)}" bytes_ = random.randint(40, 65000) pkts = random.randint(1, 50) action = random.choices(ACTIONS, weights=[9, 1])[0] now = int(time.time()) return f"2 123456789012 eni-0abc1234 {src} {dst} 12345 443 6 {pkts} {bytes_} {now-30} {now} {action} OK" def lambda_log_line() -> str: levels = ["INFO", "INFO", "INFO", "WARN", "ERROR"] level = random.choice(levels) request_id = str(uuid.uuid4()) return f"{datetime.utcnow().isoformat()}Z {level} RequestId: {request_id} processing checkout" def cloudwatch_envelope(log_group: str, log_stream: str, line_fn) -> dict: """Build a CloudWatch logs subscription delivery envelope. See: https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/SubscriptionFilters.html """ return { "messageType": "DATA_MESSAGE", "owner": "123456789012", "logGroup": log_group, "logStream": log_stream, "subscriptionFilters": ["AlloyDemo"], "logEvents": [ { "id": str(uuid.uuid4()), "timestamp": int(time.time() * 1000), "message": line_fn(), } for _ in range(EVENTS_PER_BATCH) ], } def encode_record(envelope: dict) -> dict: """CloudWatch subscription delivery is gzip-compressed JSON, then base64-encoded inside the Firehose record `data` field. See: https://docs.aws.amazon.com/firehose/latest/dev/httpdeliveryrequestresponse.html """ raw = json.dumps(envelope).encode() compressed = gzip.compress(raw) return {"data": base64.b64encode(compressed).decode()} def send_batch() -> None: log_group, log_stream = random.choice(LOG_GROUPS) line_fn = lambda_log_line if "lambda" in log_group else vpc_flow_line envelope = cloudwatch_envelope(log_group, log_stream, line_fn) body = { "requestId": str(uuid.uuid4()), "timestamp": int(time.time() * 1000), "records": [encode_record(envelope)], } req = urlrequest.Request( ENDPOINT, data=json.dumps(body).encode(), headers={ "Content-Type": "application/json", "X-Amz-Firehose-Request-Id": body["requestId"], }, ) try: with urlrequest.urlopen(req, timeout=5) as resp: print(f"POST {log_group}/{log_stream}: {resp.status}", flush=True) except Exception as e: print(f"POST {log_group}/{log_stream}: FAILED {e}", flush=True) def main() -> int: # Wait briefly so Alloy's HTTP listener is up before the first POST. time.sleep(3) while True: send_batch() time.sleep(INTERVAL) if __name__ == "__main__": sys.exit(main() or 0) ================================================ FILE: aws-firehose-logs/loki-config.yaml ================================================ auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true ingester: max_chunk_age: 5m ================================================ FILE: blackbox-probing/README.md ================================================ # Blackbox Probing This scenario demonstrates **synthetic monitoring** and **HTTP endpoint probing** using Grafana Alloy's `prometheus.exporter.blackbox` component. ## Overview Blackbox probing (also known as synthetic monitoring) tests the availability and responsiveness of services from an external perspective. Instead of instrumenting applications to export metrics, the blackbox exporter actively probes endpoints and reports whether they are reachable, how long they take to respond, and other HTTP-level details. This scenario probes two targets: - **nginx** — a simple web server running on port 80 - **prometheus** — the Prometheus server running on port 9090 ## Architecture ``` Alloy (blackbox exporter) --probes--> nginx:80 --probes--> prometheus:9090 --writes--> Prometheus (remote write) Grafana --queries--> Prometheus ``` ## Running ```bash # From this directory docker compose up -d # Or from the repo root ./run-example.sh blackbox-probing ``` ## Accessing the Stack | Service | URL | |------------|----------------------------| | Grafana | http://localhost:3000 | | Alloy UI | http://localhost:12345 | | Prometheus | http://localhost:9090 | | nginx | http://localhost:8080 | ## Key Metrics Once running, you can query these metrics in Grafana or Prometheus: - `probe_success` — 1 if the probe succeeded, 0 if it failed - `probe_duration_seconds` — total time the probe took - `probe_http_status_code` — HTTP status code returned by the target - `probe_http_duration_seconds` — duration of each phase of the HTTP request (resolve, connect, tls, processing, transfer) ## Stopping ```bash docker compose down ``` ================================================ FILE: blackbox-probing/config.alloy ================================================ // --- Remote Write to Prometheus --- prometheus.remote_write "remote" { endpoint { url = "http://prometheus:9090/api/v1/write" } } // --- Blackbox Exporter Configuration --- prometheus.exporter.blackbox "default" { config = "{ modules: { http_2xx: { prober: http, timeout: 5s } } }" target { name = "nginx" address = "http://nginx:80" module = "http_2xx" } target { name = "prometheus" address = "http://prometheus:9090" module = "http_2xx" } } // --- Blackbox Scrape Configuration --- prometheus.scrape "blackbox_targets" { scrape_interval = "15s" targets = prometheus.exporter.blackbox.default.targets forward_to = [prometheus.remote_write.remote.receiver] } // --- Enable Live Debugging --- livedebugging { enabled = true } ================================================ FILE: blackbox-probing/docker-compose.coda.yml ================================================ services: nginx: image: nginx:latest@sha256:1881968aff6f7cdcc4b888c00a11f4ce241ad7ec957e0cb4a9e19e93a3ff87ea ports: - 8080:80/tcp ================================================ FILE: blackbox-probing/docker-compose.yml ================================================ services: nginx: image: nginx:latest@sha256:1881968aff6f7cdcc4b888c00a11f4ce241ad7ec957e0cb4a9e19e93a3ff87ea ports: - 8080:80/tcp prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090/tcp volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy ================================================ FILE: blackbox-probing/prom-config.yaml ================================================ # Minimal Prometheus configuration global: scrape_interval: 15s evaluation_interval: 15s ================================================ FILE: cloudwatch-metrics/README.md ================================================ # AWS CloudWatch metrics — no AWS account required Demonstrates `prometheus.exporter.cloudwatch`, Alloy's built-in wrapper around [YACE](https://github.com/nerdswords/yet-another-cloudwatch-exporter). **No real AWS account or live infrastructure needed** — [LocalStack](https://localstack.cloud/) emulates the CloudWatch and STS APIs locally, and a small Python seeder container plants synthetic `EC2/CPUUtilization` data points every 30 s. This is the same offline-reproducibility pattern used by [`aws-firehose-logs/`](../aws-firehose-logs/). ## Architecture ``` metric-seeder (Python) └── put_metric_data → LocalStack CloudWatch (:4566) ↑ Alloy prometheus.exporter.cloudwatch ↓ prometheus.scrape → prometheus.remote_write ↓ Prometheus (:9090) ↑ Grafana (:3000) ``` - **`localstack`** — emulates `cloudwatch` + `sts` APIs; no AWS credentials required - **`metric-seeder`** — pushes `CPUUtilization` (random 5–85 %) for `i-1234567890abcdef0` every 30 s - **`alloy`** — runs `prometheus.exporter.cloudwatch` pointed at LocalStack via `AWS_ENDPOINT_URL`; scrapes every 60 s and remote-writes to Prometheus - **`prometheus`** — stores and serves metrics - **`grafana`** — visualises with Prometheus datasource auto-provisioned ## Running ```bash # From this directory docker compose up -d # Or from the repo root ./run-example.sh cloudwatch-metrics ``` LocalStack and the metric-seeder start first; Alloy waits for LocalStack to be healthy before scraping. ## Accessing | Service | URL | |---|---| | **Grafana** | http://localhost:3000 (no login) | | **Prometheus** | http://localhost:9090 | | **Alloy UI** | http://localhost:12345 | | **LocalStack** | http://localhost:4566/_localstack/health | ## Trying it out Within ~90 s of bring-up (LocalStack ready → seeder plants first points → Alloy scrapes → Prometheus ingests), metrics appear in Prometheus. Open **Grafana → Explore → Prometheus** and run: ```promql # CPU utilisation for the seeded EC2 instance aws_ec2_cpuutilization_average # Maximum CPU in the last 5 m aws_ec2_cpuutilization_maximum # All CloudWatch-sourced metrics {job="cloudwatch/localstack/ec2_cpu"} ``` Or query Prometheus directly: ```bash curl -sG 'http://localhost:9090/api/v1/query' \ --data-urlencode 'query=aws_ec2_cpuutilization_average' | jq . ``` In the **Alloy UI** (http://localhost:12345), navigate to **Graph** to see the pipeline: `prometheus.exporter.cloudwatch.localstack` → `prometheus.scrape.cloudwatch` → `prometheus.remote_write.local` Use **livedebugging** on `prometheus.scrape.cloudwatch` to watch metrics flow through in real time. ## Adapting for real AWS To point this scenario at real CloudWatch instead of LocalStack: 1. Remove the `localstack` and `metric-seeder` services from `docker-compose.yml` 2. Remove the `AWS_ENDPOINT_URL` environment variable from the `alloy` service 3. Set real credentials: ```yaml environment: - AWS_ACCESS_KEY_ID= - AWS_SECRET_ACCESS_KEY= - AWS_DEFAULT_REGION=us-east-1 ``` 4. Update the `dimensions` in `config.alloy` to match a real `InstanceId` in your account The `config.alloy` static job configuration and Alloy pipeline are identical for both LocalStack and real AWS. ================================================ FILE: cloudwatch-metrics/config.alloy ================================================ // AWS CloudWatch metrics → Prometheus — no AWS account required. // // Uses LocalStack to emulate CloudWatch locally. A companion `metric-seeder` // container pushes synthetic EC2/CPUUtilization data points every 30 s so // Alloy has real data to scrape immediately on start-up. // // `prometheus.exporter.cloudwatch` wraps YACE and honours AWS SDK v2 endpoint // overrides; we point it at LocalStack via AWS_ENDPOINT_URL in docker-compose. livedebugging { enabled = true } // Static job: no live EC2 discovery needed — we target the exact InstanceId // that the metric-seeder plants in LocalStack CloudWatch. prometheus.exporter.cloudwatch "localstack" { sts_region = "us-east-1" static "ec2_cpu" { regions = ["us-east-1"] namespace = "AWS/EC2" dimensions = { "InstanceId" = "i-1234567890abcdef0", } metric { name = "CPUUtilization" statistics = ["Average", "Maximum"] period = "1m" } } } // Scrape the exporter every 60 s — CloudWatch data points are coarse-grained // so there is no benefit in scraping more frequently. prometheus.scrape "cloudwatch" { targets = prometheus.exporter.cloudwatch.localstack.targets forward_to = [prometheus.remote_write.local.receiver] scrape_interval = "60s" } // Remote-write to the local Prometheus instance. prometheus.remote_write "local" { endpoint { url = "http://prometheus:9090/api/v1/write" } } ================================================ FILE: cloudwatch-metrics/docker-compose.yml ================================================ services: # LocalStack emulates the CloudWatch + STS APIs locally. # No real AWS account or credentials needed. localstack: image: localstack/localstack:${LOCALSTACK_VERSION:-4.4.0} ports: - "4566:4566" environment: - SERVICES=cloudwatch,sts - DEFAULT_REGION=us-east-1 healthcheck: test: ["CMD", "curl", "-sf", "http://localhost:4566/_localstack/health"] interval: 5s timeout: 5s retries: 15 # Pushes synthetic EC2/CPUUtilization data into LocalStack every 30 s. metric-seeder: image: python:${PYTHON_VERSION:-3.11-slim} volumes: - ./seed-metrics.py:/seed-metrics.py:ro environment: - AWS_ACCESS_KEY_ID=test - AWS_SECRET_ACCESS_KEY=test - AWS_DEFAULT_REGION=us-east-1 - AWS_ENDPOINT_URL=http://localstack:4566 - INTERVAL_SECONDS=30 command: > sh -c "pip install boto3 --quiet && python -u /seed-metrics.py" depends_on: localstack: condition: service_healthy restart: unless-stopped prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --config.file=/etc/prometheus/prometheus.yml volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml ports: - "9090:9090" grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - "3000:3000/tcp" entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh depends_on: - prometheus alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - "12345:12345" volumes: - ./config.alloy:/etc/alloy/config.alloy environment: # Point AWS SDK v2 at LocalStack instead of real AWS endpoints. - AWS_ACCESS_KEY_ID=test - AWS_SECRET_ACCESS_KEY=test - AWS_DEFAULT_REGION=us-east-1 - AWS_ENDPOINT_URL=http://localstack:4566 command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: localstack: condition: service_healthy prometheus: condition: service_started ================================================ FILE: cloudwatch-metrics/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s ================================================ FILE: cloudwatch-metrics/seed-metrics.py ================================================ """ CloudWatch metric seeder for LocalStack. Pushes synthetic EC2 CPUUtilization data points into LocalStack every INTERVAL_SECONDS so that prometheus.exporter.cloudwatch has something to scrape immediately without a real AWS account. """ import os import random import time import boto3 from botocore.config import Config ENDPOINT = os.getenv("AWS_ENDPOINT_URL", "http://localstack:4566") REGION = os.getenv("AWS_DEFAULT_REGION", "us-east-1") INTERVAL = int(os.getenv("INTERVAL_SECONDS", "30")) INSTANCE_ID = "i-1234567890abcdef0" cw = boto3.client( "cloudwatch", endpoint_url=ENDPOINT, region_name=REGION, aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID", "test"), aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY", "test"), config=Config(retries={"max_attempts": 5}), ) print(f"Seeder started — pushing to {ENDPOINT} every {INTERVAL}s", flush=True) while True: cpu = round(random.uniform(5.0, 85.0), 2) cw.put_metric_data( Namespace="AWS/EC2", MetricData=[ { "MetricName": "CPUUtilization", "Dimensions": [{"Name": "InstanceId", "Value": INSTANCE_ID}], "Value": cpu, "Unit": "Percent", } ], ) print(f" → CPUUtilization={cpu}% instance={INSTANCE_ID}", flush=True) time.sleep(INTERVAL) ================================================ FILE: coda ================================================ #!/usr/bin/env bash set -euo pipefail # On Coda VMs the repo lives at /opt/alloy-scenarios and this script is # symlinked from /usr/local/bin/coda. For local dev use the script's own # directory (works when invoked directly, not via symlink). if [[ -d /opt/alloy-scenarios ]]; then REPO_DIR="/opt/alloy-scenarios" else REPO_DIR="$(cd "$(dirname "$0")" && pwd)" fi ENV_FILE="${REPO_DIR}/image-versions.env" SCENARIO_FILE="/etc/coda/scenario" usage() { cat < [scenario] Commands: start [scenario] Start app containers for a scenario stop [scenario] Stop app containers for a scenario status [scenario] Show container status for a scenario list List all available scenarios If no scenario is given, reads from ${SCENARIO_FILE}. EOF exit 1 } resolve_scenario() { local scenario="${1:-}" if [[ -z "$scenario" ]]; then if [[ -f "$SCENARIO_FILE" ]]; then scenario="$(cat "$SCENARIO_FILE")" else echo "Error: no scenario specified and ${SCENARIO_FILE} not found" >&2 exit 1 fi fi echo "$scenario" } compose_args() { local scenario="$1" local dir="${REPO_DIR}/${scenario}" local compose_file="${dir}/docker-compose.coda.yml" if [[ ! -f "$compose_file" ]]; then echo "Error: ${compose_file} not found" >&2 exit 1 fi # Sanitize project name: replace / with - local project_name="coda-${scenario//\//-}" echo "-f ${compose_file} --env-file ${ENV_FILE} -p ${project_name}" } cmd_start() { local scenario scenario="$(resolve_scenario "${1:-}")" local args args="$(compose_args "$scenario")" echo "Starting scenario: ${scenario}" eval docker compose $args up -d --build } cmd_stop() { local scenario scenario="$(resolve_scenario "${1:-}")" local args args="$(compose_args "$scenario")" echo "Stopping scenario: ${scenario}" eval docker compose $args down } cmd_status() { local scenario scenario="$(resolve_scenario "${1:-}")" local args args="$(compose_args "$scenario")" eval docker compose $args ps } cmd_list() { echo "Available scenarios:" find "$REPO_DIR" -name docker-compose.coda.yml 2>/dev/null \ | sed "s|^${REPO_DIR}/||; s|/docker-compose.coda.yml||" \ | sort \ | while read -r s; do echo " $s"; done } [[ $# -lt 1 ]] && usage command="$1" shift case "$command" in start) cmd_start "$@" ;; stop) cmd_stop "$@" ;; status) cmd_status "$@" ;; list) cmd_list ;; *) usage ;; esac ================================================ FILE: continuous-profiling/README.md ================================================ # Continuous Profiling This scenario demonstrates continuous profiling of a Go application using Grafana Alloy's `pyroscope.scrape` and `pyroscope.write` components, with Grafana Pyroscope as the profiling backend. ## Overview The example includes: - **demo-app** -- A Go application that performs CPU-intensive and memory-intensive work, exposing standard pprof endpoints on port 6060 - **alloy** -- Grafana Alloy configured to scrape pprof profiles from the demo app and forward them to Pyroscope - **pyroscope** -- Grafana Pyroscope for storing and querying profiling data - **grafana** -- Grafana with the Pyroscope datasource pre-configured for visualizing profiles ## Running the Demo 1. Clone the repository: ``` git clone https://github.com/grafana/alloy-scenarios.git cd alloy-scenarios ``` 2. Navigate to this example directory: ``` cd continuous-profiling ``` 3. Run using Docker Compose: ``` docker compose up -d ``` Or use the centralized image management: ``` cd .. ./run-example.sh continuous-profiling ``` 4. Access Grafana at http://localhost:3000 ## What to Expect After starting the scenario, Alloy will scrape the following profile types from the demo app every 15 seconds: - **CPU** -- Identifies functions consuming the most CPU time (the `cpuIntensive` goroutine) - **Memory (heap)** -- Shows memory allocation patterns (the `memoryIntensive` goroutine allocating 1MB chunks) - **Goroutine** -- Displays active goroutines and their stack traces - **Mutex** -- Captures mutex contention profiles - **Block** -- Captures blocking operation profiles To view profiles: 1. Open Grafana at http://localhost:3000 2. Navigate to **Explore** 3. Select the **Pyroscope** datasource 4. Choose a profile type (e.g., `process_cpu`) and the `demo-app` service 5. You should see flame graphs showing where the application spends its time and allocates memory ## Architecture ``` ┌───────────┐ scrape pprof ┌───────────┐ push profiles ┌────────────┐ │ demo-app │◀─────────────────────│ Alloy │─────────────────────▶│ Pyroscope │ │ :6060 │ /debug/pprof/* │ :12345 │ │ :4040 │ └───────────┘ └───────────┘ └─────┬──────┘ │ ▼ ┌──────────┐ │ Grafana │ │ :3000 │ └──────────┘ ``` ## Useful Links - Alloy UI: http://localhost:12345 -- Inspect the Alloy pipeline and component status - Grafana: http://localhost:3000 -- Explore profiles via the Pyroscope datasource - Pyroscope: http://localhost:4040 -- Direct access to the Pyroscope UI - Demo app pprof index: http://localhost:6060/debug/pprof/ -- Raw pprof endpoints ================================================ FILE: continuous-profiling/app/go.mod ================================================ module demo go 1.23 ================================================ FILE: continuous-profiling/app/main.go ================================================ package main import ( "fmt" "math/rand" "net/http" _ "net/http/pprof" "time" ) func cpuIntensive() { for { sum := 0 for i := 0; i < 1000000; i++ { sum += rand.Intn(100) } time.Sleep(100 * time.Millisecond) } } func memoryIntensive() { var data [][]byte for { chunk := make([]byte, 1024*1024) // 1MB for i := range chunk { chunk[i] = byte(rand.Intn(256)) } data = append(data, chunk) if len(data) > 50 { data = data[1:] } time.Sleep(500 * time.Millisecond) } } func main() { go cpuIntensive() go memoryIntensive() fmt.Println("Demo app running on :6060 with pprof endpoints") http.ListenAndServe(":6060", nil) } ================================================ FILE: continuous-profiling/config.alloy ================================================ livedebugging { enabled = true } // Scrape pprof profiles from the demo Go application pyroscope.scrape "default" { targets = [ {"__address__" = "demo-app:6060", "service_name" = "demo-app"}, ] scrape_interval = "15s" profiling_config { profile.process_cpu { enabled = true } profile.memory { enabled = true } profile.goroutine { enabled = true } profile.mutex { enabled = true } profile.block { enabled = true } } forward_to = [pyroscope.write.default.receiver] } pyroscope.write "default" { endpoint { url = "http://pyroscope:4040" } } ================================================ FILE: continuous-profiling/docker-compose.coda.yml ================================================ services: demo-app: image: golang:1.26@sha256:2981696eed011d747340d7252620932677929cce7d2d539602f56a8d7e9b660b ports: - 6060:6060 volumes: - ./app:/app working_dir: /app command: go run main.go ================================================ FILE: continuous-profiling/docker-compose.yml ================================================ services: # Demo Go application with pprof endpoints demo-app: image: golang:1.26@sha256:2981696eed011d747340d7252620932677929cce7d2d539602f56a8d7e9b660b ports: - 6060:6060 volumes: - ./app:/app working_dir: /app command: go run main.go # Pyroscope for continuous profiling storage and visualization pyroscope: image: grafana/pyroscope:2.0.1@sha256:704889ae04768d982a0a71935bb054948993ddc3fe80234611d20877ba8be4c9 ports: - 4040:4040 # Grafana for visualization grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Pyroscope type: grafana-pyroscope-datasource access: proxy orgId: 1 url: http://pyroscope:4040 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh depends_on: - pyroscope # Alloy for telemetry pipeline alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 # Alloy HTTP server volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - demo-app - pyroscope ================================================ FILE: docker-monitoring/README.md ================================================ # Docker Monitoring with Grafana Alloy This example demonstrates how to monitor Docker containers using Grafana Alloy. ## Prerequisites - Docker - Docker Compose - Git ## Running the Demo ### Step 1: Clone the repository ```bash git clone https://github.com/grafana/alloy-scenarios.git ``` ### Step 2: Deploy the monitoring stack ```bash cd alloy-scenarios/docker-monitoring docker-compose up -d ``` > **Note (macOS Docker Desktop):** If Alloy cannot connect to the Docker socket, you may need to change the volume mount in `docker-compose.yml` from `/var/run/docker.sock` to `/var/run/docker.sock.raw`. This is a workaround specific to some versions of Docker Desktop on macOS. ### Step 3: Access Grafana Alloy UI Open your browser and go to `http://localhost:12345`. ### Step 4: Access Grafana UI Open your browser and go to `http://localhost:3000`. ================================================ FILE: docker-monitoring/config.alloy ================================================ // ############################### // #### Metrics Configuration #### // ############################### // Host Cadvisor on the Docker socket to expose container metrics. prometheus.exporter.cadvisor "example" { docker_only = true } discovery.relabel "example" { targets = prometheus.exporter.cadvisor.example.targets rule { target_label = "job" replacement = "integrations/docker" } rule { target_label = "instance" replacement = constants.hostname } } // Configure a prometheus.scrape component to collect cadvisor metrics. prometheus.scrape "scraper" { targets = discovery.relabel.example.output forward_to = [ prometheus.remote_write.demo.receiver ] scrape_interval = "10s" } // Configure a prometheus.remote_write component to send metrics to a Prometheus server. prometheus.remote_write "demo" { endpoint { url = "http://prometheus:9090/api/v1/write" } } // ############################### // #### Logging Configuration #### // ############################### // Discover Docker containers and extract metadata. discovery.docker "linux" { host = "unix:///var/run/docker.sock" } // Define a relabeling rule to create a service name from the container name. discovery.relabel "logs_integrations_docker" { targets = [] rule { source_labels = ["__meta_docker_container_name"] regex = "/(.*)" target_label = "container_name" } rule { target_label = "instance" replacement = constants.hostname } } // Configure a loki.source.docker component to collect logs from Docker containers. loki.source.docker "default" { host = "unix:///var/run/docker.sock" targets = discovery.docker.linux.targets relabel_rules = discovery.relabel.logs_integrations_docker.rules forward_to = [loki.process.docker_logs.receiver] } // Process and filter Docker logs before sending to Loki. // Example: Drop logs from infrastructure containers. // Modify the regex pattern to match container names you want to exclude. loki.process "docker_logs" { forward_to = [loki.write.local.receiver] stage.drop { source = "container_name" expression = "(alloy|grafana|loki)" } } loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" } } ================================================ FILE: docker-monitoring/docker-compose.yml ================================================ version: '3' services: loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --config.file=/etc/prometheus/prometheus.yml ports: - "9090:9090" grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} privileged: true ports: - 12345:12345 - 4317:4317 - 4318:4318 environment: ALLOY_DEPLOY_MODE: docker volumes: - ./config.alloy:/etc/alloy/config.alloy - /proc:/rootproc:ro - /var/run/docker.sock:/var/run/docker.sock - /sys:/sys:ro - /:/rootfs:ro - /dev/disk/:/dev/disk:ro - /var/lib/docker/:/var/lib/docker:ro command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy extra_hosts: - "host.docker.internal:host-gateway" devices: - /dev/kmsg ================================================ FILE: docker-monitoring/grafana/datasources/default.yml ================================================ apiVersion: 1 datasources: - name: Loki type: loki access: proxy url: http://loki:3100 ================================================ FILE: docker-monitoring/loki-config.yaml ================================================ # This is a complete configuration to deploy Loki backed by the filesystem. # The index will be shipped to the storage via tsdb-shipper. auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true distributor: otlp_config: # List of default otlp resource attributes to be picked as index labels # CLI flag: -distributor.otlp.default_resource_attributes_as_index_labels default_resource_attributes_as_index_labels: [service.name service.namespace service.instance.id deployment.environment deployment.environment.name cloud.region cloud.availability_zone k8s.cluster.name k8s.namespace.name k8s.container.name container.name k8s.replicaset.name k8s.deployment.name k8s.statefulset.name k8s.daemonset.name k8s.cronjob.name k8s.job.name] server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true ================================================ FILE: elasticsearch-monitoring/README.md ================================================ # Elasticsearch Monitoring with Grafana Alloy This scenario demonstrates how to monitor an Elasticsearch instance using Grafana Alloy's built-in `prometheus.exporter.elasticsearch` component. ## Architecture - **Elasticsearch** - The monitored Elasticsearch instance (single-node, security disabled) - **Grafana Alloy** - Collects Elasticsearch metrics via `prometheus.exporter.elasticsearch` and remote writes them to Prometheus - **Prometheus** - Stores the scraped metrics - **Grafana** - Visualizes Elasticsearch metrics (auto-provisioned with Prometheus datasource) ## Running ```bash # From this directory docker compose up -d # Or from the repo root using centralized image versions ./run-example.sh elasticsearch-monitoring ``` ## Accessing - **Grafana**: http://localhost:3000 (no login required) - **Alloy UI**: http://localhost:12345 - **Prometheus**: http://localhost:9090 - **Elasticsearch**: http://localhost:9200 ## Key Metrics Once running, you can query Elasticsearch metrics in Grafana or Prometheus. Some useful metrics include: - `elasticsearch_cluster_health_status` - Cluster health (green/yellow/red) - `elasticsearch_cluster_health_number_of_nodes` - Number of nodes in the cluster - `elasticsearch_indices_docs_total` - Total number of documents - `elasticsearch_indices_store_size_bytes` - Total store size - `elasticsearch_jvm_memory_used_bytes` - JVM memory usage - `elasticsearch_process_cpu_percent` - CPU usage - `elasticsearch_breakers_tripped` - Circuit breaker trip count Metrics are scraped every 30s by default — adjust `scrape_interval` in `config.alloy` if you need finer or coarser resolution. ## Stopping ```bash docker compose down ``` ================================================ FILE: elasticsearch-monitoring/config.alloy ================================================ // Elasticsearch Monitoring with Grafana Alloy // This configuration scrapes Elasticsearch metrics using the built-in prometheus.exporter.elasticsearch component // and remote writes them to Prometheus. livedebugging { enabled = true } prometheus.exporter.elasticsearch "default" { address = "http://elasticsearch:9200" } prometheus.scrape "elasticsearch" { targets = prometheus.exporter.elasticsearch.default.targets forward_to = [prometheus.remote_write.default.receiver] scrape_interval = "30s" } prometheus.remote_write "default" { endpoint { url = "http://prometheus:9090/api/v1/write" } } ================================================ FILE: elasticsearch-monitoring/docker-compose.coda.yml ================================================ services: elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch:8.17.0@sha256:2f602552550869fb29b6fd5848c5118d3ef3a2e1d5d45802e3ab9088cb2de8e2 environment: - discovery.type=single-node - xpack.security.enabled=false - ES_JAVA_OPTS=-Xms512m -Xmx512m ports: - "9200:9200" ================================================ FILE: elasticsearch-monitoring/docker-compose.yml ================================================ services: elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch:8.17.0@sha256:2f602552550869fb29b6fd5848c5118d3ef3a2e1d5d45802e3ab9088cb2de8e2 environment: - discovery.type=single-node - xpack.security.enabled=false - ES_JAVA_OPTS=-Xms512m -Xmx512m ports: - "9200:9200" prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --config.file=/etc/prometheus/prometheus.yml volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml ports: - "9090:9090" grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - elasticsearch - prometheus ================================================ FILE: elasticsearch-monitoring/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s ================================================ FILE: faro-frontend-observability/README.md ================================================ # Faro Frontend Observability This scenario demonstrates collecting frontend web telemetry using Grafana Alloy's `faro.receiver` component and the [Grafana Faro Web SDK](https://github.com/grafana/faro-web-sdk). The Faro Web SDK runs in the browser and captures logs, errors, events, and web vitals, then sends them to Alloy's Faro receiver endpoint. Alloy forwards the collected telemetry to Loki for storage and querying. ## Architecture ``` Browser (Faro Web SDK) --> Alloy (faro.receiver :12347) --> Loki (:3100) | Grafana (:3000) ``` ## Getting Started 1. Start all services: ```bash docker compose up -d ``` 2. Open the demo web page at [http://localhost:8080](http://localhost:8080). 3. Click the buttons to generate telemetry: - **Send Log** -- pushes an info-level log message - **Throw Error** -- catches and reports a JavaScript error - **Send Event** -- sends a custom event with metadata - **Unhandled Error** -- triggers an uncaught exception (automatically captured by Faro) 4. View the collected telemetry in Grafana: - Open [http://localhost:3000](http://localhost:3000) - Go to **Explore** and select the **Loki** datasource - Query with `{service_name="faro-demo"}` to see all frontend telemetry ## Services | Service | URL | Description | |---------|-----|-------------| | Web (nginx) | [http://localhost:8080](http://localhost:8080) | Demo frontend page with Faro Web SDK | | Alloy | [http://localhost:12345](http://localhost:12345) | Alloy UI for pipeline debugging | | Alloy Faro Receiver | `http://localhost:12347/collect` | Faro SDK collection endpoint | | Loki | [http://localhost:3100](http://localhost:3100) | Log aggregation backend | | Grafana | [http://localhost:3000](http://localhost:3000) | Visualization and querying | ## Alloy Pipeline The `config.alloy` pipeline is straightforward: 1. **`faro.receiver`** -- listens on port 12347 for Faro Web SDK payloads with CORS enabled for all origins 2. **`loki.write`** -- forwards the received logs to Loki ## Cleanup ```bash docker compose down ``` ================================================ FILE: faro-frontend-observability/app/index.html ================================================ Faro Frontend Observability Demo

Faro Frontend Observability Demo

Click the buttons below to generate frontend telemetry. Check Grafana Loki for the collected data.

Telemetry output will appear here...
================================================ FILE: faro-frontend-observability/config.alloy ================================================ livedebugging { enabled = true } // Receive frontend telemetry from the Faro Web SDK faro.receiver "default" { server { listen_address = "0.0.0.0" listen_port = 12347 cors_allowed_origins = ["*"] } output { logs = [loki.write.local.receiver] } } loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" } } ================================================ FILE: faro-frontend-observability/docker-compose.coda.yml ================================================ services: web: image: nginx:latest@sha256:1881968aff6f7cdcc4b888c00a11f4ce241ad7ec957e0cb4a9e19e93a3ff87ea ports: - 8080:80 volumes: - ./app:/usr/share/nginx/html:ro ================================================ FILE: faro-frontend-observability/docker-compose.yml ================================================ services: # Nginx web server serving the demo frontend page web: image: nginx:latest@sha256:1881968aff6f7cdcc4b888c00a11f4ce241ad7ec957e0cb4a9e19e93a3ff87ea ports: - 8080:80 volumes: - ./app:/usr/share/nginx/html:ro # Alloy telemetry pipeline — receives Faro Web SDK telemetry and forwards logs to Loki alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 - 12347:12347 volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data --stability.level=experimental /etc/alloy/config.alloy depends_on: - loki # Loki for log aggregation loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - 3100:3100/tcp volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml # Grafana for visualization grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh depends_on: - loki ================================================ FILE: faro-frontend-observability/loki-config.yaml ================================================ auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true ingester: max_chunk_age: 2h ================================================ FILE: game-of-tracing/AGENTS.md ================================================ # Game of Tracing — Agent Guide > Canonical guide for any AI coding agent working inside this scenario. Tool-agnostic (Cursor, Codex, Cline, Aider, Claude Code). Claude-specific dispatch lives in `CLAUDE.md`. ## What this scenario is **Game of Tracing** (titled *War of Kingdoms* in the UI) is a distributed-tracing tutorial game in the `alloy-scenarios` repository. It is substantially more elaborate than other scenarios in the repo: 10 Python/Flask services, two kingdoms competing over 8 territories, an algorithmic AI opponent, and the full LGMT stack (Loki, Grafana, Metrics/Prometheus, Tempo) sitting behind Grafana Alloy. The **headline feature** is **span-link-driven game replay**: every player and AI action stores its `trace_id`/`span_id` in SQLite; the next action creates an OpenTelemetry `trace.Link` to the previous one, producing a causal chain of traces that can be replayed from Tempo. See `SPAN_LINKS.md` for the full spec and `README.md` for the player-facing tutorial narrative. ## Architecture at a glance ``` Players ──► war-map (8080) ──┐ │ AI Opponent (8081) ──────────┤──► 8 Location Services (5001-5008) │ southern-capital, northern-capital, │ village-1 … village-6 │ All services ──OTLP──► Alloy (4317 gRPC / 4318 HTTP) ──► Tempo (3200) ├─► Loki (3100) └─► Prom (9090) │ Grafana (3000) ──datasources──► Tempo (default), Loki, Prometheus ``` All services push OTLP to Alloy; Alloy fans out by signal (traces→Tempo, logs→Loki, metrics→Prometheus). Grafana is auto-provisioned with all three datasources plus traces↔logs↔metrics correlation. ## Services and ports | Service | Port(s) | Build context | Image version env | Purpose | |---|---|---|---|---| | `loki` | 3100 | — | `GRAFANA_LOKI_VERSION` (default 3.6.7) | Log storage | | `prometheus` | 9090 | — | `PROMETHEUS_VERSION` (default v3.10.0) | Metrics storage + OTLP receiver | | `tempo` | 3200 | — | `GRAFANA_TEMPO_VERSION` (default 2.10.1) | Trace storage + metrics generator | | `grafana` | 3000 | — | `GRAFANA_VERSION` (default 12.4.0) | Visualization (anonymous admin) | | `alloy` | 12345, 4317, 4318 | — | `GRAFANA_ALLOY_VERSION` (default v1.14.0) | Telemetry pipeline | | `southern-capital` | 5001 | `./app` | — | Capital location service | | `northern-capital` | 5002 | `./app` | — | Capital location service | | `village-1` … `village-6` | 5003-5008 | `./app` | — | Village location services | | `war-map` | 8080 | `./war_map` | — | Game UI + span-link broker | | `ai-opponent` | 8081 | `./ai_opponent` | — | Algorithmic AI opponent | Image versions are centralized at `/Users/jayclifford/Repos/alloy-scenarios/image-versions.env` — edit that file, not the compose files (they use `${VAR:-default}` syntax). ## Submodules (each has its own CLAUDE.md) - **`app/`** — the 8 location Flask services. See [`app/CLAUDE.md`](app/CLAUDE.md). - **`ai_opponent/`** — the algorithmic strategic AI (not LLM). See [`ai_opponent/CLAUDE.md`](ai_opponent/CLAUDE.md). - **`war_map/`** — the Flask UI and the owner of span-link reconstruction logic. See [`war_map/CLAUDE.md`](war_map/CLAUDE.md). ## Shared state One Docker volume, `game-data`, mounted at `/data`. **Two SQLite databases live under it, with different owners — do not confuse them:** | File | Owner | Mode | Purpose | |---|---|---|---| | `game_state.db` | All 8 location services (shared) | WAL | Canonical game state: resources, armies, faction per location | | `game_sessions.db` | `war_map/` only | default | `game_actions` table: per-action `trace_id`, `span_id`, `action_sequence`, `game_session_id` — drives span linking | Overriding `DATABASE_FILE` (game_state) or `GAME_SESSIONS_DB` (game_sessions) env vars on `war_map` is supported. ### Extra tables added for multi-map support `game_state.db` also holds: - **`game_config`** — key/value store; the `active_map_id` row is authoritative at runtime. `war_map`'s `/select_map` route writes this; every location service reads it on boot and `/reload`. - **`faction_economy`** — `(faction, corpses)`. Holds the White Walkers' corpse pool on the WWA map. Populated by the post-battle hook in `LocationServer.receive_army` and by the passive corpse tick at the WW fortress. Consumed by `LocationServer.create_army` when the faction's currency is `corpses`. - **`wall_hold`** — `(map_id, faction, ticks, last_update)`. Written by `war_map`'s `_wall_tick_thread`. Non-zero rows mean that faction currently holds every wall keep on that map. `game_sessions.db` has a `map_id` column added to the `game_actions` table so replay queries can filter by map. Fresh installs seed `map_id=NULL` for any legacy rows; an additive `ALTER TABLE` migration runs on first boot after the upgrade. ## Maps `app/game_config.py` defines a `MAPS` dict with two entries: | Map id | Players | Factions | Win | Notable rules | |---|---|---|---|---| | `war_of_kingdoms` (default) | 2 | `southern`, `northern`, `neutral` | Capture enemy capital | Classic — 30 resources per army, 20 resource/collect at capitals, village passive +10/15 s | | `white_walkers_attack` | 1 (player is `nights_watch`) | `nights_watch`, `white_walkers`, `barbarian`, `neutral` | Hold every `wall` keep for 5 × 30 s ticks | `wall` settlement type doubles defenders; WW spends 5 corpses per army (no resources); barbarian villages grow +1 army every 30 s; WW fortress passively +1 corpse every 15 s | Each map also defines a **slot assignments** dict (`slot_1` → logical location id) so the 8 physical containers can serve either map. See "Slot identity" below. ### Slot identity Each location container has a fixed `SLOT_ID` env var (`slot_1` … `slot_8`). On boot, the container: 1. Reads the shared `active_map_id` from `game_state.db`'s `game_config` table. 2. Looks up `MAPS[active_map_id]["slot_assignments"][SLOT_ID]` → its logical `location_id`. 3. Loads config from `MAPS[active_map_id]["locations"][location_id]`. The container's **SERVICE_NAME** (used by Grafana dashboards) stays stable (`southern-capital`, `village-1`, etc.) regardless of the map — the *logical* location id is published as the `location.id` span attribute, not the service name. Runtime map switching: `war_map/select_map` writes a new `active_map_id`, POSTs `/reset` to any one container to wipe the `locations` table, then POSTs `/reload` to every container so they rebind in place without a restart. ## Two Alloy configurations ### Default — River (HCL) ```bash cd game-of-tracing && docker compose up -d ``` Uses `config.alloy`. Alloy runs with `run /etc/alloy/config.alloy`. ### Alternate — OTel Collector YAML ```bash cd game-of-tracing && docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d ``` Uses `config-otel.yaml`. Alloy runs with its OTel Engine mode: `otel --config=/etc/alloy/config-otel.yaml`. The pipeline is functionally identical; this variant demonstrates Alloy's ability to accept OTel Collector syntax. ## OpenTelemetry patterns you must respect Every service has its own `telemetry.py` exposing a `GameTelemetry` class that wires up all three signals. - **Traces** — OTLP gRPC → `alloy:4317`, `BatchSpanProcessor(max_export_batch_size=1)`. The batch size of 1 is **intentional** for demo timing; do not tune it. - **Logs** — OTLP HTTP → `alloy:4318/v1/logs`, `BatchLogRecordProcessor(max_queue_size=30, max_export_batch_size=5)`. - **Metrics** — OTLP HTTP → `alloy:4318/v1/metrics`, `PeriodicExportingMetricReader(export_interval_millis=10000)`, `TraceBasedExemplarFilter` (so metric exemplars link to trace IDs). ### Context propagation is manual Incoming requests extract W3C trace context from headers; outgoing requests inject it: ```python # Incoming (every route handler): ctx = extract(request.headers) with tracer.start_as_current_span("name", context=ctx, ...) as span: # Outgoing (canonical helper at app/location_server.py:327-352): inject(headers) requests.post(url, headers=headers, ...) ``` ### Background threads MUST capture context explicitly Python threads do not inherit OpenTelemetry context. The scenario's canonical pattern is to capture before spawning and attach inside the thread: ```python # app/location_server.py:209-271 (_continue_army_movement) — canonical example: ctx = get_current() def move(): token = attach(ctx) try: with self.tracer.start_as_current_span("army_movement", ...): ... finally: detach(token) Thread(target=move).start() ``` The same pattern appears in `_transfer_resources_along_path` at `app/location_server.py:273-325`. If a background span shows up with a missing or different `trace_id`, the `get_current()` / `attach` / `detach` pair is the first thing to check. ## Span links — the headline feature Span links are the mechanism that turns a sequence of discrete player actions into a replayable narrative. See `SPAN_LINKS.md` for the full design. **Flow:** 1. Player selects a faction → `war_map/app.py` creates a `game_session_id` (UUID). 2. Every action handler (`/api/collect_resources`, `/api/create_army`, `/api/move_army`) does: - Looks up the previous action for this session via `get_previous_action_context()` at `war_map/app.py:130-170`. That function reads `trace_id` and `span_id` from the `game_actions` SQLite table and rebuilds a `trace.SpanContext(..., is_remote=True, trace_flags=TraceFlags.SAMPLED)`. - Wraps the context in a link via `create_span_link_from_context()` at `war_map/app.py:172-189`, attaching `link.type="game_sequence"`, `link.relation="follows"`, `game.sequence="true"`. - Starts its own action span with that link, then calls `store_game_action()` to record its own `trace_id`/`span_id` for the next action to link back to. 3. The AI opponent uses the same primitive with a different link type — `link.type="ai_decision_trigger"` — to link its decision span to the action execution span it spawns (see `ai_opponent/ai_server.py`). 4. The replay UI queries Tempo: - `GET /api/v2/search/tag/game.session.id/values` to enumerate sessions. - `GET /api/search?q={game.session.id=""}` to pull every trace in a session. - SQLite `game_actions` is the fallback if Tempo is unavailable. ## Custom metrics reference ### From `app/telemetry.py` | Metric | Type | Attributes | Notes | |---|---|---|---| | `game.resources` | observable gauge | `location`, `location_type` | Current resource pool per location | | `game.army_size` | observable gauge | `location`, `location_type`, `faction` | Current army strength | | `game.battles` | counter | `attacker_faction`, `defender_faction`, `result`, `location` | `result ∈ {attacker_victory, defender_victory, stalemate, reinforcement}` | | `game.resource_transfer_cooldown` | observable gauge | `location` | Seconds remaining | | `game.location_control` | observable gauge | `location`, `location_type`, `faction` | `northern=1, southern=2, neutral=0, unknown=-1` | ### From `ai_opponent/telemetry.py` | Metric | Type | Attributes | |---|---|---| | `ai.decisions` | counter | `action_type`, `phase`, `reason` | | `ai.plans_created` | counter | `goal` | | `ai.plans_abandoned` | counter | `reason` | | `ai.decision_cycle_duration_seconds` | histogram | `phase` | | `ai.territory_count` | observable gauge | `faction` | | `ai.total_army` | observable gauge | `faction` | ### Span attributes used by the provisioned Grafana dashboard Preserve these when adding new spans — the dashboard's TraceQL filters depend on them: - `span.resource.movement = true` - `span.battle.occurred = true` - `span.player.action = true` ## Common tasks ```bash # Start everything cd game-of-tracing && docker compose up -d # Stop (preserves volume) docker compose down # Stop and wipe game state docker compose down -v # Rebuild only one service after code change docker compose up -d --build war-map # Switch to the OTel Engine variant docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d # Tail a trace end-to-end # 1. Game UI: http://localhost:8080 # 2. Grafana: http://localhost:3000 (anonymous admin) # 3. Alloy debug: http://localhost:12345/debug/livedebugging # 4. Tempo API: http://localhost:3200 ``` ## Gotchas - **Hyphens vs underscores.** Service names are hyphenated (`southern-capital`, set via `SERVICE_NAME` resource attribute); location IDs in game_config.py and DB rows are underscored (`southern_capital`). Code that bridges them uses `location_id.replace('_', '-')`. Do not cross them. - **Two compose files — `docker-compose.yml` and `docker-compose.coda.yml`.** The coda variant redefines the same 10 app-layer services already defined in the main compose file, for use with the `coda` CLI. When editing app services, update both. - **Image versions.** Live in `/Users/jayclifford/Repos/alloy-scenarios/image-versions.env`. Compose files use `${VAR:-default}` — edit the env file, not the compose. - **Grafana is auto-provisioned** via `grafana/datasources/defaults.yml`. Tempo is the default datasource; service map, traces-to-logs (Loki `trace_id` label), traces-to-metrics, and exemplars are pre-wired. Do not add datasources via UI — edit the YAML. - **Tempo metrics generator is enabled** in `tempo-config.yaml` with processors `service-graphs`, `span-metrics`, `local-blocks`, writing to `prometheus:9090/api/v1/write`. Ingester `max_block_duration: 5m` and 720h compactor retention are demo-tuned, not production values. - **`grafana-traces-app` plugin** is installed via `GF_INSTALL_PLUGINS` at container start. If Grafana is slow on first boot, that is why. - **`war-map` strips `X-Frame-Options`** in an `@app.after_request` hook (`war_map/app.py:191-194`) so the UI can be embedded in Grafana iframes. Intentional — do not remove. ## Keep docs current **Any change to this scenario must land in the same work unit as a doc update.** Stale line-number anchors, removed symbols, or new services that nobody documents are treated as regressions, not cleanup tasks. Files that must be checked whenever the scenario changes: - `game-of-tracing/AGENTS.md` (this file) - `game-of-tracing/CLAUDE.md` - `game-of-tracing/app/CLAUDE.md` - `game-of-tracing/ai_opponent/CLAUDE.md` - `game-of-tracing/war_map/CLAUDE.md` - `.claude/agents/game-of-tracing-expert.md` (cheat-sheet references) Triggers that require a doc update: new service, renamed function, new/changed span attribute, new env var, added/removed metric, port change, dependency bump, new action type in the span-link chain, change to any cited line-number anchor. The Claude sub-agent at `.claude/agents/game-of-tracing-expert.md` owns this responsibility end-to-end for Claude Code sessions. For non-Claude agents: before returning a response that involved a code edit, grep the six files above for any outdated references and update them. ## Verification After any meaningful change, run through this sequence: 1. **Smoke the scenario.** `cd game-of-tracing && docker compose up -d`; wait ~20s for all 10 services to be healthy (`docker compose ps` — all should be `(healthy)` or `Up`). 2. **Confirm Alloy ingest.** Open `http://localhost:12345/debug/livedebugging`. Select the `otelcol.receiver.otlp.default` component and confirm non-zero signal counts for traces/logs/metrics. 3. **Trigger a player action.** Open `http://localhost:8080`, pick a faction, collect resources, create an army, move it to a neutral village. 4. **Inspect the resulting trace.** Grafana at `http://localhost:3000` → Explore → Tempo → Search by `game.session.id` tag. Verify: - Parent player-action span in `war-map`. - Child CLIENT span with propagated trace context. - SERVER span in the target location (`village-X` etc.). - Background `army_movement` span sharing the same `trace_id` (confirms `get_current()`/`attach` worked). - A span link back to the previous action span (the headline feature). 5. **Dashboard check.** Open the provisioned *War of Kingdoms* dashboard; TraceQL filters like `{span.resource.movement = true}` should return traces. 6. **Shutdown.** `docker compose down` (add `-v` to wipe volumes). ## Cross-references - Full span-link design: [`SPAN_LINKS.md`](SPAN_LINKS.md) - Player-facing tutorial: [`README.md`](README.md) - Generic scenario conventions: [`../CLAUDE.md`](../CLAUDE.md) - Submodule guides: [`app/CLAUDE.md`](app/CLAUDE.md), [`ai_opponent/CLAUDE.md`](ai_opponent/CLAUDE.md), [`war_map/CLAUDE.md`](war_map/CLAUDE.md) ================================================ FILE: game-of-tracing/CLAUDE.md ================================================ # CLAUDE.md — Game of Tracing (Claude Code) > Claude-specific workflow for this scenario. For architecture, services, OpenTelemetry patterns, span-link mechanics, and gotchas, **read [`./AGENTS.md`](AGENTS.md) first**. This file only covers what's different when the agent is Claude Code. ## Start here 1. Read `./AGENTS.md` for the scenario overview — including the **Maps** and **Slot identity** sections. 2. Read the submodule `CLAUDE.md` matching the area you are touching: [`app/CLAUDE.md`](app/CLAUDE.md), [`ai_opponent/CLAUDE.md`](ai_opponent/CLAUDE.md), [`war_map/CLAUDE.md`](war_map/CLAUDE.md). 3. If the task involves span links, trace replay, cross-service context propagation, or AI decision logic — delegate to the sub-agent below. ### Two maps, one stack The scenario ships **two maps** selected via an in-UI picker at game start: `war_of_kingdoms` (default 2-player) and `white_walkers_attack` (single-player Night's Watch vs AI White Walkers with `wall` keeps, corpse economy, and a 5-tick hold-to-win condition). Both reuse the same 8 location containers — each container has a constant `SLOT_ID` env and picks up its logical identity from `MAPS[active_map_id]["slot_assignments"][SLOT_ID]` in `app/game_config.py`. Changing maps writes a new `active_map_id` to the shared `game_config` table and POSTs `/reload` to every slot. ## Sub-agent dispatch A specialized sub-agent lives at [`../.claude/agents/game-of-tracing-expert.md`](../.claude/agents/game-of-tracing-expert.md). Use it (via `Task` tool, `subagent_type: game-of-tracing-expert`) for any non-trivial question about: - Reconstructing or debugging span contexts / span links - Cross-service or cross-thread OpenTelemetry context propagation - The `StrategicAI` priority cascade, game phases, or AI metric instrumentation - Tempo TraceQL queries used by the replay UI - Why a trace is orphaned, missing, or appears duplicated in Grafana The sub-agent is read-only (no Write/Edit tools) — it reports; the parent agent does the writes. It **also owns keeping the docs in sync with the code** — see "Keep docs current" below. ## Tool preferences - **Use `Read`, not `cat`**, for the large files in this scenario. Use `offset` / `limit` to target line ranges rather than reading the whole file: - `app/location_server.py` (~52 KB, ~1200 lines) - `ai_opponent/ai_server.py` (~46 KB) - `war_map/app.py` (~64 KB) - `war_map/templates/map.html` (~50 KB) - `war_map/templates/replay_session.html` (~28 KB) - `SPAN_LINKS.md` (~17 KB) - **Use `Grep`, not `grep | head`** for pattern search across the scenario. - For the Alloy pipeline debug UI (`http://localhost:12345`), the stack has to be running — either ask the user to `docker compose up -d` or check `docker compose ps` first. ## Read-before-edit checklist Before editing any service, open these files to ground yourself: | Change area | Open first | |---|---| | Location server behavior | `app/telemetry.py`, relevant route handler in `app/location_server.py`, `app/game_config.py`, the service block in `docker-compose.yml` | | AI decision logic | `ai_opponent/telemetry.py`, `ai_opponent/ai_server.py`, `ai_opponent/README.md` | | UI, sessions, or replay | `war_map/telemetry.py`, `war_map/app.py` (especially `:130-189` for span-link plumbing), relevant template under `war_map/templates/` | | Telemetry pipeline | `config.alloy` (default) or `config-otel.yaml` (OTel variant), `tempo-config.yaml`, `loki-config.yaml`, `prom-config.yaml` | | Datasources / dashboards | `grafana/datasources/defaults.yml`, `grafana/dashboards/*.json` | | Image versions | `../image-versions.env` | ## Keep docs current **Whenever a change to this scenario ships, the matching docs must ship in the same change.** The sub-agent (`game-of-tracing-expert`) enforces this during its work; Claude Code in the main loop is responsible whenever the sub-agent is not invoked. Triggers that require a doc update in the same commit: - New service, renamed function, relocated symbol (line-number anchors shift) - New, removed, or renamed span attribute — especially the ones that feed the Grafana dashboard TraceQL (`span.resource.movement`, `span.battle.occurred`, `span.player.action`) - New or removed env var - New or removed metric - Port change - Dependency version bump (update `image-versions.env` *and* any docs that quote a version) - New action type in the span-link chain (both `war_map/app.py` handler and `replay_session.html` renderer) Files to sweep on every scenario change: 1. `game-of-tracing/AGENTS.md` 2. `game-of-tracing/CLAUDE.md` (this file) 3. `game-of-tracing/app/CLAUDE.md` 4. `game-of-tracing/ai_opponent/CLAUDE.md` 5. `game-of-tracing/war_map/CLAUDE.md` 6. `.claude/agents/game-of-tracing-expert.md` Stale line-number anchors are treated as regressions, not cleanup tasks. If a cited `file:line` range no longer resolves to the referenced symbol, fix it. ## Relationship to the repo root - `/Users/jayclifford/Repos/alloy-scenarios/CLAUDE.md` covers the generic multi-scenario conventions (run commands, scenario directory layout, Alloy pipeline shape). - This file overrides nothing; it extends the root with the patterns that are unique to this scenario (manual context propagation, background-thread context capture, span-link-driven replay, AI instrumentation). ================================================ FILE: game-of-tracing/README.md ================================================ --- title: A Game of Traces menuTitle: A Game of Traces description: A grand strategy game with distributed tracing weight: 600 killercoda: title: A Game of Traces description: A grand strategy game with distributed tracing details: intro: foreground: docker-compose-update.sh backend: backend: imageid: ubuntu --- # War of Kingdoms: A Distributed Tracing Tutorial Game
Game of Tracing
This educational game demonstrates distributed tracing concepts through an interactive strategy game built with OpenTelemetry and Grafana Alloy. Players learn about trace sampling, service graphs, and observability while competing for territory control. ## Educational Goals This game teaches several key concepts in distributed tracing: 1. **Distributed System Architecture** - Multiple microservices (locations) communicating via HTTP - Shared state management - Event-driven updates - Real-time data propagation 2. **OpenTelemetry Concepts** - Trace context propagation - Span creation and attributes - Service naming and resource attributes - Manual instrumentation techniques 3. **Observability Patterns** - Trace sampling strategies - Error tracking and monitoring - Performance measurement - Service dependencies visualization ## Game Overview Open the scenario at `http://localhost:8080` and you land on a **map picker**. Two maps ship today: ### War of Kingdoms (default, 2-player) Two rival kingdoms — Southern and Northern — race to capture the enemy capital. Players: - Collect resources from their territories - Build armies (30 resources per unit) to expand their influence - Capture neutral villages (6 of them) - Send resources back to their capital - Launch strategic attacks on enemy territories **Win condition:** capture the enemy capital. ### White Walkers Attack (single-player) The Long Night has come. The human plays the **Night's Watch** (player faction); the AI opponent plays the **White Walkers**. A new **Barbarian** faction controls two villages on the flanks — passive, slowly accruing army units, good raid targets. New mechanics: - **Wall settlements** run across the middle of the map. Defenders count **2×** when a wall is attacked, making them hard to dislodge. - **Corpse economy.** White Walkers spend **corpses** (not resources) to raise new armies at their fortress. Corpses come from winning battles (every unit killed on either side becomes a corpse) plus a slow passive tick at the fortress itself. Cost: 5 corpses per unit. - **Barbarians** never attack. They accrue +1 army every 30 s — easy farm for White Walkers, but they also harass unguarded Night's Watch supply lines. **Win condition:** hold *every* wall settlement continuously for **5 ticks** (150 s, since the tick is 30 s). Any wall changing hands resets the counter. Both maps share the same 8 location containers — the active map lives in `game_state.db`, and the `/reload` endpoint on each service rebinds the slot's identity when the player switches maps via the picker. Each action in the game generates traces that can be analyzed in Grafana Tempo, demonstrating how distributed tracing works in a real application. ## Technical Components The application consists of: - **Location Servers**: Python Flask microservices representing different map locations - **War Map UI**: Web interface for game interaction - **AI Opponent**: Intelligent computer player for single-player mode - **Telemetry Pipeline**: - OpenTelemetry SDK for instrumentation - `pyroscope-otel` bridge for linking traces to CPU profiles - Grafana Alloy for trace/log/metric/profile processing - Tempo for trace storage - Prometheus for metrics - Loki for logs - Pyroscope for continuous profiling - Grafana for visualization ## Running the Demo 1. Clone the repository: ```bash git clone https://github.com/grafana/alloy-scenarios.git cd alloy-scenarios ``` 2. Navigate to this example: ```bash cd game-of-tracing ``` 3. Run using Docker Compose: ```bash docker compose up -d ``` 4. Access the components: - Game UI: [http://localhost:8080](http://localhost:8080) - Grafana: [http://localhost:3000](http://localhost:3000) - Prometheus: [http://localhost:9090](http://localhost:9090) - Pyroscope: [http://localhost:4040](http://localhost:4040) - Alloy Debug: [http://localhost:12345/debug/livedebugging](http://localhost:12345/debug/livedebugging) 5. Multiplayer Access: - The game supports multiple players simultaneously - Players can join using: - `http://localhost:8080` from the same machine - `http://:8080` from other machines on the network - Each player can choose either the Southern or Northern faction - The game prevents multiple players from selecting the same faction 6. Single-Player Mode: - Toggle "Enable AI Opponent" in the game interface - The AI will automatically control the faction not chosen by the player - The AI provides a balanced challenge with adaptive strategies - For two-player games, keep the AI toggle disabled ## Setting Up the Dashboard 1. Open Grafana at http://localhost:3000 (anonymous admin auth is enabled, no login required). 2. The **War of Kingdoms** dashboard is auto-provisioned at startup — no manual import needed. Find it under Dashboards → Browse. 3. Data sources (Prometheus, Loki, Tempo, **Pyroscope**) are auto-provisioned too. The Tempo datasource is pre-wired to Loki (traces-to-logs), Prometheus (traces-to-metrics), and Pyroscope (traces-to-profiles), so every span in Explore gets a "View profile" link. 4. The dashboard provides: - Real-time army and resource metrics - Battle analytics - Territory control visualization - Service dependency mapping - Trace analytics for game events ### Viewing Profiles With every player action the app emits CPU pprof samples via the `pyroscope-otel` bridge. Each span carries a `pyroscope.profile.id` attribute that Grafana uses to jump directly from a span to its flamegraph. - Explore → **Pyroscope** datasource → pick a service (e.g. `war-map`) → flamegraph renders. - Explore → **Tempo** → open a recent trace → right-click a span → **View Profile**. > **OTel-engine variant note**: when running the alternate pipeline via `docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d`, Alloy's OTel-engine mode has no native Pyroscope receiver. The Python services still profile themselves, but the default profile endpoint (`http://alloy:9999`) won't exist. Override with `PYROSCOPE_SERVER_ADDRESS=http://pyroscope:4040` in the environment to push profiles straight to Pyroscope. ## Learning Through Play ### 1. Trace Context Propagation Watch how actions propagate through the system: - Resource collection triggers spans across services - Army movements create trace chains - Battle events generate nested spans ### 2. Service Graph Analysis Learn how services interact: - Village-to-capital resource flows - Army movement paths - Battle resolution chains ## Observability Features ### 1. Resource Movement Tracing ```console {span.resource.movement = true} ``` Track resource transfers between locations with detailed timing and amounts. ### 2. Battle Analysis ```console {span.battle.occurred = true} ``` Analyze combat events, outcomes, and participating forces. ### 3. Player Actions ```console {span.player.action = true} ``` Monitor player interactions and their impact on the game state. ## Architecture Deep Dive ### Trace Flow Example: Army Movement 1. Player initiates move (UI span) 2. Source location processes request (source span) 3. Movement calculation (path span) 4. Target location receives army (target span) 5. Battle resolution if needed (battle span) 6. State updates propagate (update spans) Each step generates spans with relevant attributes, demonstrating trace context propagation in a distributed system. ## Educational Use This project is designed for educational purposes to teach: - Distributed systems concepts - Observability practices - Microservice architecture - Real-time data flow - System instrumentation ## Contributing We welcome contributions! Please see our [contribution guidelines](CONTRIBUTING.md) for details. ## License This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. ## Disclaimer This is an educational project focused on teaching distributed tracing concepts. Any resemblance to existing games or properties is coincidental and falls under fair use for educational purposes. ## Further Resources - [OpenTelemetry Documentation](https://opentelemetry.io/docs/) - [Grafana Alloy Documentation](https://grafana.com/docs/alloy/latest/) - [Distributed Tracing Guide](https://opentelemetry.io/docs/concepts/observability-primer/#distributed-traces) ================================================ FILE: game-of-tracing/SPAN_LINKS.md ================================================ # Span Links Implementation in Game of Tracing This document explains how span links are implemented in the Game of Tracing game to enable game replay functionality. ## What Are Span Links? Span links allow you to create relationships between spans that aren't in a direct parent-child hierarchy. Unlike parent-child relationships (which are synchronous and hierarchical), links are more flexible and can connect spans across different traces or time periods. ## Implementation Overview ### Game Session Tracking Each player gets a unique `game_session_id` when they select a faction. This ID is used to track all their actions throughout the game: ```python # Generated when player selects faction session['game_session_id'] = str(uuid.uuid4()) session['action_sequence'] = 0 ``` ### Action Storage Every significant game action is stored in a SQLite database with its trace information: ```sql CREATE TABLE game_actions ( id INTEGER PRIMARY KEY AUTOINCREMENT, game_session_id TEXT NOT NULL, action_sequence INTEGER NOT NULL, action_type TEXT NOT NULL, player_name TEXT, faction TEXT, trace_id TEXT NOT NULL, span_id TEXT NOT NULL, location_id TEXT, target_location_id TEXT, timestamp INTEGER NOT NULL, game_state_after TEXT ) ``` ### Span Link Creation Each new action creates a span link to the previous action in the sequence: ```python # Get previous action's span context previous_span_context = get_previous_action_context(game_session_id, current_sequence) # Create span link using official OpenTelemetry API if previous_span_context: link = trace.Link( previous_span_context, attributes={ "link.type": "game_sequence", "link.relation": "follows", "game.sequence": "true" } ) links.append(link) # Create new span with links with tracer.start_as_current_span( "move_army", kind=SpanKind.SERVER, links=links, # Links to previous actions attributes={ "game.session.id": game_session_id, "game.action.type": "move_army", "game.action.sequence": current_sequence + 1 } ) as span: # ... action logic ... ``` ## Supported Actions The following game actions create span links: 1. **collect_resources** - Collecting resources at a location 2. **create_army** - Creating armies at capitals 3. **move_army** - Moving armies between locations 4. **all_out_attack** - Launching all-out attacks ## Battle Mechanics The game uses simple but effective battle calculations: ### Combat Rules 1. **Same Faction**: Reinforcement - Armies combine: `final_army = attacking_army + defending_army` - Used for friendly army movements and reinforcements 2. **Different Factions**: Combat - **Attacker Victory**: `remaining_army = attacking_army - defending_army` - **Defender Victory**: `remaining_army = defending_army - attacking_army` - **Stalemate**: `remaining_army = 0` (equal armies destroy each other) ### All-Out Attack Special Rules - All-out attacks automatically collect armies from friendly villages along the path - This simulates gathering reinforcements during the march to enemy territory - Example: 5 armies + 2 village armies = 7 armies continuing to target ### Battle Calculation Code ```python def _handle_battle(self, attacking_army: int, attacking_faction: str, defending_army: int, defending_faction: str) -> tuple[str, int, str]: # Same faction = reinforcement if attacking_faction == defending_faction: return "reinforcement", attacking_army + defending_army, attacking_faction # Actual combat if attacking_army > defending_army: remaining = attacking_army - defending_army return "attacker_victory", remaining, attacking_faction elif defending_army > attacking_army: remaining = defending_army - attacking_army return "defender_victory", remaining, defending_faction else: return "stalemate", 0, defending_faction ``` ## Game Restart Functionality The restart system ensures complete game state reset: ### What Gets Reset 1. **Game State Variables** - `GAME_OVER`, `WINNER`, `VICTORY_MESSAGE` flags - Global game state in war map 2. **Span Links Database** - All game action records cleared - Fresh start for span link chains 3. **Faction Assignments** - Player faction selections cleared - All factions become available 4. **AI Opponent** - AI automatically deactivated - Prevents ghost AI actions 5. **Location Database** - All locations reset to initial state - Resources, armies, and factions restored ### Restart Process ```python def reset_game_data(): # Reset local game state reset_game_state() # Deactivate AI requests.post(f"{AI_SERVICE_URL}/deactivate") # Clear faction assignments release_all_factions() # Clear span links database cursor.execute("DELETE FROM game_actions") # Reset location database make_api_request('southern_capital', 'reset', method='POST') ``` ### Verification Use the debug endpoint to verify complete reset: ```bash curl http://localhost:8080/api/debug/restart_verification ``` Expected response: ```json { "success": true, "all_systems_reset": true, "details": { "game_state_reset": true, "span_links_cleared": true, "faction_assignments_cleared": true, "ai_deactivated": true, "database_reset": true } } ``` ## Game Replay Chain With span links, you can trace the complete game narrative: ``` Game Start → Collect Resources → Create Army → Move Army → Battle → Victory ↑ ↑ ↑ ↑ ↑ ↑ [trace_1] [trace_2] [trace_3] [trace_4] [trace_5] [trace_6] ↑ ↑ ↑ ↑ ↑ [links to] [links to] [links to] [links to] [links to] trace_1 trace_2 trace_3 trace_4 trace_5 ``` ## Game Replay Through Tempo ### TraceQL Queries for Replay #### 1. Find All Game Sessions ```traceql {game.session.id!=""} ``` #### 2. Get Specific Game Session ```traceql {game.session.id="abc-123-def"} ``` #### 3. Find Actions with Span Links ```traceql {link.type="game_sequence"} ``` #### 4. Find Game Actions by Type ```traceql {game.action.type="move_army"} ``` #### 5. Find Actions by Player ```traceql {player.name="Alice" && game.session.id!=""} ``` #### 6. Find Battle Outcomes ```traceql {span.battle.occurred=true} ``` ### Tempo API Integration The replay system uses Tempo's HTTP API: ```python # 1. Search for game sessions GET /api/search?q={game.session.id!=""} # 2. Get specific session traces GET /api/search?q={game.session.id="session-id"} # 3. Get full trace details GET /api/traces/{trace-id} # 4. Extract span links from trace data for span in trace['batches'][0]['spans']: for ref in span.get('references', []): if ref.get('refType') == 'FOLLOWS_FROM': # This is a span link linked_span_id = ref.get('spanID') ``` ### Replay Engine Architecture ```python class GameReplayEngine: def find_game_sessions(self) -> List[str]: """Query Tempo for all game sessions""" def get_session_traces(self, session_id: str) -> List[Dict]: """Get all traces for a specific session""" def extract_game_actions(self, traces: List[Dict]) -> List[GameAction]: """Parse traces into game actions""" def verify_span_links(self, actions: List[GameAction]) -> None: """Verify span link chain integrity""" def replay_session_step_by_step(self, session: GameSession) -> None: """Replay game session action by action""" ``` ### Web UI Replay The game includes web endpoints for replay: - `GET /api/replay/sessions` - List available game sessions - `GET /api/replay/session/{id}` - Get detailed replay data - `GET /replay` - Replay dashboard page - `GET /replay/{session-id}` - Specific session replay ### Replay Data Structure ```json { "session_id": "abc-123-def", "player_name": "Alice", "faction": "southern", "actions": [ { "sequence": 1, "action_type": "collect_resources", "trace_id": "trace-1", "span_id": "span-1", "span_links": [], // First action has no links "timestamp": "2024-01-01T10:00:00Z", "location_id": "southern_capital" }, { "sequence": 2, "action_type": "create_army", "trace_id": "trace-2", "span_id": "span-2", "span_links": ["span-1"], // Links to previous action "timestamp": "2024-01-01T10:01:00Z", "location_id": "southern_capital" } ], "span_link_chain": [ {"sequence": 1, "valid_chain": true, "note": "First action"}, {"sequence": 2, "valid_chain": true, "note": "Correctly links to action 1"} ] } ``` ## Querying Span Links ### In Grafana Tempo Search for traces with game session information: ``` {game.session.id!=""} ``` Find spans with links: ``` {link.type="game_sequence"} ``` ### Trace Attributes Each span includes these attributes for game replay: - `game.session.id` - Unique session identifier - `game.action.type` - Type of action (move_army, create_army, etc.) - `game.action.sequence` - Sequence number in the game - `link.type` - Type of link (game_sequence) - `link.relation` - Relationship (follows) ## Testing Run the test script to verify span links are working: ```bash cd game-of-tracing python debug_span_links.py ``` This will: 1. Select a faction 2. Perform a sequence of actions 3. Each action will link to the previous one 4. Provide instructions for viewing the links in Grafana 5. Test battle calculation mechanics 6. Verify restart functionality Test the replay functionality: ```bash cd game-of-tracing python war_map/replay.py ``` ## Educational Value Span links demonstrate: - **Cross-trace relationships** - Actions in different traces can be related - **Historical context** - Each action knows what came before it - **Game narrative** - Complete story of how the game unfolded - **Advanced OpenTelemetry** - Real-world use of span links feature - **Tempo integration** - How to query and reconstruct trace relationships ## Game Replay Benefits 1. **Debugging** - Understand what led to game outcomes 2. **Analytics** - Analyze player behavior patterns 3. **Education** - Show distributed tracing concepts in action 4. **Auditing** - Verify game logic and fairness 5. **Entertainment** - Watch epic games unfold step by step ## Future Enhancements Potential additions: - AI action links to player actions that triggered them - Battle outcome links to the actions that led to the battle - Resource transfer chains across multiple locations - Victory condition traces showing the sequence that led to game end - Interactive replay UI with game map visualization - Export replay data for external analysis ## Troubleshooting Replay Functionality ### Tempo API Query Strategy The replay system uses a **two-step approach** to work reliably with Tempo: #### **Step 1: Discover Game Sessions** Uses Tempo's tag values API to find all available game session IDs: ```bash GET /api/v2/search/tag/game.session.id/values?start=&end=&limit=50 ``` This returns all unique values for the `game.session.id` tag, giving us a list of available sessions. #### **Step 2: Query Each Session** For each discovered session ID, queries for its traces: ```bash GET /api/search?q={game.session.id="specific-session-id"}&limit=100 ``` This approach avoids complex TraceQL queries that might fail with 400 errors. ### Common Issues and Solutions #### 1. Tempo Query Errors (400 Bad Request) **Problem**: Getting 400 errors when querying Tempo with complex TraceQL **Solutions**: - **New approach**: Use tag values API first, then simple session-specific queries - **Fallback**: System automatically falls back to local SQLite database - **Logging**: Enhanced logging shows exactly which queries are being attempted #### 2. Missing Span Attributes **Problem**: Custom span attributes like `game.session.id` may not be indexed in Tempo **Solutions**: - **Attribute verification**: Check that spans are being created with correct attributes - **Hybrid approach**: Local database stores action sequence as backup - **Index configuration**: Ensure Tempo is configured to index custom attributes #### 3. Time Range Issues **Solutions**: - **4-hour window**: System now uses 4-hour time windows for discovery - **Unix timestamps**: Uses seconds-based timestamps for better compatibility - **Configurable ranges**: Time ranges can be adjusted based on game session length ### Data Source Fallbacks The replay system has multiple data sources in order of preference: 1. **`tempo_tag_values`** - Primary approach using tag values API 2. **`tempo_search_only`** - Basic span data from search results only 3. **`local_db_fallback`** - SQLite database as final fallback ### Debug Tools #### 1. Replay Debug Script ```bash cd game-of-tracing python debug_replay.py ``` This comprehensive script tests: - Tempo connection and version - Basic TraceQL query functionality - Game-specific attribute queries - Replay API endpoints - Local database fallback #### 2. Manual Tempo Queries Test Tempo directly using curl: ```bash # Basic connectivity curl http://localhost:3200/ready # Simple trace search curl "http://localhost:3200/api/search?q={span.name!=\"\"}&limit=5" # Game-specific search curl "http://localhost:3200/api/search?q={span.name=\"collect_resources\"}&limit=10" ``` #### 3. Replay API Testing ```bash # Get available sessions curl http://localhost:8080/api/replay/sessions # Get specific session curl http://localhost:8080/api/replay/session/your-session-id # Check local database health curl http://localhost:8080/api/debug/health ``` ### Replay System Architecture The improved replay system uses a **hybrid approach**: #### 1. Primary Data Source: Tempo - Queries Tempo using multiple TraceQL approaches - Extracts complete span information including links - Provides full distributed tracing context #### 2. Fallback Data Source: Local SQLite - Stores essential game action metadata - Always available even if Tempo queries fail - Enables replay functionality regardless of Tempo state #### 3. Query Strategy ```python # Multiple query attempts with increasing specificity queries = [ '{span.name="collect_resources" || span.name="create_army" || span.name="move_army" || span.name="all_out_attack"}', '{resource.service.name="war_map"}', '{game.action.type!=""}', '{span.name!=""}' # Fallback to any spans ] ``` ### Performance Optimizations #### 1. Time Window Optimization - **Before**: 24-hour windows with nanosecond precision - **After**: 1-hour windows with Unix second precision - **Result**: Faster queries, reduced timeout errors #### 2. Query Prioritization - Try specific game queries first - Fall back to broader queries if needed - Use local database if all Tempo queries fail #### 3. Response Caching - Session metadata cached in local database - Reduces repeated Tempo queries - Improves UI responsiveness ### Access After Game Reset The replay page is now accessible from the faction selection screen: **Location**: [http://localhost:8080](http://localhost:8080) → "View Game Replays" button **Benefits**: - No need to be in an active game session - Available immediately after game reset - Persistent access to historical game data ### Expected Response Format #### Successful Tempo Response ```json { "success": true, "sessions": [ { "session_id": "abc-123-def", "player_name": "Alice", "faction": "southern", "start_time": 1234567890000000000, "action_count": 5, "last_action": "move_army" } ], "query_method": "tempo", "total_sessions": 1 } ``` #### Fallback Local Database Response ```json { "success": true, "sessions": [...], "query_method": "local_db_fallback", "warning": "Tempo query failed: connection timeout" } ``` ### Tempo Configuration Requirements For optimal replay functionality, ensure Tempo is configured with: ```yaml # tempo-config.yaml query_frontend: search: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 stream_over_http_enabled: true ``` And in docker-compose.yml: ```yaml environment: - TEMPO_URL=http://tempo:3200 ``` ### TraceQL Query Examples Based on the [Tempo API documentation](https://grafana.com/docs/tempo/latest/api_docs/), these queries should work: #### Basic Queries ```traceql # Find any spans with duration {duration>1ms} # Find spans by name {span.name="collect_resources"} # Find spans by service {resource.service.name="war_map"} ``` #### Game-Specific Queries ```traceql # Find game actions (if attributes are indexed) {game.action.type!=""} # Find player actions (if attributes are indexed) {player.name!=""} # Combine conditions {span.name="move_army" && player.faction="southern"} ``` ### Integration with Grafana Once the replay data is accessible, you can: 1. **View in Grafana Tempo**: Search for game session traces directly 2. **Create dashboards**: Visualize game progression over time 3. **Set up alerts**: Monitor for specific game events 4. **Analyze patterns**: Study player behavior across multiple games ================================================ FILE: game-of-tracing/ai_opponent/CLAUDE.md ================================================ # ai_opponent/ — Strategic AI Decision Engine > Algorithmic opponent (not LLM-based) that plays the faction not chosen by a human player. This doc is read by any AI coding agent. For scenario-wide context read [`../AGENTS.md`](../AGENTS.md) first. ## Purpose `ai-opponent` is a Flask service on port **8081** that takes control of a faction and makes strategic decisions on a recurring loop. It is activated by `war_map` via `POST /activate` with JSON body `{"faction": ..., "map_id": ...}` — on the WoK map the player toggles it on manually; on WWA it auto-activates as `white_walkers` the moment the player picks the map. Two AI variants dispatch off the `faction` field at activation time: - **`StrategicAI`** — classic WoK opponent (southern / northern). 6-step priority cascade: capital defense → zero-risk captures → resource transfers → plan execution → plan creation → fallback. - **`WhiteWalkerAI(StrategicAI)`** — single-player WWA opponent. Different cascade: defend fortress → capture unowned wall → reinforce weakest wall (non-capital neighbours preferred; capital is a fallback when no other source has spare army, since `move_army` empties the source) → raid barbarian village (for corpses) → raise army from corpses at the fortress (only requires the capital to still belong to the AI; no minimum garrison) → idle. Reads its corpse pool via `GET /faction_economy?faction=white_walkers` on any location service; spends 5 corpses per army unit instead of 30 resources. Common to both: the AI: - Fetches the state of all 8 locations. - Runs a priority cascade of checks to decide the next action (defend, capture, transfer, plan, fallback). - Executes the action via the same HTTP API the player uses (against the location services on 5001-5008). - Emits fully-linked traces so the replay UI can narrate the AI's reasoning alongside the human player's. - Adapts its loop cadence (2-15 s) to the current game phase. **This is deterministic code, not an LLM.** No `anthropic`, `openai`, or other model SDKs are imported. ## File map | File | Size | Purpose | |---|---|---| | `ai_server.py` | ~46 KB | Main decision engine: `StrategicAI`, `PhaseDetector`, `Planner`, `MapAnalyzer`, Flask routes, decision loop. | | `telemetry.py` | ~7.7 KB | `AITelemetry` class for `ai-opponent` — traces, logs, AI-specific metrics, plus Pyroscope profiling with OTel span-profile linkage. | | `README.md` | ~2.6 KB | Feature doc. | | `Dockerfile` | small | `python:3.11-slim`, `pip install -r requirements.txt`, runs `python ai_server.py`. | | `requirements.txt` | small | Flask 3.1.3, requests 2.33.1, OpenTelemetry SDK/API + exporters, `pyroscope-io` + `pyroscope-otel` for profiling. | ## Decision model ### Priority cascade — `StrategicAI.decide()` Executed every cycle; returns the first non-null action: 1. **Capital defense.** If the capital is under threat (enemy army adjacent with path-army-estimate exceeding capital garrison), react: build army, pull army back, or preempt. 2. **Zero-risk captures.** Grab any neutral village reachable with overwhelming numerical advantage. 3. **Resource transfers.** Move resources from villages to the capital when the capital is running low. 4. **Plan execution.** If a multi-step plan is active and valid, advance to the next step. 5. **Plan creation.** Propose a new plan targeting the most valuable enemy territory. 6. **Fallback.** Collect resources at the capital. ### Phase detection — `PhaseDetector.detect()` at `ai_server.py:195-212` Five phases drive cadence and aggressiveness: | Phase | Condition | Cadence (seconds) | |---|---|---| | `READY_TO_ATTACK` | `total_army >= 8` | 3-8 | | `DESPERATE` | `my_count <= 1` | 2-5 | | `DEFENSIVE` | `my_count < enemy_count` | medium | | `DOMINATING` | `my_count > enemy_count + 1` | 5-15 | | `BALANCED` | everything else | 5-15 | Cadence is set by `StrategicAI.get_pause_time()`; faster in crisis, slower in stability. ### Supporting classes - **`MapAnalyzer`** (`ai_server.py:64-135`) — precomputes BFS distances between all location pairs at startup. Used by `path_army_estimate()` to sum enemy armies along shortest path to a target — enabling threat assessment. - **`Planner`** (`ai_server.py:216+`) — multi-step goal sequences like `[create_army, create_army, create_army, move_army(target)]`. Validated every cycle via `Planner.validate()`; abandoned if preconditions break (e.g., capital lost, source location flipped). - **`GameMemory`** — tracks territory-loss history, failed attacks, enemy push directions; used by `territory_lost_recently()` etc. at `ai_server.py:180-191` to adjust reactive behavior. ## Custom metrics | Metric | Type | Attributes | Emitter | |---|---|---|---| | `ai.decisions` | counter | `action_type`, `phase`, `reason` | `decide()` / `execute_strategic_action()` | | `ai.plans_created` | counter | `goal` | `Planner.set_plan` | | `ai.plans_abandoned` | counter | `reason` | `Planner.abandon` | | `ai.decision_cycle_duration_seconds` | histogram | `phase` | Each decision cycle | | `ai.territory_count` | observable gauge | `faction` | Callback into live state | | `ai.total_army` | observable gauge | `faction` | Callback into live state | ## Span events Significant state transitions are emitted as events on the active decision span (rather than as standalone spans): - `phase_transition` — with `from_phase`, `to_phase` attributes - `territory_change` — with `gained` / `lost` territory lists - `plan_abandoned` — with `reason` and `original_goal` - `threat_detected` — with `threat_source`, `threat_army`, `target` Locations: `ai_server.py:299-327`. ## Span links unique to `ai_opponent/` The AI opponent instruments its own causal chain **inside a single decision cycle**: - `ai_decision_cycle` span (SpanKind.INTERNAL) wraps the whole cycle. - `ai_decision` span (child, INTERNAL) captures the cascade evaluation and chosen action. - `execute_ai_action` span (INTERNAL) is the action execution — it starts with a `Link` back to the `ai_decision` span's context, with `link.type="ai_decision_trigger"`. This allows the replay UI to jump from the executed action back to the reasoning that produced it. The linking logic lives around `ai_server.py:888-901`. The AI does **not** participate in the cross-session `game_sequence` chain that `war_map` builds — that is player-only. ## Environment | Var | Default | Purpose | |---|---|---| | `PORT` | `8081` | Flask listen port | | `IN_DOCKER` | unset | When set, location URLs resolve via container DNS (`southern-capital:5001`) instead of `localhost:5001` | Telemetry endpoints are hard-coded in `telemetry.py` to `alloy:4317` (gRPC traces) and `alloy:4318` (HTTP logs + metrics). The service resource is registered with `SERVICE_NAME="ai-opponent"`. ## Activation flow 1. `war_map` calls `POST http://ai-opponent:8081/activate` with JSON body `{"faction": "northern"}`. 2. The handler constructs a `StrategicAI(faction)` instance and starts `ai_decision_loop()` in a daemon thread. 3. The loop runs until `/deactivate` is called or the game is marked over. 4. Each cycle captures a span, logs, and increments the appropriate metrics. ## Common edits **Tune aggressiveness.** Adjust thresholds in `PhaseDetector.detect()` at `ai_server.py:195-212`, or the cadence ranges in `get_pause_time()`. **Change the priority cascade.** Edit `StrategicAI.decide()`. Each priority is its own helper (`_check_capital_defense`, `_find_zero_risk_captures`, `_do_resource_transfers`, plan steps). Reorder by reshuffling the cascade. **Add a new AI metric.** Mirror the observable-gauge pattern in `telemetry.py` and wire a callback that reads from `StrategicAI` live state (via a registered state accessor, same pattern as `app/telemetry.py`). **Add a new span event.** Call `span.add_event("event_name", attributes={...})` inside the decision span. Keep the existing four event names stable — they feed replay UI rendering. ## Keep this doc current Per the sub-agent rule, any change to the priority cascade, phase thresholds, metric set, env vars, or the line-number anchors above must land in the same work unit. Before returning a response that touched `ai_opponent/`, grep this file for references to anything you changed. ## Cross-references - [`../AGENTS.md`](../AGENTS.md) — scenario-wide architecture and patterns - [`../app/CLAUDE.md`](../app/CLAUDE.md) — the location-server HTTP API this AI calls - [`../war_map/CLAUDE.md`](../war_map/CLAUDE.md) — the orchestrator that activates/deactivates this service - [`../SPAN_LINKS.md`](../SPAN_LINKS.md) — span-link design, including the `ai_decision_trigger` link type ================================================ FILE: game-of-tracing/ai_opponent/Dockerfile ================================================ FROM python:3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2 WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt COPY . . ENV FLASK_APP=ai_server.py ENV FLASK_DEBUG=0 ENV IN_DOCKER=1 EXPOSE 8081 CMD ["flask", "run", "--host=0.0.0.0", "--port=8081"] ================================================ FILE: game-of-tracing/ai_opponent/README.md ================================================ # AI Opponent for War of Kingdoms This Flask-based AI service provides an intelligent opponent for single-player games in the War of Kingdoms distributed tracing tutorial. ## Features ### Adaptive Strategy The AI adapts its strategy based on the game phase: - **Early Game (0-5 minutes)**: Focuses on resource collection and capturing neutral villages - **Mid Game (5-15 minutes)**: Balances expansion with army building and defense - **Late Game (15+ minutes)**: Shifts to aggressive tactics and all-out attacks ### Natural Behavior - Takes 15-45 second pauses between actions to simulate human thinking time - Uses weighted random decisions to avoid predictable patterns - Reacts to player threats by reinforcing endangered locations - Manages resources by transferring them from villages to capitals ### Decision Making The AI analyzes the game state to make intelligent decisions: 1. **Threat Analysis**: Identifies enemy armies near its territories 2. **Expansion Targets**: Finds neutral villages and weak enemy locations 3. **Resource Management**: Collects resources and creates armies when needed 4. **Strategic Movement**: Reinforces threatened locations and attacks vulnerable targets ### OpenTelemetry Integration All AI actions are fully instrumented with OpenTelemetry: - Traces show decision-making process - Spans include game phase, threats, and chosen actions - Integrates with the game's distributed tracing pipeline ## API Endpoints - `POST /activate` - Activate the AI for a specific faction - `POST /deactivate` - Deactivate the AI - `GET /status` - Get current AI status - `GET /health` - Health check endpoint ## How It Works 1. When activated, the AI starts a background thread that runs the decision loop 2. Every 15-45 seconds, it: - Fetches the current game state from all locations - Analyzes threats and opportunities - Makes a weighted random decision based on the game phase - Executes the chosen action via location server APIs 3. The AI automatically stops when it detects game over ## Configuration The AI difficulty is set to "normal" and provides a balanced challenge. Decision weights can be adjusted in the `DECISION_WEIGHTS` dictionary to make the AI more aggressive or defensive. ## Usage The AI is integrated with the War Map UI: 1. Players can toggle "Enable AI Opponent" in the game interface 2. The AI automatically takes control of the faction not chosen by the player 3. For two-player games, keep the AI toggle off ## Observability Monitor AI behavior through: - **Traces**: View AI decision-making and action execution - **Logs**: Track AI state changes and decisions - **Service Map**: See AI interactions with location servers ================================================ FILE: game-of-tracing/ai_opponent/ai_server.py ================================================ import os import time import random import requests import threading import atexit from collections import deque from flask import Flask, jsonify, request from telemetry import AITelemetry from opentelemetry import trace, baggage from opentelemetry.trace import SpanKind, Link from opentelemetry.propagate import inject from datetime import datetime, timedelta from enum import Enum app = Flask(__name__) # Initialize telemetry telemetry = AITelemetry() logger = telemetry.get_logger() tracer = telemetry.get_tracer() atexit.register(telemetry.shutdown) # ─── Constants ───────────────────────────────────────────────────────────────── # Per-map adjacency lists. Keep keys in sync with # game-of-tracing/app/game_config.py's MAPS[*]["locations"][*]["connections"]. MAP_GRAPHS_BY_MAP = { "war_of_kingdoms": { "southern_capital": ["village_1", "village_3"], "northern_capital": ["village_2", "village_6"], "village_1": ["southern_capital", "village_2", "village_4"], "village_2": ["northern_capital", "village_1", "village_5"], "village_3": ["southern_capital", "village_5", "village_6"], "village_4": ["village_1", "village_5"], "village_5": ["village_2", "village_3", "village_4", "village_6"], "village_6": ["northern_capital", "village_3", "village_5"], }, "white_walkers_attack": { "nights_watch_fortress": [ "wall_west", "wall_center_west", "wall_center_east", "wall_east", ], "white_walker_fortress": [ "wall_west", "wall_center_west", "wall_center_east", "wall_east", ], "wall_west": [ "nights_watch_fortress", "white_walker_fortress", "wall_center_west", "barbarian_village_west", ], "wall_center_west": [ "nights_watch_fortress", "white_walker_fortress", "wall_west", "wall_center_east", ], "wall_center_east": [ "nights_watch_fortress", "white_walker_fortress", "wall_center_west", "wall_east", ], "wall_east": [ "nights_watch_fortress", "white_walker_fortress", "wall_center_east", "barbarian_village_east", ], "barbarian_village_west": ["wall_west"], "barbarian_village_east": ["wall_east"], }, } # Per-map capital mapping (faction -> location_id of that faction's capital). CAPITALS_BY_MAP = { "war_of_kingdoms": { "southern": "southern_capital", "northern": "northern_capital", }, "white_walkers_attack": { "nights_watch": "nights_watch_fortress", "white_walkers": "white_walker_fortress", }, } # Per-map location type lookup (capital / village / wall). LOCATION_TYPES_BY_MAP = { "war_of_kingdoms": { "southern_capital": "capital", "northern_capital": "capital", "village_1": "village", "village_2": "village", "village_3": "village", "village_4": "village", "village_5": "village", "village_6": "village", }, "white_walkers_attack": { "nights_watch_fortress": "capital", "white_walker_fortress": "capital", "wall_west": "wall", "wall_center_west": "wall", "wall_center_east": "wall", "wall_east": "wall", "barbarian_village_west": "village", "barbarian_village_east": "village", }, } # Per-map location faction (static initial ownership — what the AI reasons # about for walls-are-neutral / barbarian-villages-are-barbarian etc.). INITIAL_FACTIONS_BY_MAP = { "war_of_kingdoms": { "southern_capital": "southern", "northern_capital": "northern", "village_1": "neutral", "village_2": "neutral", "village_3": "neutral", "village_4": "neutral", "village_5": "neutral", "village_6": "neutral", }, "white_walkers_attack": { "nights_watch_fortress": "nights_watch", "white_walker_fortress": "white_walkers", "wall_west": "neutral", "wall_center_west": "neutral", "wall_center_east": "neutral", "wall_east": "neutral", "barbarian_village_west": "barbarian", "barbarian_village_east": "barbarian", }, } # Per-map army cost per faction. Matches app/game_config.py's rules.army_cost. ARMY_COST_BY_MAP = { "war_of_kingdoms": {"default": 30}, "white_walkers_attack": {"default": 30, "white_walkers": 5}, } # Backward-compat alias: legacy code that references MAP_GRAPH still sees WoK. MAP_GRAPH = MAP_GRAPHS_BY_MAP["war_of_kingdoms"] ARMY_COST = 30 VILLAGE_INCOME_PER_MIN = 40 # ~10 resources every 15s RESOURCE_TRANSFER_THRESHOLD = 30 # Single port table keyed by location id (same ports are shared across maps # because a slot's port is fixed and each map just renames the slot). LOCATION_PORTS = { "southern_capital": 5001, "northern_capital": 5002, "village_1": 5003, "village_2": 5004, "village_3": 5005, "village_4": 5006, "village_5": 5007, "village_6": 5008, # White Walkers Attack aliases (same physical slot → same port). "nights_watch_fortress": 5001, "white_walker_fortress": 5002, "wall_west": 5003, "wall_center_west": 5004, "wall_center_east": 5005, "wall_east": 5006, "barbarian_village_west": 5007, "barbarian_village_east": 5008, } # Container hostname per logical location id (resolves HTTP URLs in docker). CONTAINER_FOR_LOCATION_ID = { # WoK ids are their own container names. "southern_capital": "southern-capital", "northern_capital": "northern-capital", "village_1": "village-1", "village_2": "village-2", "village_3": "village-3", "village_4": "village-4", "village_5": "village-5", "village_6": "village-6", # WWA ids share containers with their slot peer. "nights_watch_fortress": "southern-capital", "white_walker_fortress": "northern-capital", "wall_west": "village-1", "wall_center_west": "village-2", "wall_center_east": "village-3", "wall_east": "village-4", "barbarian_village_west": "village-5", "barbarian_village_east": "village-6", } def get_map_graph(map_id): return MAP_GRAPHS_BY_MAP.get(map_id, MAP_GRAPH) def get_capitals(map_id): return CAPITALS_BY_MAP.get(map_id, CAPITALS_BY_MAP["war_of_kingdoms"]) def get_location_types(map_id): return LOCATION_TYPES_BY_MAP.get(map_id, LOCATION_TYPES_BY_MAP["war_of_kingdoms"]) def get_initial_factions(map_id): return INITIAL_FACTIONS_BY_MAP.get(map_id, INITIAL_FACTIONS_BY_MAP["war_of_kingdoms"]) def get_army_cost_for(map_id, faction): costs = ARMY_COST_BY_MAP.get(map_id, ARMY_COST_BY_MAP["war_of_kingdoms"]) return costs.get(faction, costs["default"]) # ─── Game Phase ──────────────────────────────────────────────────────────────── class GamePhase(Enum): DESPERATE = "desperate" DEFENSIVE = "defensive" BALANCED = "balanced" DOMINATING = "dominating" READY_TO_ATTACK = "ready_to_attack" # ─── Map Analyzer ────────────────────────────────────────────────────────────── class MapAnalyzer: """Precomputed map analysis: BFS distances, strategic values, path army estimation.""" def __init__(self, graph=None, capitals=None): # ``graph`` defaults to WoK to preserve legacy behaviour; new callers # pass the active map's adjacency list. ``capitals`` is the map's # faction→capital dict (needed for the strategic-value heuristic). self.graph = graph if graph is not None else MAP_GRAPH self.capitals = capitals if capitals is not None else CAPITALS_BY_MAP["war_of_kingdoms"] self.distances = self._compute_all_distances() self.strategic_values = self._compute_strategic_values() def _bfs_distances(self, start): """BFS from start node, returns dict {node: distance}.""" visited = {start: 0} queue = deque([start]) while queue: node = queue.popleft() for neighbor in self.graph[node]: if neighbor not in visited: visited[neighbor] = visited[node] + 1 queue.append(neighbor) return visited def _compute_all_distances(self): """Precompute all-pairs BFS distances.""" return {loc: self._bfs_distances(loc) for loc in self.graph} def _compute_strategic_values(self): """Score each location by connectivity + centrality. High connectivity or short distance to either capital = valuable. Works identically across maps because it reads capitals from the per-map mapping rather than hardcoding WoK's capital names. """ values = {} capital_ids = list(self.capitals.values()) for loc in self.graph: connections = len(self.graph[loc]) if capital_ids: avg_capital_dist = sum( self.distances[loc].get(cap, 99) for cap in capital_ids ) / float(len(capital_ids)) else: avg_capital_dist = 99 values[loc] = connections + (4.0 / max(avg_capital_dist, 1)) return values def distance(self, a, b): return self.distances[a].get(b, 99) def neighbors(self, loc): return self.graph.get(loc, []) def path_army_estimate(self, game_state, from_loc, to_loc, my_faction): """Estimate total enemy army along BFS shortest path from from_loc to to_loc.""" parent = {from_loc: None} queue = deque([from_loc]) while queue: node = queue.popleft() if node == to_loc: break for neighbor in self.graph[node]: if neighbor not in parent: parent[neighbor] = node queue.append(neighbor) if to_loc not in parent: return 999 # unreachable # Walk path and sum enemy armies (excluding from_loc) path = [] node = to_loc while node is not None: path.append(node) node = parent[node] path.reverse() enemy_army = 0 for loc in path[1:]: # skip from_loc loc_data = game_state.get(loc, {}) if loc_data.get('faction') != my_faction: enemy_army += loc_data.get('army', 0) return enemy_army # ─── Game Memory ─────────────────────────────────────────────────────────────── class GameMemory: """Tracks territory changes, failed attacks, and enemy push direction.""" def __init__(self): self.territory_history = [] # list of (timestamp, my_territories set) self.failed_attacks = {} # {target_loc: last_failure_time} self.enemy_push_direction = None self.last_enemy_territories = set() def update(self, game_state, my_faction): now = time.time() my_territories = set() enemy_territories = set() for loc_id, data in game_state.items(): if data.get('faction') == my_faction: my_territories.add(loc_id) elif data.get('faction') not in (my_faction, 'neutral'): enemy_territories.add(loc_id) self.territory_history.append((now, my_territories.copy())) if len(self.territory_history) > 20: self.territory_history = self.territory_history[-20:] # Detect enemy push direction: new enemy territory closest to our capital new_enemy = enemy_territories - self.last_enemy_territories if new_enemy: self.enemy_push_direction = list(new_enemy)[0] self.last_enemy_territories = enemy_territories return my_territories, enemy_territories def record_failed_attack(self, target): self.failed_attacks[target] = time.time() def recently_failed(self, target, cooldown=60): last = self.failed_attacks.get(target) if last is None: return False return (time.time() - last) < cooldown def territory_lost_recently(self, seconds=30): """Check if we lost territory in the last N seconds.""" if len(self.territory_history) < 2: return False now = time.time() current = self.territory_history[-1][1] for ts, territories in reversed(self.territory_history[:-1]): if now - ts > seconds: break if len(territories) > len(current): return True return False # ─── Phase Detector ──────────────────────────────────────────────────────────── class PhaseDetector: """State-based phase detection using territory count and total army.""" @staticmethod def detect(my_territories, enemy_territories, total_army): my_count = len(my_territories) enemy_count = len(enemy_territories) if total_army >= 8: return GamePhase.READY_TO_ATTACK if my_count <= 1: return GamePhase.DESPERATE elif my_count < enemy_count: return GamePhase.DEFENSIVE elif my_count > enemy_count + 1: return GamePhase.DOMINATING else: return GamePhase.BALANCED # ─── Planner ─────────────────────────────────────────────────────────────────── class Planner: """Multi-step goal planning: sequences like [create_army x3, move_army(target)].""" def __init__(self): self.steps = [] self.goal = None @property def active(self): return len(self.steps) > 0 def set_plan(self, goal, steps): self.goal = goal self.steps = list(steps) def next_step(self): if self.steps: return self.steps[0] return None def advance(self): if self.steps: self.steps.pop(0) def abandon(self, reason=""): self.steps = [] self.goal = None def validate(self, game_state, my_faction, my_capital): """Check if the current plan is still valid. Abandon if not.""" if not self.active: return step = self.steps[0] action = step.get("action") if action == "create_army": cap_data = game_state.get(my_capital, {}) if cap_data.get('faction') != my_faction: self.abandon("lost capital") elif action == "move_army": from_loc = step.get("from") loc_data = game_state.get(from_loc, {}) if loc_data.get('faction') != my_faction or loc_data.get('army', 0) == 0: self.abandon("lost staging location or no army") elif action == "all_out_attack": cap_data = game_state.get(my_capital, {}) if cap_data.get('faction') != my_faction or cap_data.get('army', 0) < 3: self.abandon("insufficient army for all-out attack") # ─── Strategic AI ────────────────────────────────────────────────────────────── class StrategicAI: """Main decision engine with priority cascade.""" def __init__(self, faction, map_id="war_of_kingdoms"): self.faction = faction self.map_id = map_id capitals = get_capitals(map_id) self.my_capital = capitals.get(faction) enemies = [cap for fac, cap in capitals.items() if fac != faction] self.enemy_capital = enemies[0] if enemies else None self.map = MapAnalyzer(graph=get_map_graph(map_id), capitals=capitals) self.memory = GameMemory() self.planner = Planner() self.phase = GamePhase.BALANCED self.my_territories = set() self.enemy_territories = set() self.total_army = 0 self._previous_phase = None self._previous_territories = set() self._last_evaluated = [] # Army cost for this faction on this map. self.army_cost = get_army_cost_for(map_id, faction) def decide(self, game_state): """Run the priority cascade and return an action dict or None.""" # Update memory and phase self.my_territories, self.enemy_territories = self.memory.update(game_state, self.faction) self.total_army = sum( data.get('army', 0) for loc, data in game_state.items() if data.get('faction') == self.faction ) self.phase = PhaseDetector.detect(self.my_territories, self.enemy_territories, self.total_army) # Span events: phase transition span = trace.get_current_span() if self._previous_phase is not None and self.phase != self._previous_phase: span.add_event("phase_transition", attributes={ "previous_phase": self._previous_phase.value, "new_phase": self.phase.value, "territory_count": len(self.my_territories), "total_army": self.total_army, }) self._previous_phase = self.phase # Span events: territory change current_territory_set = set(self.my_territories) gained = current_territory_set - self._previous_territories lost = self._previous_territories - current_territory_set if gained or lost: span.add_event("territory_change", attributes={ "territories_gained": str(list(gained)), "territories_lost": str(list(lost)), "current_count": len(current_territory_set), }) self._previous_territories = current_territory_set # Validate active plan (track if it gets abandoned) had_plan = self.planner.active previous_goal = self.planner.goal self.planner.validate(game_state, self.faction, self.my_capital) if had_plan and not self.planner.active: span.add_event("plan_abandoned", attributes={ "previous_goal": previous_goal or "unknown", "reason": "validation_failed", }) telemetry.record_plan_abandoned("validation_failed") # Priority cascade with alternatives tracking evaluated = [] action = self._check_capital_defense(game_state) if action: evaluated.append(f"capital_defense: TRIGGERED ({action.get('reason', '')})") self._last_evaluated = evaluated return action evaluated.append("capital_defense: skipped") action = self._find_zero_risk_captures(game_state) if action: evaluated.append(f"zero_risk_capture: TRIGGERED ({action.get('reason', '')})") self._last_evaluated = evaluated return action evaluated.append("zero_risk_capture: skipped") action = self._do_resource_transfers(game_state) if action: evaluated.append(f"resource_transfer: TRIGGERED ({action.get('reason', '')})") self._last_evaluated = evaluated return action evaluated.append("resource_transfer: skipped") action = self._execute_plan_step(game_state) if action: evaluated.append(f"execute_plan: TRIGGERED ({action.get('reason', '')})") self._last_evaluated = evaluated return action evaluated.append("execute_plan: skipped") action = self._create_new_plan(game_state) if action: evaluated.append(f"create_plan: TRIGGERED ({action.get('reason', '')})") self._last_evaluated = evaluated return action evaluated.append("create_plan: skipped") evaluated.append("fallback: TRIGGERED") self._last_evaluated = evaluated return self._fallback(game_state) # ── Priority 1: Capital Defense ──────────────────────────────────────────── def _check_capital_defense(self, game_state): """If enemies adjacent to capital, create armies or reinforce.""" cap_data = game_state.get(self.my_capital, {}) if not cap_data or cap_data.get('faction') != self.faction: return None my_army = cap_data.get('army', 0) neighbors = self.map.neighbors(self.my_capital) max_threat = 0 threat_loc = None for n in neighbors: n_data = game_state.get(n, {}) if n_data.get('faction') not in (self.faction, 'neutral') and n_data.get('army', 0) > 0: if n_data['army'] > max_threat: max_threat = n_data['army'] threat_loc = n if max_threat == 0: return None needed = max_threat + 2 trace.get_current_span().add_event("threat_detected", attributes={ "threat_location": threat_loc, "threat_army": max_threat, "capital_army": my_army, "armies_needed": needed, }) if my_army < needed: if cap_data.get('resources', 0) >= ARMY_COST: armies_to_create = min( needed - my_army, cap_data['resources'] // ARMY_COST ) return { "action": "create_army", "location": self.my_capital, "count": max(1, armies_to_create), "reason": f"capital_defense against {max_threat} at {threat_loc}" } return self._reinforce_capital(game_state) return None def _reinforce_capital(self, game_state): """Move friendly armies within 2 hops toward capital.""" best_source = None best_army = 0 for loc in MAP_GRAPH: if loc == self.my_capital: continue loc_data = game_state.get(loc, {}) if loc_data.get('faction') == self.faction and loc_data.get('army', 0) > 0: dist = self.map.distance(loc, self.my_capital) if dist <= 2 and loc_data['army'] > best_army: best_army = loc_data['army'] best_source = loc if best_source: target = self._step_toward(best_source, self.my_capital) if target: return { "action": "move_army", "from": best_source, "to": target, "reason": f"reinforce capital from {best_source}" } return None def _step_toward(self, from_loc, toward_loc): """Return the neighbor of from_loc that is closest to toward_loc.""" # Must consult the *active map's* adjacency, not the global # ``MAP_GRAPH`` (which is hard-coded to WoK). On WWA the from_loc is # e.g. ``white_walker_fortress`` — absent from the WoK graph and # raises ``KeyError`` mid-cascade, leaving the AI stuck. best = None best_dist = 99 for n in self.map.graph[from_loc]: d = self.map.distance(n, toward_loc) if d < best_dist: best_dist = d best = n return best # ── Priority 2: Zero-Risk Captures ───────────────────────────────────────── def _find_zero_risk_captures(self, game_state): """Capture locations where our army > target army + 1, sorted by strategic value.""" candidates = [] for loc in MAP_GRAPH: loc_data = game_state.get(loc, {}) if loc_data.get('faction') == self.faction: continue target_army = loc_data.get('army', 0) for neighbor in MAP_GRAPH[loc]: n_data = game_state.get(neighbor, {}) if n_data.get('faction') == self.faction and n_data.get('army', 0) > target_army + 1: # Don't attack from capital if it would leave it defenseless if neighbor == self.my_capital: cap_threatened = False for cap_n in MAP_GRAPH[self.my_capital]: cn_data = game_state.get(cap_n, {}) if cn_data.get('faction') not in (self.faction, 'neutral') and cn_data.get('army', 0) > 0: cap_threatened = True break if cap_threatened: continue if self.memory.recently_failed(loc): continue candidates.append({ "target": loc, "from": neighbor, "our_army": n_data['army'], "their_army": target_army, "strategic_value": self.map.strategic_values.get(loc, 0), "is_neutral": loc_data.get('faction') == 'neutral', }) if not candidates: return None candidates.sort(key=lambda c: (-c['is_neutral'], -c['strategic_value'])) best = candidates[0] return { "action": "move_army", "from": best["from"], "to": best["target"], "reason": f"zero_risk_capture {best['target']} (our {best['our_army']} vs {best['their_army']})" } # ── Priority 3: Resource Transfers ───────────────────────────────────────── def _do_resource_transfers(self, game_state): """Transfer resources from ALL villages above threshold to capital, every cycle.""" transfer_targets = [] for loc in MAP_GRAPH: if loc == self.my_capital: continue loc_data = game_state.get(loc, {}) if (loc_data.get('faction') == self.faction and 'village' in loc and loc_data.get('resources', 0) > RESOURCE_TRANSFER_THRESHOLD): transfer_targets.append(loc) if not transfer_targets: return None return { "action": "resource_transfer", "locations": transfer_targets, "reason": f"transfer resources from {len(transfer_targets)} villages" } # ── Priority 4: Execute Active Plan Step ─────────────────────────────────── def _execute_plan_step(self, game_state): """Execute next step of active plan.""" if not self.planner.active: return None step = self.planner.next_step() if not step: return None action = step.get("action") if action == "create_army": cap_data = game_state.get(self.my_capital, {}) if cap_data.get('resources', 0) >= ARMY_COST: self.planner.advance() return { "action": "create_army", "location": self.my_capital, "count": 1, "reason": f"plan step: {self.planner.goal}" } else: return { "action": "collect_resources", "location": self.my_capital, "reason": "waiting for resources for plan" } elif action == "move_army": from_loc = step.get("from") to_loc = step.get("to") loc_data = game_state.get(from_loc, {}) if loc_data.get('faction') == self.faction and loc_data.get('army', 0) > 0: self.planner.advance() return { "action": "move_army", "from": from_loc, "to": to_loc, "reason": f"plan step: {self.planner.goal}" } else: reason = "can't execute move step" self.planner.abandon(reason) trace.get_current_span().add_event("plan_abandoned", attributes={ "reason": reason, }) telemetry.record_plan_abandoned(reason) return None elif action == "all_out_attack": self.planner.advance() return { "action": "all_out_attack", "location": self.my_capital, "reason": f"plan step: {self.planner.goal}" } self.planner.advance() return None # ── Priority 5: Create New Plan ──────────────────────────────────────────── def _create_new_plan(self, game_state): """Create a new plan based on current phase.""" # Sub-priority: if total army < 3, always build armies first if self.total_army < 3: armies_needed = 3 - self.total_army steps = [{"action": "create_army"} for _ in range(armies_needed)] goal = f"build {armies_needed} armies" self.planner.set_plan(goal, steps) trace.get_current_span().add_event("plan_created", attributes={ "goal": goal, "step_count": len(steps), }) telemetry.record_plan_created(goal) return self._execute_plan_step(game_state) # Sub-priority: capturable targets exist -> plan capture capture_plan = self._plan_capture(game_state) if capture_plan: return capture_plan # Sub-priority: READY_TO_ATTACK + feasible all-out if self.phase == GamePhase.READY_TO_ATTACK: attack_plan = self._plan_all_out_attack(game_state) if attack_plan: return attack_plan # Sub-priority: DESPERATE -> emergency build if self.phase == GamePhase.DESPERATE: cap_data = game_state.get(self.my_capital, {}) if cap_data.get('resources', 0) >= ARMY_COST: goal = "emergency army build" steps = [{"action": "create_army"}] self.planner.set_plan(goal, steps) trace.get_current_span().add_event("plan_created", attributes={ "goal": goal, "step_count": len(steps), }) telemetry.record_plan_created(goal) return self._execute_plan_step(game_state) # Sub-priority: concentrate isolated armies concentrate = self._concentrate_forces(game_state) if concentrate: return concentrate return None def _plan_capture(self, game_state): """Plan a capture: build N armies then move toward target.""" targets = self._find_capturable_targets(game_state) if not targets: return None target = targets[0] target_loc = target["target"] target_army = game_state.get(target_loc, {}).get('army', 0) needed_army = target_army + 3 steps = [] # Build armies if needed armies_to_build = max(0, needed_army - self.total_army) for _ in range(min(armies_to_build, 5)): # cap at 5 to avoid over-planning steps.append({"action": "create_army"}) # Move one hop from capital toward target next_hop = self._step_toward(self.my_capital, target_loc) if next_hop: steps.append({"action": "move_army", "from": self.my_capital, "to": next_hop}) if steps: goal = f"capture {target_loc}" self.planner.set_plan(goal, steps) trace.get_current_span().add_event("plan_created", attributes={ "goal": goal, "step_count": len(steps), }) telemetry.record_plan_created(goal) return self._execute_plan_step(game_state) return None def _find_capturable_targets(self, game_state): """Find targets we could capture, prioritizing low-defense neutrals for income.""" targets = [] for loc in MAP_GRAPH: loc_data = game_state.get(loc, {}) if loc_data.get('faction') == self.faction: continue if self.memory.recently_failed(loc): continue target_army = loc_data.get('army', 0) is_neutral = loc_data.get('faction') == 'neutral' strat_value = self.map.strategic_values.get(loc, 0) # Find best staging location (closest of our territories) best_staging = None best_staging_dist = 99 for our_loc in self.my_territories: dist = self.map.distance(our_loc, loc) if dist < best_staging_dist: best_staging_dist = dist best_staging = our_loc path_enemy = self.map.path_army_estimate( game_state, best_staging, loc, self.faction ) if best_staging else 999 targets.append({ "target": loc, "staging": best_staging, "target_army": target_army, "path_enemy": path_enemy, "is_neutral": is_neutral, "strategic_value": strat_value, "distance": best_staging_dist, }) # Sort: neutrals first, then by lowest defense, then by strategic value targets.sort(key=lambda t: ( not t['is_neutral'], t['target_army'], -t['strategic_value'], )) return targets def _plan_all_out_attack(self, game_state): """Plan an all-out attack if feasible (expected remaining army > 2).""" path_enemy = self.map.path_army_estimate( game_state, self.my_capital, self.enemy_capital, self.faction ) expected_remaining = self.total_army - path_enemy if expected_remaining > 2: goal = "all-out attack on enemy capital" steps = [{"action": "all_out_attack"}] self.planner.set_plan(goal, steps) trace.get_current_span().add_event("plan_created", attributes={ "goal": goal, "step_count": len(steps), }) telemetry.record_plan_created(goal) return self._execute_plan_step(game_state) return None def _concentrate_forces(self, game_state): """Move isolated friendly armies toward threats or strategic hub (V5).""" target_loc = self.memory.enemy_push_direction or "village_5" for loc in MAP_GRAPH: if loc == self.my_capital: continue loc_data = game_state.get(loc, {}) if loc_data.get('faction') == self.faction and loc_data.get('army', 0) > 0: # Check if this army is isolated (no enemy neighbors) has_enemy_neighbor = False for n in MAP_GRAPH[loc]: n_data = game_state.get(n, {}) if n_data.get('faction') not in (self.faction, 'neutral'): has_enemy_neighbor = True break if not has_enemy_neighbor: next_hop = self._step_toward(loc, target_loc) if next_hop and next_hop != loc: n_data = game_state.get(next_hop, {}) if n_data.get('faction') == self.faction or n_data.get('army', 0) < loc_data['army']: return { "action": "move_army", "from": loc, "to": next_hop, "reason": f"concentrate forces from {loc} toward {target_loc}" } return None # ── Priority 6: Fallback ─────────────────────────────────────────────────── def _fallback(self, game_state): """Collect resources at capital.""" return { "action": "collect_resources", "location": self.my_capital, "reason": "fallback: collect resources" } # ── Adaptive Timing ──────────────────────────────────────────────────────── def get_pause_time(self): """Adaptive loop timing based on phase.""" if self.phase == GamePhase.DESPERATE or self.memory.territory_lost_recently(): return random.randint(2, 5) elif self.phase == GamePhase.READY_TO_ATTACK: return random.randint(3, 8) else: return random.randint(5, 15) # ─── White Walkers AI ───────────────────────────────────────────────────────── class WhiteWalkerAI(StrategicAI): """Single-player opponent on the White Walkers Attack map. Economy: corpses, not resources. Corpses come from winning battles and passive generation at the fortress. Army units cost ``ARMY_COST_BY_MAP["white_walkers_attack"]["white_walkers"]`` corpses. Priority cascade (replaces ``StrategicAI.decide``): 1. Defend the fortress when enemies are adjacent and the garrison is outnumbered. 2. Capture any wall that the White Walkers do not already control, preferring the wall that needs the fewest attacking troops to beat its 2× defender multiplier. 3. Reinforce the weakest White Walker-held wall. 4. Raid the nearest barbarian village whose army is less than or equal to the closest White Walker garrison — a clean harvest for corpses. 5. If corpses are at or above the army cost and the fortress holds any troops, raise a new undead unit. 6. No-op fallback (corpse stream keeps flowing via the passive tick). """ def decide(self, game_state): self.my_territories, self.enemy_territories = self.memory.update( game_state, self.faction ) self.total_army = sum( data.get('army', 0) for loc, data in game_state.items() if data.get('faction') == self.faction ) self.phase = PhaseDetector.detect( self.my_territories, self.enemy_territories, self.total_army ) span = trace.get_current_span() span.set_attribute("ai.variant", "white_walkers") span.set_attribute("game.map.id", self.map_id) corpses = fetch_faction_corpses(self.faction) span.set_attribute("ai.corpse_pool", corpses) evaluated = [] action = self._defend_fortress(game_state) if action: evaluated.append(f"defend_fortress: TRIGGERED ({action.get('reason', '')})") self._last_evaluated = evaluated return action evaluated.append("defend_fortress: skipped") action = self._capture_unowned_wall(game_state) if action: evaluated.append(f"capture_wall: TRIGGERED ({action.get('reason', '')})") self._last_evaluated = evaluated return action evaluated.append("capture_wall: skipped") action = self._reinforce_weakest_wall(game_state) if action: evaluated.append(f"reinforce_wall: TRIGGERED ({action.get('reason', '')})") self._last_evaluated = evaluated return action evaluated.append("reinforce_wall: skipped") action = self._raid_barbarian(game_state) if action: evaluated.append(f"raid_barbarian: TRIGGERED ({action.get('reason', '')})") self._last_evaluated = evaluated return action evaluated.append("raid_barbarian: skipped") action = self._raise_army_from_corpses(game_state, corpses) if action: evaluated.append(f"raise_army: TRIGGERED ({action.get('reason', '')})") self._last_evaluated = evaluated return action evaluated.append("raise_army: skipped") self._last_evaluated = evaluated return self._passive_fallback() # ── Cascade helpers ─────────────────────────────────────────────────────── def _defend_fortress(self, game_state): cap_data = game_state.get(self.my_capital, {}) if not cap_data or cap_data.get('faction') != self.faction: return None garrison = cap_data.get('army', 0) max_threat = 0 threat_loc = None for n in self.map.neighbors(self.my_capital): n_data = game_state.get(n, {}) n_faction = n_data.get('faction') if n_faction and n_faction != self.faction and n_faction != 'barbarian': if n_data.get('army', 0) > max_threat: max_threat = n_data['army'] threat_loc = n if max_threat == 0 or max_threat <= garrison: return None # Pull back from the strongest adjacent wall we own (if any). best_source = None best_army = 0 for wall in self._walls(): w_data = game_state.get(wall, {}) if w_data.get('faction') == self.faction and w_data.get('army', 0) > best_army: best_source = wall best_army = w_data['army'] if best_source: return { "action": "move_army", "from": best_source, "to": self.my_capital, "reason": f"defend fortress vs {threat_loc} ({max_threat} army)", } return None def _capture_unowned_wall(self, game_state): best = None best_cost = float("inf") for wall in self._walls(): w_data = game_state.get(wall, {}) if w_data.get('faction') == self.faction: continue defender = w_data.get('army', 0) # Wall multiplier = 2 — must exceed 2 * defender to take it. needed = 2 * defender + 1 source, source_army = self._nearest_source_with_army(game_state, wall, needed) if source is None: continue total_cost = needed if total_cost < best_cost: best_cost = total_cost best = (source, wall, defender) if best is None: return None source, wall, defender = best return { "action": "move_army", "from": source, "to": self._step_toward(source, wall), "reason": f"capture {wall} (defender {defender}, needed {best_cost})", } def _reinforce_weakest_wall(self, game_state): mine = [ (w, game_state.get(w, {}).get('army', 0)) for w in self._walls() if game_state.get(w, {}).get('faction') == self.faction ] if not mine: return None weakest, weakest_army = min(mine, key=lambda item: item[1]) # Prefer non-capital neighbours so corpse-driven army production at # the capital isn't drained on every tick. Capital is a fallback # below — without it the AI gets stuck post-capture, since # ``move_army`` moves *all* army, leaving walls at 0 and capital as # the only source. capital_neighbour = None for n in self.map.neighbors(weakest): n_data = game_state.get(n, {}) if n_data.get('faction') != self.faction: continue n_army = n_data.get('army', 0) if n_army <= 1: continue if n == self.my_capital: capital_neighbour = (n, n_army) continue return { "action": "move_army", "from": n, "to": weakest, "reason": f"reinforce {weakest} from {n}", } # Capital fallback. Only fire if (a) the capital has more than the # weakest wall (otherwise it's not really reinforcing) and (b) the # capital has enough to spare — leaving 0 garrison is fine because # ``_raise_army_from_corpses`` no longer requires a non-zero # garrison to wrap a fresh unit around. if capital_neighbour is not None: cap_loc, cap_army = capital_neighbour if cap_army > weakest_army + 1: return { "action": "move_army", "from": cap_loc, "to": weakest, "reason": f"reinforce {weakest} from capital ({cap_army} → wall {weakest_army})", } return None def _raid_barbarian(self, game_state): targets = [ loc for loc, t in get_location_types(self.map_id).items() if t == "village" and get_initial_factions(self.map_id).get(loc) == "barbarian" and game_state.get(loc, {}).get('faction') == "barbarian" ] if not targets: return None best = None best_margin = -1 for target in targets: defender = game_state.get(target, {}).get('army', 0) source, source_army = self._nearest_source_with_army( game_state, target, defender + 1 ) if source is None: continue margin = source_army - defender if margin > best_margin: best_margin = margin best = (source, target, defender) if best is None: return None source, target, defender = best return { "action": "move_army", "from": source, "to": self._step_toward(source, target), "reason": f"raid {target} (defender {defender}) for corpses", } def _raise_army_from_corpses(self, game_state, corpses): # Capital must still belong to us — if NW captured it the AI has # soft-lost. The earlier `army >= 1` gate has been dropped: it # blocked the AI's primary economic loop after every capital→wall # reinforcement (move_army drains the source to 0), leaving the AI # idle until corpses overflowed. cap_data = game_state.get(self.my_capital, {}) if cap_data.get('faction') != self.faction: return None if corpses < self.army_cost: return None return { "action": "create_army", "location": self.my_capital, "count": 1, "reason": f"raise undead ({corpses} corpses, cost {self.army_cost})", } def _passive_fallback(self): # No-op for White Walkers: the passive corpse tick handles "idle". return { "action": "noop", "reason": "passive: corpses accumulate at fortress", } # ── Utility ─────────────────────────────────────────────────────────────── def _walls(self): types = get_location_types(self.map_id) return [loc for loc, t in types.items() if t == "wall"] def _nearest_source_with_army(self, game_state, target, needed): """Return the (location_id, army) of the closest friendly node with at least ``needed`` troops, or ``(None, 0)`` if nothing qualifies. """ best = (None, 0) best_dist = float("inf") for loc, data in game_state.items(): if data.get('faction') != self.faction: continue if data.get('army', 0) < needed: continue dist = self.map.distance(loc, target) if dist < best_dist: best = (loc, data.get('army', 0)) best_dist = dist return best # ─── AI State ────────────────────────────────────────────────────────────────── class AIState: def __init__(self): self.faction = None self.map_id = "war_of_kingdoms" self.active = False self.last_action_time = None self.game_start_time = None self.strategic_ai = None self.decision_thread = None self.stop_flag = threading.Event() ai_state = AIState() # ─── Preserved Helpers ───────────────────────────────────────────────────────── def get_location_url(location_id): """Get the URL for a location's API. Container hostnames in docker-compose are the stable WoK names (``southern-capital``, ``village-1`` …). On WWA the *logical* location id differs (``wall_west`` → still lives on container ``village-1``), so we look up the container via ``CONTAINER_FOR_LOCATION_ID`` rather than naively hyphenating the location id. """ if os.environ.get('IN_DOCKER'): host = CONTAINER_FOR_LOCATION_ID.get(location_id, location_id.replace('_', '-')) else: host = 'localhost' port = LOCATION_PORTS[location_id] return f"http://{host}:{port}" def fetch_faction_corpses(faction): """Query any location service for the faction's corpse pool. Returns 0 on error.""" # Use slot_1 (southern-capital container); any container is fine since # the DB is shared. try: if os.environ.get('IN_DOCKER'): base = "http://southern-capital:5001" else: base = "http://localhost:5001" resp = requests.get(f"{base}/faction_economy", params={"faction": faction}, timeout=2) resp.raise_for_status() return int(resp.json().get("corpses", 0)) except Exception: return 0 def make_api_request(location_id, endpoint, method='GET', data=None): """Make an API request to a location server with trace context""" url = f"{get_location_url(location_id)}/{endpoint}" headers = {"Content-Type": "application/json"} with tracer.start_as_current_span( "ai_api_request", kind=SpanKind.CLIENT, attributes={ "location.id": location_id, "location.endpoint": endpoint, "http.method": method } ) as span: inject(headers) # Inject trace context try: if method == 'GET': response = requests.get(url, headers=headers) else: # POST response = requests.post(url, json=data, headers=headers) span.set_attribute("http.status_code", response.status_code) response.raise_for_status() result = response.json() if not result.get("success", True): span.set_status(trace.StatusCode.ERROR, result.get("message", "Unknown error")) return result except requests.RequestException as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) logger.error("API request failed", extra={"error": str(e)}) return {"error": str(e)} def get_game_state(parent_ctx): """Get the current state of every location on the currently active map.""" # Which set of location ids belongs to this AI's map? Fall back to # WoK's 8 ids if AI isn't initialised yet. if ai_state.strategic_ai is not None: location_ids = list(get_map_graph(ai_state.strategic_ai.map_id).keys()) else: location_ids = list(MAP_GRAPH.keys()) with tracer.start_as_current_span( "get_game_state", kind=SpanKind.INTERNAL, context=parent_ctx, attributes={"location_count": len(location_ids)} ) as span: game_state = {} error_count = 0 for location_id in location_ids: data = make_api_request(location_id, '') if 'error' not in data: game_state[location_id] = data else: error_count += 1 span.add_event( "location_fetch_error", attributes={ "location": location_id, "error": str(data.get('error', 'Unknown error')) } ) span.set_attribute("locations_retrieved", len(game_state)) span.set_attribute("errors", error_count) if error_count > 0: span.set_status(trace.StatusCode.ERROR, f"Failed to fetch {error_count} locations") return game_state # ─── Action Executor ─────────────────────────────────────────────────────────── def execute_strategic_action(action, game_state, parent_ctx, decision_link=None): """Execute an action returned by StrategicAI.decide().""" if not action: return action_type = action.get("action") reason = action.get("reason", "") links = [] if decision_link: links = [Link(decision_link, attributes={"link.type": "ai_decision_trigger"})] with tracer.start_as_current_span( "execute_ai_action", kind=SpanKind.INTERNAL, context=parent_ctx, links=links, attributes={ "action_type": action_type, "reason": reason, } ) as span: try: if action_type == "create_army": location = action.get("location", ai_state.strategic_ai.my_capital) count = action.get("count", 1) armies_created = 0 for i in range(count): result = make_api_request(location, 'create_army', method='POST') if result.get('success'): armies_created += 1 logger.info("AI created army", extra={"army_number": armies_created, "total_requested": count, "reason": reason}) else: logger.warning("Failed to create army", extra={"message": result.get('message', 'unknown')}) break if i < count - 1: time.sleep(0.5) span.set_attribute("armies_created", armies_created) span.set_attribute("armies_requested", count) elif action_type == "move_army": from_loc = action["from"] to_loc = action["to"] result = make_api_request( from_loc, 'move_army', method='POST', data={"target_location": to_loc} ) success = result.get('success', False) span.set_attribute("from_location", from_loc) span.set_attribute("target_location", to_loc) span.set_attribute("move_success", success) logger.info("AI move army", extra={"from_location": from_loc, "to_location": to_loc, "reason": reason, "success": success}) if not success: ai_state.strategic_ai.memory.record_failed_attack(to_loc) elif action_type == "all_out_attack": location = action.get("location", ai_state.strategic_ai.my_capital) result = make_api_request(location, 'all_out_attack', method='POST') span.set_attribute("all_out_attack", True) logger.info("AI all-out attack", extra={"location": location, "reason": reason}) elif action_type == "collect_resources": location = action.get("location", ai_state.strategic_ai.my_capital) result = make_api_request(location, 'collect_resources', method='POST') logger.info("AI collected resources", extra={"location": location, "reason": reason}) elif action_type == "resource_transfer": locations = action.get("locations", []) for loc in locations: result = make_api_request(loc, 'send_resources_to_capital', method='POST') logger.info("AI transferred resources", extra={"from_location": loc}) span.set_attribute("transfers_count", len(locations)) elif action_type == "noop": # WhiteWalkerAI uses ``noop`` as a quiet-tick fallback when # corpses are accruing but no actionable move exists. Still # emit a span so replay shows the AI was awake but chose not # to act. span.set_attribute("ai.cycle.idle", True) logger.debug("AI idle cycle", extra={"reason": reason}) except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) logger.error("Error executing AI action", extra={"error": str(e), "action_type": action_type}) # ─── Decision Loop ───────────────────────────────────────────────────────────── def ai_decision_loop(): """Main AI decision loop that runs in a separate thread""" logger.info("AI decision loop started", extra={"faction": ai_state.faction}) decision_count = 0 while ai_state.active and not ai_state.stop_flag.is_set(): decision_count += 1 with tracer.start_as_current_span( "ai_decision_cycle", kind=SpanKind.INTERNAL, attributes={ "faction": ai_state.faction, "game_phase": ai_state.strategic_ai.phase.value if ai_state.strategic_ai else "unknown", "cycle_number": decision_count, "cycle_start": datetime.now().isoformat(), "session_start": ai_state.game_start_time.isoformat() if ai_state.game_start_time else None } ) as cycle_span: parent_ctx = baggage.set_baggage("context", "parent") cycle_start_time = time.time() try: # Get current game state game_state = get_game_state(parent_ctx) my_capital = ai_state.strategic_ai.my_capital # Check if game is over if my_capital not in game_state or game_state[my_capital].get('faction') != ai_state.faction: logger.info("AI detected game over", extra={"faction": ai_state.faction, "cycle_number": decision_count}) cycle_span.set_attribute("game_over_detected", True) cycle_span.set_attribute("final_cycle", True) ai_state.active = False break # Make decision using StrategicAI decision_context = None with tracer.start_as_current_span( "ai_decision", kind=SpanKind.INTERNAL, context=parent_ctx, attributes={"game_phase": ai_state.strategic_ai.phase.value} ) as decision_span: action = ai_state.strategic_ai.decide(game_state) decision_context = decision_span.get_span_context() if action: decision_span.set_attribute("chosen_action", action.get("action", "none")) decision_span.set_attribute("reason", action.get("reason", "")) # Strategic context on spans decision_span.set_attribute("my_territories", str(list(ai_state.strategic_ai.my_territories))) decision_span.set_attribute("enemy_territories", str(list(ai_state.strategic_ai.enemy_territories))) decision_span.set_attribute("total_army", ai_state.strategic_ai.total_army) decision_span.set_attribute("game_phase", ai_state.strategic_ai.phase.value) decision_span.set_attribute("priorities_evaluated", str(ai_state.strategic_ai._last_evaluated)) if action: action_type = action.get("action", "none") telemetry.record_decision(action_type, ai_state.strategic_ai.phase.value) execute_strategic_action(action, game_state, parent_ctx, decision_link=decision_context) ai_state.last_action_time = datetime.now() cycle_span.set_attribute("action_executed", True) cycle_span.set_attribute("action_type", action_type) else: cycle_span.set_attribute("no_action_taken", True) cycle_span.set_attribute("cycle_complete", True) # Session metrics if ai_state.game_start_time: elapsed_time = (datetime.now() - ai_state.game_start_time).total_seconds() cycle_span.set_attribute("session_elapsed_seconds", elapsed_time) # Record cycle duration telemetry.record_cycle_duration(time.time() - cycle_start_time) # Adaptive pause pause_time = ai_state.strategic_ai.get_pause_time() cycle_span.set_attribute("pause_duration_seconds", pause_time) logger.info("AI waiting", extra={"pause_seconds": pause_time, "phase": ai_state.strategic_ai.phase.value}) if ai_state.stop_flag.wait(pause_time): cycle_span.set_attribute("interrupted", True) break if not ai_state.active: cycle_span.set_attribute("ai_deactivated", True) break except Exception as e: cycle_span.record_exception(e) cycle_span.set_status(trace.StatusCode.ERROR, str(e)) logger.error("Error in AI decision cycle", extra={"error": str(e), "cycle_number": decision_count}) time.sleep(5) # ─── Flask Endpoints ─────────────────────────────────────────────────────────── @app.route('/activate', methods=['POST']) def activate_ai(): """Activate the AI for a specific faction on a specific map. Accepts ``{"faction": ..., "map_id": ...}``. Defaults to War of Kingdoms when ``map_id`` is omitted (backward compat). Dispatches to ``WhiteWalkerAI`` when the requested faction is ``white_walkers``; otherwise uses the classic ``StrategicAI``. """ data = request.get_json() or {} faction = data.get('faction') map_id = data.get('map_id', 'war_of_kingdoms') valid_factions = set() for m in CAPITALS_BY_MAP.values(): valid_factions.update(m.keys()) if faction not in valid_factions: return jsonify({"success": False, "message": "Invalid faction"}), 400 if map_id not in MAP_GRAPHS_BY_MAP: return jsonify({"success": False, "message": f"Unknown map_id: {map_id}"}), 400 if faction not in get_capitals(map_id): return jsonify({ "success": False, "message": f"Faction {faction} is not valid on map {map_id}" }), 400 if ai_state.active: return jsonify({"success": False, "message": "AI already active"}), 400 ai_state.faction = faction ai_state.map_id = map_id ai_state.active = True ai_state.game_start_time = datetime.now() ai_state.stop_flag.clear() if faction == "white_walkers": ai_state.strategic_ai = WhiteWalkerAI(faction, map_id=map_id) else: ai_state.strategic_ai = StrategicAI(faction, map_id=map_id) # Register state callback for observable gauges telemetry.set_state_callback(lambda: { "territory_count": len(ai_state.strategic_ai.my_territories), "total_army": ai_state.strategic_ai.total_army, "faction": ai_state.faction or "unknown", } if ai_state.strategic_ai else None) # Corpse-pool gauge: only meaningful for White Walkers. For other AIs # the callback returns None so the gauge stays unobserved. def _corpse_cb(): if ai_state.faction == "white_walkers": return ("white_walkers", fetch_faction_corpses("white_walkers")) return None telemetry.set_corpse_callback(_corpse_cb) # Start AI decision thread ai_state.decision_thread = threading.Thread(target=ai_decision_loop, daemon=True) ai_state.decision_thread.start() logger.info( "AI activated", extra={"faction": faction, "map_id": map_id, "variant": type(ai_state.strategic_ai).__name__}, ) return jsonify({ "success": True, "message": f"AI activated for {faction} faction on {map_id}", "map_id": map_id, "variant": type(ai_state.strategic_ai).__name__, }) @app.route('/deactivate', methods=['POST']) def deactivate_ai(): """Deactivate the AI""" if not ai_state.active: return jsonify({"success": False, "message": "AI not active"}), 400 ai_state.active = False ai_state.stop_flag.set() # Wait for thread to stop (with timeout) if ai_state.decision_thread: ai_state.decision_thread.join(timeout=5) logger.info("AI deactivated", extra={"faction": ai_state.faction}) return jsonify({"success": True, "message": "AI deactivated"}) @app.route('/status', methods=['GET']) def ai_status(): """Get current AI status""" return jsonify({ "active": ai_state.active, "faction": ai_state.faction, "last_action": ai_state.last_action_time.isoformat() if ai_state.last_action_time else None, "game_phase": ai_state.strategic_ai.phase.value if ai_state.active and ai_state.strategic_ai else None }) @app.route('/health', methods=['GET']) def health_check(): """Health check endpoint""" return jsonify({"status": "healthy"}) if __name__ == '__main__': port = int(os.environ.get('PORT', 8081)) app.run(host='0.0.0.0', port=port, debug=False) ================================================ FILE: game-of-tracing/ai_opponent/requirements.txt ================================================ flask==3.1.3 requests==2.33.1 opentelemetry-api==1.41.1 opentelemetry-sdk==1.41.1 opentelemetry-exporter-otlp==1.41.1 pyroscope-io==1.0.6 pyroscope-otel==1.0.0 ================================================ FILE: game-of-tracing/ai_opponent/telemetry.py ================================================ import os from opentelemetry.sdk.resources import SERVICE_NAME, Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry import trace # Logging setup import logging from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry._logs import set_logger_provider # Metrics setup from opentelemetry import metrics from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader from opentelemetry.sdk.metrics import TraceBasedExemplarFilter from opentelemetry.metrics import CallbackOptions, Observation from typing import Iterable # Profiling setup (Pyroscope v2 + OTel span-profile linking) import pyroscope from pyroscope.otel import PyroscopeSpanProcessor class AITelemetry: def __init__(self, service_name="ai-opponent", logging_endpoint="http://alloy:4318", tracing_endpoint="http://alloy:4317", metrics_endpoint="http://alloy:4318"): self.service_name = service_name self.logging_endpoint = logging_endpoint self.tracing_endpoint = tracing_endpoint self.metrics_endpoint = metrics_endpoint self._state_callback = None self.resource = Resource.create(attributes={ SERVICE_NAME: service_name, "ai.difficulty": "normal", "ai.version": "1.0" }) self._setup_logging() self._setup_tracing() self._setup_metrics() self._setup_profiling() def _setup_logging(self): """Configure OpenTelemetry logging""" self.logger_provider = LoggerProvider(resource=self.resource) set_logger_provider(self.logger_provider) log_exporter = OTLPLogExporter( endpoint=f"{self.logging_endpoint}/v1/logs" ) self.logger_provider.add_log_record_processor( BatchLogRecordProcessor( exporter=log_exporter, max_queue_size=30, max_export_batch_size=5 ) ) # Setup root logger handler = LoggingHandler( level=logging.NOTSET, logger_provider=self.logger_provider ) logging.getLogger().addHandler(handler) logging.getLogger().setLevel(logging.INFO) self.logger = logging.getLogger(self.service_name) def _setup_tracing(self): """Configure OpenTelemetry tracing""" trace.set_tracer_provider(TracerProvider(resource=self.resource)) otlp_exporter = OTLPSpanExporter( endpoint=f"{self.tracing_endpoint}/v1/traces", insecure=True ) span_processor = BatchSpanProcessor( span_exporter=otlp_exporter, max_export_batch_size=1 ) trace.get_tracer_provider().add_span_processor(span_processor) self.tracer = trace.get_tracer(__name__) def _setup_profiling(self): """Configure Pyroscope profiling + OTel span-profile linkage.""" pyroscope.configure( application_name=self.service_name, server_address=os.getenv("PYROSCOPE_SERVER_ADDRESS", "http://alloy:9999"), tags={"service_name": self.service_name}, oncpu=True, gil_only=True, ) trace.get_tracer_provider().add_span_processor(PyroscopeSpanProcessor()) def _setup_metrics(self): """Configure OpenTelemetry metrics""" self.metric_exporter = OTLPMetricExporter( endpoint=f"{self.metrics_endpoint}/v1/metrics" ) self.metric_reader = PeriodicExportingMetricReader( self.metric_exporter, export_interval_millis=10000 ) self.meter_provider = MeterProvider( metric_readers=[self.metric_reader], resource=self.resource, exemplar_filter=TraceBasedExemplarFilter() ) metrics.set_meter_provider(self.meter_provider) self.meter = metrics.get_meter(__name__) # Counters self._decisions_counter = self.meter.create_counter( name="ai.decisions", description="Number of AI decisions made", unit="1" ) self._plans_created_counter = self.meter.create_counter( name="ai.plans_created", description="Number of plans created", unit="1" ) self._plans_abandoned_counter = self.meter.create_counter( name="ai.plans_abandoned", description="Number of plans abandoned", unit="1" ) # Histogram self._cycle_duration_histogram = self.meter.create_histogram( name="ai.decision_cycle_duration_seconds", description="Duration of AI decision cycles", unit="s" ) # Observable gauges self.meter.create_observable_gauge( name="ai.territory_count", description="Number of territories controlled by faction", callbacks=[self._observe_territory_count], unit="1" ) self.meter.create_observable_gauge( name="ai.total_army", description="Total army size for faction", callbacks=[self._observe_total_army], unit="1" ) # White Walkers Attack metrics (additive; only populate when the # relevant callback is wired). self._walls_captured_counter = self.meter.create_counter( name="ai.walls_captured", description="Number of wall keeps captured by this AI variant", unit="1", ) self._corpse_callback = None self.meter.create_observable_gauge( name="ai.corpse_pool", description="White Walker corpse pool (cost pool for raising armies)", callbacks=[self._observe_corpse_pool], unit="1", ) def _observe_territory_count(self, options: CallbackOptions) -> Iterable[Observation]: """Callback for territory count observable gauge""" if self._state_callback: try: state = self._state_callback() if state: yield Observation( value=state["territory_count"], attributes={"faction": state["faction"]} ) except Exception: pass def _observe_total_army(self, options: CallbackOptions) -> Iterable[Observation]: """Callback for total army observable gauge""" if self._state_callback: try: state = self._state_callback() if state: yield Observation( value=state["total_army"], attributes={"faction": state["faction"]} ) except Exception: pass def set_state_callback(self, fn): """Register a callback that returns current AI state for observable gauges""" self._state_callback = fn def set_corpse_callback(self, fn): """Register a callback that returns ``(faction, corpses)`` for the ``ai.corpse_pool`` gauge. ``fn`` should return ``None`` when the current AI variant does not use the corpse economy. """ self._corpse_callback = fn def _observe_corpse_pool(self, options: CallbackOptions) -> Iterable[Observation]: if not self._corpse_callback: return try: result = self._corpse_callback() if not result: return faction, corpses = result yield Observation(value=int(corpses), attributes={"faction": faction}) except Exception: pass def record_wall_captured(self, wall_id, source): """Increment the walls-captured counter. ``source`` is the AI variant name.""" self._walls_captured_counter.add( 1, {"wall_id": wall_id, "variant": source} ) def record_decision(self, action_type, phase): """Record an AI decision metric""" self._decisions_counter.add(1, {"action_type": action_type, "phase": phase}) def record_plan_created(self, goal): """Record a plan creation metric""" self._plans_created_counter.add(1, {"goal": goal}) def record_plan_abandoned(self, reason): """Record a plan abandonment metric""" self._plans_abandoned_counter.add(1, {"reason": reason}) def record_cycle_duration(self, seconds): """Record decision cycle duration""" self._cycle_duration_histogram.record(seconds) def collect_metrics(self): """Force collection and export of all metrics""" try: self.metric_reader.collect() self.meter_provider.force_flush() except Exception: pass def get_tracer(self): """Get the configured tracer""" return self.tracer def get_logger(self): """Get the configured logger""" return self.logger def shutdown(self): """Flush and shutdown all telemetry providers.""" try: trace.get_tracer_provider().shutdown() except Exception: pass try: self.meter_provider.shutdown() except Exception: pass try: self.logger_provider.shutdown() except Exception: pass ================================================ FILE: game-of-tracing/app/CLAUDE.md ================================================ # app/ — Location Servers > 8 Flask microservices representing map territories in the *War of Kingdoms* game. This doc is read by any AI coding agent (Claude, Cursor, Codex, Cline). For scenario-wide context read [`../AGENTS.md`](../AGENTS.md) first. ## Purpose All 8 locations run the same codebase. A container's **slot** (set via `SLOT_ID` env var, `slot_1` … `slot_8`) is fixed at build time; the **logical identity** it serves (`southern_capital`, `wall_west`, `barbarian_village_east`, …) is resolved at boot and on `/reload` from the active map in `game_state.db`. Each location: - Owns a row in the shared `game_state.db` (resources, army, faction). - Exposes an HTTP API for collecting resources, creating armies, moving armies, and launching attacks. - Instruments every route with OpenTelemetry traces, logs, and five custom game metrics. - Runs passive resource generation for villages (every 15 s) and handles cooldowns for capitals. - On the White Walkers Attack map, also runs: passive barbarian army growth (every 30 s at barbarian villages), passive corpse generation (every 15 s at the White Walker fortress), passive resource generation at the Night's Watch capital (+5 every 10 s — WWA has no friendly villages, so this replaces the click-only economy), and the wall multiplier (defenders count 2× at `wall`-type locations). Ports 5001-5008: | Location ID | Service name | Port | Type | |---|---|---|---| | `southern_capital` | `southern-capital` | 5001 | capital | | `northern_capital` | `northern-capital` | 5002 | capital | | `village_1` | `village-1` | 5003 | village | | `village_2` | `village-2` | 5004 | village | | `village_3` | `village-3` | 5005 | village | | `village_4` | `village-4` | 5006 | village | | `village_5` | `village-5` | 5007 | village | | `village_6` | `village-6` | 5008 | village | Service names (hyphenated) match the `SERVICE_NAME` resource attribute used in traces. Location IDs (underscored) are what DB rows and `game_config.py` use. Bridge: `location_id.replace('_', '-')`. ## File map | File | Size | Purpose | |---|---|---| | `game_config.py` | ~3 KB | `LOCATIONS` dict: coordinates, connections, initial resources/army/faction, passive-rate, costs. | | `telemetry.py` | ~11 KB | `GameTelemetry` class — traces, logs, metrics (5 observable gauges + 1 counter for game state), plus Pyroscope profiling with OTel span-profile linkage. | | `location_server.py` | ~52 KB (~1200 lines) | `LocationServer` class — Flask app, routes, DB access, pathfinding, battle resolution, background-thread movement. | | `run_game.py` | — | CLI to run all 8 services as separate local processes (non-Docker). | | `Dockerfile` | small | `python:3.11-slim`, `pip install -r requirements.txt`, runs `python location_server.py`. | | `requirements.txt` | small | Flask 3.1.3, requests 2.33.1, OpenTelemetry SDK/API + OTLP gRPC/HTTP exporters, `pyroscope-io` + `pyroscope-otel` for profiling. | ## Routes | Method | Path | Handler span name | Purpose | |---|---|---|---| | `GET` | `/` | `get_location_info` | Location state + optional cooldown | | `POST` | `/collect_resources` | `collect_resources` | Capital-only; 5 s cooldown; +20 resources | | `POST` | `/create_army` | `create_army` | Capital-only; costs 30 resources → +1 army unit | | `POST` | `/move_army` | `move_army_request` | Move army to adjacent location; spawns background movement thread | | `POST` | `/all_out_attack` | `all_out_attack` | Capital-to-capital attack via `_find_path(target, ATTACK)` | | `POST` | `/receive_army` | `receive_army` | Target of `_continue_army_movement`; resolves battle via `_handle_battle` | | `POST` | `/receive_resources` | `receive_resources` | Target of `_transfer_resources_along_path` | | `GET` | `/health` | — | Docker health check; returns `{"status":"ok"}` | | `POST` | `/send_resources_to_capital` | — | Village → friendly capital resource forwarding (used by AI) | | `POST` | `/reload` | — | Re-read `active_map_id` + rebind slot identity in place (war_map calls this after `/select_map`) | | `GET` | `/faction_economy?faction=...` | — | Read a faction's corpse pool (AI uses it) | ## Key algorithms ### Dijkstra pathfinding — `_find_path()` at `location_server.py:128-182` Faction-aware edge weights: | Mode | Friendly | Neutral | Enemy | |---|---|---|---| | `PathType.RESOURCE` | 1 | 2 | ∞ (unreachable) | | `PathType.ATTACK` | 1 | 2 | 3 | Resource routing only returns a path if the source is a capital of a known faction. Attack routing allows crossing enemy terrain at a cost. ### Battle resolution — `_handle_battle()` at `location_server.py:184-207` | Case | Outcome | New army | New faction | |---|---|---|---| | Same faction | `reinforcement` | `attacking + defending` | defender's | | `attacking > defending` | `attacker_victory` | `attacking - defending` | attacker's | | `defending > attacking` | `defender_victory` | `defending - attacking` | defender's | | equal | `stalemate` | `0` | defender's (territory held by default) | Every outcome calls `telemetry.record_battle(attacker_faction, defender_faction, result)`, which increments the `game.battles` counter and force-flushes metrics. ### Atomic state updates — `_update_location_state()` Forces metric collection at `location_server.py:124` on important changes (`faction`, `resources`, or `army` mutated), so the dashboard reflects state within ~1 s of the mutating request instead of waiting for the 10 s `PeriodicExportingMetricReader` cycle. ## OpenTelemetry patterns specific to `app/` ### HTTP clients go through one helper `_make_request_with_trace()` at `location_server.py:327-352` is the only place outbound HTTP happens. It wraps every call in a CLIENT span, sets `http.url` and `http.status_code` attributes, and calls `inject(headers)` to propagate W3C trace context downstream. If you add a new outbound call, use this helper — do not call `requests.post` directly. ### Background threads capture context explicitly Two methods spawn background threads for delayed operations: - `_continue_army_movement()` at `location_server.py:209-271` — 5 s delay before the army arrives at the next location. - `_transfer_resources_along_path()` at `location_server.py:273-325` — 5 s delay before the resources arrive. Both follow the canonical pattern: ```python ctx = get_current() # capture before Thread().start() def work(): token = attach(ctx) # re-attach inside the thread try: with tracer.start_as_current_span("..."): ... # span now belongs to the captured trace finally: detach(token) Thread(target=work).start() ``` If you add a new background thread, replicate this pattern. Python threads will **not** inherit OTel context on their own — the span will be orphaned with a fresh trace_id. ### Span attributes that feed the Grafana dashboard Preserve these when adding or modifying spans (the provisioned dashboard's TraceQL filters depend on them): - `span.resource.movement = true` — any resource transfer span - `span.battle.occurred = true` — any span that triggers `_handle_battle` - `span.player.action = true` — any span caused by a human player action ## Custom metrics — `telemetry.py` See `AGENTS.md` for the full cross-service metrics table. `app/`-specific: | Metric | Type | Callback location in `telemetry.py` | |---|---|---| | `game.resources` | observable gauge | `_observe_resources` at `:176-193` | | `game.army_size` | observable gauge | `_observe_army_size` at `:195-213` | | `game.battles` | counter | `record_battle` at `:274-290` | | `game.resource_transfer_cooldown` | observable gauge | `_observe_resource_cooldown` at `:215-233` | | `game.location_control` | observable gauge | `_observe_location_control` at `:235-260` (values: `northern=1`, `southern=2`, `neutral=0`, unknown=`-1`) | The gauge callbacks read from live server state via `_get_location_state()`, which the `LocationServer` registers on the telemetry instance at construction time. ## New mechanics (White Walkers Attack) All defined in `app/game_config.py`'s `MAPS["white_walkers_attack"]["rules"]`. All behave as no-ops on `war_of_kingdoms`. - **Wall defender multiplier** — `_handle_battle` accepts a `location_type` argument and scales `defending_army` by `rules["wall_multiplier"]` (2.0 on WWA, 1.0 on WoK) when the location type is `wall`. Remaining defender count is converted back to physical units after the fight. - **Corpse economy** — when the battle winner is `white_walkers`, the post-battle hook in `receive_army` calls `self._add_corpses(attacking + defending - remaining, "white_walkers")`. `create_army` reads `get_army_currency(map_id, faction)` and, for `currency == "corpses"`, atomically decrements via `_spend_corpses` instead of touching `resources`. The corpse pool lives in `faction_economy` (persistent) so a `/reload` doesn't wipe it. - **Barbarian passive growth** — `_start_barbarian_growth(interval_s)` runs when `faction == "barbarian"`; adds +1 army every `rules["barbarian_army_growth_interval_s"]` (30 s). Guards each iteration against identity changes via `/reload`. - **Captured-camp resource generation** — `_start_passive_generation()` is launched for *every* `type == "village"` slot at boot (including barbarian Free Folk camps). The per-iteration `faction != "barbarian"` guard keeps it a no-op while the camp is still barbarian, then it starts producing the standard village amount the moment the player captures it. Without this fallthrough, captured camps stayed unproductive because the thread was never started on barbarian slots. - **White Walker passive corpses** — `_start_white_walker_corpse_tick(interval_s)` runs at the WW fortress, +1 corpse every `rules["white_walker_passive_corpse_interval_s"]` (15 s). - **Night's Watch passive resources** — `_start_nights_watch_capital_resource_tick(interval_s, amount)` runs at Castle Black on WWA (`faction == "nights_watch"`, `loc_type == "capital"`), adding `rules["nights_watch_capital_passive_amount"]` resources every `rules["nights_watch_capital_passive_interval_s"]` (5 per 10 s). Manual `/collect_resources` (+20, 5 s cooldown) still works alongside. ## DB additions (live in `game_state.db`) - **`game_config`** — `(key, value)` key/value store. The `active_map_id` row is authoritative; containers re-read it on boot and on `/reload`. - **`faction_economy`** — `(faction, corpses)`. Updated by `_add_corpses` / `_spend_corpses`. Read by the AI via `/faction_economy?faction=white_walkers`. ## Environment | Var | Default | Purpose | |---|---|---| | `SLOT_ID` | — (required, `slot_1` … `slot_8`) | Fixed physical slot this container occupies | | `LOCATION_ID` | — (legacy; no longer authoritative) | Kept for backward-compat with `run_game.py` local dev | | `PORT` | derived from `LOCATION_ID` | HTTP listen port | | `IN_DOCKER` | unset | When set, location URLs resolve via container DNS (`village-2:5004`) instead of `localhost:5004` | | `DATABASE_FILE` | `/data/game_state.db` (Docker) / `./game_state.db` (local) | SQLite WAL-mode DB | ## Common edits **Add a new location.** 1. Add an entry to `LOCATIONS` in `game_config.py` (connections list, initial resources/army/faction, port). 2. Add a `village-N` service in both `docker-compose.yml` and `docker-compose.coda.yml`. 3. Add to the `LOCATION_PORTS` dict in `war_map/app.py` and `ai_opponent/ai_server.py`. 4. Update the services-and-ports table in `../AGENTS.md` and the location table at the top of this file. **Add a new metric.** 1. Add an observable gauge (or counter) in `telemetry.py` next to the existing ones. 2. If it reads from location state, register a callback that calls `self._get_location_state(...)`. 3. Add a row to the metrics table in this doc and in `../AGENTS.md`. **Add a new route.** 1. Wrap the handler in `tracer.start_as_current_span(..., context=extract(request.headers), ...)`. 2. Add `"span.player.action": True` (if triggered by a player) so the dashboard picks it up. 3. If the route spawns a background thread, follow the `get_current()` / `attach` / `detach` pattern from `:209-271`. ## Keep this doc current Per the sub-agent rule, any change to routes, metrics, span attributes, env vars, or the line-number anchors above must land in the same work unit. Before returning a response that touched `app/`, grep this file for references to anything you changed. ## Cross-references - [`../AGENTS.md`](../AGENTS.md) — scenario-wide architecture and patterns - [`../war_map/CLAUDE.md`](../war_map/CLAUDE.md) — the consumer of this service's HTTP API on behalf of the player - [`../ai_opponent/CLAUDE.md`](../ai_opponent/CLAUDE.md) — the other consumer of this API (autonomous) - [`../SPAN_LINKS.md`](../SPAN_LINKS.md) — how action spans chain across services ================================================ FILE: game-of-tracing/app/Dockerfile ================================================ FROM python:3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2 WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt COPY . . ENV FLASK_APP=location_server.py # Default to running the main server script CMD ["python", "run_game.py"] ================================================ FILE: game-of-tracing/app/game_config.py ================================================ """Game configuration for all maps in the game-of-tracing scenario. Each entry in ``MAPS`` describes a playable map. A map has: - ``display_name`` / ``description`` — surfaced by the map picker UI. - ``single_player`` + ``player_faction`` / ``ai_faction`` — the map picker uses these to skip faction selection and auto-activate the AI when appropriate. - ``factions`` — the valid faction strings for this map. - ``slot_assignments`` — maps the fixed container slot ids (``slot_1`` … ``slot_8``) to the logical location id that slot serves on this map. The 8 location containers carry only their ``SLOT_ID`` — their in-game identity is resolved at boot (and on ``/reload``) via this table. - ``locations`` — per-location config (name, type, faction, connections, initial resources/army, port). - ``rules`` — map-wide game rules (army costs and currency per faction, wall multiplier, tick interval, hold-to-win ticks, passive growth intervals). The active map id is stored at runtime in the shared ``game_state.db`` in the ``game_config`` key-value table (written by ``war_map`` on ``/select_map``). Both ``location_server`` and ``war_map`` read it to resolve per-service state. """ from __future__ import annotations DATABASE_FILE = "game_state.db" DEFAULT_MAP_ID = "war_of_kingdoms" # Each of the 8 location containers has a fixed SLOT_ID env var # (slot_1 .. slot_8). Its in-game identity is resolved through the active # map's slot_assignments table, so the same container can serve "village_1" on # War of Kingdoms and "wall_west" on White Walkers Attack. SLOT_IDS = tuple(f"slot_{i}" for i in range(1, 9)) MAPS = { "war_of_kingdoms": { "display_name": "War of Kingdoms", "description": ( "Northern and Southern kingdoms clash for dominance. " "Capture the enemy capital to win." ), "single_player": False, "factions": ["northern", "southern"], "slot_assignments": { "slot_1": "southern_capital", "slot_2": "northern_capital", "slot_3": "village_1", "slot_4": "village_2", "slot_5": "village_3", "slot_6": "village_4", "slot_7": "village_5", "slot_8": "village_6", }, "locations": { "southern_capital": { "name": "Southern Capital", "type": "capital", "faction": "southern", "connections": ["village_1", "village_3"], "initial_resources": 100, "initial_army": 1, "port": 5001, }, "northern_capital": { "name": "Northern Capital", "type": "capital", "faction": "northern", "connections": ["village_2", "village_6"], "initial_resources": 100, "initial_army": 1, "port": 5002, }, "village_1": { "name": "Village 1", "type": "village", "faction": "neutral", "connections": ["southern_capital", "village_2", "village_4"], "initial_resources": 50, "initial_army": 2, "port": 5003, }, "village_2": { "name": "Village 2", "type": "village", "faction": "neutral", "connections": ["northern_capital", "village_1", "village_5"], "initial_resources": 50, "initial_army": 3, "port": 5004, }, "village_3": { "name": "Village 3", "type": "village", "faction": "neutral", "connections": ["southern_capital", "village_5", "village_6"], "initial_resources": 50, "initial_army": 2, "port": 5005, }, "village_4": { "name": "Village 4", "type": "village", "faction": "neutral", "connections": ["village_1", "village_5"], "initial_resources": 50, "initial_army": 1, "port": 5006, }, "village_5": { "name": "Village 5", "type": "village", "faction": "neutral", "connections": ["village_2", "village_3", "village_4", "village_6"], "initial_resources": 50, "initial_army": 4, "port": 5007, }, "village_6": { "name": "Village 6", "type": "village", "faction": "neutral", "connections": ["northern_capital", "village_3", "village_5"], "initial_resources": 50, "initial_army": 2, "port": 5008, }, }, "rules": { "resource_generation": {"capital": 20, "village": 10}, "army_cost": {"default": 30}, "army_currency": {"default": "resources"}, "wall_multiplier": 1.0, "barbarian_army_growth_interval_s": 0, "white_walker_passive_corpse_interval_s": 0, "tick_interval_s": 0, "win_hold_ticks": 0, }, }, "white_walkers_attack": { "display_name": "White Walkers Attack", "description": ( "The Long Night has come. As the Night's Watch, hold every Wall " "keep for 5 ticks (150 s) before the White Walkers do. Single-player." ), "single_player": True, "player_faction": "nights_watch", "ai_faction": "white_walkers", "factions": ["nights_watch", "white_walkers", "barbarian"], "slot_assignments": { "slot_1": "nights_watch_fortress", "slot_2": "white_walker_fortress", "slot_3": "wall_west", "slot_4": "wall_center_west", "slot_5": "wall_center_east", "slot_6": "wall_east", "slot_7": "barbarian_village_west", "slot_8": "barbarian_village_east", }, "locations": { "nights_watch_fortress": { "name": "Castle Black", "type": "capital", "faction": "nights_watch", "connections": [ "wall_west", "wall_center_west", "wall_center_east", "wall_east", ], "initial_resources": 150, "initial_army": 3, "port": 5001, }, "white_walker_fortress": { "name": "The Lands of Always Winter", "type": "capital", "faction": "white_walkers", "connections": [ "wall_west", "wall_center_west", "wall_center_east", "wall_east", ], # White Walkers spend corpses, not resources. Keep the column # populated so the DB row shape stays uniform; the create_army # handler reads currency from the map rules. "initial_resources": 0, "initial_army": 2, "port": 5002, }, "wall_west": { "name": "Westwatch", "type": "wall", "faction": "neutral", "connections": [ "nights_watch_fortress", "white_walker_fortress", "wall_center_west", "barbarian_village_west", ], "initial_resources": 0, "initial_army": 1, "port": 5003, }, "wall_center_west": { "name": "Queensgate", "type": "wall", "faction": "neutral", "connections": [ "nights_watch_fortress", "white_walker_fortress", "wall_west", "wall_center_east", ], "initial_resources": 0, "initial_army": 1, "port": 5004, }, "wall_center_east": { "name": "Deep Lake", "type": "wall", "faction": "neutral", "connections": [ "nights_watch_fortress", "white_walker_fortress", "wall_center_west", "wall_east", ], "initial_resources": 0, "initial_army": 1, "port": 5005, }, "wall_east": { "name": "Eastwatch-by-the-Sea", "type": "wall", "faction": "neutral", "connections": [ "nights_watch_fortress", "white_walker_fortress", "wall_center_east", "barbarian_village_east", ], "initial_resources": 0, "initial_army": 1, "port": 5006, }, "barbarian_village_west": { "name": "Free Folk Camp (West)", "type": "village", "faction": "barbarian", "connections": ["wall_west"], "initial_resources": 0, "initial_army": 2, "port": 5007, }, "barbarian_village_east": { "name": "Free Folk Camp (East)", "type": "village", "faction": "barbarian", "connections": ["wall_east"], "initial_resources": 0, "initial_army": 2, "port": 5008, }, }, "rules": { # Night's Watch capital collects resources on the classic schedule. # White Walker fortress ignores resource_generation (uses corpses). "resource_generation": {"capital": 20, "village": 10}, "army_cost": { "default": 30, "white_walkers": 5, }, "army_currency": { "default": "resources", "white_walkers": "corpses", }, "wall_multiplier": 2.0, "barbarian_army_growth_interval_s": 30, "white_walker_passive_corpse_interval_s": 15, # WWA gives the Night's Watch no friendly villages, so its only # income source is /collect_resources at Castle Black. Add a slow # passive trickle so the resource HUD ticks up without click-spam. # Keep it well below the click rate (+20 per 5 s) — passive should # supplement, not replace, active play. "nights_watch_capital_passive_amount": 5, "nights_watch_capital_passive_interval_s": 10, "tick_interval_s": 30, "win_hold_ticks": 5, }, }, } # Backward-compat exports: unchanged shape for callers that don't know about # maps yet. These always reflect the War of Kingdoms defaults. LOCATIONS = MAPS[DEFAULT_MAP_ID]["locations"] RESOURCE_GENERATION = MAPS[DEFAULT_MAP_ID]["rules"]["resource_generation"] COSTS = {"create_army": MAPS[DEFAULT_MAP_ID]["rules"]["army_cost"]["default"]} def get_map(map_id): """Return the full map-config dict for ``map_id``.""" if map_id not in MAPS: raise KeyError(f"Unknown map_id: {map_id}") return MAPS[map_id] def resolve_slot(map_id, slot_id): """Return the location_id the given slot serves on the given map.""" return MAPS[map_id]["slot_assignments"][slot_id] def get_location_config(map_id, location_id): """Return the per-location config dict for (map_id, location_id).""" return MAPS[map_id]["locations"][location_id] def get_rules(map_id): """Return the ``rules`` dict for ``map_id``.""" return MAPS[map_id]["rules"] def get_army_cost(map_id, faction): """Return the army-creation cost for ``faction`` on ``map_id``.""" costs = MAPS[map_id]["rules"]["army_cost"] return costs.get(faction, costs["default"]) def get_army_currency(map_id, faction): """Return ``"resources"`` or ``"corpses"`` for ``faction`` on ``map_id``.""" currencies = MAPS[map_id]["rules"]["army_currency"] return currencies.get(faction, currencies["default"]) def locations_by_type(map_id, type_name): """Return the list of location_ids on ``map_id`` of the given ``type_name``.""" return [ lid for lid, cfg in MAPS[map_id]["locations"].items() if cfg["type"] == type_name ] ================================================ FILE: game-of-tracing/app/location_server.py ================================================ """Location server implementation. Each of the 8 location containers has a constant ``SLOT_ID`` env var (``slot_1`` … ``slot_8``). The in-game identity a slot serves (e.g. ``southern_capital`` in War of Kingdoms, ``wall_west`` in White Walkers Attack) is resolved at boot and on ``/reload`` via the active map stored in the shared ``game_config`` key-value table. See ``game_config.MAPS``. The per-container SERVICE_NAME (used by Grafana dashboards) stays stable regardless of map — it's derived from ``LOCATION_NAME`` env / slot id, not from the logical location id. """ import os, sqlite3, requests, random, time, threading, atexit from threading import Thread, Lock from datetime import datetime, timedelta from flask import Flask, jsonify, request from game_config import ( MAPS, COSTS, DATABASE_FILE, DEFAULT_MAP_ID, LOCATIONS, RESOURCE_GENERATION, SLOT_IDS, get_army_cost, get_army_currency, get_location_config, get_map, get_rules, resolve_slot, ) from telemetry import GameTelemetry from opentelemetry.propagate import extract, inject from opentelemetry import trace from opentelemetry.trace import SpanKind from opentelemetry.context import get_current, attach, detach from enum import Enum from typing import Optional, List, Tuple, Dict class PathType(Enum): RESOURCE = 'resource' ATTACK = 'attack' class LocationServer: def __init__(self, slot_or_location=None): # Accept either a slot id (new, preferred) or a legacy location id # (for backward compat with local dev scripts). Falls back to env. raw = slot_or_location or os.environ.get('SLOT_ID') if raw in SLOT_IDS: self.slot_id = raw elif raw in MAPS[DEFAULT_MAP_ID]["locations"]: # Legacy: caller passed a War of Kingdoms location id; resolve to # its slot via the reverse map. inverse = {v: k for k, v in MAPS[DEFAULT_MAP_ID]["slot_assignments"].items()} self.slot_id = inverse[raw] else: raise ValueError( f"Cannot determine SLOT_ID from {raw!r}; expected one of {SLOT_IDS} " f"or a War of Kingdoms location id." ) self.app = Flask(__name__) self.last_resource_collection = {} self.resource_cooldown = {} self.lock = Lock() # SERVICE_NAME must stay stable across map switches so Grafana # dashboards keep their series. Prefer the explicit LOCATION_NAME env # (matches container name in docker-compose); else synthesise from the # slot id. service_name = os.environ.get('LOCATION_NAME') or self.slot_id.replace('_', '-') self.telemetry = GameTelemetry(service_name=service_name) self.logger = self.telemetry.get_logger() self.tracer = self.telemetry.get_tracer() # Give telemetry access to location state self.telemetry._get_location_state = self._get_location_state # And access to faction-scoped economy (for the corpse gauge). self.telemetry._get_corpse_count = self._get_corpses self.db_path = os.environ.get('DATABASE_FILE', DATABASE_FILE) # Populated by _load_identity(). self.map_id = DEFAULT_MAP_ID self.location_id = None self.location_info = None self._passive_thread_started = False self._barbarian_thread_started = False self._corpse_thread_started = False self._nw_capital_thread_started = False self._initialize_database() self._load_identity() self.setup_routes() atexit.register(self.telemetry.shutdown) # ---------------------------------------------------------------- # Map / slot identity resolution # ---------------------------------------------------------------- def _current_locations(self) -> Dict: """Return the active map's ``location_id → config`` dict.""" return MAPS[self.map_id]["locations"] def _current_rules(self) -> Dict: return MAPS[self.map_id]["rules"] def _read_active_map_id(self) -> str: conn = self._get_db_connection() try: row = conn.execute( "SELECT value FROM game_config WHERE key = 'active_map_id'" ).fetchone() finally: conn.close() return row['value'] if row else DEFAULT_MAP_ID def _load_identity(self): """Resolve slot → (map, location_id, config); seed this slot's row.""" self.map_id = self._read_active_map_id() self.location_id = resolve_slot(self.map_id, self.slot_id) self.location_info = get_location_config(self.map_id, self.location_id) # Publish live identity to the telemetry instance so the observable # gauges report the currently-served id, not whatever id was derived # from the container's SERVICE_NAME at boot. self.telemetry._location_id = self.location_id self.telemetry._location_type = self.location_info["type"] # Seed this slot's row in the locations table if missing. Idempotent: # INSERT OR IGNORE handles the case where war_map already re-seeded. conn = self._get_db_connection() try: conn.execute( "INSERT OR IGNORE INTO locations (id, resources, army, faction) VALUES (?, ?, ?, ?)", ( self.location_id, self.location_info["initial_resources"], self.location_info["initial_army"], self.location_info["faction"], ), ) conn.commit() finally: conn.close() self._start_passive_threads_if_needed() self.logger.info( f"Identity loaded: slot={self.slot_id} map={self.map_id} " f"location_id={self.location_id} type={self.location_info['type']} " f"faction={self.location_info['faction']}" ) def _start_passive_threads_if_needed(self): """Kick off whichever passive loop matches this slot's identity. Threads are started at most once per process lifetime. If a slot's identity changes through ``/reload``, the *old* thread keeps running but becomes a no-op because it guards each iteration against the current location type/faction. """ loc_type = self.location_info["type"] faction = self.location_info["faction"] rules = self._current_rules() # Launch the village resource thread for *every* village, including # barbarian-faction slots (Free Folk camps). The thread guards each # iteration on ``faction != "barbarian"``, so it stays a no-op while # the camp is still barbarian and starts producing for the player # the moment they capture it. Without this fallthrough, captured # camps stay unproductive because the thread was never started. if loc_type == "village" and not self._passive_thread_started: self._start_passive_generation() self._passive_thread_started = True if faction == "barbarian" and not self._barbarian_thread_started: interval = rules.get("barbarian_army_growth_interval_s", 0) or 0 if interval > 0: self._start_barbarian_growth(interval) self._barbarian_thread_started = True if ( loc_type == "capital" and faction == "white_walkers" and not self._corpse_thread_started ): interval = rules.get("white_walker_passive_corpse_interval_s", 0) or 0 if interval > 0: self._start_white_walker_corpse_tick(interval) self._corpse_thread_started = True if ( loc_type == "capital" and faction == "nights_watch" and not self._nw_capital_thread_started ): interval = rules.get("nights_watch_capital_passive_interval_s", 0) or 0 amount = rules.get("nights_watch_capital_passive_amount", 0) or 0 if interval > 0 and amount > 0: self._start_nights_watch_capital_resource_tick(interval, amount) self._nw_capital_thread_started = True # ---------------------------------------------------------------- # Corpse economy (faction-scoped; lives in faction_economy table) # ---------------------------------------------------------------- def _get_corpses(self, faction: str = "white_walkers") -> int: conn = self._get_db_connection() try: row = conn.execute( "SELECT corpses FROM faction_economy WHERE faction = ?", (faction,) ).fetchone() finally: conn.close() return int(row['corpses']) if row else 0 def _add_corpses(self, delta: int, faction: str = "white_walkers"): if delta <= 0: return conn = self._get_db_connection() try: conn.execute( "INSERT INTO faction_economy (faction, corpses) VALUES (?, ?) " "ON CONFLICT(faction) DO UPDATE SET corpses = corpses + excluded.corpses", (faction, delta), ) conn.commit() finally: conn.close() def _spend_corpses(self, amount: int, faction: str = "white_walkers") -> bool: """Atomically decrement ``faction``'s corpse pool. Returns True on success.""" conn = self._get_db_connection() try: cursor = conn.execute( "UPDATE faction_economy SET corpses = corpses - ? " "WHERE faction = ? AND corpses >= ?", (amount, faction, amount), ) conn.commit() return cursor.rowcount > 0 finally: conn.close() def _find_capital(self, faction: str) -> Optional[str]: """Return the location_id of the capital with the given faction in the active map, by static config.""" for loc_id, cfg in self._current_locations().items(): if cfg["type"] == "capital" and cfg["faction"] == faction: return loc_id return None def _find_enemy_capital(self, faction: str) -> Optional[str]: """Return the location_id of a capital not belonging to ``faction`` (and not barbarian), by static config.""" for loc_id, cfg in self._current_locations().items(): if cfg["type"] == "capital" and cfg["faction"] not in (faction, "barbarian"): return loc_id return None def _get_db_connection(self): # ``timeout`` applies before the first PRAGMA runs, so concurrent # boot of all 8 containers doesn't race on ``PRAGMA journal_mode=WAL`` # (which briefly acquires an exclusive lock to switch modes). conn = sqlite3.connect(self.db_path, timeout=15) conn.execute("PRAGMA busy_timeout=5000") conn.execute("PRAGMA journal_mode=WAL") conn.row_factory = sqlite3.Row return conn def _initialize_database(self): conn = self._get_db_connection() cursor = conn.cursor() # Canonical per-location state. cursor.execute(''' CREATE TABLE IF NOT EXISTS locations ( id TEXT PRIMARY KEY, resources INTEGER NOT NULL, army INTEGER NOT NULL, faction TEXT NOT NULL ) ''') # Key/value game-wide config; holds active_map_id (authoritative at # runtime; overrides whatever the process started with). cursor.execute(''' CREATE TABLE IF NOT EXISTS game_config ( key TEXT PRIMARY KEY, value TEXT NOT NULL ) ''') cursor.execute( "INSERT OR IGNORE INTO game_config (key, value) VALUES ('active_map_id', ?)", (DEFAULT_MAP_ID,), ) # Faction-scoped economy (White Walkers' corpse pool today; room for # additional faction-level currencies later). cursor.execute(''' CREATE TABLE IF NOT EXISTS faction_economy ( faction TEXT PRIMARY KEY, corpses INTEGER NOT NULL DEFAULT 0 ) ''') conn.commit() conn.close() def _get_location_state(self, location_id): conn = self._get_db_connection() cursor = conn.cursor() cursor.execute("SELECT * FROM locations WHERE id = ?", (location_id,)) row = cursor.fetchone() state = None if row: state = { "resources": row['resources'], "army": row['army'], "faction": row['faction'] } conn.close() return state def _update_location_state(self, location_id, resources=None, army=None, faction=None): set_clauses = [] params = [] if resources is not None: set_clauses.append("resources = ?") params.append(resources) if army is not None: set_clauses.append("army = ?") params.append(army) if faction is not None: set_clauses.append("faction = ?") params.append(faction) if not set_clauses: return False params.append(location_id) conn = self._get_db_connection() cursor = conn.cursor() cursor.execute( f"UPDATE locations SET {', '.join(set_clauses)} WHERE id = ?", params ) conn.commit() conn.close() # Force metric collection on important state changes if faction is not None or resources is not None or army is not None: self.telemetry.collect_metrics() return True def _find_path(self, target: str, path_type: PathType) -> Optional[List[str]]: """Unified pathfinding for both resources and armies on the active map.""" locations = self._current_locations() location_state = self._get_location_state(self.location_id) faction = location_state["faction"] # Resource routing only makes sense for factions that have a resource # economy. ``barbarian`` and ``white_walkers`` don't send resources. resource_factions = {"southern", "northern", "nights_watch"} if path_type == PathType.RESOURCE and faction not in resource_factions: return None distances = {loc: float('infinity') for loc in locations.keys()} distances[self.location_id] = 0 previous = {loc: None for loc in locations.keys()} unvisited = set(locations.keys()) def get_weight(loc_id: str) -> float: state = self._get_location_state(loc_id) loc_faction = state["faction"] if state else "neutral" if path_type == PathType.RESOURCE: if loc_faction == faction: return 1 elif loc_faction == "neutral": return 2 return float('infinity') else: # PathType.ATTACK if loc_faction == faction: return 1 elif loc_faction == "neutral": return 2 return 3 while unvisited: current = min(unvisited, key=lambda loc: distances[loc]) if current == target: break unvisited.remove(current) for neighbor in locations[current]["connections"]: if neighbor in unvisited: weight = get_weight(neighbor) distance = distances[current] + weight if distance < distances[neighbor]: distances[neighbor] = distance previous[neighbor] = current if previous[target] is None: return None path = [] current = target while current is not None: path.append(current) current = previous[current] return list(reversed(path)) def _handle_battle(self, attacking_army: int, attacking_faction: str, defending_army: int, defending_faction: str, location_type: Optional[str] = None) -> tuple[str, int, str]: """Handle battle between armies and return ``(result, remaining_army, new_faction)``. ``location_type`` lets the active map's rules modify the fight. For ``wall`` settlements on a map with ``wall_multiplier`` > 1 the defender's effective strength is scaled up — the physical garrison plays harder to dislodge, but the ``remaining_army`` reported back is converted back to physical units so DB rows stay honest. """ # Same faction = reinforcement. Multiplier never applies. if attacking_faction == defending_faction: self.logger.info(f"Reinforcement battle between {attacking_faction} armies") self.telemetry.record_battle(attacking_faction, defending_faction, "reinforcement") return "reinforcement", attacking_army + defending_army, attacking_faction multiplier = 1.0 if location_type == "wall": multiplier = float(self._current_rules().get("wall_multiplier", 1.0) or 1.0) effective_defender = int(defending_army * multiplier) if attacking_army > effective_defender: remaining = attacking_army - effective_defender self.logger.info( f"Attacker victory: {attacking_army} vs {defending_army} " f"(effective {effective_defender}, mult {multiplier}) -> {remaining}" ) self.telemetry.record_battle(attacking_faction, defending_faction, "attacker_victory") return "attacker_victory", remaining, attacking_faction elif effective_defender > attacking_army: # Convert defender's surviving *effective* strength back to physical. effective_remaining = effective_defender - attacking_army remaining = max(1, int(effective_remaining / multiplier)) if multiplier > 0 else effective_remaining self.logger.info( f"Defender victory: {defending_army} vs {attacking_army} " f"(effective {effective_defender}, mult {multiplier}) -> {remaining}" ) self.telemetry.record_battle(attacking_faction, defending_faction, "defender_victory") return "defender_victory", remaining, defending_faction else: self.logger.info( f"Stalemate: {attacking_army} vs {defending_army} " f"(effective {effective_defender}, mult {multiplier})" ) self.telemetry.record_battle(attacking_faction, defending_faction, "stalemate") return "stalemate", 0, defending_faction def _continue_army_movement(self, army_size: int, faction: str, current_loc: str, next_loc: str, remaining_path: List[str], is_attack_move: bool = False) -> Dict: """Continue army movement to next location.""" # Capture the full context before spawning the thread ctx = get_current() def move(): token = attach(ctx) try: time.sleep(5) # Wait 5 seconds before moving with self.tracer.start_as_current_span( "army_movement", kind=SpanKind.SERVER, attributes={ "source_location": current_loc, "target_location": next_loc, "army_size": army_size, "is_attack_move": is_attack_move } ) as movement_span: target_url = f"{self.get_location_url(next_loc)}/receive_army" self.logger.info(f"Moving army from {current_loc} to {next_loc}") result = self._make_request_with_trace( 'post', target_url, { "army_size": army_size, "faction": faction, "source_location": current_loc, "remaining_path": remaining_path, "is_attack_move": is_attack_move }, span_name="http_request.move_army" ) if not result.get("success", False): movement_span.set_status(trace.StatusCode.ERROR, "Army movement failed") movement_span.set_attribute("error", result.get("message", "Unknown error")) self.logger.error(f"Army movement failed: {result.get('message', 'Unknown error')}") else: # Force metric collection after successful army movement self.telemetry.collect_metrics() except Exception as e: self.logger.error(f"Failed to move army to {next_loc}: {str(e)}") raise finally: detach(token) # Start movement in background thread Thread(target=move).start() # Force metric collection at the start of movement self.telemetry.collect_metrics() # Return immediate response indicating movement has started return { "success": True, "message": f"Army movement started from {current_loc} to {next_loc}", "is_attack_move": is_attack_move } def _transfer_resources_along_path(self, resources: int, path: List[str]) -> bool: """Transfer resources along a path with delays.""" if not path or len(path) < 2: return False # Capture the full context before spawning the thread ctx = get_current() def transfer(): current_loc = path[0] next_loc = path[1] token = attach(ctx) try: time.sleep(5) # Wait before starting transfer with self.tracer.start_as_current_span( "resource_movement", kind=SpanKind.SERVER, attributes={ "source_location": current_loc, "target_location": next_loc, "resources_amount": resources } ) as movement_span: target_url = f"{self.get_location_url(next_loc)}/receive_resources" result = self._make_request_with_trace( 'post', target_url, { "resources": resources, "source_location": current_loc, "remaining_path": path[1:], "faction": self._get_location_state(self.location_id)["faction"] }, span_name="http_request.transfer_resources" ) if result.get("success", False): current_loc_resources = self._get_location_state(current_loc)['resources'] self._update_location_state(current_loc, resources=current_loc_resources - resources) # Force metric collection after successful resource transfer self.telemetry.collect_metrics() else: movement_span.set_status(trace.StatusCode.ERROR, "Resource transfer failed") except Exception as e: self.logger.error(f"Failed to send resources to {next_loc} from {current_loc}: {str(e)}") finally: detach(token) Thread(target=transfer).start() return True def _make_request_with_trace(self, method: str, url: str, json_data: Optional[Dict] = None, span_name: str = "http_request") -> Dict: """Make HTTP request with trace context propagated in headers.""" headers = {"Content-Type": "application/json"} with self.tracer.start_as_current_span( span_name, kind=SpanKind.CLIENT, attributes={"http.url": url} ) as request_span: inject(headers) # This will now inject the current request_span's context try: if method.lower() == 'get': response = requests.get(url, headers=headers) elif method.lower() == 'post': response = requests.post(url, json=json_data, headers=headers) else: raise ValueError(f"Unsupported method: {method}") request_span.set_attribute("http.status_code", response.status_code) response.raise_for_status() return response.json() except requests.RequestException as e: request_span.set_status(trace.StatusCode.ERROR, str(e)) self.logger.error(f"Request failed: {str(e)}") raise def _can_collect_resources(self) -> tuple[bool, Optional[str], Optional[int]]: """Check if location can collect resources. Returns: tuple: (can_collect, message, cooldown_seconds) """ with self.lock: if self.location_info["type"] != "capital": return False, "Only capitals can manually collect resources", None now = datetime.now() # Check resource sending cooldown if self.location_id in self.resource_cooldown: cooldown_end = self.resource_cooldown[self.location_id] if now < cooldown_end: remaining = (cooldown_end - now).seconds return False, f"Resource generation on cooldown for {remaining} seconds", remaining # Check collection cooldown last_time = self.last_resource_collection.get(self.location_id, datetime.min) wait_time = timedelta(seconds=5) if now - last_time < wait_time: remaining = wait_time - (now - last_time) return False, f"Must wait {remaining.seconds} seconds to collect resources", remaining.seconds return True, None, None def _start_resource_cooldown(self): with self.lock: self.resource_cooldown[self.location_id] = datetime.now() + timedelta(seconds=5) def get_location_url(self, location_id): """Return the HTTP base URL for reaching another location service. Uses the active map's port assignment; falls back to WoK's port for a legacy id if the location isn't on the current map (shouldn't happen during a coherent game but guards against transition races). """ locations = self._current_locations() if location_id in locations: port = locations[location_id]["port"] else: port = MAPS[DEFAULT_MAP_ID]["locations"][location_id]["port"] if os.environ.get('IN_DOCKER') or os.environ.get('LOCATION_ID'): docker_service_name = self._container_for(location_id) return f"http://{docker_service_name}:{port}" return f"http://localhost:{port}" def _container_for(self, location_id: str) -> str: """Return the stable container hostname for another location id. Containers are named after their *slot* (slot_1 → southern-capital in docker-compose, which is slot_1's stable identity). We reverse-look up the slot that currently serves ``location_id`` on the active map, then translate that slot back to its container hostname using the WoK default slot assignments (which match docker-compose service names). """ active = MAPS[self.map_id]["slot_assignments"] wok = MAPS[DEFAULT_MAP_ID]["slot_assignments"] for slot, active_loc in active.items(): if active_loc == location_id: return wok[slot].replace('_', '-') # Unknown id — best-effort: use the hyphenated form. return location_id.replace('_', '-') def _start_passive_generation(self): def generate_resources(): while True: time.sleep(15) # Static identity guards against /reload moving this slot off # of a village type entirely. if self.location_info["type"] != "village": continue # Live-DB guard: gate on the *current* faction, not the # boot-time identity, so a captured Free Folk camp starts # producing for the new owner the moment its row flips. The # static ``self.location_info["faction"]`` is set at boot # from MAPS config and never updates on battle. location_state = self._get_location_state(self.location_id) if location_state is None: continue if location_state["faction"] == "barbarian": continue amount = self._current_rules()["resource_generation"]["village"] with self.tracer.start_as_current_span( "passive_resource_generation", attributes={ "location.id": self.location_id, "resources_gained": amount, "game.map.id": self.map_id, "owner.faction": location_state["faction"], } ): new_resources = location_state["resources"] + amount self._update_location_state(self.location_id, resources=new_resources) self.telemetry.collect_metrics() Thread(target=generate_resources, daemon=True).start() def _start_barbarian_growth(self, interval_s: int): """Barbarian villages grow +1 army every ``interval_s`` seconds. Barbarians never initiate combat; they exist to pressure the map and feed the White Walker corpse economy. The thread self-gates against identity changes so it becomes a no-op if /reload moves this slot off a barbarian role. """ def grow(): while True: time.sleep(interval_s) if self.location_info["faction"] != "barbarian": continue with self.tracer.start_as_current_span( "barbarian_passive_growth", attributes={ "location.id": self.location_id, "game.map.id": self.map_id, "army_gained": 1, } ): state = self._get_location_state(self.location_id) if state is None: continue # Only grow while still barbarian-controlled. if state["faction"] != "barbarian": continue self._update_location_state(self.location_id, army=state["army"] + 1) self.telemetry.collect_metrics() Thread(target=grow, daemon=True).start() def _start_nights_watch_capital_resource_tick(self, interval_s: int, amount: int): """Passive resource generation at the Night's Watch capital (WWA only). WWA gives the player no friendly villages, so /collect_resources at Castle Black is the only income source — leading to click-spam UX. A slow passive tick supplements that without removing the incentive to actively collect (manual is +20 per 5 s; passive is +amount per interval_s, configured well below that). """ def tick(): while True: time.sleep(interval_s) if (self.location_info["faction"] != "nights_watch" or self.location_info["type"] != "capital"): continue with self.tracer.start_as_current_span( "nights_watch_passive_resource", attributes={ "location.id": self.location_id, "game.map.id": self.map_id, "resources_gained": amount, } ): state = self._get_location_state(self.location_id) if state is None: continue if state["faction"] != "nights_watch": continue self._update_location_state( self.location_id, resources=state["resources"] + amount ) self.telemetry.collect_metrics() Thread(target=tick, daemon=True).start() def _start_white_walker_corpse_tick(self, interval_s: int): """Passive corpse generation at the White Walker fortress. Simulates the undead slowly rising — keeps the WW economy nonzero even when no battles are happening. Corpses accrue to the faction pool. """ def tick(): while True: time.sleep(interval_s) if self.location_info["faction"] != "white_walkers" or self.location_info["type"] != "capital": continue with self.tracer.start_as_current_span( "white_walker_corpse_tick", attributes={ "location.id": self.location_id, "game.map.id": self.map_id, "game.corpses.harvested": 1, "corpse.source": "passive", } ): self._add_corpses(1, "white_walkers") self.telemetry.collect_metrics() Thread(target=tick, daemon=True).start() def reset_database(self): """Reset every location row + the corpse pool to the active map's initial state.""" conn = self._get_db_connection() cursor = conn.cursor() cursor.execute("DELETE FROM locations") for loc_id, loc_info in self._current_locations().items(): cursor.execute( "INSERT INTO locations VALUES (?, ?, ?, ?)", ( loc_id, loc_info["initial_resources"], loc_info["initial_army"], loc_info["faction"], ), ) cursor.execute("DELETE FROM faction_economy") conn.commit() conn.close() self.logger.info(f"Database reset to initial state for map {self.map_id}") def setup_routes(self): @self.app.route('/', methods=['GET']) def info(): context = extract(request.headers) with self.tracer.start_as_current_span( "get_location_info", context=context, kind=SpanKind.SERVER, attributes={ "location.id": self.location_id, "location.name": self.location_info["name"], "location.type": self.location_info["type"] } ): location_state = self._get_location_state(self.location_id) cooldown_info = None with self.lock: now = datetime.now() last_time = self.last_resource_collection.get(self.location_id, datetime.min) wait_time = timedelta(seconds=15 if self.location_info["type"] == "village" else 5) if now - last_time < wait_time: remaining = wait_time - (now - last_time) cooldown_info = remaining.seconds return jsonify({ "location_id": self.location_id, "name": self.location_info["name"], "faction": location_state["faction"], "connections": self.location_info["connections"], "resources": location_state["resources"], "army": location_state["army"], "resource_cooldown": cooldown_info }) @self.app.route('/health', methods=['GET']) def health(): return jsonify({"status": "ok"}) @self.app.route('/collect_resources', methods=['POST']) def collect_resources(): """Collect resources from a location""" # Extract trace context from request headers context = extract(request.headers) with self.tracer.start_as_current_span( "collect_resources", context=context, kind=SpanKind.SERVER, attributes={ "location_name": self.location_info["name"], "location_type": self.location_info["type"] } ) as span: can_collect, message, cooldown_seconds = self._can_collect_resources() if not can_collect: span.set_status(trace.StatusCode.ERROR, message) span.set_attribute("cooldown_seconds", cooldown_seconds or 0) return jsonify({ "success": False, "message": message, "cooldown": True, "cooldown_seconds": cooldown_seconds }), 200 # Return 200 for cooldown, as it's an expected state location_type = self.location_info["type"] resources_gained = self._current_rules()["resource_generation"].get(location_type, 0) location_state = self._get_location_state(self.location_id) new_resources = location_state["resources"] + resources_gained self._update_location_state(self.location_id, resources=new_resources) span.set_attribute("resources_gained", resources_gained) span.set_attribute("new_resources_total", new_resources) with self.lock: self.last_resource_collection[self.location_id] = datetime.now() # Force metric collection after resource update self.telemetry.collect_metrics() return jsonify({ "success": True, "message": f"Collected {resources_gained} resources", "current_resources": new_resources, "cooldown": False }) @self.app.route('/create_army', methods=['POST']) def create_army(): # Extract trace context from request headers context = extract(request.headers) with self.tracer.start_as_current_span( "create_army", context=context, kind=SpanKind.SERVER, attributes={ "location_name": self.location_info["name"], "location_type": self.location_info["type"], "game.map.id": self.map_id, } ) as span: if self.location_info["type"] != "capital": span.set_status(trace.StatusCode.ERROR, "Only capitals can create armies") return jsonify({ "success": False, "message": "Only capitals can create armies" }), 403 location_state = self._get_location_state(self.location_id) current_resources = location_state["resources"] current_army = location_state["army"] faction = location_state["faction"] currency = get_army_currency(self.map_id, faction) cost = get_army_cost(self.map_id, faction) span.set_attribute("current_resources", current_resources) span.set_attribute("current_army", current_army) span.set_attribute("army_cost", cost) span.set_attribute("army_currency", currency) span.set_attribute("faction", faction) if currency == "corpses": # White Walkers spend corpses from the faction pool, not # resources from the location. if not self._spend_corpses(cost, faction): available = self._get_corpses(faction) span.set_status(trace.StatusCode.ERROR, "Insufficient corpses") return jsonify({ "success": False, "message": f"Not enough corpses. Need {cost}, have {available}" }), 400 new_resources = current_resources new_army = current_army + 1 self._update_location_state(self.location_id, army=new_army) span.set_attribute("game.corpses.spent", cost) span.set_attribute("corpses_remaining", self._get_corpses(faction)) else: if current_resources < cost: span.set_status(trace.StatusCode.ERROR, "Insufficient resources") return jsonify({ "success": False, "message": f"Not enough resources. Need {cost}, have {current_resources}" }), 400 new_resources = current_resources - cost new_army = current_army + 1 self._update_location_state( self.location_id, resources=new_resources, army=new_army ) span.set_attribute("new_resources", new_resources) span.set_attribute("new_army", new_army) self.telemetry.collect_metrics() return jsonify({ "success": True, "message": "Army created", "current_army": new_army, "current_resources": new_resources, "currency": currency, }) @self.app.route('/move_army', methods=['POST']) def move_army(): # Extract trace context from request headers context = extract(request.headers) with self.tracer.start_as_current_span( "move_army_request", context=context, kind=SpanKind.SERVER, attributes={ "location_name": self.location_info["name"], "location_type": self.location_info["type"] } ) as move_span: data = request.get_json() if not data or 'target_location' not in data: move_span.set_status(trace.StatusCode.ERROR, "Target location not specified") return jsonify({"success": False, "message": "Target location not specified"}), 400 target_location = data['target_location'] remaining_path = data.get('remaining_path', []) is_attack_move = data.get('is_attack_move', False) move_span.set_attribute("target_location", target_location) move_span.set_attribute("is_attack_move", is_attack_move) if target_location not in self.location_info["connections"]: move_span.set_status(trace.StatusCode.ERROR, f"Cannot move to {target_location}") return jsonify({ "success": False, "message": f"Cannot move to {target_location}. Not connected to {self.location_id}" }), 400 location_state = self._get_location_state(self.location_id) if location_state["army"] <= 0: move_span.set_status(trace.StatusCode.ERROR, "No army to move") return jsonify({ "success": False, "message": "No army to move" }), 400 try: army_size = location_state["army"] current_faction = location_state["faction"] move_span.set_attribute("army_size", army_size) move_span.set_attribute("faction", current_faction) # Update the source location's army to 0 self._update_location_state(self.location_id, army=0) # Force metric collection after army leaves the location self.telemetry.collect_metrics() result = self._continue_army_movement( army_size, current_faction, self.location_id, target_location, remaining_path, is_attack_move ) if not result.get("success", True): move_span.set_status(trace.StatusCode.ERROR, result.get("message", "Unknown error")) return jsonify(result) except Exception as e: move_span.record_exception(e) move_span.set_status(trace.StatusCode.ERROR, str(e)) return jsonify({ "success": False, "message": f"Failed to move army: {str(e)}" }), 500 @self.app.route('/all_out_attack', methods=['POST']) def all_out_attack(): """Launch an all-out attack from a capital to the enemy capital""" context = extract(request.headers) with self.tracer.start_as_current_span( "all_out_attack", context=context, kind=SpanKind.SERVER, attributes={ "location_name": self.location_info["name"], "location_type": self.location_info["type"] } ) as attack_span: try: if self.location_info["type"] != "capital": attack_span.set_status(trace.StatusCode.ERROR, "Only capitals can launch all-out attacks") return jsonify({ "success": False, "message": "Only capitals can launch all-out attacks" }), 403 location_state = self._get_location_state(self.location_id) army_size = location_state["army"] faction = location_state["faction"] if army_size <= 0: attack_span.set_status(trace.StatusCode.ERROR, "No army available for attack") return jsonify({ "success": False, "message": "No army available for attack" }), 400 # Determine enemy capital based on the active map's config. target_capital = self._find_enemy_capital(faction) if not target_capital: attack_span.set_status(trace.StatusCode.ERROR, "No enemy capital on this map") return jsonify({ "success": False, "message": "No enemy capital to attack on this map" }), 400 attack_span.set_attribute("target_capital", target_capital) attack_path = self._find_path(target_capital, PathType.ATTACK) if not attack_path: attack_span.set_status(trace.StatusCode.ERROR, "No valid path to enemy capital") return jsonify({ "success": False, "message": "No valid path to enemy capital" }), 400 attack_span.set_attribute("attack_path", str(attack_path)) attack_span.set_attribute("initial_army_size", army_size) # Set army to 0 before starting the attack self._update_location_state(self.location_id, army=0) if len(attack_path) > 1: next_loc = attack_path[1] result = self._continue_army_movement( army_size, faction, self.location_id, next_loc, attack_path[1:], is_attack_move=True ) if not result.get("success", False): # If movement fails, restore the army self._update_location_state(self.location_id, army=army_size) attack_span.set_status(trace.StatusCode.ERROR, "Failed to start attack") return jsonify({ "success": False, "message": f"Failed to start attack: {result.get('message', 'Unknown error')}" }), 400 return jsonify({ "success": True, "message": f"All-out attack started with {army_size} troops", "path": attack_path, "army_size": army_size }) return jsonify({ "success": False, "message": "Invalid attack path" }), 400 except Exception as e: attack_span.record_exception(e) attack_span.set_status(trace.StatusCode.ERROR, str(e)) raise @self.app.route('/receive_army', methods=['POST']) def receive_army(): try: data = request.get_json() self.logger.info(f"Received army at {self.location_id}: {data}") if not data or 'army_size' not in data or 'faction' not in data: return jsonify({"success": False, "message": "Invalid army data"}), 400 context = extract(request.headers) with self.tracer.start_as_current_span( "receive_army", context=context, kind=SpanKind.SERVER, attributes={ "location_name": self.location_info["name"], "location_type": self.location_info["type"] } ) as battle_span: attacking_army = data['army_size'] attacking_faction = data['faction'] source_location = data.get('source_location', 'unknown') remaining_path = data.get('remaining_path', []) is_attack_move = data.get('is_attack_move', False) location_state = self._get_location_state(self.location_id) defending_army = location_state["army"] defending_faction = location_state["faction"] battle_span.set_attribute("source_location", source_location) battle_span.set_attribute("attacking_army", attacking_army) battle_span.set_attribute("defending_army", defending_army) battle_span.set_attribute("remaining_path", str(remaining_path)) battle_span.set_attribute("is_attack_move", is_attack_move) self.logger.info(f"Received army at {self.location_id}: {data}") self.logger.info(f"Remaining path: {remaining_path}, is_attack_move: {is_attack_move}") if attacking_faction == defending_faction: # For all-out attacks, combine armies with friendly villages if is_attack_move and self.location_info["type"] == "village": # Add village's army to the attacking force attacking_army += defending_army # Set village's army to 0 self._update_location_state(self.location_id, army=0) battle_span.set_attribute("combined_army_size", attacking_army) self.logger.info(f"Combined armies at {self.location_id}: {attacking_army} (village army was {defending_army})") # Continue movement if there's a path remaining if is_attack_move and remaining_path: next_location = remaining_path[0] new_remaining_path = remaining_path[1:] if len(remaining_path) > 1 else [] self.logger.info(f"Continuing attack from {self.location_id} to {next_location}, new path: {new_remaining_path}") result = self._continue_army_movement( attacking_army, # Use the potentially increased army size attacking_faction, self.location_id, next_location, new_remaining_path, is_attack_move ) battle_span.set_attribute("result", "friendly_passage") self.logger.info(f"Friendly passage result: {result}") # Force metric collection after friendly passage self.telemetry.collect_metrics() return jsonify(result) elif not is_attack_move: # Normal army movement - combine armies new_army = defending_army + attacking_army self._update_location_state(self.location_id, army=new_army) battle_span.set_attribute("result", "armies_combined") self.logger.info(f"Armies combined at {self.location_info['name']}: {new_army}") # Force metric collection after combining armies self.telemetry.collect_metrics() return jsonify({ "success": True, "message": f"Armies combined at {self.location_info['name']}", "current_army": new_army, "faction": defending_faction }) else: # All-out attack reached friendly location with no remaining path # This shouldn't normally happen, but handle it gracefully if self.location_info["type"] == "capital": # If it's our own capital, stop here self._update_location_state(self.location_id, army=attacking_army) battle_span.set_attribute("result", "returned_to_capital") self.logger.warning(f"All-out attack returned to own capital with {attacking_army} troops") else: # For villages, the army should already be zeroed out above battle_span.set_attribute("result", "attack_ended_at_village") self.logger.warning(f"All-out attack ended at friendly village {self.location_id}") self.telemetry.collect_metrics() return jsonify({ "success": True, "message": f"Army movement ended at {self.location_info['name']}", "current_army": self._get_location_state(self.location_id)["army"], "faction": defending_faction }) battle_result, remaining_army, new_faction = self._handle_battle( attacking_army, attacking_faction, defending_army, defending_faction, location_type=self.location_info["type"], ) # Corpse harvesting: the White Walkers reap from any battle # they win (either as attacker or defender). Corpses equal # the total physical units that died on both sides. if new_faction == "white_walkers": dead = max(0, attacking_army + defending_army - remaining_army) if dead > 0: self._add_corpses(dead, "white_walkers") battle_span.set_attribute("game.corpses.harvested", dead) battle_span.set_attribute("corpse.source", "battle") self._update_location_state( self.location_id, army=remaining_army, faction=new_faction ) battle_span.set_attribute("result", battle_result) battle_span.set_attribute("remaining_army", remaining_army) battle_span.set_attribute("game.map.id", self.map_id) if self.location_info["type"] == "wall": battle_span.set_attribute("game.wall.held", new_faction != "neutral") battle_span.set_attribute("span.wall.battle", True) if battle_result == "attacker_victory" and is_attack_move and remaining_path: self.logger.info(f"Continuing army movement at {self.location_id}: {remaining_army}") self.logger.info(f"Battle victory - continuing to {remaining_path[0]}, path: {remaining_path[1:]}") result = self._continue_army_movement( remaining_army, attacking_faction, self.location_id, remaining_path[0], remaining_path[1:] if len(remaining_path) > 1 else [], is_attack_move ) return jsonify(result) if battle_result != "attacker_victory": self.logger.warning(f"Battle result: {battle_result}") battle_span.add_event("battle_result", attributes={ "outcome": battle_result, "attacker_faction": attacking_faction, "defender_faction": defending_faction, "remaining_army": remaining_army, }) # Force metric collection after battle resolution self.telemetry.collect_metrics() return jsonify({ "success": battle_result == "attacker_victory", "message": f"Battle at {self.location_info['name']}: {battle_result}", "current_army": remaining_army, "faction": new_faction }) except Exception as e: self.logger.error(f"Error in receive_army: {str(e)}") return jsonify({"success": False, "message": f"Error: {str(e)}"}), 500 @self.app.route('/reset', methods=['POST']) def reset(): self.reset_database() return jsonify({"success": True, "message": "Game state reset to initial values"}) @self.app.route('/reload', methods=['POST']) def reload_identity(): """Re-read the active map from the DB and rebind this slot's identity. Called by ``war_map`` after ``/select_map``. The slot's port + the telemetry service name do not change — only the logical ``location_id``, ``name``, ``type``, ``faction``, connections, and rules-scoped behaviour. """ self._load_identity() return jsonify({ "success": True, "slot_id": self.slot_id, "map_id": self.map_id, "location_id": self.location_id, "faction": self.location_info["faction"], "type": self.location_info["type"], }) @self.app.route('/faction_economy', methods=['GET']) def faction_economy(): """Expose the corpse pool for a faction (used by the AI).""" faction = request.args.get('faction', 'white_walkers') return jsonify({ "faction": faction, "corpses": self._get_corpses(faction), }) @self.app.route('/send_resources_to_capital', methods=['POST']) def send_resources_to_capital(): # Extract trace context from request headers context = extract(request.headers) with self.tracer.start_as_current_span( "send_resources_to_capital", context=context, # Use the extracted context kind=SpanKind.SERVER, attributes={ "location_name": self.location_info["name"], "location_type": self.location_info["type"] } ) as span: try: location_state = self._get_location_state(self.location_id) current_resources = location_state["resources"] faction = location_state["faction"] span.set_attribute("resources_amount", current_resources) span.set_attribute("faction", faction) if self.location_info["type"] != "village": span.set_status(trace.StatusCode.ERROR, "Only villages can send resources") self.logger.error(f"Only villages can send resources to capital") return jsonify({ "success": False, "message": "Only villages can send resources to capital" }), 403 resource_factions = {"southern", "northern", "nights_watch"} if faction not in resource_factions: span.set_status(trace.StatusCode.ERROR, "Faction has no resource economy") self.logger.error( f"Faction {faction!r} has no resource economy; cannot send to capital" ) return jsonify({ "success": False, "message": "This faction does not send resources", }), 403 # Target this faction's capital on the active map. target_capital = self._find_capital(faction) if not target_capital: span.set_status(trace.StatusCode.ERROR, "No friendly capital on this map") return jsonify({ "success": False, "message": "No friendly capital to send resources to" }), 400 path = self._find_path(target_capital, PathType.RESOURCE) if not path: span.set_status(trace.StatusCode.ERROR, "No valid path to capital") self.logger.error(f"No valid path to capital found") return jsonify({ "success": False, "message": "No valid path to capital found" }), 400 span.set_attribute("path_to_capital", str(path)) if self._transfer_resources_along_path(current_resources, path): self._start_resource_cooldown() self.logger.info(f"Resources sent to capital via {path}") # Force metric collection after initiating resource transfer self.telemetry.collect_metrics() return jsonify({ "success": True, "message": f"Sending {current_resources} resources to capital via {' -> '.join(path)}", "path": path, "amount": current_resources }) else: span.set_status(trace.StatusCode.ERROR, "Failed to start resource transfer") self.logger.error(f"Failed to start resource transfer") return jsonify({ "success": False, "message": "Failed to start resource transfer" }), 500 except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) self.logger.error(f"Error in send_resources_to_capital: {str(e)}") return jsonify({ "success": False, "message": f"Error: {str(e)}" }), 500 @self.app.route('/receive_resources', methods=['POST']) def receive_resources(): data = request.get_json() if not data or 'resources' not in data or 'faction' not in data: return jsonify({"success": False, "message": "Invalid resource data"}), 400 context = extract(request.headers) with self.tracer.start_as_current_span( "receive_resources", context=context, attributes={ "location": self.location_id, "location_type": self.location_info["type"], "sending_faction": data['faction'], "receiving_faction": self._get_location_state(self.location_id)["faction"], "resources_amount": data['resources'] } ) as transfer_span: incoming_resources = data['resources'] source_location = data.get('source_location', 'unknown') remaining_path = data.get('remaining_path', []) faction = data['faction'] transfer_span.set_attribute("source_location", source_location) location_state = self._get_location_state(self.location_id) current_resources = location_state["resources"] current_faction = location_state["faction"] if current_faction != faction: transfer_span.set_status(trace.Status(trace.StatusCode.ERROR, f"Resources captured by {current_faction}")) self._update_location_state(self.location_id, resources=current_resources + incoming_resources) # Force metric collection after resource capture self.telemetry.collect_metrics() self.logger.error(f"Resources captured by {current_faction}") return jsonify({ "success": False, "message": f"Resources captured by {current_faction}!", "current_resources": current_resources + incoming_resources }) new_resources = current_resources + incoming_resources self._update_location_state(self.location_id, resources=new_resources) # Force metric collection after receiving resources self.telemetry.collect_metrics() self.logger.info(f"Resources updated to {new_resources}") if len(remaining_path) > 1: next_loc = remaining_path[1] def continue_transfer(): with self._start_movement_trace( "resource_movement", self.location_id, next_loc, resources=incoming_resources ) as movement_span: try: time.sleep(5) target_url = f"{self.get_location_url(next_loc)}/receive_resources" self.logger.info(f"Sending resources to {next_loc} with target URL: {target_url}") result = self._make_request_with_trace('post', target_url, { "resources": incoming_resources, "source_location": self.location_id, "remaining_path": remaining_path[1:], "faction": faction }, span_name="http_request.forward_resources") if not result.get("success", False): movement_span.set_status(trace.Status(trace.StatusCode.ERROR, "Resource transfer failed")) current_state = self._get_location_state(self.location_id) self._update_location_state(self.location_id, resources=current_state["resources"] - incoming_resources) # Force metric collection after forwarding resources self.telemetry.collect_metrics() self.logger.info(f"Resources updated to {current_state['resources'] - incoming_resources}") except Exception as e: movement_span.set_status(trace.Status(trace.StatusCode.ERROR, str(e))) self.logger.error(f"Failed to forward resources to {next_loc}: {str(e)}") Thread(target=continue_transfer).start() transfer_span.set_attribute("final_resources", new_resources) if self.location_info["type"] == "capital": transfer_span.set_attribute("resources_reached_capital", True) self.logger.info(f"Resources received at {self.location_info['name']}") return jsonify({ "success": True, "message": f"Resources received at {self.location_info['name']}", "current_resources": new_resources }) def run(self): port = self.location_info["port"] self.app.run(host='0.0.0.0', port=port) self.logger.info(f"Location server running on port {port}") if __name__ == '__main__': # Docker entrypoint: read SLOT_ID env var, resolve identity from the # shared active_map_id, and serve. SERVICE_NAME comes from LOCATION_NAME # (set per-container in docker-compose.yml) or is synthesised from slot. LocationServer().run() ================================================ FILE: game-of-tracing/app/requirements.txt ================================================ flask==3.1.3 requests==2.33.1 opentelemetry-api==1.41.1 opentelemetry-sdk==1.41.1 opentelemetry-exporter-otlp==1.41.1 pyroscope-io==1.0.6 pyroscope-otel==1.0.0 ================================================ FILE: game-of-tracing/app/run_game.py ================================================ import os import sys import json import sqlite3 import argparse import multiprocessing from game_config import LOCATIONS, DATABASE_FILE from location_server import LocationServer def reset_game(): """Reset the database to initial state""" db_path = os.environ.get('DATABASE_FILE', DATABASE_FILE) if os.path.exists(db_path): # Connect to database and reset it conn = sqlite3.connect(db_path) cursor = conn.cursor() # Delete all data cursor.execute("DELETE FROM locations") # Reinitialize locations for loc_id, loc_info in LOCATIONS.items(): cursor.execute( "INSERT INTO locations VALUES (?, ?, ?, ?)", ( loc_id, loc_info["initial_resources"], loc_info["initial_army"], loc_info["faction"] ) ) conn.commit() conn.close() print(f"Game reset successfully. Database {db_path} reset to initial state.") else: print("Database not found. It will be created when the game starts.") def run_location(location_id): """Run a location server in a separate process""" print(f"Starting {LOCATIONS[location_id]['name']} (Port: {LOCATIONS[location_id]['port']})") server = LocationServer(location_id) server.run() def run_single_location(): """Run a single location server based on environment variable""" location_id = os.environ.get('LOCATION_ID') if not location_id: print("Error: LOCATION_ID environment variable not set") sys.exit(1) if location_id not in LOCATIONS: print(f"Error: Invalid location_id '{location_id}'") sys.exit(1) print(f"Starting {LOCATIONS[location_id]['name']} server (Port: {LOCATIONS[location_id]['port']})") server = LocationServer(location_id) server.run() def show_game_state(): """Show the current game state from the database""" db_path = os.environ.get('DATABASE_FILE', DATABASE_FILE) if not os.path.exists(db_path): print("Database not found. Starting a new game...") return try: conn = sqlite3.connect(db_path) conn.row_factory = sqlite3.Row cursor = conn.cursor() cursor.execute("SELECT * FROM locations") rows = cursor.fetchall() if not rows: print("No game state found. Starting a new game...") return print("Current Game State:") for row in rows: loc_id = row['id'] print(f"{LOCATIONS[loc_id]['name']} ({loc_id}): Faction={row['faction']}, Army={row['army']}, Resources={row['resources']}") conn.close() except sqlite3.Error as e: print(f"Error accessing database: {e}") print("Starting a new game...") def run_game(reset=False): """Run all location servers""" if reset: reset_game() # Check if we're in Docker and should run just one location if os.environ.get('LOCATION_ID'): run_single_location() return # Show initial game state show_game_state() # Start each location server in a separate process processes = [] for location_id in LOCATIONS: p = multiprocessing.Process(target=run_location, args=(location_id,)) p.start() processes.append(p) print("\nAll locations are running!") print("Game Instructions:") print("1. Each location is running a Flask server at its designated port") print("2. Use HTTP requests to interact with locations") print("3. Example commands:") print(" - Get location info: curl http://localhost:[PORT]/") print(" - Collect resources: curl -X POST http://localhost:[PORT]/collect_resources") print(" - Create army: curl -X POST http://localhost:[PORT]/create_army") print(" - Move army: curl -X POST -H \"Content-Type: application/json\" -d '{\"target_location\":\"village_1\"}' http://localhost:[PORT]/move_army") print(" - Reset game: curl -X POST http://localhost:[PORT]/reset") print("4. Or use the game client: python game_client.py map") try: # Wait for processes to complete (they won't unless terminated) for p in processes: p.join() except KeyboardInterrupt: print("\nShutting down all servers...") for p in processes: p.terminate() print("Game ended.") if __name__ == "__main__": parser = argparse.ArgumentParser(description="War of Westeros Game") parser.add_argument("--reset", action="store_true", help="Reset the game state") args = parser.parse_args() run_game(args.reset) ================================================ FILE: game-of-tracing/app/telemetry.py ================================================ import os from opentelemetry.sdk.resources import SERVICE_NAME, Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry import trace # Logging setup import logging from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry._logs import set_logger_provider # Metrics setup from opentelemetry import metrics from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader from opentelemetry.sdk.metrics import TraceBasedExemplarFilter from opentelemetry.metrics import CallbackOptions, Observation from typing import Iterable # Profiling setup (Pyroscope v2 + OTel span-profile linking) import pyroscope from pyroscope.otel import PyroscopeSpanProcessor class GameTelemetry: def __init__(self, service_name, logging_endpoint="http://alloy:4318", tracing_endpoint="http://alloy:4317", metrics_endpoint="http://alloy:4318"): self.service_name = service_name self.logging_endpoint = logging_endpoint self.tracing_endpoint = tracing_endpoint self.metrics_endpoint = metrics_endpoint self.resource = Resource.create(attributes={ SERVICE_NAME: service_name }) self._setup_logging() self._setup_tracing() self._setup_metrics() self._setup_profiling() def _setup_logging(self): """Configure OpenTelemetry logging""" self.logger_provider = LoggerProvider(resource=self.resource) set_logger_provider(self.logger_provider) log_exporter = OTLPLogExporter( endpoint=f"{self.logging_endpoint}/v1/logs" ) self.logger_provider.add_log_record_processor( BatchLogRecordProcessor( exporter=log_exporter, max_queue_size=30, max_export_batch_size=5 ) ) # Setup root logger handler = LoggingHandler( level=logging.NOTSET, logger_provider=self.logger_provider ) logging.getLogger().addHandler(handler) logging.getLogger().setLevel(logging.INFO) self.logger = logging.getLogger(self.service_name) def _setup_tracing(self): """Configure OpenTelemetry tracing""" trace.set_tracer_provider(TracerProvider(resource=self.resource)) otlp_exporter = OTLPSpanExporter( endpoint=f"{self.tracing_endpoint}/v1/traces", insecure=True ) span_processor = BatchSpanProcessor( span_exporter=otlp_exporter, max_export_batch_size=1 ) trace.get_tracer_provider().add_span_processor(span_processor) self.tracer = trace.get_tracer(__name__) def _setup_profiling(self): """Configure Pyroscope profiling + OTel span-profile linkage. Pyroscope collects CPU samples from this process and pushes pprof to the configured server. ``PyroscopeSpanProcessor`` attaches the current profile id to every span so the trace view in Grafana can link back to the flamegraph captured while each span was active. """ pyroscope.configure( application_name=self.service_name, server_address=os.getenv("PYROSCOPE_SERVER_ADDRESS", "http://alloy:9999"), tags={"service_name": self.service_name}, oncpu=True, gil_only=True, ) trace.get_tracer_provider().add_span_processor(PyroscopeSpanProcessor()) def _setup_metrics(self): """Configure OpenTelemetry metrics""" # Create the metrics exporter self.metric_exporter = OTLPMetricExporter( endpoint=f"{self.metrics_endpoint}/v1/metrics" ) # Set up periodic metric reader with manual collection capability self.metric_reader = PeriodicExportingMetricReader( self.metric_exporter, export_interval_millis=10000 # Export every 10 seconds ) # Create and set meter provider with exemplar support self.meter_provider = MeterProvider( metric_readers=[self.metric_reader], resource=self.resource, exemplar_filter=TraceBasedExemplarFilter() ) metrics.set_meter_provider(self.meter_provider) # Get meter for creating metrics self.meter = metrics.get_meter(__name__) # Create observable gauges for game metrics self._setup_game_gauges() def _setup_game_gauges(self): """Set up observable gauges for game metrics""" # Resource gauge self.resource_gauge = self.meter.create_observable_gauge( name="game.resources", description="Current resources at location", callbacks=[self._observe_resources], unit="1" ) # Army size gauge self.army_gauge = self.meter.create_observable_gauge( name="game.army_size", description="Current army size at location", callbacks=[self._observe_army_size], unit="1" ) # Battle count counter self.battle_counter = self.meter.create_counter( name="game.battles", description="Number of battles fought", unit="1" ) # Resource transfer gauge self.cooldown_gauge = self.meter.create_observable_gauge( name="game.resource_transfer_cooldown", description="Resource transfer cooldown status", callbacks=[self._observe_resource_cooldown], unit="s" ) # Location control gauge self.control_gauge = self.meter.create_observable_gauge( name="game.location_control", description="Current faction controlling the location", callbacks=[self._observe_location_control], unit="1" ) # Log that metrics have been set up self.logger.info("Game metrics initialized") # Faction → numeric value for the ``game.location_control`` gauge. # Existing WoK values (0/1/2) preserved for dashboard backward compat; # new factions appended with fresh values. _FACTION_VALUE = { "neutral": 0, "northern": 1, "southern": 2, "nights_watch": 3, "white_walkers": 4, "barbarian": 5, } def _active_location_id(self): """Return the currently served logical location id. ``LocationServer`` sets ``self._location_id`` on the telemetry instance at boot and refreshes it on ``/reload``. Fall back to the legacy ``service_name.replace('-', '_')`` pattern for non-slot deployments. """ return getattr(self, "_location_id", None) or self.service_name.replace("-", "_") def _active_location_type(self): return getattr(self, "_location_type", None) or "village" def _observe_resources(self, options: CallbackOptions) -> Iterable[Observation]: """Callback to observe current resources""" try: location_id = self._active_location_id() if hasattr(self, '_get_location_state'): state = self._get_location_state(location_id) if state: self.logger.debug(f"Observing resources for {location_id}: {state['resources']}") yield Observation( value=state["resources"], attributes={ "location": self.service_name, "location_type": self._active_location_type(), } ) except Exception as e: self.logger.error(f"Error observing resources: {e}") def _observe_army_size(self, options: CallbackOptions) -> Iterable[Observation]: """Callback to observe current army size""" try: location_id = self._active_location_id() if hasattr(self, '_get_location_state'): state = self._get_location_state(location_id) if state: self.logger.debug(f"Observing army size for {location_id}: {state['army']}") yield Observation( value=state["army"], attributes={ "location": self.service_name, "location_type": self._active_location_type(), "faction": state["faction"], } ) except Exception as e: self.logger.error(f"Error observing army size: {e}") def _observe_resource_cooldown(self, options: CallbackOptions) -> Iterable[Observation]: """Callback to observe resource transfer cooldown""" try: from datetime import datetime location_id = self._active_location_id() if hasattr(self, 'resource_cooldown') and location_id in self.resource_cooldown: cooldown = self.resource_cooldown[location_id] now = datetime.now() if cooldown > now: cooldown_value = (cooldown - now).total_seconds() self.logger.debug(f"Observing cooldown for {location_id}: {cooldown_value}s") yield Observation( value=cooldown_value, attributes={"location": self.service_name} ) else: yield Observation(value=0, attributes={"location": location_id}) except Exception as e: self.logger.error(f"Error observing resource cooldown: {e}") def _observe_location_control(self, options: CallbackOptions) -> Iterable[Observation]: """Callback to observe location control status.""" try: location_id = self._active_location_id() if hasattr(self, '_get_location_state'): state = self._get_location_state(location_id) if state: faction_value = self._FACTION_VALUE.get(state["faction"], -1) self.logger.debug( f"Observing control for {location_id}: {state['faction']} ({faction_value})" ) yield Observation( value=faction_value, attributes={ "location": self.service_name, "location_type": self._active_location_type(), "faction": state["faction"], } ) except Exception as e: self.logger.error(f"Error observing location control: {e}") def get_tracer(self): """Get the configured tracer""" return self.tracer def get_logger(self): """Get the configured logger""" return self.logger def get_meter(self): """Get the configured meter""" return self.meter def record_battle(self, attacker_faction: str, defender_faction: str, result: str): """Record a battle event and force metrics collection""" try: self.battle_counter.add( 1, { "attacker_faction": attacker_faction, "defender_faction": defender_faction, "result": result, "location": self.service_name } ) self.logger.info(f"Battle recorded: {attacker_faction} vs {defender_faction} - {result}") # Force collection of all metrics self.collect_metrics() except Exception as e: self.logger.error(f"Error recording battle: {e}") def collect_metrics(self): """Force collection and export of all metrics""" try: # Collect metrics immediately self.metric_reader.collect() # Force flush to ensure metrics are exported self.meter_provider.force_flush() self.logger.debug("Metrics collected and flushed") except Exception as e: self.logger.error(f"Error collecting metrics: {e}") def shutdown(self): """Flush and shutdown all telemetry providers.""" try: trace.get_tracer_provider().shutdown() except Exception: pass try: self.meter_provider.shutdown() except Exception: pass try: self.logger_provider.shutdown() except Exception: pass ================================================ FILE: game-of-tracing/config-otel.yaml ================================================ # # OTel Collector YAML Configuration for Game of Tracing # # This is the OTel-native equivalent of config.alloy for use with the Alloy OTel Engine. # Run with: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d # receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 processors: batch: {} exporters: otlp/tempo: endpoint: tempo:4317 tls: insecure: true otlphttp/logs: endpoint: http://loki:3100/otlp otlphttp/metrics: endpoint: http://prometheus:9090/api/v1/otlp service: pipelines: traces: receivers: [otlp] processors: [batch] exporters: [otlp/tempo] logs: receivers: [otlp] processors: [batch] exporters: [otlphttp/logs] metrics: receivers: [otlp] processors: [batch] exporters: [otlphttp/metrics] ================================================ FILE: game-of-tracing/config.alloy ================================================ /* * Alloy Configuration for OpenTelemetry Trace Collection with Tail Sampling */ // Receive OpenTelemetry traces otelcol.receiver.otlp "default" { http {} grpc {} output { metrics = [otelcol.processor.batch.default.input] logs = [otelcol.processor.batch.default.input] traces = [otelcol.processor.batch.default.input] } } // Batch processor to improve performance otelcol.processor.batch "default" { output { traces = [otelcol.exporter.otlp.tempo.input] logs = [otelcol.exporter.otlphttp.logs.input] metrics = [otelcol.exporter.otlphttp.metrics.input] } } // Send sampled traces to Tempo otelcol.exporter.otlp "tempo" { client { endpoint = "tempo:4317" tls { insecure = true } } } otelcol.exporter.otlphttp "logs" { client { endpoint = "http://loki:3100/otlp" } } otelcol.exporter.otlphttp "metrics" { client { endpoint = "http://prometheus:9090/api/v1/otlp" } } // Receive pprof profiles from the Python services and forward to Pyroscope. pyroscope.receive_http "default" { http { listen_address = "0.0.0.0" listen_port = 9999 } forward_to = [pyroscope.write.default.receiver] } pyroscope.write "default" { endpoint { url = "http://pyroscope:4040" } } livedebugging { enabled = true } ================================================ FILE: game-of-tracing/docker-compose-otel.yml ================================================ # OTel Engine Override # # Uses Alloy's experimental OTel Engine to run a standard OTel Collector YAML config # instead of the River/HCL config.alloy file. # # Usage: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d # Stop: docker compose -f docker-compose.yml -f docker-compose-otel.yml down # services: alloy: command: otel --config=/etc/alloy/config-otel.yaml volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml ports: - 8888:8888 # OTel Engine HTTP server ================================================ FILE: game-of-tracing/docker-compose.coda.yml ================================================ services: # Southern Capital southern-capital: build: context: ./app dockerfile: Dockerfile ports: - "5001:5001" environment: - LOCATION_ID=southern_capital - FLASK_APP=location_server.py - LOCATION_NAME=southern-capital - DATABASE_FILE=/data/game_state.db volumes: - game-data:/data command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('southern_capital'); server.run()"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s # Northern Capital northern-capital: build: context: ./app dockerfile: Dockerfile ports: - "5002:5002" environment: - LOCATION_ID=northern_capital - FLASK_APP=location_server.py - LOCATION_NAME=northern-capital - DATABASE_FILE=/data/game_state.db volumes: - game-data:/data command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('northern_capital'); server.run()"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5002/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s # Village 1 village-1: build: context: ./app dockerfile: Dockerfile ports: - "5003:5003" environment: - LOCATION_ID=village_1 - FLASK_APP=location_server.py - LOCATION_NAME=village-1 - DATABASE_FILE=/data/game_state.db volumes: - game-data:/data command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('village_1'); server.run()"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5003/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s # Village 2 village-2: build: context: ./app dockerfile: Dockerfile ports: - "5004:5004" environment: - LOCATION_ID=village_2 - FLASK_APP=location_server.py - LOCATION_NAME=village-2 - DATABASE_FILE=/data/game_state.db volumes: - game-data:/data command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('village_2'); server.run()"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5004/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s # Village 3 village-3: build: context: ./app dockerfile: Dockerfile ports: - "5005:5005" environment: - LOCATION_ID=village_3 - FLASK_APP=location_server.py - LOCATION_NAME=village-3 - DATABASE_FILE=/data/game_state.db volumes: - game-data:/data command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('village_3'); server.run()"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5005/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s # Village 4 village-4: build: context: ./app dockerfile: Dockerfile ports: - "5006:5006" environment: - LOCATION_ID=village_4 - FLASK_APP=location_server.py - LOCATION_NAME=village-4 - DATABASE_FILE=/data/game_state.db volumes: - game-data:/data command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('village_4'); server.run()"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5006/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s # Village 5 village-5: build: context: ./app dockerfile: Dockerfile ports: - "5007:5007" environment: - LOCATION_ID=village_5 - FLASK_APP=location_server.py - LOCATION_NAME=village-5 - DATABASE_FILE=/data/game_state.db volumes: - game-data:/data command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('village_5'); server.run()"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5007/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s # Village 6 village-6: build: context: ./app dockerfile: Dockerfile ports: - "5008:5008" environment: - LOCATION_ID=village_6 - FLASK_APP=location_server.py - LOCATION_NAME=village-6 - DATABASE_FILE=/data/game_state.db volumes: - game-data:/data command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('village_6'); server.run()"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5008/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s # Web-based war map application war-map: build: context: ./war_map dockerfile: Dockerfile ports: - "8080:8080" environment: - DATABASE_FILE=/data/game_state.db - LOCATION_NAME=war-map - SECRET_KEY=war_of_westeros_secret_key - IN_DOCKER=1 - AI_URL=http://ai-opponent:8081 - TEMPO_URL=http://tempo:3200 volumes: - game-data:/data healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s depends_on: southern-capital: condition: service_healthy northern-capital: condition: service_healthy village-1: condition: service_healthy village-2: condition: service_healthy village-3: condition: service_healthy village-4: condition: service_healthy village-5: condition: service_healthy village-6: condition: service_healthy # AI Opponent Service ai-opponent: build: context: ./ai_opponent dockerfile: Dockerfile ports: - "8081:8081" environment: - IN_DOCKER=1 - LOCATION_NAME=ai-opponent healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8081/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s depends_on: southern-capital: condition: service_healthy northern-capital: condition: service_healthy village-1: condition: service_healthy village-2: condition: service_healthy village-3: condition: service_healthy village-4: condition: service_healthy village-5: condition: service_healthy village-6: condition: service_healthy volumes: game-data: ================================================ FILE: game-of-tracing/docker-compose.yml ================================================ version: '3.8' services: loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml # Prometheus for metrics collection prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --web.enable-otlp-receiver - --enable-feature=native-histograms - --enable-feature=exemplar-storage - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090/tcp volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml # Tempo for tracing tempo: image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4} command: ["-config.file=/etc/tempo.yaml"] ports: - 3200:3200/tcp # tempo volumes: - ./tempo-config.yaml:/etc/tempo.yaml depends_on: - prometheus # Pyroscope v2 for continuous profiling pyroscope: image: grafana/pyroscope:${GRAFANA_PYROSCOPE_VERSION:-2.0.1} ports: - "4040:4040" command: - "-config.file=/etc/pyroscope/config.yaml" - "-architecture.storage=v1-v2-dual" volumes: - ./pyroscope-config.yaml:/etc/pyroscope/config.yaml - pyroscope-data:/data # Grafana for visualization grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_SECURITY_ALLOW_EMBEDDING=true - GF_SECURITY_DISABLE_SANITIZE_HTML=true - GF_FEATURE_TOGGLES_ENABLE=dashboardNewLayouts,kubernetesDashboards,provisioning - GF_PATHS_PERMITTED_PROVISIONING_PATHS=grafana/|/etc/grafana/provisioning/dashboards/ - GF_INSTALL_PLUGINS=https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app volumes: - ./grafana:/etc/grafana/provisioning ports: - 3000:3000/tcp depends_on: - prometheus - tempo - pyroscope # Alloy for telemetry pipeline and tail sampling alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 # Alloy HTTP server - 4317:4317/tcp # OTLP gRPC - 4318:4318/tcp # OTLP HTTP - 9999:9999/tcp # Pyroscope HTTP receiver volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - pyroscope # Game of Kingdoms War Game Services # Southern Capital southern-capital: build: context: ./app dockerfile: Dockerfile ports: - "5001:5001" environment: - SLOT_ID=slot_1 - LOCATION_ID=southern_capital - FLASK_APP=location_server.py - LOCATION_NAME=southern-capital - DATABASE_FILE=/data/game_state.db - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999 - IN_DOCKER=1 volumes: - game-data:/data command: ["python", "location_server.py"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s depends_on: - alloy # Northern Capital northern-capital: build: context: ./app dockerfile: Dockerfile ports: - "5002:5002" environment: - SLOT_ID=slot_2 - LOCATION_ID=northern_capital - FLASK_APP=location_server.py - LOCATION_NAME=northern-capital - DATABASE_FILE=/data/game_state.db - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999 - IN_DOCKER=1 volumes: - game-data:/data command: ["python", "location_server.py"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5002/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s depends_on: - alloy # Village 1 village-1: build: context: ./app dockerfile: Dockerfile ports: - "5003:5003" environment: - SLOT_ID=slot_3 - LOCATION_ID=village_1 - FLASK_APP=location_server.py - LOCATION_NAME=village-1 - DATABASE_FILE=/data/game_state.db - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999 - IN_DOCKER=1 volumes: - game-data:/data command: ["python", "location_server.py"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5003/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s depends_on: - alloy # Village 2 village-2: build: context: ./app dockerfile: Dockerfile ports: - "5004:5004" environment: - SLOT_ID=slot_4 - LOCATION_ID=village_2 - FLASK_APP=location_server.py - LOCATION_NAME=village-2 - DATABASE_FILE=/data/game_state.db - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999 - IN_DOCKER=1 volumes: - game-data:/data command: ["python", "location_server.py"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5004/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s depends_on: - alloy # Village 3 village-3: build: context: ./app dockerfile: Dockerfile ports: - "5005:5005" environment: - SLOT_ID=slot_5 - LOCATION_ID=village_3 - FLASK_APP=location_server.py - LOCATION_NAME=village-3 - DATABASE_FILE=/data/game_state.db - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999 - IN_DOCKER=1 volumes: - game-data:/data command: ["python", "location_server.py"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5005/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s depends_on: - alloy # Village 4 village-4: build: context: ./app dockerfile: Dockerfile ports: - "5006:5006" environment: - SLOT_ID=slot_6 - LOCATION_ID=village_4 - FLASK_APP=location_server.py - LOCATION_NAME=village-4 - DATABASE_FILE=/data/game_state.db - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999 - IN_DOCKER=1 volumes: - game-data:/data command: ["python", "location_server.py"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5006/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s depends_on: - alloy # Village 5 village-5: build: context: ./app dockerfile: Dockerfile ports: - "5007:5007" environment: - SLOT_ID=slot_7 - LOCATION_ID=village_5 - FLASK_APP=location_server.py - LOCATION_NAME=village-5 - DATABASE_FILE=/data/game_state.db - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999 - IN_DOCKER=1 volumes: - game-data:/data command: ["python", "location_server.py"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5007/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s depends_on: - alloy # Village 6 village-6: build: context: ./app dockerfile: Dockerfile ports: - "5008:5008" environment: - SLOT_ID=slot_8 - LOCATION_ID=village_6 - FLASK_APP=location_server.py - LOCATION_NAME=village-6 - DATABASE_FILE=/data/game_state.db - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999 - IN_DOCKER=1 volumes: - game-data:/data command: ["python", "location_server.py"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5008/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s depends_on: - alloy # Web-based war map application war-map: build: context: ./war_map dockerfile: Dockerfile ports: - "8080:8080" environment: - DATABASE_FILE=/data/game_state.db - GAME_SESSIONS_DB=/data/game_sessions.db - LOCATION_NAME=war-map - SECRET_KEY=war_of_westeros_secret_key - IN_DOCKER=1 - AI_URL=http://ai-opponent:8081 - TEMPO_URL=http://tempo:3200 - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999 volumes: - game-data:/data healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s depends_on: southern-capital: condition: service_healthy northern-capital: condition: service_healthy village-1: condition: service_healthy village-2: condition: service_healthy village-3: condition: service_healthy village-4: condition: service_healthy village-5: condition: service_healthy village-6: condition: service_healthy tempo: condition: service_started # AI Opponent Service ai-opponent: build: context: ./ai_opponent dockerfile: Dockerfile ports: - "8081:8081" environment: - IN_DOCKER=1 - LOCATION_NAME=ai-opponent - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999 healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8081/health')"] interval: 5s timeout: 3s retries: 5 start_period: 10s depends_on: southern-capital: condition: service_healthy northern-capital: condition: service_healthy village-1: condition: service_healthy village-2: condition: service_healthy village-3: condition: service_healthy village-4: condition: service_healthy village-5: condition: service_healthy village-6: condition: service_healthy alloy: condition: service_started volumes: game-data: pyroscope-data: ================================================ FILE: game-of-tracing/grafana/dashboards/War of Kingdoms-1747821967780.json ================================================ { "apiVersion": "dashboard.grafana.app/v2beta1", "kind": "Dashboard", "metadata": { "name": "game-dashboard" }, "spec": { "annotations": [ { "kind": "AnnotationQuery", "spec": { "builtIn": true, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "query": { "datasource": { "name": "-- Grafana --" }, "group": "grafana", "kind": "DataQuery", "spec": {}, "version": "v0" } } } ], "cursorSync": "Off", "description": "", "editable": true, "elements": { "panel-1": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "filters": [ { "id": "f905accb", "operator": "=", "scope": "span" }, { "id": "service-name", "operator": "=", "scope": "resource", "tag": "service.name", "value": [ "war_map" ], "valueType": "string" } ], "limit": 20, "metricsQueryType": "range", "queryType": "traceqlSearch", "tableType": "traces" }, "group": "tempo", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 1, "links": [], "title": "Player Decisions", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true }, "pluginVersion": "12.0.0" }, "group": "table", "version": "12.4.0" } } }, "panel-10": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "disableTextWrap": false, "editorMode": "builder", "expr": "game_location_control_ratio{location=\"northern-capital\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "useBackend": false }, "group": "prometheus", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 10, "links": [], "title": "Location Allegiance", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "max": 2, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#370a4d", "value": 0 }, { "color": "super-light-blue", "value": 1 }, { "color": "dark-red", "value": 2 } ] } }, "overrides": [] }, "options": { "displayMode": "gradient", "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, "pluginVersion": "12.0.0" }, "group": "bargauge", "version": "12.4.0" } } }, "panel-11": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "direction": "backward", "editorMode": "code", "expr": "{service_name=\"northern-capital\"} | code_function_name != \"_log\"", "queryType": "range" }, "group": "loki", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 11, "links": [], "title": "Location History", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": {}, "overrides": [] }, "options": { "dedupStrategy": "none", "enableInfiniteScrolling": false, "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": false, "showTime": false, "sortOrder": "Descending", "wrapLogMessage": false }, "pluginVersion": "12.0.0" }, "group": "logs", "version": "12.4.0" } } }, "panel-12": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "disableTextWrap": false, "editorMode": "code", "expr": "sum(game_army_size_ratio{job=\"$villages\"}) without (faction)", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "useBackend": false }, "group": "prometheus", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 12, "links": [], "title": "Current Army", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 }, { "color": "red", "value": 80 } ] }, "unit": "short" }, "overrides": [] }, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "12.0.0" }, "group": "stat", "version": "12.4.0" } } }, "panel-13": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "disableTextWrap": false, "editorMode": "builder", "expr": "game_resources_ratio{job=\"$villages\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "useBackend": false }, "group": "prometheus", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 13, "links": [], "title": "Current Resources", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "max": 200, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "dark-red", "value": 0 }, { "color": "red", "value": 30 }, { "color": "#EAB839", "value": 50 }, { "color": "dark-green", "value": 100 } ] } }, "overrides": [] }, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "12.0.0" }, "group": "gauge", "version": "12.4.0" } } }, "panel-14": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "disableTextWrap": false, "editorMode": "code", "expr": "sum(game_location_control_ratio{job=\"$villages\"}) without (faction)", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "useBackend": false }, "group": "prometheus", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 14, "links": [], "title": "Location Allegiance", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "max": 2, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#370a4d", "value": 0 }, { "color": "super-light-blue", "value": 1 }, { "color": "dark-red", "value": 2 } ] } }, "overrides": [] }, "options": { "displayMode": "gradient", "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, "pluginVersion": "12.0.0" }, "group": "bargauge", "version": "12.4.0" } } }, "panel-15": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "direction": "backward", "editorMode": "code", "expr": "{service_name=\"$villages\"} | code_function_name !=\"_log\"", "queryType": "range" }, "group": "loki", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 15, "links": [], "title": "Location History", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": {}, "overrides": [] }, "options": { "dedupStrategy": "none", "enableInfiniteScrolling": false, "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": false, "showTime": false, "sortOrder": "Descending", "wrapLogMessage": false }, "pluginVersion": "12.0.0" }, "group": "logs", "version": "12.4.0" } } }, "panel-16": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "filters": [ { "id": "e020e714", "operator": "=", "scope": "span" } ], "limit": 20, "metricsQueryType": "range", "queryType": "serviceMap", "serviceMapQuery": "{}", "tableType": "traces" }, "group": "tempo", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 16, "links": [], "title": "War Map", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": {}, "overrides": [] }, "options": { "edges": {}, "layoutAlgorithm": "layered", "nodes": {}, "zoomMode": "cooperative" }, "pluginVersion": "12.0.0" }, "group": "nodeGraph", "version": "12.4.0" } } }, "panel-17": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "filters": [ { "id": "9aa2da84", "operator": ">", "scope": "span", "tag": "army_size", "value": [ "3" ] } ], "limit": 20, "metricsQueryType": "range", "query": "{span.army_size>3}", "queryType": "traceqlSearch", "tableType": "traces" }, "group": "tempo", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 17, "links": [], "title": "Army Size Greater than 3", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true }, "pluginVersion": "12.0.0" }, "group": "table", "version": "12.4.0" } } }, "panel-18": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "disableTextWrap": false, "editorMode": "builder", "expr": "game_army_size_ratio{location=\"southern-capital\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "useBackend": false }, "group": "prometheus", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 18, "links": [], "title": "Current Army", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "max": 10, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "dark-red", "value": 0 }, { "color": "red", "value": 3 }, { "color": "#EAB839", "value": 5 }, { "color": "dark-green", "value": 15 } ] } }, "overrides": [] }, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "12.0.0" }, "group": "gauge", "version": "12.4.0" } } }, "panel-19": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "disableTextWrap": false, "editorMode": "builder", "exemplar": true, "expr": "game_battles_total", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "useBackend": false }, "group": "prometheus", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 19, "links": [], "title": "Battles", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 17, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineStyle": { "fill": "solid" }, "lineWidth": 1, "pointSize": 1, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "12.0.0" }, "group": "timeseries", "version": "12.4.0" } } }, "panel-2": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "disableTextWrap": false, "editorMode": "code", "exemplar": true, "expr": "sum by (faction) (game_army_size_ratio)", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "useBackend": false }, "group": "prometheus", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 2, "links": [], "title": "Current Army by Faction", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 29, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "12.0.0" }, "group": "timeseries", "version": "12.4.0" } } }, "panel-20": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "filters": [ { "id": "f905accb", "operator": "=", "scope": "span" }, { "id": "service-name", "operator": "=", "scope": "resource", "tag": "service.name", "value": [ "ai-opponent" ], "valueType": "string" } ], "limit": 20, "metricsQueryType": "range", "queryType": "traceqlSearch", "tableType": "traces" }, "group": "tempo", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 20, "links": [], "title": "AI Decisions", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true }, "pluginVersion": "12.0.0" }, "group": "table", "version": "12.4.0" } } }, "panel-3": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "disableTextWrap": false, "editorMode": "code", "exemplar": true, "expr": "game_resources_ratio{location_type=\"capital\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "useBackend": false }, "group": "prometheus", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 3, "links": [], "title": "Current Resources (By Captital)", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "color": { "mode": "palette-classic-by-name" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 26, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "12.0.0" }, "group": "timeseries", "version": "12.4.0" } } }, "panel-4": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "direction": "backward", "editorMode": "code", "expr": "{service_name=\"southern-capital\"} | code_function_name != \"_log\"", "queryType": "range" }, "group": "loki", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 4, "links": [], "title": "Location History", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": {}, "overrides": [] }, "options": { "dedupStrategy": "none", "enableInfiniteScrolling": false, "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": false, "showTime": false, "sortOrder": "Descending", "wrapLogMessage": false }, "pluginVersion": "12.0.0" }, "group": "logs", "version": "12.4.0" } } }, "panel-5": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "disableTextWrap": false, "editorMode": "builder", "exemplar": true, "expr": "game_army_size_ratio{location=\"southern-capital\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "useBackend": false }, "group": "prometheus", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 5, "links": [], "title": "Current Army", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "color": { "fixedColor": "dark-red", "mode": "fixed" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 20, "gradientMode": "scheme", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 3, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "options": { "legend": { "calcs": [], "displayMode": "hidden", "placement": "right", "showLegend": false }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "12.0.0" }, "group": "timeseries", "version": "12.4.0" } } }, "panel-6": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "disableTextWrap": false, "editorMode": "builder", "expr": "game_resources_ratio{location=\"southern-capital\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "useBackend": false }, "group": "prometheus", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 6, "links": [], "title": "Current Resources", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "max": 200, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "dark-red", "value": 0 }, { "color": "red", "value": 30 }, { "color": "#EAB839", "value": 50 }, { "color": "dark-green", "value": 100 } ] } }, "overrides": [] }, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "12.0.0" }, "group": "gauge", "version": "12.4.0" } } }, "panel-7": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "disableTextWrap": false, "editorMode": "builder", "expr": "game_location_control_ratio{location=\"southern-capital\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "useBackend": false }, "group": "prometheus", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 7, "links": [], "title": "Location Allegiance", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "max": 2, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#370a4d", "value": 0 }, { "color": "super-light-blue", "value": 1 }, { "color": "dark-red", "value": 2 } ] } }, "overrides": [] }, "options": { "displayMode": "gradient", "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, "pluginVersion": "12.0.0" }, "group": "bargauge", "version": "12.4.0" } } }, "panel-8": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "disableTextWrap": false, "editorMode": "builder", "expr": "game_army_size_ratio{location=\"northern-capital\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "useBackend": false }, "group": "prometheus", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 8, "links": [], "title": "Current Army", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "max": 10, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "dark-red", "value": 0 }, { "color": "red", "value": 3 }, { "color": "#EAB839", "value": 5 }, { "color": "dark-green", "value": 15 } ] } }, "overrides": [] }, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "12.0.0" }, "group": "gauge", "version": "12.4.0" } } }, "panel-9": { "kind": "Panel", "spec": { "data": { "kind": "QueryGroup", "spec": { "queries": [ { "kind": "PanelQuery", "spec": { "hidden": false, "query": { "kind": "DataQuery", "spec": { "disableTextWrap": false, "editorMode": "builder", "expr": "game_resources_ratio{location=\"northern-capital\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "useBackend": false }, "group": "prometheus", "version": "v0" }, "refId": "A" } } ], "queryOptions": {}, "transformations": [] } }, "description": "", "id": 9, "links": [], "title": "Current Resources", "vizConfig": { "kind": "VizConfig", "spec": { "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "max": 200, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "dark-red", "value": 0 }, { "color": "red", "value": 30 }, { "color": "#EAB839", "value": 50 }, { "color": "dark-green", "value": 100 } ] } }, "overrides": [] }, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "12.0.0" }, "group": "gauge", "version": "12.4.0" } } } }, "layout": { "kind": "TabsLayout", "spec": { "tabs": [ { "kind": "TabsLayoutTab", "spec": { "layout": { "kind": "RowsLayout", "spec": { "rows": [ { "kind": "RowsLayoutRow", "spec": { "collapse": false, "layout": { "kind": "GridLayout", "spec": { "items": [ { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-19" }, "height": 7, "width": 24, "x": 0, "y": 0 } }, { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-1" }, "height": 11, "width": 24, "x": 0, "y": 7 } }, { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-2" }, "height": 6, "width": 12, "x": 0, "y": 18 } }, { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-3" }, "height": 6, "width": 12, "x": 12, "y": 18 } }, { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-20" }, "height": 11, "width": 24, "x": 0, "y": 24 } } ] } }, "title": "Current Overview" } }, { "kind": "RowsLayoutRow", "spec": { "collapse": false, "layout": { "kind": "GridLayout", "spec": { "items": [ { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-16" }, "height": 11, "width": 24, "x": 0, "y": 0 } }, { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-17" }, "height": 10, "width": 24, "x": 0, "y": 11 } } ] } }, "title": "Trace Analytics" } } ] } }, "title": "War Map" } }, { "kind": "TabsLayoutTab", "spec": { "layout": { "kind": "GridLayout", "spec": { "items": [ { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-18" }, "height": 6, "width": 9, "x": 0, "y": 0 } }, { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-6" }, "height": 6, "width": 8, "x": 9, "y": 0 } }, { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-7" }, "height": 6, "width": 7, "x": 17, "y": 0 } }, { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-4" }, "height": 10, "width": 24, "x": 0, "y": 6 } }, { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-5" }, "height": 6, "width": 24, "x": 0, "y": 16 } } ] } }, "title": "Southern Kingdom" } }, { "kind": "TabsLayoutTab", "spec": { "layout": { "kind": "GridLayout", "spec": { "items": [ { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-8" }, "height": 6, "width": 9, "x": 0, "y": 0 } }, { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-9" }, "height": 6, "width": 8, "x": 9, "y": 0 } }, { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-10" }, "height": 6, "width": 7, "x": 17, "y": 0 } }, { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-11" }, "height": 10, "width": 24, "x": 0, "y": 6 } } ] } }, "title": "Northern Kingdom " } }, { "kind": "TabsLayoutTab", "spec": { "layout": { "kind": "GridLayout", "spec": { "items": [ { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-12" }, "height": 6, "width": 9, "x": 0, "y": 0 } }, { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-13" }, "height": 6, "width": 8, "x": 9, "y": 0 } }, { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-14" }, "height": 6, "width": 7, "x": 17, "y": 0 } }, { "kind": "GridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-15" }, "height": 10, "width": 24, "x": 0, "y": 6 } } ] } }, "title": "Villages" } } ] } }, "links": [], "liveNow": false, "preload": false, "tags": [], "timeSettings": { "autoRefresh": "", "autoRefreshIntervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "fiscalYearStartMonth": 0, "from": "now-30m", "hideTimepicker": false, "timezone": "browser", "to": "now" }, "title": "Game Dashboard", "variables": [ { "kind": "QueryVariable", "spec": { "hide": "dontHide", "includeAll": false, "label": "Villages", "multi": false, "name": "villages", "query": { "datasource": { "name": "prometheus" }, "group": "prometheus", "kind": "DataQuery", "spec": { "qryType": 1, "query": "label_values(game_resources_ratio,job)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "version": "v0" }, "refresh": "onDashboardLoad", "regex": "", "skipUrlSync": false, "sort": "alphabeticalAsc" } } ] } } ================================================ FILE: game-of-tracing/grafana/dashboards/dashboards.yaml ================================================ apiVersion: 1 providers: - name: 'game-of-tracing' orgId: 1 folder: '' type: file disableDeletion: true updateIntervalSeconds: 10 allowUiUpdates: false options: path: /etc/grafana/provisioning/dashboards foldersFromFilesStructure: false ================================================ FILE: game-of-tracing/grafana/datasources/defaults.yml ================================================ apiVersion: 1 datasources: - name: prometheus uid: prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: false version: 1 editable: false jsonData: exemplarTraceIdDestinations: - datasourceUid: "tempo" name: "trace_id" - name: tempo uid: tempo type: tempo access: proxy orgId: 1 url: http://tempo:3200 basicAuth: false isDefault: true version: 1 editable: false jsonData: serviceMap: datasourceUid: 'prometheus' nodeGraph: enabled: true tracesToLogsV2: datasourceUid: 'loki' filterBySpanID: true tracesToMetrics: datasourceUid: 'prometheus' tracesToProfilesV2: datasourceUid: 'pyroscope' tags: - key: 'service.name' value: 'service_name' profileTypeId: 'process_cpu:cpu:nanoseconds:cpu:nanoseconds' - name: loki uid: loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false jsonData: derivedFields: - datasourceUid: "tempo" matcherRegex: "trace_id" matcherType: "label" name: "trace_id" targetBlank: true url: "$${__value.raw}" urlDisplayLabel: "" - name: pyroscope uid: pyroscope type: grafana-pyroscope-datasource access: proxy orgId: 1 url: http://pyroscope:4040 basicAuth: false isDefault: false editable: false ================================================ FILE: game-of-tracing/loki-config.yaml ================================================ auth_enabled: false server: http_listen_port: 3100 grpc_listen_port: 9096 log_level: debug grpc_server_max_concurrent_streams: 1000 common: instance_addr: 127.0.0.1 path_prefix: /tmp/storage storage: filesystem: chunks_directory: /tmp/storage/chunks rules_directory: /tmp/storage/rules replication_factor: 1 ring: kvstore: store: inmemory query_range: results_cache: cache: embedded_cache: enabled: true max_size_mb: 100 limits_config: metric_aggregation_enabled: true schema_config: configs: - from: 2020-10-24 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h pattern_ingester: enabled: true metric_aggregation: loki_address: localhost:3100 ruler: alertmanager_url: http://localhost:9093 frontend: encoding: protobuf # By default, Loki will send anonymous, but uniquely-identifiable usage and configuration # analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/ # # Statistics help us better understand how Loki is used, and they show us performance # levels for most users. This helps us prioritize features and documentation. # For more information on what's sent, look at # https://github.com/grafana/loki/blob/main/pkg/analytics/stats.go # Refer to the buildReport method to see what goes into a report. # # If you would like to disable reporting, uncomment the following lines: #analytics: # reporting_enabled: false ================================================ FILE: game-of-tracing/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s otlp: keep_identifying_resource_attributes: true ================================================ FILE: game-of-tracing/pyroscope-config.yaml ================================================ --- # Minimal Pyroscope v2 config for local single-binary demo. # v2 defaults (filesystem backend, v1-v2-dual storage) handle the rest. server: http_listen_port: 4040 storage: backend: filesystem filesystem: dir: /data ================================================ FILE: game-of-tracing/tempo-config.yaml ================================================ stream_over_http_enabled: true server: http_listen_port: 3200 log_level: info cache: background: writeback_goroutines: 5 caches: - roles: - frontend-search memcached: addresses: dns+memcached:11211 query_frontend: search: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 metadata_slo: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 trace_by_id: duration_slo: 100ms metrics: max_duration: 200h # maximum duration of a metrics query, increase for local setups query_backend_after: 5m duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 distributor: receivers: # this configuration will listen on all ports and protocols that tempo is capable of. jaeger: # the receives all come from the OpenTelemetry collector. more configuration information can protocols: # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver thrift_http: # endpoint: "tempo:14268" # for a production deployment you should only enable the receivers you need! grpc: endpoint: "tempo:14250" thrift_binary: endpoint: "tempo:6832" thrift_compact: endpoint: "tempo:6831" zipkin: endpoint: "tempo:9411" otlp: protocols: grpc: endpoint: "tempo:4317" http: endpoint: "tempo:4318" opencensus: endpoint: "tempo:55678" ingester: max_block_duration: 5m # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally compactor: compaction: block_retention: 720h # overall Tempo trace retention. set for demo purposes metrics_generator: registry: external_labels: source: tempo cluster: docker-compose storage: path: /var/tempo/generator/wal remote_write: - url: http://prometheus:9090/api/v1/write send_exemplars: true traces_storage: path: /var/tempo/generator/traces processor: local_blocks: filter_server_spans: false flush_to_storage: true storage: trace: backend: local # backend configuration to use wal: path: /var/tempo/wal # where to store the wal locally local: path: /var/tempo/blocks overrides: defaults: metrics_generator: processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator generate_native_histograms: both ================================================ FILE: game-of-tracing/war_map/CLAUDE.md ================================================ # war_map/ — UI + Span-Link Broker > Flask web UI on port 8080, game session orchestrator, and **owner of the span-link reconstruction logic that drives game replay**. This doc is read by any AI coding agent. For scenario-wide context read [`../AGENTS.md`](../AGENTS.md) first. ## Purpose `war-map` is the human-facing surface of the game and the coordination point for everything the player touches: - Hosts the **map picker** (`/map_picker` + `/select_map`) that lets the user choose between `war_of_kingdoms` and `white_walkers_attack`, then renders the faction selection (or single-player auto-start) for the chosen map. - Renders the interactive game map (territory ownership, army sizes, supply routes, wall-hold HUD for WWA). - Manages faction selection, sessions, and the human player's identity. - Is the **sole writer** of the `game_actions` SQLite table — the record of every action's trace/span IDs that makes span-link replay possible (rows carry a `map_id` column). - Activates / deactivates the AI opponent on behalf of the player (auto-activates as `white_walkers` when the chosen map is WWA). - Proxies trace-replay queries to Tempo and falls back to local SQLite when Tempo is unavailable. - Instruments player actions as `SERVER` spans with `trace.Link`s chaining each action to the previous one in the session. - Runs the **wall-hold tick thread** (`_wall_tick_thread`, 30 s cadence) that increments `wall_hold` when one faction owns every wall keep, and declares the WWA winner at 5 consecutive ticks. ## File map | File | Size | Purpose | |---|---|---| | `app.py` | ~64 KB | Flask app, session/player management, span-link broker, Tempo proxy for replay, AI activation control. | | `telemetry.py` | ~3 KB | `GameTelemetry` — traces + logs (no custom metrics), plus Pyroscope profiling with OTel span-profile linkage. | | `templates/index.html` | ~7 KB | Faction selection screen. | | `templates/map.html` | ~50 KB | Main SVG-based game map with real-time updates. | | `templates/layout.html` | ~4 KB | Shared layout chrome. | | `templates/replay.html` | ~6 KB | Replay session picker. | | `templates/replay_session.html` | ~28 KB | Per-session trace-replay UI — the consumer of the span-link chain. | | `static/css/style.css` | — | UI styling. | | `Dockerfile` | small | `python:3.11-slim`, runs `python app.py`. | | `requirements.txt` | small | Flask 3.1.3, requests 2.33.1, python-dotenv 1.2.2, OpenTelemetry SDK/API + exporters, `pyroscope-io` + `pyroscope-otel` for profiling. | ## The span-link broker (the critical bit) ### Two SQLite databases — do not confuse | File | Owner | Purpose | |---|---|---| | `game_state.db` | All 8 location services (WAL mode, shared) | Canonical game state | | `game_sessions.db` | `war_map` **only** | `game_actions` table: `(game_session_id, action_sequence, action_type, player_name, faction, trace_id, span_id, location_id, target_location_id, timestamp, game_state_after, map_id)` | `game_actions` schema is defined in `init_game_session_tracking()` at `app.py:60-96`. It carries a `UNIQUE(game_session_id, action_sequence)` constraint — the sequence is what lets "next action" look up "previous action" deterministically. ### Storing an action — `store_game_action()` at `app.py:101-128` Called at the tail of every action handler. Reads the current max `action_sequence` for the session, inserts a new row with `next_sequence = max + 1`, returns the sequence number. Persists the active `map_id` (defaults to `get_active_map_id()` when callers don't pass one) so the replay UI can render the correct map layout for each session. ### Resolving a session's map — `get_session_map_id()` Used by `replay_session_page` to pick the right layout. Reads the first non-NULL `map_id` from the session's actions (cheap — sessions don't switch maps mid-play), falls back to the active map, then to `DEFAULT_MAP_ID`. Without this, the replay template renders the WoK layout regardless of which map was actually played. ### Reconstructing a previous span context — `get_previous_action_context()` at `app.py:130-170` Looks up `(trace_id, span_id)` for `(game_session_id, target_sequence)` in SQLite. Converts the hex strings to integers with `int(result[0], 16)` / `int(result[1], 16)` (this step has bitten agents in the past — the IDs are stored as hex strings, not raw bytes). Constructs a `trace.SpanContext(trace_id=..., span_id=..., is_remote=True, trace_flags=trace.TraceFlags.SAMPLED)` and returns it. The `SAMPLED` flag is required — without it, downstream processors may drop the link. ### Creating a link — `create_span_link_from_context()` at `app.py:172-189` Wraps the reconstructed context in a `trace.Link(span_context, attributes={...})` with: - `link.type` — caller-supplied (default `"game_sequence"`; AI opponent uses `"ai_decision_trigger"` in its own code). - `link.relation` — always `"follows"`. - `game.sequence` — always `"true"` (enables Tempo tag search). ### Per-action flow inside a player-action handler ```python previous_span_context = get_previous_action_context(game_session_id, current_sequence) links = [create_span_link_from_context(previous_span_context, "game_sequence")] if previous_span_context else [] with tracer.start_as_current_span( "move_army", kind=SpanKind.SERVER, links=links, attributes={ "game.session.id": game_session_id, "game.action.sequence": current_sequence + 1, "span.player.action": True, "player.name": ..., "player.faction": ..., }, ) as span: # ... do the work, call location_api_request, etc. store_game_action( game_session_id, "move_army", ..., trace_id=format(span.get_span_context().trace_id, '032x'), span_id=format(span.get_span_context().span_id, '016x'), ... ) ``` The `format(..., '032x')` / `'016x'` pair is the inverse of the `int(..., 16)` step in `get_previous_action_context()` — always keep the two in sync. ## Replay endpoints The replay UI (`replay_session.html`) is backed by Tempo. `app.py` serves as the proxy and cleans up the responses. **Primary (Tempo):** - Discover sessions — `GET {TEMPO_URL}/api/v2/search/tag/game.session.id/values` - Pull a session's traces — `GET {TEMPO_URL}/api/search?q={game.session.id=""}&limit=100` - Pull a specific trace — `GET {TEMPO_URL}/api/traces/` **Fallback (SQLite):** If Tempo returns an error or is unreachable, read the `game_actions` table directly. Replay renders a reduced view (without span payloads) but the session narrative is preserved. ## Environment | Var | Default | Purpose | |---|---|---| | `SECRET_KEY` | `war_of_westeros_secret_key` | Flask session secret | | `AI_URL` / `AI_SERVICE_URL` | `http://localhost:8081` | AI opponent base URL. Docker sets `http://ai-opponent:8081` | | `DATABASE_FILE` | `../app/game_state.db` | Shared game-state DB (read-only access from war_map) | | `GAME_SESSIONS_DB` | `game_sessions.db` | `game_actions` DB. Docker sets `/data/game_sessions.db` | | `API_BASE_URL` | `http://localhost` | Base URL for location server calls (host portion only; port comes from `LOCATION_PORTS`) | | `TEMPO_URL` | `http://tempo:3200` | Replay-query target | | `IN_DOCKER` | unset | Switches location URLs between `localhost:500X` and container DNS | Location ports are hard-coded in `LOCATION_PORTS` at `app.py:201-210`; mirror any change here in `app/game_config.py`. ## `X-Frame-Options` stripped — intentional `@app.after_request` at `app.py:191-194` removes `X-Frame-Options` from every response: ```python @app.after_request def remove_frame_options(response): response.headers.pop('X-Frame-Options', None) return response ``` This is deliberate — it lets the UI be embedded in Grafana iframes for the replay experience. Grafana's `GF_SECURITY_ALLOW_EMBEDDING=true` is the other half of this configuration. **Do not remove** unless you are also disabling Grafana embedding. ## Common edits **Add a new action type to the span-link chain.** 1. Add the Flask handler in `app.py`, following the `move_army` / `create_army` pattern: look up previous context, build link, start a SERVER span with link + attributes, call `store_game_action()` at the tail. 2. Add a renderer case in `templates/replay_session.html` so the replay UI can visualize the new action. 3. Update the action-types table in [`../SPAN_LINKS.md`](../SPAN_LINKS.md). 4. Update this doc and [`../AGENTS.md`](../AGENTS.md) if the new action surfaces new span attributes. **Tune the replay query.** Edit the TraceQL strings in the replay endpoints (`app.py`). The `game.session.id` tag is required — Tempo uses it to group the session's traces. **Add attributes to every player-action link.** Edit `create_span_link_from_context()` at `app.py:172-189`. The current three (`link.type`, `link.relation`, `game.sequence`) are load-bearing — the replay UI reads them. **Change session-tracking schema.** Edit `init_game_session_tracking()` at `app.py:60-96`. Because the DB lives on a persistent Docker volume, a schema change requires either `docker compose down -v` before restart **or** a migration script. Flag to the user which one you recommend before changing columns. ## Keep this doc current Per the sub-agent rule, any change to span-link fields, replay endpoints, env vars, action types, or the line-number anchors above must land in the same work unit. Before returning a response that touched `war_map/`, grep this file for references to anything you changed. Particularly sensitive references: - `app.py:130-170` — `get_previous_action_context` - `app.py:172-189` — `create_span_link_from_context` - `app.py:60-96` — `init_game_session_tracking` - `app.py:101-128` — `store_game_action` - `app.py:191-194` — `X-Frame-Options` strip - `app.py:201-210` — `LOCATION_PORTS` dict ## Cross-references - [`../AGENTS.md`](../AGENTS.md) — scenario-wide architecture and patterns - [`../SPAN_LINKS.md`](../SPAN_LINKS.md) — full span-link design spec and replay flow - [`../app/CLAUDE.md`](../app/CLAUDE.md) — location-server HTTP API this service calls - [`../ai_opponent/CLAUDE.md`](../ai_opponent/CLAUDE.md) — AI service this one activates/deactivates ================================================ FILE: game-of-tracing/war_map/Dockerfile ================================================ FROM python:3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2 WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt COPY . . ENV FLASK_APP=app.py ENV FLASK_DEBUG=0 ENV IN_DOCKER=1 EXPOSE 8080 CMD ["flask", "run", "--host=0.0.0.0", "--port=8080"] ================================================ FILE: game-of-tracing/war_map/app.py ================================================ import os import json import sqlite3 import requests import threading import uuid import time import atexit from flask import Flask, render_template, jsonify, request, redirect, url_for, session from telemetry import GameTelemetry from opentelemetry import trace from opentelemetry.trace import SpanKind from opentelemetry.propagate import inject app = Flask(__name__) app.secret_key = os.environ.get('SECRET_KEY', 'war_of_westeros_secret_key') # AI Service configuration AI_SERVICE_URL = os.environ.get('AI_URL', 'http://localhost:8081') # Initialize telemetry telemetry = GameTelemetry(service_name="war_map") logger = telemetry.get_logger() tracer = telemetry.get_tracer() atexit.register(telemetry.shutdown) # Game session tracking database GAME_SESSIONS_DB = os.environ.get('GAME_SESSIONS_DB', 'game_sessions.db') # Use local file for development # Game state variables GAME_OVER = False WINNER = None VICTORY_MESSAGE = None # ---------------------------------------------------------------- # Maps — in-UI picker metadata. # Full per-location config lives in app/game_config.py. This is a compact # read-only duplicate of the fields war_map actually needs: layout for the # canvas, tick rules for the hold-to-win loop, faction/AI wiring for the # picker screen. Keep the map-id strings in sync with app/game_config.py. # ---------------------------------------------------------------- DEFAULT_MAP_ID = "war_of_kingdoms" MAPS_META = { "war_of_kingdoms": { "display_name": "War of Kingdoms", "description": ( "Northern and Southern kingdoms clash for dominance. " "Capture the enemy capital to win." ), "single_player": False, "player_faction": None, "ai_faction": None, "factions": ["northern", "southern"], "tick_interval_s": 0, "win_hold_ticks": 0, "icon": "fa-chess-knight", }, "white_walkers_attack": { "display_name": "White Walkers Attack", "description": ( "The Long Night has come. As the Night's Watch, hold every Wall " "keep for 5 ticks before the White Walkers do. Single-player." ), "single_player": True, "player_faction": "nights_watch", "ai_faction": "white_walkers", "factions": ["nights_watch", "white_walkers", "barbarian"], "tick_interval_s": 30, "win_hold_ticks": 5, "icon": "fa-icicles", }, } # Map layout — canvas x/y percentages per location. Each map's keys must # match the location ids in app/game_config.py's MAPS[map_id]["locations"]. LOCATION_POSITIONS_BY_MAP = { "war_of_kingdoms": { "southern_capital": {"x": 20, "y": 70, "type": "capital", "name": "Southern Capital"}, "northern_capital": {"x": 80, "y": 20, "type": "capital", "name": "Northern Capital"}, "village_1": {"x": 35, "y": 55, "type": "village", "name": "Village 1"}, "village_2": {"x": 65, "y": 35, "type": "village", "name": "Village 2"}, "village_3": {"x": 30, "y": 40, "type": "village", "name": "Village 3"}, "village_4": {"x": 45, "y": 65, "type": "village", "name": "Village 4"}, "village_5": {"x": 50, "y": 50, "type": "village", "name": "Village 5"}, "village_6": {"x": 70, "y": 45, "type": "village", "name": "Village 6"}, }, "white_walkers_attack": { "nights_watch_fortress": {"x": 50, "y": 85, "type": "capital", "name": "Castle Black"}, "white_walker_fortress": {"x": 50, "y": 15, "type": "capital", "name": "The Lands of Always Winter"}, "wall_west": {"x": 20, "y": 50, "type": "wall", "name": "Westwatch"}, "wall_center_west": {"x": 40, "y": 50, "type": "wall", "name": "Queensgate"}, "wall_center_east": {"x": 60, "y": 50, "type": "wall", "name": "Deep Lake"}, "wall_east": {"x": 80, "y": 50, "type": "wall", "name": "Eastwatch-by-the-Sea"}, "barbarian_village_west": {"x": 10, "y": 72, "type": "village", "name": "Free Folk Camp (West)"}, "barbarian_village_east": {"x": 90, "y": 72, "type": "village", "name": "Free Folk Camp (East)"}, }, } LOCATION_CONNECTIONS_BY_MAP = { "war_of_kingdoms": [ ["southern_capital", "village_1"], ["southern_capital", "village_3"], ["northern_capital", "village_2"], ["northern_capital", "village_6"], ["village_1", "village_2"], ["village_1", "village_4"], ["village_2", "village_5"], ["village_3", "village_5"], ["village_3", "village_6"], ["village_4", "village_5"], ["village_5", "village_6"], ], "white_walkers_attack": [ ["nights_watch_fortress", "wall_west"], ["nights_watch_fortress", "wall_center_west"], ["nights_watch_fortress", "wall_center_east"], ["nights_watch_fortress", "wall_east"], ["white_walker_fortress", "wall_west"], ["white_walker_fortress", "wall_center_west"], ["white_walker_fortress", "wall_center_east"], ["white_walker_fortress", "wall_east"], ["wall_west", "wall_center_west"], ["wall_center_west", "wall_center_east"], ["wall_center_east", "wall_east"], ["wall_west", "barbarian_village_west"], ["wall_east", "barbarian_village_east"], ], } # Per-map list of wall-type locations for the hold-to-win check. WALL_LOCATIONS_BY_MAP = { map_id: [ loc_id for loc_id, meta in positions.items() if meta.get("type") == "wall" ] for map_id, positions in LOCATION_POSITIONS_BY_MAP.items() } # Kept for legacy call sites that still reference the module-level names. # These stay pointing at the WoK defaults — call sites that need per-map # behaviour should call _current_positions() / _current_connections() instead. LOCATION_POSITIONS = LOCATION_POSITIONS_BY_MAP[DEFAULT_MAP_ID] LOCATION_CONNECTIONS = LOCATION_CONNECTIONS_BY_MAP[DEFAULT_MAP_ID] def _current_positions(): """Positions for the currently active map (reads active_map_id from DB).""" return LOCATION_POSITIONS_BY_MAP.get( get_active_map_id(), LOCATION_POSITIONS_BY_MAP[DEFAULT_MAP_ID] ) def _current_connections(): """Connections for the currently active map.""" return LOCATION_CONNECTIONS_BY_MAP.get( get_active_map_id(), LOCATION_CONNECTIONS_BY_MAP[DEFAULT_MAP_ID] ) def _current_walls(): return WALL_LOCATIONS_BY_MAP.get(get_active_map_id(), []) def init_game_session_tracking(): """Initialize the game session tracking database""" try: # Ensure the database directory exists if using an absolute path db_dir = os.path.dirname(GAME_SESSIONS_DB) if db_dir and not os.path.exists(db_dir): os.makedirs(db_dir, exist_ok=True) conn = sqlite3.connect(GAME_SESSIONS_DB) cursor = conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS game_actions ( id INTEGER PRIMARY KEY AUTOINCREMENT, game_session_id TEXT NOT NULL, action_sequence INTEGER NOT NULL, action_type TEXT NOT NULL, player_name TEXT, faction TEXT, trace_id TEXT NOT NULL, span_id TEXT NOT NULL, location_id TEXT, target_location_id TEXT, timestamp INTEGER NOT NULL, game_state_after TEXT, map_id TEXT, UNIQUE(game_session_id, action_sequence) ) ''') # Best-effort migration for existing game_sessions.db files created # before the map_id column existed. SQLite's ALTER TABLE only adds # missing columns; the IGNORE/OperationalError guard keeps a # fresh-install run idempotent. try: cursor.execute("ALTER TABLE game_actions ADD COLUMN map_id TEXT") except sqlite3.OperationalError: pass conn.commit() conn.close() logger.info(f"Game session tracking database initialized: {GAME_SESSIONS_DB}") except Exception as e: logger.error(f"Failed to initialize game session tracking database: {e}") # Don't fail the app startup if database init fails pass # Initialize the game session tracking database immediately init_game_session_tracking() # Tables in game_state.db (game_config, wall_hold, faction_economy) are # initialized lazily on first call to _ensure_game_config_tables() — see # the in-process startup path later in this module. def store_game_action(game_session_id, action_type, player_name, faction, trace_id, span_id, location_id=None, target_location_id=None, game_state=None, map_id=None): """Store a game action with its trace information. ``map_id`` is recorded so the replay page can render the correct map layout (positions/connections) for sessions played on non-default maps. Defaults to the currently active map when not supplied. """ if map_id is None: try: map_id = get_active_map_id() except Exception: map_id = DEFAULT_MAP_ID conn = sqlite3.connect(GAME_SESSIONS_DB) cursor = conn.cursor() # Get next sequence number cursor.execute("SELECT MAX(action_sequence) FROM game_actions WHERE game_session_id = ?", (game_session_id,)) result = cursor.fetchone() next_sequence = (result[0] or 0) + 1 # Debug logging logger.info(f"Storing action: session={game_session_id}, sequence={next_sequence}, action={action_type}, trace_id={trace_id}, span_id={span_id}, map_id={map_id}") cursor.execute(''' INSERT INTO game_actions (game_session_id, action_sequence, action_type, player_name, faction, trace_id, span_id, location_id, target_location_id, timestamp, game_state_after, map_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', (game_session_id, next_sequence, action_type, player_name, faction, trace_id, span_id, location_id, target_location_id, int(time.time()), json.dumps(game_state) if game_state else None, map_id)) conn.commit() conn.close() return next_sequence def get_session_map_id(session_id): """Resolve the map a session was played on. Looks at any non-NULL ``map_id`` in the session's actions; falls back to the currently active map for sessions stored before the column was populated. Returns ``DEFAULT_MAP_ID`` as a last resort so the replay template always has a layout to render. """ try: conn = sqlite3.connect(GAME_SESSIONS_DB) try: row = conn.execute( "SELECT map_id FROM game_actions " "WHERE game_session_id = ? AND map_id IS NOT NULL " "ORDER BY action_sequence LIMIT 1", (session_id,), ).fetchone() if row and row[0] in LOCATION_POSITIONS_BY_MAP: return row[0] finally: conn.close() except Exception as e: logger.warning(f"get_session_map_id failed for {session_id}: {e}") try: active = get_active_map_id() if active in LOCATION_POSITIONS_BY_MAP: return active except Exception: pass return DEFAULT_MAP_ID def get_previous_action_context(game_session_id, target_sequence): """Get the action's span context for linking by target sequence number""" conn = sqlite3.connect(GAME_SESSIONS_DB) cursor = conn.cursor() # Debug logging logger.info(f"Looking for action: session={game_session_id}, target_sequence={target_sequence}") cursor.execute(''' SELECT trace_id, span_id FROM game_actions WHERE game_session_id = ? AND action_sequence = ? ''', (game_session_id, target_sequence)) result = cursor.fetchone() conn.close() if result: try: # Debug logging logger.info(f"Found target action: trace_id={result[0]}, span_id={result[1]}") # Reconstruct the span context from stored trace and span IDs trace_id = int(result[0], 16) span_id = int(result[1], 16) # Create span context with proper trace flags span_context = trace.SpanContext( trace_id=trace_id, span_id=span_id, is_remote=True, trace_flags=trace.TraceFlags.SAMPLED ) logger.info(f"Created span context for linking: trace_id={trace_id:032x}, span_id={span_id:016x}") return span_context except (ValueError, TypeError) as e: logger.error(f"Failed to reconstruct span context: {e}") return None else: logger.info(f"No action found for sequence {target_sequence}") return None def create_span_link_from_context(span_context, link_type="game_sequence"): """Create a span link from a span context using the official API""" if span_context is None: return None try: link = trace.Link( span_context, attributes={ "link.type": link_type, "link.relation": "follows", "game.sequence": "true" } ) return link except Exception as e: logger.error(f"Failed to create span link: {e}") return None @app.after_request def remove_frame_options(response): response.headers.pop('X-Frame-Options', None) return response # Configuration DATABASE_FILE = os.environ.get('DATABASE_FILE', '../app/game_state.db') API_BASE_URL = os.environ.get('API_BASE_URL', 'http://localhost') # Base URL for API calls # Location server ports (from game_config.py). These are keyed by the # *current-map* location id; when the active map changes, the keys here # follow along because both maps assign the same port to the same slot. LOCATION_PORTS = { "southern_capital": 5001, "northern_capital": 5002, "village_1": 5003, "village_2": 5004, "village_3": 5005, "village_4": 5006, "village_5": 5007, "village_6": 5008, # White Walkers Attack mappings (same ports — just aliased). "nights_watch_fortress": 5001, "white_walker_fortress": 5002, "wall_west": 5003, "wall_center_west": 5004, "wall_center_east": 5005, "wall_east": 5006, "barbarian_village_west": 5007, "barbarian_village_east": 5008, } # Container hostname per logical location id. WWA reuses the same 8 slot # containers, so its location ids resolve to the WoK container names. Without # this aliasing, ``location_id.replace('_', '-')`` produces hostnames like # ``nights-watch-fortress`` that don't exist in the docker network and the # /map render returns an empty locations dict (blank map). CONTAINER_FOR_LOCATION_ID = { "southern_capital": "southern-capital", "northern_capital": "northern-capital", "village_1": "village-1", "village_2": "village-2", "village_3": "village-3", "village_4": "village-4", "village_5": "village-5", "village_6": "village-6", "nights_watch_fortress": "southern-capital", "white_walker_fortress": "northern-capital", "wall_west": "village-1", "wall_center_west": "village-2", "wall_center_east": "village-3", "wall_east": "village-4", "barbarian_village_west": "village-5", "barbarian_village_east": "village-6", } # Container hostname (in docker-compose) per slot. Stable across maps. SLOT_CONTAINER_NAMES = { "slot_1": "southern-capital", "slot_2": "northern-capital", "slot_3": "village-1", "slot_4": "village-2", "slot_5": "village-3", "slot_6": "village-4", "slot_7": "village-5", "slot_8": "village-6", } # Port per slot. SLOT_PORTS = { "slot_1": 5001, "slot_2": 5002, "slot_3": 5003, "slot_4": 5004, "slot_5": 5005, "slot_6": 5006, "slot_7": 5007, "slot_8": 5008, } def _container_for_slot(slot_id): """Return the docker-compose service name hosting ``slot_id`` (stable).""" return SLOT_CONTAINER_NAMES.get(slot_id, slot_id.replace('_', '-')) def _slot_port_pairs(): """Yield (slot_id, port) tuples for all 8 slots.""" return list(SLOT_PORTS.items()) # LOCATION_POSITIONS and LOCATION_CONNECTIONS are defined earlier (as the # WoK default slices of the LOCATION_*_BY_MAP dicts). Legacy call sites that # still reference the unsuffixed names get the WoK layout; new code should # go through _current_positions() / _current_connections(). # Game state - track victory conditions (local process cache; also read # from wall_hold on map WWA). # Note: GAME_OVER/WINNER/VICTORY_MESSAGE already declared near top of file. def get_db_connection(): """Create a connection to the SQLite database""" conn = sqlite3.connect(DATABASE_FILE) conn.row_factory = sqlite3.Row return conn # ---------------------------------------------------------------- # Active map + wall-hold state (lives in game_state.db so location # servers and war_map agree on the single source of truth). # ---------------------------------------------------------------- def _ensure_game_config_tables(): """Create game_config, faction_economy, and wall_hold if missing, and migrate the war_map table for single-player maps (adds map_id + drops the faction UNIQUE constraint so nights_watch can be registered without conflicting with the WoK two-faction model). """ try: conn = get_db_connection() cursor = conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS game_config ( key TEXT PRIMARY KEY, value TEXT NOT NULL ) ''') cursor.execute( "INSERT OR IGNORE INTO game_config (key, value) VALUES ('active_map_id', ?)", (DEFAULT_MAP_ID,), ) cursor.execute(''' CREATE TABLE IF NOT EXISTS faction_economy ( faction TEXT PRIMARY KEY, corpses INTEGER NOT NULL DEFAULT 0 ) ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS wall_hold ( map_id TEXT NOT NULL, faction TEXT NOT NULL, ticks INTEGER NOT NULL DEFAULT 0, last_update INTEGER NOT NULL, PRIMARY KEY (map_id, faction) ) ''') # war_map table: additive map_id column for session-level bookkeeping. try: cursor.execute("ALTER TABLE war_map ADD COLUMN map_id TEXT") except sqlite3.OperationalError: pass conn.commit() conn.close() except sqlite3.Error as e: logger.error(f"Failed to ensure game_config tables: {e}") def get_active_map_id(): """Return the currently active map id from game_state.db (cached row).""" try: conn = get_db_connection() cursor = conn.cursor() cursor.execute("SELECT value FROM game_config WHERE key = 'active_map_id'") row = cursor.fetchone() conn.close() return row['value'] if row else DEFAULT_MAP_ID except sqlite3.Error: return DEFAULT_MAP_ID def set_active_map_id(map_id): """Persist the active map id. Location services pick this up via /reload.""" try: conn = get_db_connection() cursor = conn.cursor() cursor.execute( "INSERT INTO game_config (key, value) VALUES ('active_map_id', ?) " "ON CONFLICT(key) DO UPDATE SET value = excluded.value", (map_id,), ) conn.commit() conn.close() return True except sqlite3.Error as e: logger.error(f"Failed to set active map id: {e}") return False def reset_wall_hold(map_id): """Zero the wall-hold counter for every faction on ``map_id``.""" try: conn = get_db_connection() cursor = conn.cursor() cursor.execute("DELETE FROM wall_hold WHERE map_id = ?", (map_id,)) conn.commit() conn.close() except sqlite3.Error as e: logger.error(f"Failed to reset wall_hold for {map_id}: {e}") def bump_wall_hold(map_id, faction, reset_others=True): """Increment ``faction``'s tick count on ``map_id``. Optionally reset every other faction back to 0. Returns the new tick count. """ try: conn = get_db_connection() cursor = conn.cursor() now = int(time.time()) if reset_others: cursor.execute( "UPDATE wall_hold SET ticks = 0 WHERE map_id = ? AND faction != ?", (map_id, faction), ) cursor.execute( "INSERT INTO wall_hold (map_id, faction, ticks, last_update) " "VALUES (?, ?, 1, ?) " "ON CONFLICT(map_id, faction) DO UPDATE SET " "ticks = ticks + 1, last_update = excluded.last_update", (map_id, faction, now), ) cursor.execute( "SELECT ticks FROM wall_hold WHERE map_id = ? AND faction = ?", (map_id, faction), ) row = cursor.fetchone() conn.commit() conn.close() return int(row['ticks']) if row else 0 except sqlite3.Error as e: logger.error(f"Failed to bump wall_hold: {e}") return 0 def get_wall_hold(map_id): """Return {faction: ticks} for the given map.""" try: conn = get_db_connection() cursor = conn.cursor() cursor.execute("SELECT faction, ticks FROM wall_hold WHERE map_id = ?", (map_id,)) rows = cursor.fetchall() conn.close() return {r['faction']: int(r['ticks']) for r in rows} except sqlite3.Error: return {} def get_faction_corpses(faction): """Read a faction's corpse pool (0 when no row yet).""" try: conn = get_db_connection() cursor = conn.cursor() cursor.execute("SELECT corpses FROM faction_economy WHERE faction = ?", (faction,)) row = cursor.fetchone() conn.close() return int(row['corpses']) if row else 0 except sqlite3.Error: return 0 def check_faction_availability(faction): """Check if a faction is already claimed by another player""" try: conn = get_db_connection() cursor = conn.cursor() # Check if the war_map table exists cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='war_map'") if not cursor.fetchone(): # Create the war_map table if it doesn't exist cursor.execute(''' CREATE TABLE war_map ( id INTEGER PRIMARY KEY AUTOINCREMENT, faction TEXT UNIQUE NOT NULL, player_name TEXT, session_id TEXT UNIQUE ) ''') conn.commit() # Check if the faction is already taken cursor.execute("SELECT * FROM war_map WHERE faction = ?", (faction,)) result = cursor.fetchone() conn.close() logger.info(f"Faction availability check: {result is None}") return result is None # True if available, False if taken except sqlite3.Error as e: logger.error(f"Database error: {e}") return False def register_faction(faction, player_name, session_id): """Register a player's faction choice""" try: conn = get_db_connection() cursor = conn.cursor() # Try to insert the new faction record cursor.execute( "INSERT INTO war_map (faction, player_name, session_id) VALUES (?, ?, ?)", (faction, player_name, session_id) ) conn.commit() conn.close() logger.info(f"Faction registered: {faction} for {player_name} with session ID {session_id}") return True except sqlite3.Error as e: logger.error(f"Database error when registering faction: {e}") return False def get_player_faction(session_id): """Get the faction associated with a session ID""" try: conn = get_db_connection() cursor = conn.cursor() cursor.execute("SELECT faction FROM war_map WHERE session_id = ?", (session_id,)) result = cursor.fetchone() conn.close() logger.info(f"Player faction retrieved: {result['faction'] if result else None}") return result['faction'] if result else None except sqlite3.Error as e: logger.error(f"Database error: {e}") return None def release_faction(session_id): """Release a faction when a player logs out or disconnects""" try: conn = get_db_connection() cursor = conn.cursor() cursor.execute("DELETE FROM war_map WHERE session_id = ?", (session_id,)) conn.commit() conn.close() logger.info(f"Faction released for session ID: {session_id}") return True except sqlite3.Error as e: logger.error(f"Database error when releasing faction: {e}") return False def release_all_factions(): """Release all faction assignments - used for game reset""" try: conn = get_db_connection() cursor = conn.cursor() cursor.execute("DELETE FROM war_map") conn.commit() conn.close() logger.info("All factions released") return True except sqlite3.Error as e: logger.error(f"Database error when releasing all factions: {e}") return False def get_location_url(location_id): """Get the URL for a location's API""" # In Docker, use container names instead of localhost. WWA location ids # alias the WoK slot containers — see CONTAINER_FOR_LOCATION_ID. if os.environ.get('IN_DOCKER'): host = CONTAINER_FOR_LOCATION_ID.get( location_id, location_id.replace('_', '-') ) else: host = 'localhost' port = LOCATION_PORTS[location_id] return f"http://{host}:{port}" def make_api_request(location_id, endpoint, method='GET', data=None): """Make an API request to a location server with trace context.""" url = f"{get_location_url(location_id)}/{endpoint}" # Only create spans for important operations, not for status checks important_endpoints = {'move_army', 'all_out_attack', 'send_resources_to_capital', 'receive_army', 'receive_resources', 'collect_resources', 'create_army'} headers = {"Content-Type": "application/json"} if endpoint in important_endpoints: # Create span only for important operations with tracer.start_as_current_span( "location_api_request", kind=SpanKind.CLIENT, attributes={ "location.id": location_id, "location.endpoint": endpoint, "http.method": method } ) as span: inject(headers) # Inject trace context into headers try: if method == 'GET': response = requests.get(url, headers=headers) else: # POST response = requests.post(url, json=data, headers=headers) span.set_attribute("http.status_code", response.status_code) response.raise_for_status() result = response.json() if not result.get("success", True): span.set_status(trace.StatusCode.ERROR, result.get("message", "Unknown error")) return result except requests.RequestException as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) return {"error": str(e)} else: # For status checks and other non-important operations, just make the request without tracing try: if method == 'GET': response = requests.get(url, headers=headers) else: # POST response = requests.post(url, json=data, headers=headers) response.raise_for_status() return response.json() except requests.RequestException as e: return {"error": str(e)} def check_game_over(locations_data, map_id=None): """Dispatch to the right win-condition check based on the active map.""" if map_id is None: map_id = get_active_map_id() if map_id == "white_walkers_attack": # WWA games end only via hold-the-walls. Capital captures do not end # the game (the capital can change hands mid-match). return check_wall_hold_win(locations_data, map_id) return check_capital_capture_win(locations_data) def check_capital_capture_win(locations_data): """Classic WoK win: take the enemy capital.""" global GAME_OVER, WINNER, VICTORY_MESSAGE if locations_data.get('southern_capital', {}).get('faction') == 'northern': GAME_OVER = True WINNER = 'northern' VICTORY_MESSAGE = "The Northern Kingdom has conquered the Southern Capital! Victory through unity!" return True if locations_data.get('northern_capital', {}).get('faction') == 'southern': GAME_OVER = True WINNER = 'southern' VICTORY_MESSAGE = "The Southern Kingdom has conquered the Northern Capital! Glory to the South!" return True logger.info("Game is not over") return False def check_wall_hold_win(locations_data, map_id): """White Walkers Attack win: one faction has held every wall for the configured number of ticks. This is a passive check — the tick thread owns incrementing the counter; here we just observe + declare. """ global GAME_OVER, WINNER, VICTORY_MESSAGE threshold = MAPS_META.get(map_id, {}).get("win_hold_ticks", 0) if threshold <= 0: return False holds = get_wall_hold(map_id) for faction, ticks in holds.items(): if ticks >= threshold: GAME_OVER = True WINNER = faction if faction == "nights_watch": VICTORY_MESSAGE = ( "The Night's Watch held the Wall! The Long Night is broken." ) elif faction == "white_walkers": VICTORY_MESSAGE = ( "The Wall has fallen. The Long Night has come for Westeros." ) else: VICTORY_MESSAGE = f"{faction.title()} held every Wall keep for {threshold} ticks." return True logger.debug(f"Wall hold check: {holds} (threshold {threshold})") return False def reset_game_state(): """Reset the game state""" global GAME_OVER, WINNER, VICTORY_MESSAGE GAME_OVER = False WINNER = None VICTORY_MESSAGE = None def reset_game_data(): """Reset the game completely by resetting each location's state""" # First, reset our local game state reset_game_state() # Deactivate AI if it's running try: requests.post(f"{AI_SERVICE_URL}/deactivate", timeout=5) logger.info("AI deactivated during game reset") except Exception as e: logger.warning(f"Failed to deactivate AI during reset: {e}") # Next, reset all faction assignments release_all_factions() # Clear the game session tracking database try: conn = sqlite3.connect(GAME_SESSIONS_DB) cursor = conn.cursor() cursor.execute("DELETE FROM game_actions") conn.commit() conn.close() logger.info("Game session tracking database cleared") except Exception as e: logger.warning(f"Failed to clear game session database: {e}") # Finally, reset one location to trigger a database reset # (Since they all share the same database, we only need to reset one) try: make_api_request('southern_capital', 'reset', method='POST') logger.info("Game data reset") return True except Exception as e: logger.error(f"Error resetting game data: {e}") return False @app.route('/health', methods=['GET']) def health(): return jsonify({"status": "ok"}) @app.route('/') def index(): """Home page. Routes through the map picker on first visit; once the user has picked a map the faction-selection view (WoK) or auto-start view (WWA single-player) is served instead. """ _ensure_game_config_tables() # Already in a game with a faction? Go straight to the map. if 'session_id' in session and get_player_faction(session['session_id']): return redirect(url_for('game_map')) # No map chosen yet → map picker. if 'map_id' not in session: return redirect(url_for('map_picker')) map_id = session['map_id'] meta = MAPS_META.get(map_id, MAPS_META[DEFAULT_MAP_ID]) if meta["single_player"]: # Single-player maps skip the faction cards. A single CTA button posts # back with faction=player_faction. player_faction = meta["player_faction"] player_available = check_faction_availability(player_faction) return render_template( 'index.html', map_id=map_id, map_meta=meta, single_player=True, player_faction=player_faction, player_available=player_available, southern_available=False, northern_available=False, ) # Classic WoK two-faction flow. southern_available = check_faction_availability('southern') northern_available = check_faction_availability('northern') logger.info(f"Southern available: {southern_available}, Northern available: {northern_available}") return render_template( 'index.html', map_id=map_id, map_meta=meta, single_player=False, southern_available=southern_available, northern_available=northern_available, ) @app.route('/map_picker') def map_picker(): """Map selection screen. Renders one card per entry in MAPS_META.""" _ensure_game_config_tables() return render_template('map_picker.html', maps=MAPS_META) @app.route('/select_map', methods=['POST']) def select_map(): """Persist the chosen map as active + reload every location service. Steps: 1. Write ``active_map_id`` to game_config. 2. Reset the locations table via one location's ``/reset`` (shared DB — one call repopulates the 8 rows from the new map's config). 3. POST ``/reload`` to every slot so the in-memory ``location_info`` on each service rebinds without a container restart. 4. For single-player maps, auto-register the preset player faction and auto-activate the AI as the preset enemy faction. 5. Redirect to the entry UI (map-aware from the session). """ map_id = request.form.get('map_id') or DEFAULT_MAP_ID if map_id not in MAPS_META: logger.error(f"Unknown map_id: {map_id}") return redirect(url_for('map_picker')) with tracer.start_as_current_span( "select_map", kind=SpanKind.SERVER, attributes={"game.map.id": map_id}, ) as span: # 1. Persist + wipe any previous wall-hold counters. set_active_map_id(map_id) reset_wall_hold(map_id) # Clear all maps' old counters to avoid stale wins after switching. for mid in MAPS_META: reset_wall_hold(mid) # 2. Reset locations rows to match the new map. try: # Any one container will do — the DB is shared. Use the first # Docker service name (stable across maps). reset_container = _container_for_slot("slot_1") requests.post( f"http://{reset_container}:5001/reset" if os.environ.get('IN_DOCKER') else f"http://localhost:5001/reset", timeout=5, ) except Exception as e: logger.warning(f"Failed to reset location rows during map switch: {e}") # 3. Kick every slot to reload identity. for slot_id, port in _slot_port_pairs(): try: host = _container_for_slot(slot_id) if os.environ.get('IN_DOCKER') else "localhost" requests.post(f"http://{host}:{port}/reload", timeout=5) except Exception as e: logger.warning(f"Failed to /reload {slot_id}: {e}") # 4. Clear faction claims + session data so the new map starts clean. release_all_factions() session.pop('faction', None) session.pop('player_name', None) session.pop('game_session_id', None) session.pop('action_sequence', None) session.pop('session_id', None) session['map_id'] = map_id meta = MAPS_META[map_id] # 5. Single-player: AI activation is deferred until the player clicks # "Take the Black" on the index page (so the player always explicitly # starts the game). But we do pre-reset the game-over flags. reset_game_state() span.set_attribute("single_player", meta["single_player"]) return redirect(url_for('index')) @app.route('/select_faction', methods=['POST']) def select_faction(): """Process faction selection (WoK two-player or single-player preset).""" map_id = session.get('map_id', DEFAULT_MAP_ID) meta = MAPS_META.get(map_id, MAPS_META[DEFAULT_MAP_ID]) faction = request.form.get('faction') player_name = request.form.get('player_name', 'Unknown Player') allowed = set(meta.get("factions", [])) if not faction or faction not in allowed: return render_template( 'index.html', map_id=map_id, map_meta=meta, single_player=meta["single_player"], player_faction=meta.get("player_faction"), southern_available=check_faction_availability('southern'), northern_available=check_faction_availability('northern'), player_available=( check_faction_availability(meta.get("player_faction")) if meta["single_player"] else False ), error="Invalid faction selected", ) # Check if faction is available if not check_faction_availability(faction): logger.info(f"Faction {faction} is already taken") return render_template( 'index.html', map_id=map_id, map_meta=meta, single_player=meta["single_player"], player_faction=meta.get("player_faction"), southern_available=check_faction_availability('southern'), northern_available=check_faction_availability('northern'), player_available=False, error=f"The {faction.replace('_', ' ').title()} faction is already taken", ) # Generate a session ID if not present if 'session_id' not in session: session['session_id'] = str(uuid.uuid4()) # Generate a game session ID for span linking if 'game_session_id' not in session: session['game_session_id'] = str(uuid.uuid4()) session['action_sequence'] = 0 # Initialize action sequence logger.info(f"Initialized game session: {session['game_session_id']}") # Register the faction if register_faction(faction, player_name, session['session_id']): session['faction'] = faction session['player_name'] = player_name session['is_ai'] = False # Human player by default logger.info(f"Player {player_name} selected faction {faction} on map {map_id}") # On single-player maps, auto-activate the AI as the preset enemy # the moment the human commits to playing. if meta["single_player"] and meta.get("ai_faction"): try: requests.post( f"{AI_SERVICE_URL}/activate", json={"faction": meta["ai_faction"], "map_id": map_id}, timeout=5, ) logger.info(f"Auto-activated AI as {meta['ai_faction']} for single-player map {map_id}") except Exception as e: logger.warning(f"Auto-activation of AI failed: {e}") return redirect(url_for('game_map')) else: logger.error(f"Failed to register faction {faction}") return render_template( 'index.html', map_id=map_id, map_meta=meta, single_player=meta["single_player"], player_faction=meta.get("player_faction"), southern_available=check_faction_availability('southern'), northern_available=check_faction_availability('northern'), player_available=False, error=f"Failed to register {faction.replace('_', ' ').title()} faction", ) @app.route('/logout') def logout(): """Log out and release faction""" if 'session_id' in session: release_faction(session['session_id']) logger.info(f"Faction released for session ID: {session['session_id']}") # Clear the session session.clear() return redirect(url_for('index')) @app.route('/restart-game') def restart_game(): """Reset the game and redirect all players to faction selection""" logger.info("Game restart initiated") # Store current session info for logging current_player = session.get('player_name', 'Unknown') current_faction = session.get('faction', 'Unknown') # Reset the entire game state success = reset_game_data() # Clear current user's session completely session.clear() # Log the restart if success: logger.info(f"Game successfully restarted by {current_player} ({current_faction})") else: logger.error(f"Game restart failed, initiated by {current_player} ({current_faction})") # Redirect to the home page with a reset status if success: return redirect(url_for('index') + '?reset=success&message=Game has been reset successfully') else: return redirect(url_for('index') + '?reset=failed&message=Game reset failed, please try again') @app.route('/map') def game_map(): """Game map page — renders the canvas for the currently active map.""" # Check if user has selected a faction if 'faction' not in session: return redirect(url_for('index')) map_id = session.get('map_id') or get_active_map_id() positions = LOCATION_POSITIONS_BY_MAP.get(map_id, LOCATION_POSITIONS_BY_MAP[DEFAULT_MAP_ID]) connections = LOCATION_CONNECTIONS_BY_MAP.get(map_id, LOCATION_CONNECTIONS_BY_MAP[DEFAULT_MAP_ID]) meta = MAPS_META.get(map_id, MAPS_META[DEFAULT_MAP_ID]) faction = session['faction'] player_name = session.get('player_name', 'Unknown Player') # Get all location data for the map (only the ids relevant to this map). locations_data = {} for loc_id in positions.keys(): data = make_api_request(loc_id, '') if 'error' not in data: locations_data[loc_id] = { **positions[loc_id], 'faction': data['faction'], 'resources': data['resources'], 'army': data['army'], } # Check for game over condition (map-aware). check_game_over(locations_data, map_id=map_id) # Wall-hold HUD payload for WWA. wall_hold_state = None if map_id == "white_walkers_attack": wall_hold_state = { "threshold": meta.get("win_hold_ticks", 0), "holds": get_wall_hold(map_id), "walls": WALL_LOCATIONS_BY_MAP.get(map_id, []), } return render_template( 'map.html', player_name=player_name, faction=faction, map_id=map_id, map_meta=meta, locations=locations_data, connections=connections, wall_hold=wall_hold_state, game_over=GAME_OVER, winner=WINNER, victory_message=VICTORY_MESSAGE, ) @app.route('/api/collect_resources', methods=['POST']) def collect_resources(): """API endpoint to collect resources at a location""" # Get game session info for span linking game_session_id = session.get('game_session_id') current_sequence = session.get('action_sequence', 0) # Get previous action context for linking links = [] if game_session_id and current_sequence > 0: previous_span_context = get_previous_action_context(game_session_id, current_sequence) if previous_span_context: link = create_span_link_from_context(previous_span_context, "game_sequence") if link: links.append(link) with tracer.start_as_current_span( "collect_resources", kind=SpanKind.SERVER, links=links, attributes={ "player.name": session.get('player_name', 'Unknown'), "player.faction": session.get('faction', 'Unknown'), "game.session.id": game_session_id, "game.action.type": "collect_resources", "game.action.sequence": current_sequence + 1 } ) as span: location_id = request.json.get('location_id') if not location_id: logger.error("Location ID required") return jsonify({"error": "Location ID required"}), 400 span.set_attribute("location_id", location_id) result = make_api_request(location_id, 'collect_resources', method='POST') logger.info(f"Collect resources result: {result}") # Store this action for future span linking if game_session_id and result.get('success', True): # Assume success if not specified try: next_sequence = store_game_action( game_session_id=game_session_id, action_type="collect_resources", player_name=session.get('player_name'), faction=session.get('faction'), trace_id=format(span.get_span_context().trace_id, '032x'), span_id=format(span.get_span_context().span_id, '016x'), location_id=location_id ) session['action_sequence'] = next_sequence logger.info(f"Stored game action {next_sequence} for session {game_session_id}") except Exception as e: logger.error(f"Failed to store game action: {e}") return jsonify(result) @app.route('/api/create_army', methods=['POST']) def create_army(): """API endpoint to create an army at a location""" # Get game session info for span linking game_session_id = session.get('game_session_id') current_sequence = session.get('action_sequence', 0) # Get previous action context for linking links = [] if game_session_id and current_sequence > 0: previous_span_context = get_previous_action_context(game_session_id, current_sequence) if previous_span_context: link = create_span_link_from_context(previous_span_context, "game_sequence") if link: links.append(link) with tracer.start_as_current_span( "create_army", kind=SpanKind.SERVER, links=links, attributes={ "player.name": session.get('player_name', 'Unknown'), "player.faction": session.get('faction', 'Unknown'), "game.session.id": game_session_id, "game.action.type": "create_army", "game.action.sequence": current_sequence + 1 } ) as span: location_id = request.json.get('location_id') if not location_id: logger.error("Location ID required") return jsonify({"error": "Location ID required"}), 400 span.set_attribute("location_id", location_id) result = make_api_request(location_id, 'create_army', method='POST') logger.info(f"Create army result: {result}") # Store this action for future span linking if game_session_id and result.get('success', True): # Assume success if not specified try: next_sequence = store_game_action( game_session_id=game_session_id, action_type="create_army", player_name=session.get('player_name'), faction=session.get('faction'), trace_id=format(span.get_span_context().trace_id, '032x'), span_id=format(span.get_span_context().span_id, '016x'), location_id=location_id ) session['action_sequence'] = next_sequence logger.info(f"Stored game action {next_sequence} for session {game_session_id}") except Exception as e: logger.error(f"Failed to store game action: {e}") return jsonify(result) @app.route('/api/move_army', methods=['POST']) def move_army(): """API endpoint to move an army""" # Get game session info for span linking game_session_id = session.get('game_session_id') current_sequence = session.get('action_sequence', 0) # Debug logging logger.info(f"move_army: session={game_session_id}, current_sequence={current_sequence}") # Get previous action context for linking # Note: current_sequence is the last stored sequence number, so we look for that previous_span_context = None links = [] if game_session_id and current_sequence > 0: previous_span_context = get_previous_action_context(game_session_id, current_sequence) if previous_span_context: link = create_span_link_from_context(previous_span_context, "game_sequence") if link: links.append(link) logger.info(f"Created span link to previous action (sequence {current_sequence})") with tracer.start_as_current_span( "move_army", kind=SpanKind.SERVER, links=links, # Add span links here attributes={ "player.name": session.get('player_name', 'Unknown'), "player.faction": session.get('faction', 'Unknown'), "game.session.id": game_session_id, "game.action.type": "move_army", "game.action.sequence": current_sequence + 1 } ) as span: # Debug: log current span info current_trace_id = format(span.get_span_context().trace_id, '032x') current_span_id = format(span.get_span_context().span_id, '016x') logger.info(f"Current span: trace_id={current_trace_id}, span_id={current_span_id}") source_id = request.json.get('source_id') target_id = request.json.get('target_id') if not source_id or not target_id: span.set_status(trace.StatusCode.ERROR, "Missing location IDs") return jsonify({"error": "Source and target location IDs required"}), 400 span.set_attribute("source_location", source_id) span.set_attribute("target_location", target_id) # Check if the player controls the source location source_info = make_api_request(source_id, '') player_faction = session.get('faction') if source_info.get('faction') != player_faction: span.set_status(trace.StatusCode.ERROR, "Not player's location") return jsonify({ "error": f"You cannot move armies from {source_id} because it belongs to {source_info.get('faction')}" }), 403 result = make_api_request( source_id, 'move_army', method='POST', data={"target_location": target_id} ) # Check if this move resulted in a victory condition if target_id in ['southern_capital', 'northern_capital'] and result.get('success'): locations_data = {} for loc_id in _current_positions().keys(): data = make_api_request(loc_id, '') if 'error' not in data: locations_data[loc_id] = { 'faction': data['faction'] } if check_game_over(locations_data): result['game_over'] = True result['winner'] = WINNER result['victory_message'] = VICTORY_MESSAGE span.set_attribute("game_over", True) span.set_attribute("winner", WINNER) # Store this action for future span linking if game_session_id: try: next_sequence = store_game_action( game_session_id=game_session_id, action_type="move_army", player_name=session.get('player_name'), faction=session.get('faction'), trace_id=current_trace_id, span_id=current_span_id, location_id=source_id, target_location_id=target_id ) session['action_sequence'] = next_sequence logger.info(f"Stored game action {next_sequence} for session {game_session_id}, updated session sequence to {next_sequence}") except Exception as e: logger.error(f"Failed to store game action: {e}") return jsonify(result) @app.route('/api/location_info/', methods=['GET']) def location_info(location_id): """API endpoint to get information about a location""" if location_id not in _current_positions(): return jsonify({"error": "Invalid location ID"}), 400 result = make_api_request(location_id, '') logger.info(f"Location info result: {result}") return jsonify(result) @app.route('/api/map_data', methods=['GET']) def map_data(): """API endpoint to get all map data for updating the UI""" map_id = get_active_map_id() meta = MAPS_META.get(map_id, MAPS_META[DEFAULT_MAP_ID]) locations_data = {} for loc_id in _current_positions().keys(): data = make_api_request(loc_id, '') if 'error' not in data: locations_data[loc_id] = { **_current_positions()[loc_id], 'faction': data['faction'], 'resources': data['resources'], 'army': data['army'], 'type': _current_positions()[loc_id]['type'] } check_game_over(locations_data, map_id=map_id) response = { "locations": locations_data, "connections": _current_connections(), "game_over": GAME_OVER, "winner": WINNER, "victory_message": VICTORY_MESSAGE, "map_id": map_id, } # Include wall-hold state when the active map uses the tick mechanic. if meta.get("win_hold_ticks", 0) > 0: response["wall_hold"] = { "threshold": meta["win_hold_ticks"], "holds": get_wall_hold(map_id), "walls": WALL_LOCATIONS_BY_MAP.get(map_id, []), } return jsonify(response) @app.route('/api/game_status', methods=['GET']) def game_status(): """API endpoint to get the current game status""" # Always check the current state to catch AI victories locations_data = {} for loc_id in _current_positions().keys(): data = make_api_request(loc_id, '') if 'error' not in data: locations_data[loc_id] = { 'faction': data['faction'] } # Check for game over condition with fresh data check_game_over(locations_data) return jsonify({ "game_over": GAME_OVER, "winner": WINNER, "victory_message": VICTORY_MESSAGE }) @app.route('/api/reset_game', methods=['POST']) def reset_game(): """Reset the game state (for testing)""" success = reset_game_data() return jsonify({"success": success, "message": "Game has been reset"}) @app.route('/api/send_resources_to_capital', methods=['POST']) def send_resources_to_capital(): """API endpoint to send resources from a village to its capital""" with tracer.start_as_current_span( "send_resources_to_capital", kind=SpanKind.SERVER, attributes={ "player.name": session.get('player_name', 'Unknown'), "player.faction": session.get('faction', 'Unknown') } ) as span: location_id = request.json.get('location_id') if not location_id: span.set_status(trace.StatusCode.ERROR, "Missing location ID") return jsonify({"error": "Location ID required"}), 400 span.set_attribute("source_location", location_id) # Forward the request to the location server result = make_api_request(location_id, 'send_resources_to_capital', method='POST') return jsonify(result) @app.route('/api/all_out_attack', methods=['POST']) def all_out_attack(): """API endpoint to launch an all-out attack from a capital""" # Get game session info for span linking game_session_id = session.get('game_session_id') current_sequence = session.get('action_sequence', 0) # Get previous action context for linking links = [] if game_session_id and current_sequence > 0: previous_span_context = get_previous_action_context(game_session_id, current_sequence) if previous_span_context: link = create_span_link_from_context(previous_span_context, "game_sequence") if link: links.append(link) with tracer.start_as_current_span( "all_out_attack", kind=SpanKind.SERVER, links=links, attributes={ "player.name": session.get('player_name', 'Unknown'), "player.faction": session.get('faction', 'Unknown'), "game.session.id": game_session_id, "game.action.type": "all_out_attack", "game.action.sequence": current_sequence + 1 } ) as span: location_id = request.json.get('location_id') if not location_id: span.set_status(trace.StatusCode.ERROR, "Location ID required") return jsonify({"error": "Location ID required"}), 400 span.set_attribute("location_id", location_id) # Forward the request to the location server try: result = make_api_request(location_id, 'all_out_attack', method='POST', data=request.json) if 'error' in result: span.set_status(trace.StatusCode.ERROR, f"Error from location server: {result['error']}") return jsonify({"success": False, "message": f"Error from location server: {result['error']}"}), 500 # Check if this attack resulted in game over if result.get('success'): locations_data = {} for loc_id in _current_positions().keys(): data = make_api_request(loc_id, '') if 'error' not in data: locations_data[loc_id] = { 'faction': data['faction'] } if check_game_over(locations_data): result['game_over'] = True result['winner'] = WINNER result['victory_message'] = VICTORY_MESSAGE span.set_attribute("game_over", True) span.set_attribute("winner", WINNER) # Store this action for future span linking if game_session_id and result.get('success'): try: next_sequence = store_game_action( game_session_id=game_session_id, action_type="all_out_attack", player_name=session.get('player_name'), faction=session.get('faction'), trace_id=format(span.get_span_context().trace_id, '032x'), span_id=format(span.get_span_context().span_id, '016x'), location_id=location_id ) session['action_sequence'] = next_sequence logger.info(f"Stored game action {next_sequence} for session {game_session_id}") except Exception as e: logger.error(f"Failed to store game action: {e}") return jsonify(result) except Exception as e: span.set_status(trace.StatusCode.ERROR, f"Request failed: {str(e)}") logger.error(f"All out attack failed: {e}") return jsonify({"success": False, "message": f"Request failed: {str(e)}"}), 500 @app.route('/api/ai_toggle', methods=['POST']) def toggle_ai(): """Toggle AI opponent on/off""" data = request.get_json() enable_ai = data.get('enable', False) if enable_ai: # Get player's faction to determine AI faction player_faction = session.get('faction') if not player_faction: return jsonify({"success": False, "message": "No player faction selected"}), 400 # AI takes the opposite faction ai_faction = 'northern' if player_faction == 'southern' else 'southern' # Activate AI try: response = requests.post( f"{AI_SERVICE_URL}/activate", json={"faction": ai_faction}, timeout=5 ) if response.status_code == 200: result = response.json() if result.get('success'): logger.info(f"AI activated for {ai_faction} faction") return jsonify({ "success": True, "message": f"AI opponent activated for {ai_faction} faction" }) return jsonify({ "success": False, "message": "Failed to activate AI" }), 500 except requests.RequestException as e: logger.error(f"Error communicating with AI service: {e}") return jsonify({ "success": False, "message": "AI service unavailable" }), 503 else: # Deactivate AI try: response = requests.post( f"{AI_SERVICE_URL}/deactivate", timeout=5 ) if response.status_code == 200: logger.info("AI deactivated") return jsonify({ "success": True, "message": "AI opponent deactivated" }) return jsonify({ "success": False, "message": "Failed to deactivate AI" }), 500 except requests.RequestException as e: logger.error(f"Error communicating with AI service: {e}") return jsonify({ "success": False, "message": "AI service unavailable" }), 503 @app.route('/api/ai_status', methods=['GET']) def get_ai_status(): """Get current AI status""" try: response = requests.get(f"{AI_SERVICE_URL}/status", timeout=5) if response.status_code == 200: return jsonify(response.json()) return jsonify({"active": False, "faction": None}) except requests.RequestException: return jsonify({"active": False, "faction": None}) @app.route('/api/replay/sessions', methods=['GET']) def get_replay_sessions(): """Get available game sessions for replay using tag values API""" tempo_url = os.environ.get('TEMPO_URL', 'http://localhost:3200') try: from datetime import datetime, timedelta # Step 1: Get all game session IDs using tag values API end_time = datetime.now() start_time = end_time - timedelta(hours=24) # 24-hour window tag_params = { 'start': int(start_time.timestamp()), 'end': int(end_time.timestamp()), 'limit': 50 } response = requests.get( f"{tempo_url}/api/v2/search/tag/.game.session.id/values", params=tag_params, timeout=15 ) if response.status_code != 200: logger.error(f"Tag values API failed with status {response.status_code}") return jsonify({ 'success': False, 'error': f'Tag values API failed with status {response.status_code}', 'sessions': [], 'total_sessions': 0 }), response.status_code tag_response = response.json() session_ids = [] # Extract session IDs from tag values for tag_value in tag_response.get('tagValues', []): if tag_value.get('type') == 'string': session_id = tag_value.get('value', '') if session_id: session_ids.append(session_id) logger.info(f"Found {len(session_ids)} game sessions: {session_ids}") # Just return the session IDs with minimal info - details will be fetched when clicked session_list = [] for session_id in session_ids: session_list.append({ 'session_id': session_id, 'player_name': 'Unknown', # Will be determined when session is opened 'faction': 'Unknown', # Will be determined when session is opened 'start_time': 0, # Will be determined when session is opened 'action_count': 0, # Will be determined when session is opened 'last_action': 'Unknown' # Will be determined when session is opened }) # Sort by session_id for consistent ordering session_list.sort(key=lambda x: x.get('session_id', ''), reverse=True) return jsonify({ 'success': True, 'sessions': session_list, 'total_sessions': len(session_list), 'data_source': 'tempo_tag_values', 'discovered_session_ids': session_ids }) except Exception as e: logger.error(f"Error getting replay sessions: {e}") return jsonify({ 'success': False, 'error': str(e), 'sessions': [], 'total_sessions': 0 }), 500 @app.route('/replay') def replay_page(): """Replay page to view game sessions""" return render_template('replay.html') @app.route('/replay/') def replay_session_page(session_id): """Page to replay a specific game session — renders with the layout of whichever map the session was played on (not the active map).""" map_id = get_session_map_id(session_id) return render_template( 'replay_session.html', session_id=session_id, map_id=map_id, location_positions=LOCATION_POSITIONS_BY_MAP[map_id], location_connections=LOCATION_CONNECTIONS_BY_MAP[map_id], ) """Debug endpoint to verify restart cleared all data properly""" verification_results = { 'game_state_reset': False, 'span_links_cleared': False, 'faction_assignments_cleared': False, 'ai_deactivated': False, 'database_reset': False } try: # Check game state verification_results['game_state_reset'] = not GAME_OVER and WINNER is None and VICTORY_MESSAGE is None # Check span links database conn = sqlite3.connect(GAME_SESSIONS_DB) cursor = conn.cursor() cursor.execute("SELECT COUNT(*) FROM game_actions") span_links_count = cursor.fetchone()[0] conn.close() verification_results['span_links_cleared'] = span_links_count == 0 # Check faction assignments db_conn = get_db_connection() cursor = db_conn.cursor() cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='war_map'") table_exists = cursor.fetchone() is not None if table_exists: cursor.execute("SELECT COUNT(*) FROM war_map") faction_count = cursor.fetchone()[0] verification_results['faction_assignments_cleared'] = faction_count == 0 else: verification_results['faction_assignments_cleared'] = True db_conn.close() # Check AI status try: response = requests.get(f"{AI_SERVICE_URL}/status", timeout=5) if response.status_code == 200: ai_status = response.json() verification_results['ai_deactivated'] = not ai_status.get('active', False) else: verification_results['ai_deactivated'] = True # Assume deactivated if can't reach except: verification_results['ai_deactivated'] = True # Assume deactivated if can't reach # Check if location database reset to initial state try: locations_data = {} for loc_id in _current_positions().keys(): data = make_api_request(loc_id, '') if 'error' not in data: locations_data[loc_id] = data # Verify initial state from game_config import LOCATIONS database_reset = True for loc_id, expected in LOCATIONS.items(): actual = locations_data.get(loc_id, {}) if (actual.get('faction') != expected['faction'] or actual.get('army') != expected['initial_army'] or actual.get('resources') != expected['initial_resources']): database_reset = False break verification_results['database_reset'] = database_reset except Exception: verification_results['database_reset'] = False # Overall status all_clear = all(verification_results.values()) return jsonify({ 'success': True, 'all_systems_reset': all_clear, 'details': verification_results }) except Exception as e: return jsonify({ 'success': False, 'error': str(e), 'details': verification_results }), 500 @app.route('/api/replay/session/', methods=['GET']) def get_replay_session(session_id): """Get detailed replay data for a specific session""" tempo_url = os.environ.get('TEMPO_URL', 'http://localhost:3200') try: logger.info(f"Getting replay data for session: {session_id}") # Query for this specific session with 24-hour time window from datetime import datetime, timedelta end_time = datetime.now() start_time = end_time - timedelta(hours=24) # 24-hour window params = { 'q': f'{{.game.session.id="{session_id}"}}', 'start': int(start_time.timestamp()), 'end': int(end_time.timestamp()), 'limit': 100 } logger.info(f"Querying Tempo with: {params}") response = requests.get( f"{tempo_url}/api/search", params=params, timeout=15 ) logger.info(f"Tempo response status: {response.status_code}") actions = [] seen_spans = set() # Track span IDs to avoid duplicates if response.status_code == 200: search_results = response.json() traces = search_results.get('traces', []) logger.info(f"Found {len(traces)} traces for session {session_id}") # Extract all game actions from traces for trace in traces: trace_id = trace.get('traceID') root_trace_name = trace.get('rootTraceName', '') trace_details_success = False # Try to get full trace details first (preferred - has rich attributes) try: trace_params = { 'start': int(start_time.timestamp()), 'end': int(end_time.timestamp()) } trace_response = requests.get( f"{tempo_url}/api/traces/{trace_id}", params=trace_params, timeout=10 ) if trace_response.status_code == 200: trace_detail = trace_response.json() # Parse using the correct structure: batches -> scopeSpans -> spans for batch in trace_detail.get('batches', []): for scope_span in batch.get('scopeSpans', []): for span in scope_span.get('spans', []): # Parse ALL spans for this session, don't filter by action type action = parse_span_to_action_from_detail(span, trace_id, root_trace_name) if action and action.get('session_id') == session_id: span_id = action.get('span_id') if span_id and span_id not in seen_spans: seen_spans.add(span_id) actions.append(action) trace_details_success = True else: logger.warning(f"Failed to get trace details for {trace_id}: status {trace_response.status_code}") except Exception as e: logger.warning(f"Error getting trace details for {trace_id}: {e}") # Only use search results if trace details completely failed if not trace_details_success: logger.info(f"Using search results fallback for trace {trace_id}") for span_set in trace.get('spanSets', []): for span in span_set.get('spans', []): action = parse_span_to_action_from_search(span, trace_id, root_trace_name, session_id) if action: span_id = action.get('span_id') if span_id and span_id not in seen_spans: seen_spans.add(span_id) actions.append(action) else: logger.warning(f"Tempo search failed with status {response.status_code}") # Sort by sequence number or start time actions.sort(key=lambda x: (x.get('sequence', 0), x.get('start_time', 0))) logger.info(f"Returning {len(actions)} actions for session {session_id}") # Extract session metadata from actions session_metadata = { 'player_name': 'Unknown', 'faction': 'Unknown', 'start_time': 0, 'end_time': 0 } if actions: # Get metadata from first action first_action = actions[0] session_metadata['player_name'] = first_action.get('player_name', 'Unknown') session_metadata['faction'] = first_action.get('faction', 'Unknown') session_metadata['start_time'] = first_action.get('start_time', 0) # Get end time from last action last_action = actions[-1] session_metadata['end_time'] = last_action.get('start_time', 0) # Verify span links span_link_chain = verify_action_links(actions) return jsonify({ 'success': True, 'session_id': session_id, 'session_metadata': session_metadata, 'actions': actions, 'span_link_chain': span_link_chain, 'total_actions': len(actions), 'data_source': 'tempo' }) except Exception as e: logger.error(f"Error getting replay session {session_id}: {e}") return jsonify({ 'success': False, 'error': str(e), 'session_id': session_id, 'actions': [], 'total_actions': 0 }), 500 def parse_span_to_action_from_detail(span, trace_id, root_trace_name): """Parse a span from trace details into a game action for replay""" # Convert base64 spanId to hex for consistency with search results span_id_b64 = span.get('spanId', '') span_id_hex = '' if span_id_b64: try: import base64 span_id_bytes = base64.b64decode(span_id_b64) span_id_hex = span_id_bytes.hex() except: span_id_hex = span_id_b64 # fallback to original action = { 'trace_id': trace_id, 'span_id': span_id_hex, 'operation': span.get('name', ''), # Fixed: name not operationName 'action_type': root_trace_name, # Use root trace name as action type 'start_time': 0, # Will be set properly below 'duration': 0, # Will calculate from start/end times 'attributes': {}, 'span_links': [], 'data_source': 'tempo_detail' } # Calculate duration and set start time properly start_time_raw = span.get('startTimeUnixNano', 0) end_time_raw = span.get('endTimeUnixNano', 0) # Convert start time to integer and set it try: action['start_time'] = int(start_time_raw) if start_time_raw else 0 except (ValueError, TypeError): action['start_time'] = 0 # Calculate duration if we have both start and end times if start_time_raw and end_time_raw: try: # Convert to integers if they're strings start_time_int = int(start_time_raw) if isinstance(start_time_raw, str) else start_time_raw end_time_int = int(end_time_raw) if isinstance(end_time_raw, str) else end_time_raw action['duration'] = end_time_int - start_time_int except (ValueError, TypeError): action['duration'] = 0 # Extract attributes from the correct structure for attr in span.get('attributes', []): key = attr.get('key', '') value = attr.get('value', {}) # Store full attribute for later use - handle all value types correctly if 'stringValue' in value: action['attributes'][key] = value['stringValue'] elif 'intValue' in value: action['attributes'][key] = value['intValue'] # Keep as int, convert when needed elif 'boolValue' in value: action['attributes'][key] = value['boolValue'] # Keep as bool # Extract span links from the links array and convert to hex format links = span.get('links', []) if links: # Only process if links is not None and not empty for link in links: linked_span_id_b64 = link.get('spanId', '') if linked_span_id_b64: try: import base64 linked_span_bytes = base64.b64decode(linked_span_id_b64) linked_span_hex = linked_span_bytes.hex() action['span_links'].append(linked_span_hex) except: action['span_links'].append(linked_span_id_b64) # fallback # Extract specific game attributes with proper type handling attrs = action['attributes'] # Handle sequence number as int if 'game.action.sequence' in attrs: seq_val = attrs['game.action.sequence'] if isinstance(seq_val, int): action['sequence'] = seq_val else: try: action['sequence'] = int(seq_val) except: action['sequence'] = 0 # Handle string attributes if 'game.action.type' in attrs: action['action_type'] = str(attrs['game.action.type']) if 'player.name' in attrs: action['player_name'] = str(attrs['player.name']) if 'player.faction' in attrs: action['faction'] = str(attrs['player.faction']) if 'game.session.id' in attrs: action['session_id'] = str(attrs['game.session.id']) if 'location_id' in attrs: action['location_id'] = str(attrs['location_id']) if 'source_location' in attrs: action['source_location'] = str(attrs['source_location']) if 'target_location' in attrs: action['target_location'] = str(attrs['target_location']) return action def parse_span_to_action_from_search(span, trace_id, root_trace_name, session_id): """Parse a span from search results into a game action for replay""" action = { 'trace_id': trace_id, 'span_id': span.get('spanID', ''), 'start_time': 0, # Will be set properly below 'duration': 0, # Will be set properly below 'action_type': root_trace_name, # Use root trace name as action type 'session_id': session_id, 'span_links': [], 'data_source': 'tempo_search', 'attributes': {} } # Convert start time and duration to integers safely try: start_time_raw = span.get('startTimeUnixNano', 0) action['start_time'] = int(start_time_raw) if start_time_raw else 0 except (ValueError, TypeError): action['start_time'] = 0 try: duration_raw = span.get('durationNanos', 0) action['duration'] = int(duration_raw) if duration_raw else 0 except (ValueError, TypeError): action['duration'] = 0 # Extract attributes from the correct structure for attr in span.get('attributes', []): key = attr.get('key', '') value = attr.get('value', {}) # Store the raw attribute value for later use if 'stringValue' in value: action['attributes'][key] = value['stringValue'] elif 'intValue' in value: action['attributes'][key] = value['intValue'] elif 'boolValue' in value: action['attributes'][key] = value['boolValue'] # Also extract key attributes directly if key == 'game.action.sequence': if 'intValue' in value: action['sequence'] = int(value['intValue']) elif 'stringValue' in value: try: action['sequence'] = int(value['stringValue']) except: action['sequence'] = 0 elif key == 'game.action.type': action['action_type'] = value.get('stringValue', root_trace_name) elif key == 'player.name': action['player_name'] = value.get('stringValue', '') elif key == 'player.faction': action['faction'] = value.get('stringValue', '') elif key == 'game.session.id': action['session_id'] = value.get('stringValue', '') elif key == 'location_id': action['location_id'] = value.get('stringValue', '') elif key == 'source_location': action['source_location'] = value.get('stringValue', '') elif key == 'target_location': action['target_location'] = value.get('stringValue', '') # Only return if this span belongs to our session if action.get('session_id') == session_id: return action return None def verify_action_links(actions): """Verify the span link chain between actions""" chain_verification = [] for i, action in enumerate(actions): verification = { 'sequence': action.get('sequence', i + 1), 'action_type': action.get('action_type', 'unknown'), 'span_id': action.get('span_id', ''), 'has_links': len(action.get('span_links', [])) > 0, 'links_to': [], 'valid_chain': False, 'data_source': action.get('data_source', 'unknown') } if i == 0: # First action should have no links verification['valid_chain'] = True # First action is always valid verification['note'] = 'First action (no links expected)' else: # Check if this action links to any previous action (not necessarily the immediate previous) previous_actions = actions[:i] # All previous actions linked_to_previous = False for prev_action in previous_actions: prev_span_id = prev_action.get('span_id', '') if prev_span_id and prev_span_id in action.get('span_links', []): linked_to_previous = True verification['links_to'].append({ 'sequence': prev_action.get('sequence', 0), 'action_type': prev_action.get('action_type', 'unknown'), 'span_id': prev_span_id }) if linked_to_previous: verification['valid_chain'] = True verification['note'] = f'Links to previous action(s)' else: # For now, consider missing links as acceptable due to data source limitations verification['valid_chain'] = True # More lenient verification['note'] = f'Missing link to previous action (may be due to data source limitations)' chain_verification.append(verification) return chain_verification # ---------------------------------------------------------------- # Wall-hold tick thread — WWA win condition. # Runs every tick_interval_s, reads every wall-type location's faction from # game_state.db, increments the hold counter for whoever owns them all, and # resets the counter otherwise. When a faction's count reaches win_hold_ticks # the global game-over flags flip and the map.html poll picks up the winner. # ---------------------------------------------------------------- def _wall_tick_thread(): _ensure_game_config_tables() logger.info("Wall-hold tick thread started") while True: try: map_id = get_active_map_id() meta = MAPS_META.get(map_id) interval = meta.get("tick_interval_s", 0) if meta else 0 if not meta or interval <= 0: # WoK or any map that doesn't use the hold-to-win mechanic: # sleep in short slices so a map switch to WWA picks up # within 5 s rather than waiting out a long interval. time.sleep(5) continue # Measure wall ownership from game_state.db directly (faster and # more consistent than round-tripping through the HTTP API, and # avoids producing tracing noise every 30 s). wall_ids = WALL_LOCATIONS_BY_MAP.get(map_id, []) if not wall_ids: time.sleep(interval) continue conn = get_db_connection() cursor = conn.cursor() placeholders = ",".join("?" for _ in wall_ids) cursor.execute( f"SELECT id, faction FROM locations WHERE id IN ({placeholders})", wall_ids, ) rows = cursor.fetchall() conn.close() factions = {r['faction'] for r in rows} playable = factions - {"neutral"} threshold = meta.get("win_hold_ticks", 0) with tracer.start_as_current_span( "wall_tick", kind=SpanKind.INTERNAL, attributes={ "game.map.id": map_id, "wall.count": len(wall_ids), "wall.factions": ",".join(sorted(factions)), }, ) as tick_span: if len(rows) == len(wall_ids) and len(playable) == 1 and "neutral" not in factions: holder = playable.pop() ticks = bump_wall_hold(map_id, holder, reset_others=True) tick_span.set_attribute("wall.holder", holder) tick_span.set_attribute("game.wall.hold_counter", ticks) if threshold > 0 and ticks >= threshold: tick_span.add_event( "game.wall.hold_win", attributes={"faction": holder, "ticks": ticks}, ) logger.info(f"Wall-hold win detected for {holder} on {map_id}") else: reset_wall_hold(map_id) tick_span.set_attribute("wall.holder", "contested") time.sleep(interval) except Exception as e: logger.error(f"Wall-tick thread error: {e}") time.sleep(5) # Kick off the wall-tick thread once per process. threading.Thread(target=_wall_tick_thread, daemon=True, name="wall-tick").start() if __name__ == '__main__': port = int(os.environ.get('PORT', 8080)) app.run(host='0.0.0.0', port=port, debug=True) ================================================ FILE: game-of-tracing/war_map/requirements.txt ================================================ flask==3.1.3 requests==2.33.1 python-dotenv==1.2.2 opentelemetry-api==1.41.1 opentelemetry-sdk==1.41.1 opentelemetry-exporter-otlp==1.41.1 pyroscope-io==1.0.6 pyroscope-otel==1.0.0 ================================================ FILE: game-of-tracing/war_map/static/css/style.css ================================================ /* ======================================== Game of Traces - Dark Fantasy Theme ======================================== */ /* --- CSS Custom Properties --- */ :root { /* Background */ --bg-primary: #0d1117; --bg-secondary: #161b22; --bg-card: rgba(22, 27, 34, 0.85); --bg-card-hover: rgba(30, 37, 48, 0.9); --bg-glass: rgba(13, 17, 23, 0.7); /* Southern Faction */ --southern-gold: #FFD700; --southern-crimson: #DC143C; --southern-glow: rgba(255, 215, 0, 0.4); --southern-bg: linear-gradient(135deg, #8B0000, #DC143C); /* Northern Faction */ --northern-blue: #4FC3F7; --northern-steel: #B0BEC5; --northern-glow: rgba(79, 195, 247, 0.4); --northern-bg: linear-gradient(135deg, #1a237e, #4FC3F7); /* Neutral */ --neutral-silver: #78909C; --neutral-glow: rgba(120, 144, 156, 0.3); /* White Walkers Attack — Night's Watch (player on WWA) */ --nights-watch-black: #141824; --nights-watch-accent: #d7e4f1; --nights-watch-glow: rgba(215, 228, 241, 0.45); --nights-watch-bg: linear-gradient(135deg, #0a0f1d, #2a3246); /* White Walkers (AI on WWA) */ --white-walkers-blue: #88c4e6; --white-walkers-ice: #d6f1ff; --white-walkers-glow: rgba(136, 196, 230, 0.55); --white-walkers-bg: linear-gradient(135deg, #0f2d3f, #88c4e6); /* Barbarians (passive NPCs on WWA) */ --barbarian-orange: #c1442e; --barbarian-glow: rgba(193, 68, 46, 0.4); --barbarian-bg: linear-gradient(135deg, #5a1a0d, #c1442e); /* Wall keeps (new settlement type) */ --wall-stone: #8a8a95; --wall-stone-light: #b9b9c2; --wall-rune: rgba(200, 225, 255, 0.5); /* Text */ --text-primary: #e6edf3; --text-secondary: #8b949e; --text-muted: #6e7681; /* Accents */ --border-subtle: rgba(240, 246, 252, 0.1); --border-glow: rgba(255, 215, 0, 0.3); /* Misc */ --glass-blur: 12px; --transition-speed: 0.3s; } /* --- General Styles --- */ body { background-color: var(--bg-primary); background-image: radial-gradient(ellipse at 20% 80%, rgba(220, 20, 60, 0.05) 0%, transparent 50%), radial-gradient(ellipse at 80% 20%, rgba(79, 195, 247, 0.05) 0%, transparent 50%); min-height: 100vh; display: flex; flex-direction: column; color: var(--text-primary); font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; } h1, h2, h3, h4, h5, h6, .navbar-brand { font-family: 'Cinzel', 'Georgia', serif; } .container-fluid { flex: 1; } /* --- Dark Card / Glass Effect --- */ .card { background: var(--bg-card); border: 1px solid var(--border-subtle); backdrop-filter: blur(var(--glass-blur)); -webkit-backdrop-filter: blur(var(--glass-blur)); color: var(--text-primary); border-radius: 12px; overflow: hidden; } .card-header { background: rgba(0, 0, 0, 0.3) !important; border-bottom: 1px solid var(--border-subtle); color: var(--text-primary) !important; } .card-body { color: var(--text-primary); } /* --- Navbar --- */ .navbar { background: rgba(13, 17, 23, 0.95) !important; backdrop-filter: blur(10px); border-bottom: 1px solid var(--border-subtle); box-shadow: 0 2px 20px rgba(0, 0, 0, 0.5); } .navbar-brand { font-size: 1.4rem; letter-spacing: 1px; background: linear-gradient(135deg, var(--southern-gold), var(--northern-blue)); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; font-weight: 700; } .navbar .nav-link { color: var(--text-secondary) !important; transition: color var(--transition-speed); font-family: 'Inter', sans-serif; font-size: 0.9rem; } .navbar .nav-link:hover { color: var(--text-primary) !important; } /* --- Footer --- */ footer { background: rgba(13, 17, 23, 0.95) !important; border-top: 1px solid var(--border-subtle); color: var(--text-secondary) !important; font-family: 'Cinzel', serif; letter-spacing: 1px; font-size: 0.85rem; } footer p { color: var(--text-secondary) !important; } /* --- Custom Scrollbar (dark) --- */ ::-webkit-scrollbar { width: 8px; } ::-webkit-scrollbar-track { background: var(--bg-secondary); } ::-webkit-scrollbar-thumb { background: #30363d; border-radius: 4px; } ::-webkit-scrollbar-thumb:hover { background: #484f58; } /* --- Faction Text Colors --- */ .southern-text { color: var(--southern-gold) !important; } .northern-text { color: var(--northern-blue) !important; } .neutral-text { color: var(--neutral-silver) !important; } /* --- Buttons --- */ .btn { border-radius: 8px; font-weight: 500; transition: all var(--transition-speed); font-family: 'Inter', sans-serif; } .btn-primary { background: linear-gradient(135deg, #6366f1, #8b5cf6); border: none; box-shadow: 0 0 15px rgba(99, 102, 241, 0.3); } .btn-primary:hover { box-shadow: 0 0 25px rgba(99, 102, 241, 0.5); transform: translateY(-1px); } .btn-warning { background: linear-gradient(135deg, #f59e0b, #d97706); border: none; color: #000; } .btn-warning:hover { box-shadow: 0 0 20px rgba(245, 158, 11, 0.4); transform: translateY(-1px); } .btn-danger { background: linear-gradient(135deg, #ef4444, #dc2626); border: none; } .btn-danger:hover { box-shadow: 0 0 20px rgba(239, 68, 68, 0.4); transform: translateY(-1px); } .btn-success { background: linear-gradient(135deg, #22c55e, #16a34a); border: none; } .btn-success:hover { box-shadow: 0 0 20px rgba(34, 197, 94, 0.4); transform: translateY(-1px); } .btn-info { background: linear-gradient(135deg, var(--northern-blue), #0288d1); border: none; color: #fff; } .btn-info:hover { box-shadow: 0 0 20px rgba(79, 195, 247, 0.4); transform: translateY(-1px); color: #fff; } .btn-outline-light { border-color: var(--border-subtle); color: var(--text-secondary); } .btn-outline-light:hover { background: rgba(255, 255, 255, 0.1); border-color: var(--text-secondary); } .btn-outline-info { border-color: var(--northern-blue); color: var(--northern-blue); } .btn-outline-info:hover { background: rgba(79, 195, 247, 0.15); color: var(--northern-blue); border-color: var(--northern-blue); } /* --- Progress Bars --- */ .progress { background: rgba(255, 255, 255, 0.08); border-radius: 6px; height: 10px; overflow: hidden; } .progress-bar.bg-warning { background: linear-gradient(90deg, #f59e0b, var(--southern-gold)) !important; } .progress-bar.bg-danger { background: linear-gradient(90deg, #ef4444, #f87171) !important; } .progress-bar.bg-success { background: linear-gradient(90deg, #22c55e, #4ade80) !important; } /* --- Alerts --- */ .alert { background: var(--bg-card); border: 1px solid var(--border-subtle); color: var(--text-primary); border-radius: 8px; } .alert-success { border-left: 4px solid #22c55e; background: rgba(34, 197, 94, 0.1); } .alert-danger { border-left: 4px solid #ef4444; background: rgba(239, 68, 68, 0.1); } .alert-warning { border-left: 4px solid #f59e0b; background: rgba(245, 158, 11, 0.1); } .alert-info { border-left: 4px solid var(--northern-blue); background: rgba(79, 195, 247, 0.1); } /* --- Form Controls (dark) --- */ .form-control, .form-select { background: rgba(255, 255, 255, 0.05); border: 1px solid var(--border-subtle); color: var(--text-primary); border-radius: 8px; } .form-control:focus, .form-select:focus { background: rgba(255, 255, 255, 0.08); border-color: var(--southern-gold); color: var(--text-primary); box-shadow: 0 0 0 3px rgba(255, 215, 0, 0.15); } .form-control::placeholder { color: var(--text-muted); } .form-label { color: var(--text-secondary); font-size: 0.9rem; } .form-check-input { background-color: rgba(255, 255, 255, 0.1); border-color: var(--border-subtle); } .form-check-input:checked { background-color: var(--southern-gold); border-color: var(--southern-gold); } .form-check-label { color: var(--text-secondary); } /* --- Tables (dark) --- */ .table { color: var(--text-primary); --bs-table-bg: transparent; --bs-table-striped-bg: rgba(255, 255, 255, 0.03); } .table thead th { border-bottom-color: var(--border-subtle); color: var(--text-secondary); font-family: 'Cinzel', serif; font-size: 0.85rem; text-transform: uppercase; letter-spacing: 0.5px; } .table td { border-bottom-color: var(--border-subtle); } /* --- Badges --- */ .badge { font-family: 'Inter', sans-serif; font-weight: 600; } .badge.southern, .badge.bg-southern { background: var(--southern-bg) !important; color: #fff; } .badge.northern, .badge.bg-northern { background: var(--northern-bg) !important; color: #fff; } .badge.neutral, .badge.bg-neutral { background-color: var(--neutral-silver) !important; } /* --- Code blocks --- */ code { background: rgba(255, 255, 255, 0.06); padding: 2px 6px; border-radius: 4px; color: var(--northern-blue); font-size: 0.85em; } /* --- Map Styles --- */ .map-background { background-color: #1a1f2e; } /* --- Location Marker Styles --- */ .location-marker { position: absolute; transform: translate(-50%, -50%); width: 44px; height: 44px; border-radius: 50%; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; cursor: pointer; border: 2px solid rgba(255, 255, 255, 0.6); transition: all 0.25s ease; z-index: 10; } .location-marker:hover { transform: translate(-50%, -50%) scale(1.15); z-index: 15; } .location-marker.selected { border-color: #fff; } .location-marker.capital { width: 56px; height: 56px; border-width: 3px; } .location-marker.southern { background: linear-gradient(135deg, #8B0000, #DC143C); box-shadow: 0 0 12px var(--southern-glow), 0 0 24px rgba(220, 20, 60, 0.2); } .location-marker.southern.selected { box-shadow: 0 0 20px var(--southern-gold), 0 0 40px rgba(255, 215, 0, 0.3); border-color: var(--southern-gold); } .location-marker.northern { background: linear-gradient(135deg, #1a237e, #4FC3F7); box-shadow: 0 0 12px var(--northern-glow), 0 0 24px rgba(79, 195, 247, 0.2); } .location-marker.northern.selected { box-shadow: 0 0 20px var(--northern-blue), 0 0 40px rgba(79, 195, 247, 0.3); border-color: var(--northern-blue); } .location-marker.neutral { background: linear-gradient(135deg, #455a64, #78909C); box-shadow: 0 0 8px var(--neutral-glow); } .location-marker.neutral.selected { box-shadow: 0 0 15px rgba(176, 190, 197, 0.4); border-color: var(--neutral-silver); } /* --- White Walkers Attack faction markers --- */ .location-marker.nights_watch { background: var(--nights-watch-bg); box-shadow: 0 0 12px var(--nights-watch-glow), 0 0 24px rgba(215, 228, 241, 0.2); border-color: var(--nights-watch-accent); } .location-marker.nights_watch.selected { box-shadow: 0 0 20px var(--nights-watch-accent), 0 0 40px rgba(215, 228, 241, 0.35); } .location-marker.white_walkers { background: var(--white-walkers-bg); box-shadow: 0 0 14px var(--white-walkers-glow), 0 0 30px rgba(136, 196, 230, 0.25); border-color: var(--white-walkers-ice); } .location-marker.white_walkers.selected { box-shadow: 0 0 22px var(--white-walkers-ice), 0 0 44px rgba(214, 241, 255, 0.45); } .location-marker.barbarian { background: var(--barbarian-bg); box-shadow: 0 0 12px var(--barbarian-glow), 0 0 24px rgba(193, 68, 46, 0.22); border-color: var(--barbarian-orange); } .location-marker.barbarian.selected { box-shadow: 0 0 18px var(--barbarian-orange), 0 0 36px rgba(193, 68, 46, 0.35); } /* --- Wall settlement type: rounded rectangle, stonework styling --- */ .location-marker.wall { border-radius: 6px !important; background: linear-gradient(135deg, #4a4a55, var(--wall-stone)); box-shadow: 0 0 10px rgba(138, 138, 149, 0.35); border-color: var(--wall-stone-light); } .location-marker.wall.nights_watch { background: linear-gradient(135deg, var(--nights-watch-black), #3a4055); box-shadow: 0 0 16px var(--nights-watch-glow); } .location-marker.wall.white_walkers { background: linear-gradient(135deg, #0f2d3f, var(--white-walkers-blue)); box-shadow: 0 0 16px var(--white-walkers-glow); } /* Capital crown effect */ .location-marker.capital::before { content: ''; position: absolute; top: -8px; left: 50%; transform: translateX(-50%); width: 0; height: 0; border-left: 6px solid transparent; border-right: 6px solid transparent; border-bottom: 8px solid var(--southern-gold); opacity: 0.8; } .location-marker.capital.northern::before { border-bottom-color: var(--northern-blue); } .location-marker.capital.neutral::before { border-bottom-color: var(--neutral-silver); } .location-marker.capital.nights_watch::before { border-bottom-color: var(--nights-watch-accent); } .location-marker.capital.white_walkers::before { border-bottom-color: var(--white-walkers-ice); } /* Location label */ .location-label { position: absolute; bottom: -22px; left: 50%; transform: translateX(-50%); white-space: nowrap; font-size: 0.65rem; font-family: 'Inter', sans-serif; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px; color: var(--text-secondary); text-shadow: 0 1px 3px rgba(0, 0, 0, 0.8); pointer-events: none; } /* Pulsing animation */ @keyframes pulse { 0% { box-shadow: 0 0 0 0 rgba(255, 255, 255, 0.5); } 70% { box-shadow: 0 0 0 12px rgba(255, 255, 255, 0); } 100% { box-shadow: 0 0 0 0 rgba(255, 255, 255, 0); } } .location-marker.pulsing { animation: pulse 1.5s infinite; } /* --- Faction Selection Page --- */ .faction-hero { min-height: calc(100vh - 76px); display: flex; align-items: center; justify-content: center; padding: 2rem 0; } .faction-hero-title { font-family: 'Cinzel', serif; font-size: 2.5rem; font-weight: 700; background: linear-gradient(135deg, var(--southern-gold), var(--northern-blue)); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; margin-bottom: 0.5rem; } .faction-hero-subtitle { color: var(--text-secondary); font-size: 1.1rem; margin-bottom: 2.5rem; } .faction-card { transition: all var(--transition-speed) ease; height: 100%; border: 2px solid var(--border-subtle); cursor: pointer; position: relative; overflow: visible; } .faction-card:not(.faction-unavailable):hover { transform: translateY(-8px); } .faction-card.faction-selected { transform: translateY(-5px) scale(1.02); } .faction-card.faction-selected.faction-southern { border-color: var(--southern-gold); box-shadow: 0 0 30px var(--southern-glow), 0 0 60px rgba(255, 215, 0, 0.15); } .faction-card.faction-selected.faction-northern { border-color: var(--northern-blue); box-shadow: 0 0 30px var(--northern-glow), 0 0 60px rgba(79, 195, 247, 0.15); } .faction-card.faction-selected.faction-nights-watch { border-color: var(--nights-watch-accent); box-shadow: 0 0 30px var(--nights-watch-glow), 0 0 60px rgba(215, 228, 241, 0.2); } .faction-card.faction-selected.faction-white-walkers { border-color: var(--white-walkers-ice); box-shadow: 0 0 30px var(--white-walkers-glow), 0 0 60px rgba(214, 241, 255, 0.2); } .faction-card.faction-selected.faction-barbarian { border-color: var(--barbarian-orange); box-shadow: 0 0 30px var(--barbarian-glow), 0 0 60px rgba(193, 68, 46, 0.18); } .faction-card.faction-selected.map-card { border-color: var(--northern-blue); box-shadow: 0 0 30px var(--northern-glow), 0 0 60px rgba(79, 195, 247, 0.15); } .faction-unavailable { opacity: 0.4; cursor: not-allowed; } .faction-icon { display: inline-flex; align-items: center; justify-content: center; font-size: 2.5rem; width: 90px; height: 90px; border-radius: 50%; margin-bottom: 1rem; transition: all var(--transition-speed); } .southern-icon { background: radial-gradient(circle, rgba(255, 215, 0, 0.2), transparent 70%); color: var(--southern-gold); border: 2px solid rgba(255, 215, 0, 0.3); } .faction-card:hover .southern-icon, .faction-card.faction-selected .southern-icon { box-shadow: 0 0 30px var(--southern-glow); border-color: var(--southern-gold); } .northern-icon { background: radial-gradient(circle, rgba(79, 195, 247, 0.2), transparent 70%); color: var(--northern-blue); border: 2px solid rgba(79, 195, 247, 0.3); } .faction-card:hover .northern-icon, .faction-card.faction-selected .northern-icon { box-shadow: 0 0 30px var(--northern-glow); border-color: var(--northern-blue); } /* --- WWA faction icons --- */ .nights-watch-icon { background: radial-gradient(circle, rgba(215, 228, 241, 0.18), transparent 70%); color: var(--nights-watch-accent); border: 2px solid rgba(215, 228, 241, 0.3); } .faction-card:hover .nights-watch-icon, .faction-card.faction-selected .nights-watch-icon { box-shadow: 0 0 30px var(--nights-watch-glow); border-color: var(--nights-watch-accent); } .white-walkers-icon { background: radial-gradient(circle, rgba(136, 196, 230, 0.22), transparent 70%); color: var(--white-walkers-ice); border: 2px solid rgba(136, 196, 230, 0.35); } .faction-card:hover .white-walkers-icon, .faction-card.faction-selected .white-walkers-icon { box-shadow: 0 0 30px var(--white-walkers-glow); border-color: var(--white-walkers-ice); } .barbarian-icon { background: radial-gradient(circle, rgba(193, 68, 46, 0.2), transparent 70%); color: var(--barbarian-orange); border: 2px solid rgba(193, 68, 46, 0.32); } /* --- Wall-hold HUD overlay for WWA --- */ .wall-hold-hud { position: absolute; top: 20px; right: 20px; z-index: 10; background: var(--bg-glass); backdrop-filter: blur(var(--glass-blur)); border: 1px solid var(--border-subtle); border-radius: 10px; padding: 0.75rem 1rem; color: var(--text-primary); min-width: 220px; font-size: 0.9rem; } .wall-hold-hud h6 { margin: 0 0 0.35rem 0; color: var(--nights-watch-accent); font-size: 0.85rem; text-transform: uppercase; letter-spacing: 0.05em; } .wall-hold-hud .hold-row { display: flex; justify-content: space-between; margin: 0.1rem 0; } .wall-hold-hud .hold-row .ticks { font-family: monospace; font-weight: 600; } .wall-hold-hud .hold-row.nights_watch .ticks { color: var(--nights-watch-accent); } .wall-hold-hud .hold-row.white_walkers .ticks { color: var(--white-walkers-ice); } @keyframes iconFloat { 0%, 100% { transform: translateY(0); } 50% { transform: translateY(-6px); } } .faction-card:hover .faction-icon, .faction-card.faction-selected .faction-icon { animation: iconFloat 2s ease-in-out infinite; } .faction-card .card-body { text-align: center; padding: 2rem 1.5rem; } .faction-card h4 { font-family: 'Cinzel', serif; font-weight: 700; margin-bottom: 0.75rem; } .faction-card .faction-motto { color: var(--text-secondary); font-style: italic; font-size: 0.95rem; } .faction-card .faction-start { color: var(--text-muted); font-size: 0.85rem; } /* --- Game HUD --- */ .game-hud { background: var(--bg-card); backdrop-filter: blur(var(--glass-blur)); border: 1px solid var(--border-subtle); border-radius: 10px; padding: 0.6rem 1.2rem; display: flex; align-items: center; gap: 1.5rem; font-size: 0.85rem; margin-bottom: 0.75rem; } .hud-item { display: flex; align-items: center; gap: 0.4rem; color: var(--text-secondary); } .hud-item i { font-size: 0.9rem; } .hud-value { font-weight: 700; color: var(--text-primary); font-family: 'Inter', sans-serif; } .hud-item.southern .hud-value { color: var(--southern-gold); } .hud-item.northern .hud-value { color: var(--northern-blue); } .hud-divider { width: 1px; height: 20px; background: var(--border-subtle); } /* --- Event Feed --- */ .event-feed { background: var(--bg-card); backdrop-filter: blur(var(--glass-blur)); border: 1px solid var(--border-subtle); border-radius: 10px; max-height: 160px; overflow-y: auto; } .event-feed-header { padding: 0.5rem 1rem; border-bottom: 1px solid var(--border-subtle); font-family: 'Cinzel', serif; font-size: 0.8rem; color: var(--text-secondary); text-transform: uppercase; letter-spacing: 1px; position: sticky; top: 0; background: var(--bg-card); z-index: 1; } .event-item { padding: 0.4rem 1rem; border-bottom: 1px solid rgba(240, 246, 252, 0.04); display: flex; align-items: flex-start; gap: 0.6rem; font-size: 0.78rem; line-height: 1.4; } .event-item:last-child { border-bottom: none; } .event-time { color: var(--text-muted); font-size: 0.7rem; white-space: nowrap; min-width: 48px; font-family: 'Inter', sans-serif; } .event-icon { font-size: 0.75rem; min-width: 16px; text-align: center; } .event-icon.southern { color: var(--southern-gold); } .event-icon.northern { color: var(--northern-blue); } .event-icon.neutral { color: var(--neutral-silver); } .event-message { color: var(--text-secondary); } /* --- Map Container --- */ #mapContainer { background: #1a1f2e !important; border: 1px solid var(--border-subtle) !important; border-radius: 10px !important; overflow: hidden; } /* --- Action Panel (right sidebar) --- */ .action-panel .card { border: 1px solid var(--border-subtle); } .action-panel .card-header { font-family: 'Cinzel', serif; font-size: 0.95rem; } .action-panel .btn { font-size: 0.85rem; padding: 0.5rem 0.75rem; } /* --- AI Toggle --- */ .ai-toggle-card .form-check-input:checked { background-color: #22c55e; border-color: #22c55e; } .ai-status-dot { display: inline-block; width: 8px; height: 8px; border-radius: 50%; margin-right: 6px; } .ai-status-dot.active { background: #22c55e; box-shadow: 0 0 8px rgba(34, 197, 94, 0.6); animation: statusPulse 2s infinite; } .ai-status-dot.inactive { background: var(--text-muted); } @keyframes statusPulse { 0%, 100% { opacity: 1; } 50% { opacity: 0.4; } } /* --- Game Over Overlay --- */ #gameOverOverlay { background: rgba(13, 17, 23, 0.92); z-index: 100; color: white; backdrop-filter: blur(8px); } .victory-text { font-family: 'Cinzel', serif; text-shadow: 0 0 20px var(--southern-gold), 0 0 40px rgba(255, 215, 0, 0.3); animation: victoryPulse 2s infinite; } .defeat-text { font-family: 'Cinzel', serif; text-shadow: 0 0 20px #ef4444, 0 0 40px rgba(239, 68, 68, 0.3); animation: defeatPulse 2s infinite; } .victory-icon { animation: bounce 2s infinite; } .defeat-icon { animation: shake 2s infinite; } @keyframes victoryPulse { 0% { text-shadow: 0 0 20px var(--southern-gold); } 50% { text-shadow: 0 0 40px var(--southern-gold), 0 0 60px rgba(255, 215, 0, 0.4); } 100% { text-shadow: 0 0 20px var(--southern-gold); } } @keyframes defeatPulse { 0% { text-shadow: 0 0 20px #ef4444; } 50% { text-shadow: 0 0 40px #ef4444, 0 0 60px rgba(239, 68, 68, 0.4); } 100% { text-shadow: 0 0 20px #ef4444; } } @keyframes bounce { 0%, 20%, 50%, 80%, 100% { transform: translateY(0); } 40% { transform: translateY(-30px); } 60% { transform: translateY(-15px); } } @keyframes shake { 0%, 100% { transform: translateX(0); } 10%, 30%, 50%, 70%, 90% { transform: translateX(-10px); } 20%, 40%, 60%, 80% { transform: translateX(10px); } } /* --- Transfer Indicators --- */ .transfer-indicator { position: absolute; transform-origin: 50% 50%; z-index: 15; opacity: 0.9; pointer-events: none; display: flex; align-items: center; justify-content: center; font-size: 24px; color: white; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); background: rgba(13, 17, 23, 0.7); width: 40px; height: 40px; border-radius: 50%; backdrop-filter: blur(4px); } .transfer-indicator.southern { border: 2px solid var(--southern-gold); box-shadow: 0 0 12px var(--southern-glow); } .transfer-indicator.northern { border: 2px solid var(--northern-blue); box-shadow: 0 0 12px var(--northern-glow); } .transfer-indicator.attack i { color: #ff4444; } .transfer-indicator.resources i { color: var(--southern-gold); } /* --- Move Army Modal (dark) --- */ .modal-content { background: var(--bg-secondary); border: 1px solid var(--border-subtle); color: var(--text-primary); border-radius: 12px; } .modal-header { border-bottom-color: var(--border-subtle); } .modal-header .modal-title { font-family: 'Cinzel', serif; } .modal-footer { border-top-color: var(--border-subtle); } .modal-header .btn-close { filter: invert(1); } .list-group-item { background: rgba(255, 255, 255, 0.03); border-color: var(--border-subtle); color: var(--text-primary); } .list-group-item:hover, .list-group-item-action:hover { background: rgba(255, 255, 255, 0.08); color: var(--text-primary); } .destination-item { cursor: pointer; transition: all var(--transition-speed); } .destination-item:hover { background: rgba(255, 255, 255, 0.08) !important; border-color: var(--southern-gold); } /* --- Replay Page Styles --- */ .attribute-item { border-left: 3px solid var(--northern-blue); padding-left: 8px; margin-bottom: 8px; } #span-attributes { max-height: 200px; overflow-y: auto; } /* Movement arrow (replay) */ .movement-arrow { position: absolute; z-index: 15; pointer-events: none; color: #fff; font-size: 24px; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.7); animation: moveAlongPath 2s ease-in-out; } @keyframes moveAlongPath { 0% { opacity: 0; transform: scale(0.5); } 50% { opacity: 1; transform: scale(1); } 100% { opacity: 0; transform: scale(0.5); } } /* Action highlight (replay) */ .location-marker.active { animation: markerPulse 1.5s infinite; } .location-marker.action-highlight { animation: actionPulse 1.5s ease-in-out; } @keyframes markerPulse { 0% { transform: translate(-50%, -50%) scale(1); } 50% { transform: translate(-50%, -50%) scale(1.2); } 100% { transform: translate(-50%, -50%) scale(1); } } @keyframes actionPulse { 0% { transform: translate(-50%, -50%) scale(1); } 50% { transform: translate(-50%, -50%) scale(1.3); box-shadow: 0 0 30px rgba(255, 255, 0, 0.6); } 100% { transform: translate(-50%, -50%) scale(1); } } /* --- Spinner (dark) --- */ .spinner-border { color: var(--northern-blue) !important; } /* --- Small text helpers --- */ .text-muted { color: var(--text-muted) !important; } small.text-muted { color: var(--text-muted) !important; } /* --- Traveling Unit Animations --- */ .traveling-unit { position: absolute; z-index: 20; pointer-events: none; display: flex; align-items: center; justify-content: center; border-radius: 50%; font-size: 18px; transition: none; } .traveling-unit.army { width: 42px; height: 42px; background: rgba(13, 17, 23, 0.9); border: 2px solid; backdrop-filter: blur(4px); animation: unitBob 0.6s ease-in-out infinite; } .traveling-unit.army.southern { border-color: var(--southern-gold); box-shadow: 0 0 16px var(--southern-glow), 0 0 32px rgba(255, 215, 0, 0.2); color: var(--southern-gold); } .traveling-unit.army.northern { border-color: var(--northern-blue); box-shadow: 0 0 16px var(--northern-glow), 0 0 32px rgba(79, 195, 247, 0.2); color: var(--northern-blue); } .traveling-unit.cart { width: 36px; height: 36px; background: rgba(13, 17, 23, 0.9); border: 2px solid var(--southern-gold); box-shadow: 0 0 14px rgba(255, 215, 0, 0.35); color: var(--southern-gold); font-size: 15px; animation: unitBob 0.8s ease-in-out infinite; } .traveling-unit.cart.northern { border-color: var(--northern-blue); box-shadow: 0 0 14px rgba(79, 195, 247, 0.35); color: var(--northern-blue); } /* Army count badge on traveling unit */ .traveling-unit .army-count { position: absolute; top: -8px; right: -8px; min-width: 18px; height: 18px; border-radius: 9px; background: var(--southern-crimson); color: #fff; font-size: 10px; font-weight: 700; font-family: 'Inter', sans-serif; display: flex; align-items: center; justify-content: center; padding: 0 4px; line-height: 1; } @keyframes unitBob { 0%, 100% { margin-top: 0; } 50% { margin-top: -4px; } } /* Trail particles left behind by traveling units */ .trail-particle { position: absolute; z-index: 18; pointer-events: none; width: 6px; height: 6px; border-radius: 50%; animation: trailFade 1.5s ease-out forwards; } .trail-particle.southern { background: var(--southern-gold); box-shadow: 0 0 8px var(--southern-glow), 0 0 16px rgba(255, 215, 0, 0.15); } .trail-particle.northern { background: var(--northern-blue); box-shadow: 0 0 8px var(--northern-glow), 0 0 16px rgba(79, 195, 247, 0.15); } .trail-particle.resource { background: var(--southern-gold); box-shadow: 0 0 6px rgba(255, 215, 0, 0.5); width: 5px; height: 5px; } /* Persistent glowing trail line segment */ .trail-line-segment { position: absolute; z-index: 17; pointer-events: none; height: 2px; transform-origin: 0 50%; animation: trailLineFade 2.5s ease-out forwards; } .trail-line-segment.southern { background: linear-gradient(90deg, transparent, var(--southern-gold), transparent); box-shadow: 0 0 6px var(--southern-glow); } .trail-line-segment.northern { background: linear-gradient(90deg, transparent, var(--northern-blue), transparent); box-shadow: 0 0 6px var(--northern-glow); } .trail-line-segment.resource { background: linear-gradient(90deg, transparent, var(--southern-gold), transparent); box-shadow: 0 0 4px rgba(255, 215, 0, 0.3); height: 1.5px; } @keyframes trailFade { 0% { opacity: 0.9; transform: scale(1.2); } 50% { opacity: 0.4; transform: scale(0.8); } 100% { opacity: 0; transform: scale(0.1); } } @keyframes trailLineFade { 0% { opacity: 0.7; } 60% { opacity: 0.3; } 100% { opacity: 0; } } /* Clash/explosion effect at destination */ .clash-burst { position: absolute; z-index: 25; pointer-events: none; width: 80px; height: 80px; border-radius: 50%; transform: translate(-50%, -50%); animation: clashExpand 1.2s ease-out forwards; } .clash-burst.attack { background: radial-gradient(circle, rgba(239, 68, 68, 0.7), rgba(255, 165, 0, 0.3) 50%, rgba(239, 68, 68, 0) 70%); box-shadow: 0 0 40px rgba(239, 68, 68, 0.5), 0 0 80px rgba(239, 68, 68, 0.2); } .clash-burst.capture { background: radial-gradient(circle, rgba(34, 197, 94, 0.6), rgba(255, 215, 0, 0.3) 50%, rgba(34, 197, 94, 0) 70%); box-shadow: 0 0 40px rgba(34, 197, 94, 0.4), 0 0 80px rgba(34, 197, 94, 0.15); } .clash-burst.reinforce { background: radial-gradient(circle, rgba(99, 102, 241, 0.5), rgba(99, 102, 241, 0) 70%); box-shadow: 0 0 30px rgba(99, 102, 241, 0.4); } @keyframes clashExpand { 0% { transform: translate(-50%, -50%) scale(0.2); opacity: 1; } 30% { transform: translate(-50%, -50%) scale(1.2); opacity: 0.9; } 60% { transform: translate(-50%, -50%) scale(1.8); opacity: 0.5; } 100% { transform: translate(-50%, -50%) scale(2.5); opacity: 0; } } /* Capture sparkle particles */ .capture-sparkle { position: absolute; z-index: 24; pointer-events: none; width: 6px; height: 6px; border-radius: 50%; animation: sparkleFloat 1.2s ease-out forwards; } @keyframes sparkleFloat { 0% { opacity: 1; transform: translate(-50%, -50%) scale(1); } 100% { opacity: 0; transform: translate(-50%, -50%) translateY(-40px) scale(0); } } /* Marker captured flash */ .location-marker.just-captured { animation: capturedFlash 0.6s ease-out; } @keyframes capturedFlash { 0% { filter: brightness(1); } 30% { filter: brightness(2.5); } 100% { filter: brightness(1); } } /* Glowing connection line pulse during movement */ .connection-pulse { position: absolute; z-index: 16; pointer-events: none; width: 10px; height: 10px; border-radius: 50%; animation: connectionGlow 1s ease-in-out infinite; } .connection-pulse.southern { background: var(--southern-gold); box-shadow: 0 0 12px var(--southern-glow), 0 0 24px rgba(255, 215, 0, 0.15); } .connection-pulse.northern { background: var(--northern-blue); box-shadow: 0 0 12px var(--northern-glow), 0 0 24px rgba(79, 195, 247, 0.15); } @keyframes connectionGlow { 0%, 100% { opacity: 0.3; transform: translate(-50%, -50%) scale(0.6); } 50% { opacity: 1; transform: translate(-50%, -50%) scale(1.4); } } /* --- Responsive --- */ @media (max-width: 768px) { .location-marker { width: 32px !important; height: 32px !important; } .location-marker.capital { width: 42px !important; height: 42px !important; } .faction-hero-title { font-size: 1.8rem; } .game-hud { flex-wrap: wrap; gap: 0.75rem; } .location-label { font-size: 0.55rem; } } ================================================ FILE: game-of-tracing/war_map/telemetry.py ================================================ import os from opentelemetry.sdk.resources import SERVICE_NAME, Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry import trace # Logging setup import logging from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry._logs import set_logger_provider # Profiling setup (Pyroscope v2 + OTel span-profile linking) import pyroscope from pyroscope.otel import PyroscopeSpanProcessor class GameTelemetry: def __init__(self, service_name, logging_endpoint="http://alloy:4318", tracing_endpoint="http://alloy:4317"): self.service_name = service_name self.logging_endpoint = logging_endpoint self.tracing_endpoint = tracing_endpoint self.resource = Resource.create(attributes={ SERVICE_NAME: service_name }) self._setup_logging() self._setup_tracing() self._setup_profiling() def _setup_logging(self): """Configure OpenTelemetry logging""" self.logger_provider = LoggerProvider(resource=self.resource) set_logger_provider(self.logger_provider) log_exporter = OTLPLogExporter( endpoint=f"{self.logging_endpoint}/v1/logs" ) self.logger_provider.add_log_record_processor( BatchLogRecordProcessor( exporter=log_exporter, max_queue_size=30, max_export_batch_size=5 ) ) # Setup root logger handler = LoggingHandler( level=logging.NOTSET, logger_provider=self.logger_provider ) logging.getLogger().addHandler(handler) logging.getLogger().setLevel(logging.INFO) self.logger = logging.getLogger(self.service_name) def _setup_tracing(self): """Configure OpenTelemetry tracing""" trace.set_tracer_provider(TracerProvider(resource=self.resource)) otlp_exporter = OTLPSpanExporter( endpoint=f"{self.tracing_endpoint}/v1/traces", insecure=True ) span_processor = BatchSpanProcessor( span_exporter=otlp_exporter, max_export_batch_size=1 ) trace.get_tracer_provider().add_span_processor(span_processor) self.tracer = trace.get_tracer(__name__) def _setup_profiling(self): """Configure Pyroscope profiling + OTel span-profile linkage.""" pyroscope.configure( application_name=self.service_name, server_address=os.getenv("PYROSCOPE_SERVER_ADDRESS", "http://alloy:9999"), tags={"service_name": self.service_name}, oncpu=True, gil_only=True, ) trace.get_tracer_provider().add_span_processor(PyroscopeSpanProcessor()) def get_tracer(self): """Get the configured tracer""" return self.tracer def get_logger(self): """Get the configured logger""" return self.logger def shutdown(self): """Flush and shutdown all telemetry providers.""" try: trace.get_tracer_provider().shutdown() except Exception: pass try: self.logger_provider.shutdown() except Exception: pass ================================================ FILE: game-of-tracing/war_map/templates/index.html ================================================ {% extends "layout.html" %} {% block title %} {% if single_player %}Take the Black — {{ map_meta.display_name }}{% else %}Choose Your Faction{% endif %} {% endblock %} {% block content %}

{% if single_player %}{{ map_meta.display_name }}{% else %}A Game of Traces{% endif %}

{% if single_player %}{{ map_meta.description }} {% else %}Choose your kingdom. Command your armies. Master distributed tracing. {% endif %}

Pick a different map

{% if request.args.get('reset') %}
{{ request.args.get('message', 'Game reset status unknown') }}
{% endif %} {% if error %}
{{ error }}
{% endif %}
{% if single_player %}

The Night's Watch

"Night gathers, and now my watch begins."

Castle Black

{% if not player_available %}
Already taken — reset the game
{% endif %}
{% else %}

Southern Kingdom

"Glory and Honor!"

Start at Southern Capital

{% if not southern_available %}
Already taken
{% endif %}

Northern Kingdom

"Strength and Unity"

Start at Northern Capital

{% if not northern_available %}
Already taken
{% endif %}
{% endif %}
{% endblock %} {% block scripts %} {% endblock %} ================================================ FILE: game-of-tracing/war_map/templates/layout.html ================================================ A Game of Traces - {% block title %}Game Map{% endblock %} {% block extra_css %}{% endblock %}
{% block content %}{% endblock %}

A Game of Traces

{% block scripts %}{% endblock %} ================================================ FILE: game-of-tracing/war_map/templates/map.html ================================================ {% extends "layout.html" %} {% block title %}Game Map{% endblock %} {% block content %}
{% if faction == 'southern' %} {% else %} {% endif %} {{ faction|capitalize }}
Resources: 0
Armies: 0
Territory: 0/8
{% if wall_hold %}
Wall Hold
Hold every keep for {{ wall_hold.threshold }} ticks to win.
Night's Watch {{ wall_hold.holds.get('nights_watch', 0) }}/{{ wall_hold.threshold }}
White Walkers {{ wall_hold.holds.get('white_walkers', 0) }}/{{ wall_hold.threshold }}
{% endif %}
An error occurred
Battle Log
--:-- Game started. Select a location on the map to begin.
AI Opponent
AI is inactive
Select a Location

Click on a location on the map

{% endblock %} {% block extra_css %} {% endblock %} {% block scripts %} {% endblock %} ================================================ FILE: game-of-tracing/war_map/templates/map_picker.html ================================================ {% extends "layout.html" %} {% block title %}Pick a Map{% endblock %} {% block content %}

A Game of Traces

Pick a battlefield. Each map has its own factions, economy, and win conditions.

{% for map_id, meta in maps.items() %}

{{ meta.display_name }}

{% if meta.single_player %}Single-player · Hold to win {% else %}Two-player · Capture to win {% endif %}

{{ meta.description }}

Factions: {{ meta.factions | join(', ') | replace('_', ' ') | title }}

{% endfor %}
{% endblock %} {% block scripts %} {% endblock %} ================================================ FILE: game-of-tracing/war_map/templates/replay.html ================================================ {% extends "layout.html" %} {% block title %}Game Replay{% endblock %} {% block content %}

Game Session Replay

Replay previous game sessions using span links and distributed tracing. Each session shows the complete chain of actions linked together through OpenTelemetry spans.

Loading...

Loading game sessions from Tempo...

How It Works
Span Links & Replay

Each game action creates a span link to the previous action, forming a chain across different traces.

What You'll See
  • Action Sequence - Chronological order of game moves
  • Span Links - How actions connect to each other
  • Trace Context - Full distributed tracing information
  • Game Narrative - Complete story of how the game unfolded
Educational Value
  • Cross-trace relationships
  • Tempo API integration
  • TraceQL queries
  • Game state reconstruction
Try TraceQL Queries

Use these queries in Grafana Tempo:

{game.session.id!=""} Find all game sessions
{link.type="game_sequence"} Find spans with links
{game.action.type="move_army"} Find specific actions
{% endblock %} {% block scripts %} {% endblock %} ================================================ FILE: game-of-tracing/war_map/templates/replay_session.html ================================================ {% extends "layout.html" %} {% block title %}Session Replay{% endblock %} {% block content %}

Visual Game Replay

Back
Loading...

Loading session data from Tempo...

Current Action

Click play to start replay

Span Attributes

No action selected

Session Info

Player: Loading...

Faction: Loading...

Total Actions: 0

Data Source: Tempo

Span Links
{% endblock %} {% block extra_css %} {% endblock %} {% block scripts %} {% endblock %} ================================================ FILE: gelf-log-ingestion/README.md ================================================ # GELF Log Ingestion Scenario This scenario demonstrates how to ingest GELF (Graylog Extended Log Format) logs using Grafana Alloy's `loki.source.gelf` component. A Python application sends structured GELF messages over UDP to Alloy, which relabels GELF metadata (host, level, facility) into Loki labels before forwarding to Loki for storage and querying in Grafana. ## Architecture ``` gelf-logger (Python/pygelf) --UDP:12201--> Alloy (loki.source.gelf) --> Loki --> Grafana ``` ## Running the Demo ### Step 1: Clone the repository ```bash git clone https://github.com/grafana/alloy-scenarios.git ``` ### Step 2: Deploy the monitoring stack ```bash cd alloy-scenarios/gelf-log-ingestion docker-compose up -d ``` ### Step 3: Access Grafana Alloy UI Open your browser and go to `http://localhost:12345` to inspect the Alloy pipeline and live debugging output. ### Step 4: Access Grafana UI Open your browser and go to `http://localhost:3000`. Navigate to **Explore** and select the **Loki** datasource. Query logs using `{host="gelf-logger"}` or filter by label (e.g., `{level="6"}` for INFO). ## GELF Level Mapping | GELF Level | Syslog Severity | |------------|-----------------| | 0 | Emergency | | 1 | Alert | | 2 | Critical | | 3 | Error | | 4 | Warning | | 5 | Notice | | 6 | Informational | | 7 | Debug | ================================================ FILE: gelf-log-ingestion/app/main.py ================================================ import logging import time import random from pygelf import GelfUdpHandler logger = logging.getLogger("gelf-demo") logger.setLevel(logging.DEBUG) handler = GelfUdpHandler(host="alloy", port=12201, compress=False) logger.addHandler(handler) messages = [ (logging.INFO, "User authentication successful", {"user_id": "42", "method": "oauth2"}), (logging.WARNING, "Slow database query detected", {"query_time_ms": "2500", "table": "orders"}), (logging.ERROR, "Failed to connect to payment gateway", {"gateway": "stripe", "retry_count": "3"}), (logging.INFO, "Order processed successfully", {"order_id": "ORD-12345", "total": "99.99"}), (logging.DEBUG, "Cache lookup completed", {"cache_hit": "true", "key": "user:42:profile"}), (logging.CRITICAL, "Disk space critically low", {"mount": "/data", "available_pct": "2"}), (logging.INFO, "Health check passed", {"service": "api", "response_ms": "12"}), (logging.WARNING, "Rate limit approaching threshold", {"client_ip": "10.0.1.50", "requests": "980"}), ] print("Starting GELF log generator...") while True: level, msg, extra = random.choice(messages) logger.log(level, msg, extra=extra) time.sleep(random.uniform(1, 3)) ================================================ FILE: gelf-log-ingestion/config.alloy ================================================ livedebugging { enabled = true } // Receive GELF logs over UDP loki.source.gelf "default" { forward_to = [loki.relabel.gelf.receiver] } // Relabel GELF metadata into useful labels loki.relabel "gelf" { forward_to = [loki.write.local.receiver] rule { source_labels = ["__gelf_message_host"] target_label = "host" } rule { source_labels = ["__gelf_message_level"] target_label = "level" } rule { source_labels = ["__gelf_message_facility"] target_label = "facility" } } loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" } } ================================================ FILE: gelf-log-ingestion/docker-compose.coda.yml ================================================ services: gelf-logger: image: python:${PYTHON_VERSION:-3.11-slim} container_name: gelf-logger volumes: - ./app/main.py:/app/main.py command: ["sh", "-c", "pip install pygelf && python3 /app/main.py"] ================================================ FILE: gelf-log-ingestion/docker-compose.yml ================================================ services: # GELF log generator using pygelf gelf-logger: image: python:${PYTHON_VERSION:-3.11-slim} container_name: gelf-logger volumes: - ./app/main.py:/app/main.py command: ["sh", "-c", "pip install pygelf && python3 /app/main.py"] depends_on: - alloy alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 - 12201:12201/udp volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --stability.level=experimental --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - loki loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false EOF /run.sh ================================================ FILE: gelf-log-ingestion/loki-config.yaml ================================================ # This is a complete configuration to deploy Loki backed by the filesystem. # The index will be shipped to the storage via tsdb-shipper. auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true ================================================ FILE: image-versions.env ================================================ # Centralized Docker image versions for all examples. # # Renovate tracks each variable below — the `# renovate:` annotation # tells the bot which docker image the version refers to. Bumps to # this file land via renovate PRs that also bump the matching # `${VAR:-default}` fallback in every docker-compose file. Both sides # are driven by customManagers in renovate.json: one for this file, # one generic rule that captures the depName from the `image:` line # in compose files. Keep them in lockstep — the check-image-versions # workflow will fail PRs where they drift. # # Adding a new image: declare `# renovate: datasource=docker depName=` # + `_VERSION=` here, and reference it in compose as # `image: :${_VERSION:-}`. No renovate.json edit needed. # Grafana images # renovate: datasource=docker depName=grafana/loki GRAFANA_LOKI_VERSION=3.6.10 # renovate: datasource=docker depName=grafana/grafana GRAFANA_VERSION=13.0.1 # renovate: datasource=docker depName=grafana/alloy GRAFANA_ALLOY_VERSION=v1.16.1 # renovate: datasource=docker depName=grafana/tempo GRAFANA_TEMPO_VERSION=2.10.4 # renovate: datasource=docker depName=grafana/pyroscope GRAFANA_PYROSCOPE_VERSION=2.0.1 # Prometheus images # renovate: datasource=docker depName=prom/prometheus PROMETHEUS_VERSION=v3.11.3 # Other images # renovate: datasource=docker depName=python PYTHON_VERSION=3.11-slim # nginx-monitoring scenario # renovate: datasource=docker depName=nginx NGINX_VERSION=1.30-alpine # renovate: datasource=docker depName=nginx/nginx-prometheus-exporter NGINX_EXPORTER_VERSION=1.5.1 # renovate: datasource=docker depName=curlimages/curl CURL_VERSION=8.20.0 # rabbitmq-monitoring scenario # renovate: datasource=docker depName=rabbitmq RABBITMQ_VERSION=4.3.0-management # renovate: datasource=docker depName=pivotalrabbitmq/perf-test RABBITMQ_PERF_TEST_VERSION=2.24.0 # vault-secrets scenario # renovate: datasource=docker depName=hashicorp/vault VAULT_VERSION=2.0.0 # cloudwatch-metrics scenario # renovate: datasource=docker depName=localstack/localstack LOCALSTACK_VERSION=4.4.0 ================================================ FILE: k8s/README.md ================================================ # Monitor Kubernetes Grafana Alloy > Note this scenario works using the K8s Monitoring Helm chart. This abstracts the need to configure Loki and deploys best practices for monitoring Kubernetes clusters. The chart supports; metrics, logs, profiling, and tracing. In this directory you will find a series of scenarios that demonstrate how to setup Alloy via the Kubernetes monitoring helm chart. Examples specific to each telemetry source are provided in the respective directories. | Scenario | Description | | --- | --- | | [Logs](./logs) | Monitor Kubernetes logs with Grafana Alloy and Loki | | [Metrics](./metrics) | Monitor Kubernetes metrics with Grafana Alloy and Prometheus | | [Profiling](./profiling) | Monitor Kubernetes profiling with Grafana Alloy and Pyroscope | | [Tracing](./tracing) | Monitor Kubernetes tracing with Grafana Alloy and Tempo | ================================================ FILE: k8s/events/README.md ================================================ # Kubernetes events to Loki — without the k8s-monitoring Helm chart A focused scenario showing how `loki.source.kubernetes_events` works under the hood: Alloy is deployed as a plain `Deployment` with explicit RBAC and an Alloy `ConfigMap`, instead of being abstracted behind the [`k8s-monitoring` Helm chart](https://github.com/grafana/k8s-monitoring-helm) used in [`k8s/logs/`](../logs/). ## How this differs from `k8s/logs/` | Aspect | `k8s/logs/` (existing) | `k8s/events/` (this) | |---|---|---| | Alloy deployment | `k8s-monitoring` Helm chart (collector preset) | Plain `kubectl apply` of ConfigMap + RBAC + Deployment | | `loki.source.kubernetes_events` | Hidden inside the chart | **Visible directly in `alloy-config.yaml`** | | Scope | Pod logs + cluster events (mixed) | **Cluster events only** with `type` / `reason` / `namespace` / `kind` labels | | Demo intent | "ship everything for K8s monitoring" | "show how events ingestion actually works" | If you want production-grade Kubernetes observability, use `k8s/logs/`. If you're learning the component or want to extend it (custom filtering, namespace scoping, alerting on event reasons), this scenario is the minimal moving-parts version. ## Prerequisites - [Kind](https://kind.sigs.k8s.io/docs/user/quick-start/) - [Helm](https://helm.sh/docs/intro/install/) - The Grafana Helm repo: `helm repo add grafana https://grafana.github.io/helm-charts` ## Step 1 — Create the cluster ```bash git clone https://github.com/grafana/alloy-scenarios.git cd alloy-scenarios/k8s/events kind create cluster --config kind.yml ``` ## Step 2 — Create the `meta` namespace and install Loki + Grafana ```bash kubectl create namespace meta helm install --values loki-values.yml loki grafana/loki -n meta helm install --values grafana-values.yml grafana grafana/grafana -n meta ``` Wait for them to be ready: ```bash kubectl get pods -n meta -w ``` ## Step 3 — Apply Alloy ```bash kubectl apply -f alloy-rbac.yaml kubectl apply -f alloy-config.yaml kubectl apply -f alloy-deployment.yaml ``` The RBAC grants cluster-wide `get/list/watch` on `events` (and only that). The ConfigMap holds the Alloy pipeline. The Deployment is **single-replica on purpose** — events are cluster-scoped, so multiple Alloy replicas would produce duplicate log lines. ## Step 4 — Open Grafana ```bash kubectl port-forward -n meta svc/grafana 3000:80 ``` Username `admin`, password `adminadminadmin` (it's a dev scenario — see `grafana-values.yml`). ## Step 5 — Generate some events ```bash # Trigger Created/Started/Pulled events kubectl run events-test --image=nginx --restart=Never # Trigger BackOff/Failed events kubectl run events-fail --image=does-not-exist --restart=Never # Wait, then trigger Killing sleep 30 kubectl delete pod events-test events-fail ``` ## Step 6 — Query in Loki ```logql # All events {job="kubernetes-events"} # Just warnings {job="kubernetes-events", type="Warning"} # Pod events in default namespace {job="kubernetes-events", namespace="default", kind="Pod"} # Pull failures {job="kubernetes-events", reason="Failed"} # Backoff loops {job="kubernetes-events", reason="BackOff"} ``` The promoted labels are `type`, `reason`, `namespace`, and `kind`. The involved-object name (`name`) is kept as **structured metadata** — high cardinality, but searchable via `| json` filters. ## Inspecting the Alloy pipeline ```bash kubectl port-forward -n meta svc/alloy 12345:12345 ``` Open http://localhost:12345 to see the component graph and use **livedebugging** to inspect events flowing through each stage. ## Tear down ```bash kind delete cluster ``` ## Customization ideas - **Namespace scoping**: add `namespaces = ["prod", "default"]` to the `loki.source.kubernetes_events` block to filter at the source rather than at query time. - **Drop noisy reasons**: add a `stage.match` block dropping `reason=~"Pulled|Pulling|Created"` if you only care about Warnings. - **Alerting**: pair this with a Grafana alert on `count_over_time({type="Warning"}[5m])` for cluster-health monitoring. ================================================ FILE: k8s/events/alloy-config.yaml ================================================ # Alloy pipeline as a ConfigMap. Mounted into the alloy Deployment at # /etc/alloy/config.alloy. # # Pipeline: # loki.source.kubernetes_events (cluster-wide events feed) # → loki.process (parse JSON, promote labels) # → loki.write (push to Loki in this cluster) apiVersion: v1 kind: ConfigMap metadata: name: alloy-config namespace: meta data: config.alloy: | livedebugging {} loki.source.kubernetes_events "cluster" { job_name = "kubernetes-events" log_format = "json" forward_to = [loki.process.events.receiver] } loki.process "events" { // The component emits a flat JSON envelope (top-level fields: // type, reason, kind, name, count, msg, sourcecomponent, etc). // The `namespace` label is already attached by the source component // itself, so we don't need to extract it here. stage.json { expressions = { type = "type", reason = "reason", kind = "kind", name = "name", } } // Indexed labels — fast filtering for "show all Warnings in // namespace X with reason Y on a Pod". stage.labels { values = { type = "", reason = "", kind = "", } } // High-cardinality fields kept out of the label index but still // queryable via `| json` filters. stage.structured_metadata { values = { name = "", } } forward_to = [loki.write.loki.receiver] } loki.write "loki" { endpoint { url = "http://loki-gateway.meta.svc.cluster.local/loki/api/v1/push" } } ================================================ FILE: k8s/events/alloy-deployment.yaml ================================================ # A single-replica Deployment is the right shape for this scenario: # `loki.source.kubernetes_events` watches a cluster-scoped resource, so # more than one replica would just produce duplicate log lines for every # event. (A DaemonSet would be wrong for the same reason.) apiVersion: apps/v1 kind: Deployment metadata: name: alloy namespace: meta labels: app.kubernetes.io/name: alloy app.kubernetes.io/part-of: alloy-events spec: replicas: 1 selector: matchLabels: app.kubernetes.io/name: alloy template: metadata: labels: app.kubernetes.io/name: alloy spec: serviceAccountName: alloy containers: - name: alloy image: grafana/alloy:v1.16.0 args: - run - /etc/alloy/config.alloy - --server.http.listen-addr=0.0.0.0:12345 - --storage.path=/var/lib/alloy/data ports: - name: http containerPort: 12345 volumeMounts: - name: config mountPath: /etc/alloy - name: storage mountPath: /var/lib/alloy/data volumes: - name: config configMap: name: alloy-config - name: storage emptyDir: {} --- # Lightweight Service so the Alloy UI can be port-forwarded easily. apiVersion: v1 kind: Service metadata: name: alloy namespace: meta spec: selector: app.kubernetes.io/name: alloy ports: - name: http port: 12345 targetPort: 12345 ================================================ FILE: k8s/events/alloy-rbac.yaml ================================================ # Minimal RBAC for `loki.source.kubernetes_events`. # It needs cluster-wide read/list/watch on events. Nothing else. apiVersion: v1 kind: ServiceAccount metadata: name: alloy namespace: meta --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: alloy-events-reader rules: - apiGroups: [""] resources: ["events"] verbs: ["get", "list", "watch"] - apiGroups: ["events.k8s.io"] resources: ["events"] verbs: ["get", "list", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: alloy-events-reader roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: alloy-events-reader subjects: - kind: ServiceAccount name: alloy namespace: meta ================================================ FILE: k8s/events/grafana-values.yml ================================================ --- persistence: type: pvc enabled: true # DO NOT DO THIS IN PRODUCTION USECASES adminUser: admin adminPassword: adminadminadmin # CONSIDER USING AN EXISTING SECRET # Use an existing secret for the admin user. # admin: ## Name of the secret. Can be templated. # existingSecret: "" # userKey: admin-user # passwordKey: admin-password service: enabled: true type: ClusterIP datasources: datasources.yaml: apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki-gateway.meta.svc.cluster.local:80 basicAuth: false isDefault: false version: 1 editable: false ================================================ FILE: k8s/events/kind.yml ================================================ # 1 control-plane + 2 workers — matches the other k8s/ scenarios. kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 nodes: - role: control-plane - role: worker - role: worker ================================================ FILE: k8s/events/loki-values.yml ================================================ --- loki: auth_enabled: false commonConfig: replication_factor: 1 schemaConfig: configs: - from: 2024-04-01 store: tsdb object_store: s3 schema: v13 index: prefix: loki_index_ period: 24h ingester: chunk_encoding: snappy tracing: enabled: true pattern_ingester: enabled: true limits_config: allow_structured_metadata: true volume_enabled: true ruler: enable_api: true querier: # Default is 4, if you have enough memory and CPU you can increase, reduce if OOMing max_concurrent: 4 minio: enabled: true deploymentMode: SingleBinary singleBinary: replicas: 1 resources: limits: cpu: 4 memory: 4Gi requests: cpu: 2 memory: 2Gi extraEnv: # Keep a little bit lower than memory limits - name: GOMEMLIMIT value: 3750MiB chunksCache: # default is 500MB, with limited memory keep this smaller writebackSizeLimit: 10MB # Zero out replica counts of other deployment modes backend: replicas: 0 read: replicas: 0 write: replicas: 0 ingester: replicas: 0 querier: replicas: 0 queryFrontend: replicas: 0 queryScheduler: replicas: 0 distributor: replicas: 0 compactor: replicas: 0 indexGateway: replicas: 0 bloomCompactor: replicas: 0 bloomGateway: replicas: 0 ================================================ FILE: k8s/logs/README.md ================================================ # Monitor Kubernetes Logs with Grafana Alloy and Loki > Note this scenario works using the K8s Monitoring Helm chart. This abstracts the need to configure Alloy and deploys best practices for monitoring Kubernetes clusters. The chart supports; metrics, logs, profiling, and tracing. For this scenario, we will use the K8s Monitoring Helm chart to monitor Kubernetes logs. This scenario demonstrates how to setup the Kubernetes monitoring helm and Loki. This scenario will install three Helm charts: Loki, Grafana, and k8s-monitoring-helm. Loki will be used to store the logs, Grafana will be used to visualize the logs, and Alloy (k8s-monitoring-helm) will be used to collect three different log sources: * Pod Logs * Kubernetes Events ## Prerequisites Clone the repository: ```bash git clone https://github.com/grafana/alloy-scenarios.git ``` Change to the directory: ```bash cd alloy-scenarios/k8s/logs ``` Next you will need a Kubernetes cluster (In this example, we will configure a local Kubernetes cluster using [Kind](https://kind.sigs.k8s.io/docs/user/quick-start/)) An example kind cluster configuration is provided in the `kind.yml` file. To create a kind cluster using this configuration, run the following command: ```bash kind create cluster --config kind.yml ``` Lastly you will need to make sure you install Helm on your local machine. You can install Helm by following the instructions [here](https://helm.sh/docs/intro/install/). You will also need to install the Grafana Helm repository: ```bash helm repo add grafana https://grafana.github.io/helm-charts ``` ## Create the `meta` and `prod` namespaces The first step is to create the `meta` and `prod` namespaces. To create the namespaces, run the following commands: ```bash kubectl create namespace meta && \ kubectl create namespace prod ``` ## Install the Loki Helm Chart The first step is to install the Loki Helm chart. This will install Loki in the `meta` namespace. The `loki-values.yml` file contains the configuration for the Loki Helm chart. To install Loki, run the following command: ```bash helm install --values loki-values.yml loki grafana/loki -n meta ``` This installs Loki in monolithic mode. For more information on Loki modes, see the [Loki documentation](https://grafana.com/docs/loki/latest/get-started/deployment-modes/). ## Install the Grafana Helm Chart The next step is to install the Grafana Helm chart. This will install Grafana in the `meta` namespace. The `grafana-values.yml` file contains the configuration for the Grafana Helm chart. To install Grafana, run the following command: ```bash helm install --values grafana-values.yml grafana grafana/grafana --namespace meta ``` Note that within the `grafana-values.yml` file, the `grafana.ini` configuration is set to use the Loki data source. This is done by setting the `datasources.datasources.yaml` field to the Loki data source configuration. ## Install the K8s Monitoring Helm Chart The final step is to install the K8s monitoring Helm chart. This will install Alloy in the `meta` namespace. The `k8s-monitoring-values.yml` file contains the configuration for the K8s monitoring Helm chart. This scenario requires `grafana/k8s-monitoring` chart v4 or later. To install the K8s monitoring Helm chart, run the following command: ```bash helm install --values ./k8s-monitoring-values.yml k8s grafana/k8s-monitoring --version "^4.0.0" -n meta --create-namespace ``` Within the `k8s-monitoring-values.yml` file we declare the Alloy configuration. This configuration specifies the log sources that Alloy will collect logs from. In this scenario, we are collecting logs from two different sources: Pod Logs and Kubernetes Events. ## Accessing the Grafana UI To access the Grafana UI, you will need to port-forward the Grafana pod to your local machine. First, get the name of the Grafana pod: ```bash export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=grafana,app.kubernetes.io/instance=grafana" -o jsonpath="{.items[0].metadata.name}") ``` Next, port-forward the Grafana pod to your local machine: ```bash kubectl --namespace meta port-forward $POD_NAME 3000 ``` Open your browser and go to [http://localhost:3000](http://localhost:3000). You can log in with the default username `admin` and password `adminadminadmin`. ## Accessing the Alloy UI To access the Alloy UI, you will need to port-forward the Alloy pod to your local machine. First, get the name of the Alloy pod: ```bash export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=alloy-logs,app.kubernetes.io/instance=k8s" -o jsonpath="{.items[0].metadata.name}") ``` Next, port-forward the Alloy pod to your local machine: ```bash kubectl --namespace meta port-forward $POD_NAME 12345 ``` ## View the logs using Explore Logs in Grafana Explore Logs is a new feature in Grafana which provides a queryless way to explore logs. To access Explore Logs. To access Explore logs open a browser and go to [http://localhost:3000/a/grafana-lokiexplore-app](http://localhost:3000/a/grafana-lokiexplore-app). ## Adding a demo prod app The k8s monitoring app is configured to collect logs from two namespaces: `meta` and `prod`. To add a demo prod app, run the following command: ```bash helm install tempo grafana/tempo-distributed -n prod ``` This will install the Tempo distributed tracing system in the `prod` namespace. ================================================ FILE: k8s/logs/grafana-values.yml ================================================ --- persistence: type: pvc enabled: true # DO NOT DO THIS IN PRODUCTION USECASES adminUser: admin adminPassword: adminadminadmin # CONSIDER USING AN EXISTING SECRET # Use an existing secret for the admin user. # admin: ## Name of the secret. Can be templated. # existingSecret: "" # userKey: admin-user # passwordKey: admin-password service: enabled: true type: ClusterIP datasources: datasources.yaml: apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki-gateway.meta.svc.cluster.local:80 basicAuth: false isDefault: false version: 1 editable: false ================================================ FILE: k8s/logs/k8s-monitoring-values.yml ================================================ --- cluster: name: meta-monitoring-tutorial destinations: loki: type: loki url: http://loki-gateway.meta.svc.cluster.local/loki/api/v1/push clusterEvents: enabled: true collector: alloy-singleton namespaces: - meta - prod podLogsViaKubernetesApi: enabled: true collector: alloy-logs namespaces: - meta - prod structuredMetadata: pod: pod collectors: alloy-singleton: presets: [singleton] alloy-logs: presets: [clustered] ================================================ FILE: k8s/logs/killercoda/loki-values.yml ================================================ --- loki: auth_enabled: false commonConfig: replication_factor: 1 schemaConfig: configs: - from: 2024-04-01 store: tsdb object_store: s3 schema: v13 index: prefix: loki_index_ period: 24h ingester: chunk_encoding: snappy tracing: enabled: true pattern_ingester: enabled: true limits_config: allow_structured_metadata: true volume_enabled: true ruler: enable_api: true querier: # Default is 4, if you have enough memory and CPU you can increase, reduce if OOMing max_concurrent: 4 minio: enabled: true deploymentMode: SingleBinary singleBinary: replicas: 1 resources: limits: cpu: 0.5 memory: 1Gi requests: cpu: 0.5 memory: 1Gi extraEnv: # Keep a little bit lower than memory limits - name: GOMEMLIMIT value: 750MiB tolerations: - key: "node-role.kubernetes.io/control-plane" operator: "Exists" effect: "NoSchedule" chunksCache: # default is 500MB, with limited memory keep this smaller writebackSizeLimit: 10MB enabled: false resultsCache: writebackSizeLimit: 10MB enabled: false test: enabled: false lokiCanary: enabled: false # Zero out replica counts of other deployment modes backend: replicas: 0 read: replicas: 0 write: replicas: 0 ingester: replicas: 0 querier: replicas: 0 queryFrontend: replicas: 0 queryScheduler: replicas: 0 distributor: replicas: 0 compactor: replicas: 0 indexGateway: replicas: 0 bloomCompactor: replicas: 0 bloomGateway: replicas: 0 ================================================ FILE: k8s/logs/kind.yml ================================================ # a cluster with 3 control-plane nodes and 3 workers kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 nodes: - role: control-plane - role: worker - role: worker ================================================ FILE: k8s/logs/loki-values.yml ================================================ --- loki: auth_enabled: false commonConfig: replication_factor: 1 schemaConfig: configs: - from: 2024-04-01 store: tsdb object_store: s3 schema: v13 index: prefix: loki_index_ period: 24h ingester: chunk_encoding: snappy tracing: enabled: true pattern_ingester: enabled: true limits_config: allow_structured_metadata: true volume_enabled: true ruler: enable_api: true querier: # Default is 4, if you have enough memory and CPU you can increase, reduce if OOMing max_concurrent: 4 minio: enabled: true deploymentMode: SingleBinary singleBinary: replicas: 1 resources: limits: cpu: 4 memory: 4Gi requests: cpu: 2 memory: 2Gi extraEnv: # Keep a little bit lower than memory limits - name: GOMEMLIMIT value: 3750MiB chunksCache: # default is 500MB, with limited memory keep this smaller writebackSizeLimit: 10MB # Zero out replica counts of other deployment modes backend: replicas: 0 read: replicas: 0 write: replicas: 0 ingester: replicas: 0 querier: replicas: 0 queryFrontend: replicas: 0 queryScheduler: replicas: 0 distributor: replicas: 0 compactor: replicas: 0 indexGateway: replicas: 0 bloomCompactor: replicas: 0 bloomGateway: replicas: 0 ================================================ FILE: k8s/metrics/README.md ================================================ # Monitor Kubernetes Metrics with Grafana Alloy and Prometheus > Note this scenario works using the K8s Monitoring Helm chart. This abstracts the need to configure Alloy and deploys best practices for monitoring Kubernetes clusters. This scenario demonstrates how to set up the Kubernetes monitoring Helm chart with Prometheus. This scenario will install three Helm charts: Prometheus, Grafana, and k8s-monitoring. Prometheus will be used to store the metrics, Grafana will be used to visualize the metrics, and Alloy (k8s-monitoring) will be used to collect: * Cluster Metrics (kube-state-metrics, node-exporter, kubelet, cadvisor) * Annotation-based autodiscovery (Prometheus-style annotations on pods) ## Prerequisites Clone the repository: ```bash git clone https://github.com/grafana/alloy-scenarios.git ``` Change to the directory: ```bash cd alloy-scenarios/k8s/metrics ``` Next you will need a Kubernetes cluster. An example Kind cluster configuration is provided in the `kind.yml` file: ```bash kind create cluster --config kind.yml ``` Install Helm and add required repositories: ```bash helm repo add grafana https://grafana.github.io/helm-charts helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo update ``` ## Create the `meta` namespace ```bash kubectl create namespace meta ``` ## Install Prometheus ```bash helm install --values prometheus-values.yml prometheus prometheus-community/prometheus -n meta ``` ## Install Grafana ```bash helm install --values grafana-values.yml grafana grafana/grafana -n meta ``` ## Install the K8s Monitoring Helm Chart This scenario requires `grafana/k8s-monitoring` chart v4 or later. ```bash helm install --values k8s-monitoring-values.yml k8s grafana/k8s-monitoring --version "^4.0.0" -n meta ``` ## Accessing the Grafana UI ```bash export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=grafana,app.kubernetes.io/instance=grafana" -o jsonpath="{.items[0].metadata.name}") kubectl --namespace meta port-forward $POD_NAME 3000 ``` Open [http://localhost:3000](http://localhost:3000) and log in with `admin` / `adminadminadmin`. ## Accessing the Alloy UI ```bash export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=alloy-metrics,app.kubernetes.io/instance=k8s" -o jsonpath="{.items[0].metadata.name}") kubectl --namespace meta port-forward $POD_NAME 12345 ``` ## Explore Metrics In Grafana, go to **Explore** and select the **Prometheus** datasource. Try these queries: * `up` - See all targets being scraped * `container_cpu_usage_seconds_total` - Container CPU usage * `container_memory_working_set_bytes` - Container memory usage * `kube_pod_info` - Pod metadata from kube-state-metrics ================================================ FILE: k8s/metrics/grafana-values.yml ================================================ --- persistence: type: pvc enabled: true adminUser: admin adminPassword: adminadminadmin service: enabled: true type: ClusterIP datasources: datasources.yaml: apiVersion: 1 datasources: - name: Prometheus type: prometheus access: proxy orgId: 1 url: http://prometheus-server.meta.svc.cluster.local:80 basicAuth: false isDefault: true version: 1 editable: false ================================================ FILE: k8s/metrics/k8s-monitoring-values.yml ================================================ --- cluster: name: meta-monitoring-tutorial destinations: prometheus: type: prometheus url: http://prometheus-server.meta.svc.cluster.local:80/api/v1/write clusterMetrics: enabled: true annotationAutodiscovery: enabled: true collector: alloy-metrics collectors: alloy-metrics: presets: [clustered, statefulset] telemetryServices: kube-state-metrics: deploy: true ================================================ FILE: k8s/metrics/kind.yml ================================================ # a cluster with 1 control-plane node and 2 workers kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 nodes: - role: control-plane - role: worker - role: worker ================================================ FILE: k8s/metrics/prometheus-values.yml ================================================ server: persistentVolume: enabled: false extraFlags: - web.enable-remote-write-receiver - enable-feature=native-histograms - enable-feature=exemplar-storage alertmanager: enabled: false kube-state-metrics: enabled: false prometheus-node-exporter: enabled: false prometheus-pushgateway: enabled: false ================================================ FILE: k8s/profiling/README.md ================================================ # Monitor Kubernetes Profiles with Grafana Alloy and Pyroscope > Note this scenario works using the K8s Monitoring Helm chart. This abstracts the need to configure Alloy and deploys best practices for monitoring Kubernetes clusters. This scenario demonstrates how to set up the Kubernetes monitoring Helm chart with Pyroscope for continuous profiling. This scenario will install three Helm charts: Pyroscope, Grafana, and k8s-monitoring. Pyroscope will store the profiles, Grafana will visualize them, and Alloy (k8s-monitoring) will scrape pprof endpoints from pods. Alloy discovers pods with profiling annotations and scrapes their pprof endpoints (CPU, memory, goroutine, etc.). ## Prerequisites Clone the repository: ```bash git clone https://github.com/grafana/alloy-scenarios.git ``` Change to the directory: ```bash cd alloy-scenarios/k8s/profiling ``` Next you will need a Kubernetes cluster. An example Kind cluster configuration is provided in the `kind.yml` file: ```bash kind create cluster --config kind.yml ``` Install Helm and add the Grafana Helm repository: ```bash helm repo add grafana https://grafana.github.io/helm-charts helm repo update ``` ## Create the `meta` namespace ```bash kubectl create namespace meta ``` ## Install Pyroscope ```bash helm install --values pyroscope-values.yml pyroscope grafana/pyroscope -n meta ``` ## Install Grafana ```bash helm install --values grafana-values.yml grafana grafana/grafana -n meta ``` ## Install the K8s Monitoring Helm Chart This scenario requires `grafana/k8s-monitoring` chart v4 or later. ```bash helm install --values k8s-monitoring-values.yml k8s grafana/k8s-monitoring --version "^4.0.0" -n meta ``` ## Accessing the Grafana UI ```bash export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=grafana,app.kubernetes.io/instance=grafana" -o jsonpath="{.items[0].metadata.name}") kubectl --namespace meta port-forward $POD_NAME 3000 ``` Open [http://localhost:3000](http://localhost:3000) and log in with `admin` / `adminadminadmin`. ## Accessing the Alloy UI ```bash export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=alloy-profiles,app.kubernetes.io/instance=k8s" -o jsonpath="{.items[0].metadata.name}") kubectl --namespace meta port-forward $POD_NAME 12345 ``` ## Enabling Profiling on Your Pods To profile a Go application, ensure it exposes a pprof endpoint (typically at `:6060/debug/pprof/`) and add these annotations to the pod: ```yaml metadata: annotations: profiles.grafana.com/memory.scrape: "true" profiles.grafana.com/memory.port_name: "http-metrics" profiles.grafana.com/cpu.scrape: "true" profiles.grafana.com/cpu.port_name: "http-metrics" profiles.grafana.com/goroutine.scrape: "true" profiles.grafana.com/goroutine.port_name: "http-metrics" ``` ## Adding a Demo App Deploy Pyroscope's demo Ride Share app to generate profiles: ```bash kubectl apply -n meta -f - < Note this scenario works using the K8s Monitoring Helm chart. This abstracts the need to configure Alloy and deploys best practices for monitoring Kubernetes clusters. This scenario demonstrates how to set up the Kubernetes monitoring Helm chart with Tempo for distributed trace collection. This scenario will install three Helm charts: Tempo, Grafana, and k8s-monitoring. Tempo will store the traces, Grafana will visualize them, and Alloy (k8s-monitoring) will receive traces via OTLP and forward them to Tempo. Applications send traces to Alloy's OTLP endpoint, which then forwards them to Tempo. ## Prerequisites Clone the repository: ```bash git clone https://github.com/grafana/alloy-scenarios.git ``` Change to the directory: ```bash cd alloy-scenarios/k8s/tracing ``` Next you will need a Kubernetes cluster. An example Kind cluster configuration is provided in the `kind.yml` file: ```bash kind create cluster --config kind.yml ``` Install Helm and add the Grafana Helm repository: ```bash helm repo add grafana https://grafana.github.io/helm-charts helm repo update ``` ## Create the `meta` and `prod` namespaces ```bash kubectl create namespace meta && \ kubectl create namespace prod ``` ## Install Tempo ```bash helm install --values tempo-values.yml tempo grafana/tempo -n meta ``` ## Install Grafana ```bash helm install --values grafana-values.yml grafana grafana/grafana -n meta ``` ## Install the K8s Monitoring Helm Chart This scenario requires `grafana/k8s-monitoring` chart v4 or later. ```bash helm install --values k8s-monitoring-values.yml k8s grafana/k8s-monitoring --version "^4.0.0" -n meta ``` This configures Alloy to receive OTLP traces on ports 4317 (gRPC) and 4318 (HTTP), then forward them to Tempo. ## Accessing the Grafana UI ```bash export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=grafana,app.kubernetes.io/instance=grafana" -o jsonpath="{.items[0].metadata.name}") kubectl --namespace meta port-forward $POD_NAME 3000 ``` Open [http://localhost:3000](http://localhost:3000) and log in with `admin` / `adminadminadmin`. ## Accessing the Alloy UI ```bash export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=alloy-receiver,app.kubernetes.io/instance=k8s" -o jsonpath="{.items[0].metadata.name}") kubectl --namespace meta port-forward $POD_NAME 12345 ``` ## Sending Traces Applications in your cluster should set their OTLP exporter endpoint to the Alloy receiver service: ``` OTEL_EXPORTER_OTLP_ENDPOINT=http://k8s-alloy-receiver.meta.svc.cluster.local:4317 ``` ## Adding a Demo App Deploy a sample instrumented application in the `prod` namespace to generate traces: ```bash helm install tempo-distributed grafana/tempo-distributed -n prod ``` Or deploy any application instrumented with OpenTelemetry SDK pointing to the Alloy OTLP endpoint above. ## Explore Traces In Grafana, go to **Explore** and select the **Tempo** datasource. Use TraceQL to search for traces: * `{}` - View all traces * `{resource.service.name="my-service"}` - Filter by service name * `{status=error}` - Find error traces ================================================ FILE: k8s/tracing/grafana-values.yml ================================================ --- persistence: type: pvc enabled: true adminUser: admin adminPassword: adminadminadmin service: enabled: true type: ClusterIP datasources: datasources.yaml: apiVersion: 1 datasources: - name: Tempo type: tempo access: proxy orgId: 1 url: http://tempo.meta.svc.cluster.local:3200 basicAuth: false isDefault: true version: 1 editable: false ================================================ FILE: k8s/tracing/k8s-monitoring-values.yml ================================================ --- cluster: name: meta-monitoring-tutorial destinations: tempo: type: otlp url: http://tempo.meta.svc.cluster.local:4317 metrics: enabled: false logs: enabled: false traces: enabled: true applicationObservability: enabled: true collector: alloy-receiver receivers: otlp: grpc: enabled: true http: enabled: true metrics: enabled: false logs: enabled: false collectors: alloy-receiver: presets: [deployment] ================================================ FILE: k8s/tracing/kind.yml ================================================ # a cluster with 1 control-plane node and 2 workers kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 nodes: - role: control-plane - role: worker - role: worker ================================================ FILE: k8s/tracing/tempo-values.yml ================================================ tempo: storage: trace: backend: local local: path: /var/tempo/traces wal: path: /var/tempo/wal receivers: otlp: protocols: grpc: endpoint: "0.0.0.0:4317" http: endpoint: "0.0.0.0:4318" server: http_listen_port: 3200 ================================================ FILE: kafka/README.md ================================================ # Kafka Scenarios Learn how to use Grafana Alloy to monitor logs from Kafka. ## Overview This demo showcases how to: - Collect logs from a Kafka topic - Process and transform JSON log data with Alloy - Forward processed logs to Loki - Visualize the logs in Grafana ## Components - **Kafka**: Message broker storing logs - **Kafka Producer**: Generates sample logs and sends them to Kafka - **Grafana Alloy**: Observability pipeline that processes logs - **Loki**: Log aggregation system - **Grafana**: Visualization platform ## Running the Demo ### Step 1: Clone the repository ```bash git clone https://github.com/grafana/alloy-scenarios.git ``` ### Step 2: Deploy the monitoring stack ```bash cd alloy-scenarios/kafka docker-compose up -d ``` ### Step 3: Access Grafana Alloy UI Open your browser and go to `http://localhost:12345`. ### Step 4: Access Grafana UI Open your browser and go to `http://localhost:3000`. Click `drilldown` to see the logs in Grafana. ## How It Works 1. The `gen_log.sh` script generates random JSON logs with different log levels, applications, and messages 2. These logs are sent to the Kafka topic `alloy-logs` 3. Alloy reads from this Kafka topic, processes the JSON data, and forwards it to Loki 4. Grafana connects to Loki to display and query the processed logs Try creating dashboards in Grafana to visualize log frequencies by application or error levels! ================================================ FILE: kafka/config.alloy ================================================ livedebugging { enabled = true } loki.source.kafka "kafka" { brokers = ["kafka:9092"] topics = ["alloy-logs"] labels = { source = "kafka", component = "loki.source.kafka", } version = "3.8.0" forward_to = [loki.process.log_data.receiver] } loki.process "log_data" { forward_to = [loki.write.local.receiver] stage.json { drop_malformed = true, expressions = { level = "", msg = "", app = "app", } } stage.json { source = "app" expressions = { app_name = "name", app_version = "version", } } stage.template { source = "new_json" template = "{\"level\":\"{{ .level }}\",\"msg\":\"{{ .msg }}\",\"app_name\":\"{{ .app_name }}\",\"app_version\":\"{{ .app_version }}\"}" } stage.output { source = "new_json" } } loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" } } ================================================ FILE: kafka/docker-compose.coda.yml ================================================ services: kafka: image: 'bitnami/kafka:3.8' ports: - "9092:9092" volumes: - kafka_data:/bitnami/kafka environment: #KRaft must - KAFKA_CFG_NODE_ID=0 - KAFKA_CFG_PROCESS_ROLES=controller,broker - KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093 - KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093 - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092 - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT - KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER healthcheck: test: ["CMD", "kafka-topics.sh", "--bootstrap-server", "localhost:9092", "--version"] interval: 10s timeout: 10s retries: 5 kafka-producer: image: 'bitnami/kafka:3.8' volumes: - ./gen_log.sh:/bin/gen_log.sh - kafka_data:/bitnami/kafka entrypoint: ["sh", "-c", "/bin/gen_log.sh"] volumes: kafka_data: ================================================ FILE: kafka/docker-compose.yml ================================================ version: '3.8' services: # kafka server instance kafka: image: 'bitnami/kafka:3.8' ports: - "9092:9092" volumes: - kafka_data:/bitnami/kafka environment: #KRaft must - KAFKA_CFG_NODE_ID=0 - KAFKA_CFG_PROCESS_ROLES=controller,broker - KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093 - KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093 - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092 - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT - KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER healthcheck: test: ["CMD", "kafka-topics.sh", "--bootstrap-server", "localhost:9092", "--version"] interval: 10s timeout: 10s retries: 5 kafka-producer: image: 'bitnami/kafka:3.8' volumes: - ./gen_log.sh:/bin/gen_log.sh - kafka_data:/bitnami/kafka # change cmd entrypoint: ["sh", "-c", "/bin/gen_log.sh"] alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 - 4318:4318 volumes: - ./config.alloy:/etc/alloy/config.alloy - ./logs:/temp/logs command: run --server.http.listen-addr=0.0.0.0:12345 --stability.level=experimental --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - loki loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false EOF /run.sh volumes: kafka_data: ================================================ FILE: kafka/gen_log.sh ================================================ #!/usr/bin/env bash set -euo pipefail LEVELS=(info warn error debug) APPS=(test auth payment order catalog) MSGS=( "Hello World from Grafana Alloy integration – log pipeline initialized successfully." "User authentication succeeded: user_id=42, ip=192.168.1.100, method=OAuth2." "Order created: order_id=12345, items=[{\"sku\":\"ABC\",\"qty\":2},{\"sku\":\"XYZ\",\"qty\":1}], total=USD 299.99." "Payment processing failed: transaction_id=67890, error_code=PMT-402, reason=Insufficient funds." "Cache miss on key user_profile_42; fetching from primary DB and repopulating cache." "Background job completed: task=metrics-aggregation, duration=12.34s, processed=2500 records." "High memory usage detected on host host-01: usage=87.5%, threshold=80% — consider scaling up." "Debug info: received payload with 15 fields, sample_field=\"some long detailed info here\", parsing succeeded." ) # Always running, sending logs to kafka every two seconds. while true; do level=${LEVELS[RANDOM % ${#LEVELS[@]}]} msg=${MSGS[RANDOM % ${#MSGS[@]}]} app=${APPS[RANDOM % ${#APPS[@]}]} version="0.$((RANDOM % 10)).$((RANDOM % 100))" printf '{"level":"%s","msg":"%s","app":{"name":"%s","version":"%s"}}\n' \ "$level" "$msg" "$app" "$version" sleep 2 done | kafka-console-producer.sh \ --bootstrap-server kafka:9092 \ --topic alloy-logs ================================================ FILE: kafka/loki-config.yaml ================================================ # This is a complete configuration to deploy Loki backed by the filesystem. # The index will be shipped to the storage via tsdb-shipper. auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true # Note: We are setting the max chunk age far lower than the default expected value # This is due to the fact this scenario is used within the LogCLI demo and we need a short flush time. # To show how logcli stats --since 24h '{service_name="Delivery World", package_size="Large"}' works. ingester: max_chunk_age: 5m # Should be 2 hours ================================================ FILE: linux/README.md ================================================ # Monitoring Linux with Alloy Grafana Alloy can be used to monitor Linux servers and containers. In this guide, we will show you how to deploy Grafana Alloy in a Docker environment to monitor Linux system metrics and logs. The setup consists of: * Node Exporter metrics for system performance monitoring * System logs collection with Loki ## Prerequisites * Git - You will need Git to clone the repository. * Docker and Docker Compose - This tutorial uses Docker to host Grafana, Loki, Prometheus, and Alloy. * Linux environment - Either a Linux host running Docker or a Linux VM. ## About this Demo This demo runs Alloy in a container alongside Grafana, Prometheus, and Loki, creating a self-contained monitoring stack. The Alloy container acts as a "fake Linux server" to demonstrate monitoring capabilities out of the box. In a production environment, you would typically install Alloy directly on each Linux server you want to monitor. ## Step 1: Clone the Repository Clone the repository to your machine: ```bash git clone https://github.com/grafana/alloy-scenarios.git cd alloy-scenarios/linux ``` ## Step 2: Deploy the Monitoring Stack Use Docker Compose to deploy Grafana, Loki, Prometheus, and Alloy: ```bash docker-compose up -d ``` You can check the status of the containers: ```bash docker ps ``` Grafana should be running on [http://localhost:3000](http://localhost:3000). ## Step 3: Explore the Monitoring Data Once the stack is running, you can explore the collected metrics and logs: 1. Access Grafana at [http://localhost:3000](http://localhost:3000) (default credentials are admin/admin) 2. Import the Node Exporter dashboard to visualize system metrics: - Go to Dashboards → Import - Upload the JSON file from [here](https://grafana.com/api/dashboards/1860/revisions/37/download) - Select the Prometheus data source and click Import This community dashboard provides comprehensive system metrics including CPU, memory, disk, and network usage. ## Step 4: Viewing Logs Open your browser and go to [http://localhost:3000/a/grafana-lokiexplore-app](http://localhost:3000/a/grafana-lokiexplore-app). This will take you to the Loki explorer in Grafana. ## Deploying on Bare Metal To monitor actual Linux servers in production, you would: 1. Install Alloy directly on each Linux server 2. Modify the `config.alloy` file to point to your Prometheus and Loki instances: ``` prometheus.remote_write "local" { endpoint { url = "http://localhost:9090/api/v1/write" } } loki.write "local" { endpoint { url = "http://localhost:3100/loki/api/v1/push" } } ``` 3. Run Alloy as a service: ```bash sudo alloy run /path/to/config.alloy ``` ## Configuration Customization The included `config.alloy` file sets up: 1. Node Exporter integration to collect system metrics 2. Log collection from system logs and journal 3. Relabeling rules to organize metrics and logs 4. Remote write endpoints for Prometheus and Loki You can customize which collectors are enabled/disabled and adjust scrape intervals in the configuration file. ## Troubleshooting If you encounter issues: * Check container logs: `docker-compose logs` * Verify Alloy is running: `docker-compose ps` * Ensure ports are not conflicting with existing services * Review the Alloy configuration in `config.alloy` ================================================ FILE: linux/config.alloy ================================================ // This block relabels metrics coming from node_exporter to add standard labels discovery.relabel "integrations_node_exporter" { targets = prometheus.exporter.unix.integrations_node_exporter.targets rule { // Set the instance label to the hostname of the machine target_label = "instance" replacement = constants.hostname } rule { // Set a standard job name for all node_exporter metrics target_label = "job" replacement = "integrations/node_exporter" } } // Configure the node_exporter integration to collect system metrics prometheus.exporter.unix "integrations_node_exporter" { // Disable unnecessary collectors to reduce overhead disable_collectors = ["ipvs", "btrfs", "infiniband", "xfs", "zfs"] enable_collectors = ["meminfo"] filesystem { // Exclude filesystem types that aren't relevant for monitoring fs_types_exclude = "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|tmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$" // Exclude mount points that aren't relevant for monitoring mount_points_exclude = "^/(dev|proc|run/credentials/.+|sys|var/lib/docker/.+)($|/)" // Timeout for filesystem operations mount_timeout = "5s" } netclass { // Ignore virtual and container network interfaces ignored_devices = "^(veth.*|cali.*|[a-f0-9]{15})$" } netdev { // Exclude virtual and container network interfaces from device metrics device_exclude = "^(veth.*|cali.*|[a-f0-9]{15})$" } } // Define how to scrape metrics from the node_exporter prometheus.scrape "integrations_node_exporter" { scrape_interval = "15s" // Use the targets with labels from the discovery.relabel component targets = discovery.relabel.integrations_node_exporter.output // Send the scraped metrics to the relabeling component forward_to = [prometheus.remote_write.local.receiver] } // Define where to send the metrics for storage prometheus.remote_write "local" { endpoint { // Send metrics to a locally running Prometheus instance url = "http://prometheus:9090/api/v1/write" } } // Collect logs from systemd journal for node_exporter integration loki.source.journal "logs_integrations_integrations_node_exporter_journal_scrape" { // Only collect logs from the last 24 hours max_age = "24h0m0s" // Apply relabeling rules to the logs relabel_rules = discovery.relabel.logs_integrations_integrations_node_exporter_journal_scrape.rules // Send logs to the local Loki instance forward_to = [loki.write.local.receiver] } // Define which log files to collect for node_exporter local.file_match "logs_integrations_integrations_node_exporter_direct_scrape" { path_targets = [{ // Target localhost for log collection __address__ = "localhost", // Collect standard system logs __path__ = "/var/log/{syslog,messages,*.log}", // Add instance label with hostname instance = constants.hostname, // Add job label for logs job = "integrations/node_exporter", }] } // Define relabeling rules for systemd journal logs discovery.relabel "logs_integrations_integrations_node_exporter_journal_scrape" { targets = [] rule { // Extract systemd unit information into a label source_labels = ["__journal__systemd_unit"] target_label = "unit" } rule { // Extract boot ID information into a label source_labels = ["__journal__boot_id"] target_label = "boot_id" } rule { // Extract the hostname of the machine into a label source_labels = ["__journal__hostname"] target_label = "instance" } rule { // Extract the machine id into a label source_labels = ["__journal__machine_id"] target_label = "machine_id" } rule { // Extract transport information into a label source_labels = ["__journal__transport"] target_label = "transport" } rule { // Extract log priority into a level label source_labels = ["__journal_priority_keyword"] target_label = "level" } } // Collect logs from files for node_exporter loki.source.file "logs_integrations_integrations_node_exporter_direct_scrape" { // Use targets defined in local.file_match targets = local.file_match.logs_integrations_integrations_node_exporter_direct_scrape.targets // Send logs to the local Loki instance forward_to = [loki.write.local.receiver] } // Define where to send logs for storage loki.write "local" { endpoint { // Send logs to a locally running Loki instance url ="http://loki:3100/loki/api/v1/push" } } // Enable live debugging features (empty config means use defaults) livedebugging{} ================================================ FILE: linux/docker-compose.yml ================================================ version: '3.8' services: loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - 3100:3100/tcp volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090/tcp volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy ================================================ FILE: linux/loki-config.yaml ================================================ # This is a complete configuration to deploy Loki backed by the filesystem. # The index will be shipped to the storage via tsdb-shipper. auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true # Note: We are setting the max chunk age far lower than the default expected value # This is due to the fact this scenario is used within the LogCLI demo and we need a short flush time. # To show how logcli stats --since 24h '{service_name="Delivery World", package_size="Large"}' works. ingester: max_chunk_age: 5m # Should be 2 hours ================================================ FILE: linux/prom-config.yaml ================================================ # my global config global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). ================================================ FILE: log-api-gateway/README.md ================================================ # Log API Gateway This scenario demonstrates using **Grafana Alloy** as a centralized log gateway via the `loki.source.api` component. Instead of scraping logs from files or containers, Alloy exposes a Loki-compatible push API endpoint that applications can send logs to directly. ## Architecture ``` ┌─────────────────┐ ┌───────────────────────┐ ┌──────┐ ┌─────────┐ │ log-producer │──POST──▶│ Alloy (loki.source. │──push──▶│ Loki │◀─query──│ Grafana │ │ (Python script) │ │ api on :3500) │ │ │ │ │ └─────────────────┘ └───────────────────────┘ └──────┘ └─────────┘ ``` 1. **log-producer** - A Python script that simulates multiple microservices (auth, order, notification) pushing structured logs to Alloy's Loki push API endpoint. 2. **Alloy** - Receives logs via `loki.source.api` on port 3500, enriches them with a `gateway=alloy` label, and forwards to Loki. 3. **Loki** - Stores and indexes the logs. 4. **Grafana** - Pre-configured with the Loki datasource for querying logs. ## Running ```bash # From the repo root (uses centralized image versions) ./run-example.sh log-api-gateway # Or directly cd log-api-gateway && docker compose up -d ``` ## Exploring - **Grafana**: [http://localhost:3000](http://localhost:3000) - Query logs in the Explore view using the Loki datasource - **Alloy UI**: [http://localhost:12345](http://localhost:12345) - Inspect the pipeline graph and component health ### Example LogQL Queries ```logql # All logs from a specific service {service_name="auth-service"} # All logs passing through the gateway {gateway="alloy"} # Filter by environment {environment="demo"} ``` ## How It Works The `loki.source.api` component in Alloy exposes a Loki-compatible HTTP endpoint (`/loki/api/v1/push`) that any application can push logs to. This is useful when: - Applications already use the Loki push API format - You want a centralized gateway to enrich, filter, or route logs before they reach Loki - You need to decouple log producers from the storage backend The Alloy pipeline in this scenario: 1. **`loki.source.api`** - Listens on port 3500 for incoming log push requests 2. **`loki.process`** - Adds a `gateway=alloy` static label to all received logs 3. **`loki.write`** - Forwards the enriched logs to Loki ## Stopping ```bash cd log-api-gateway && docker compose down ``` ================================================ FILE: log-api-gateway/app/producer.py ================================================ import requests import time import random import json ALLOY_URL = "http://alloy:3500/loki/api/v1/push" services = [ {"name": "auth-service", "messages": [ "User login attempt from IP 10.0.1.50", "Token refresh completed for user_id=42", "Failed login: invalid credentials for user@example.com", "Session expired for session_id=abc123", ]}, {"name": "order-service", "messages": [ "New order created: ORD-98765", "Payment processed for order ORD-98765", "Order shipped: tracking_id=TRACK123", "Inventory check: item SKU-001 has 5 units remaining", ]}, {"name": "notification-service", "messages": [ "Email sent to user@example.com", "SMS notification queued for +1234567890", "Push notification delivered to device_id=xyz", "Notification batch completed: 150 messages sent", ]}, ] print("Starting log producer...") while True: service = random.choice(services) message = random.choice(service["messages"]) payload = { "streams": [{ "stream": { "service_name": service["name"], "environment": "demo", }, "values": [ [str(int(time.time() * 1e9)), message] ] }] } try: resp = requests.post(ALLOY_URL, json=payload, headers={"Content-Type": "application/json"}) if resp.status_code != 204: print(f"Unexpected status: {resp.status_code}") except Exception as e: print(f"Error sending log: {e}") time.sleep(random.uniform(0.5, 2.0)) ================================================ FILE: log-api-gateway/config.alloy ================================================ livedebugging { enabled = true } // Accept logs via Loki push API - acts as a centralized log gateway loki.source.api "default" { http { listen_address = "0.0.0.0" listen_port = 3500 } forward_to = [loki.process.enrich.receiver] } // Enrich logs with gateway metadata loki.process "enrich" { forward_to = [loki.write.local.receiver] stage.static_labels { values = { "gateway" = "alloy", } } } loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" } } ================================================ FILE: log-api-gateway/docker-compose.coda.yml ================================================ services: log-producer: image: python:${PYTHON_VERSION:-3.11-slim} container_name: log-producer volumes: - ./app/producer.py:/app/producer.py command: sh -c "pip install requests && python3 /app/producer.py" ================================================ FILE: log-api-gateway/docker-compose.yml ================================================ services: # Python script that pushes logs to Alloy's Loki push API endpoint log-producer: image: python:${PYTHON_VERSION:-3.11-slim} container_name: log-producer volumes: - ./app/producer.py:/app/producer.py command: sh -c "pip install requests && python3 /app/producer.py" depends_on: - alloy alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 - 3500:3500 volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --stability.level=experimental --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - loki loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh ================================================ FILE: log-api-gateway/loki-config.yaml ================================================ auth_enabled: false server: http_listen_port: 3100 grpc_listen_port: 9096 log_level: debug grpc_server_max_concurrent_streams: 1000 common: instance_addr: 127.0.0.1 path_prefix: /tmp/storage storage: filesystem: chunks_directory: /tmp/storage/chunks rules_directory: /tmp/storage/rules replication_factor: 1 ring: kvstore: store: inmemory query_range: results_cache: cache: embedded_cache: enabled: true max_size_mb: 100 limits_config: metric_aggregation_enabled: true schema_config: configs: - from: 2020-10-24 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h pattern_ingester: enabled: true metric_aggregation: loki_address: localhost:3100 ruler: alertmanager_url: http://localhost:9093 frontend: encoding: protobuf # By default, Loki will send anonymous, but uniquely-identifiable usage and configuration # analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/ # # Statistics help us better understand how Loki is used, and they show us performance # levels for most users. This helps us prioritize features and documentation. # For more information on what's sent, look at # https://github.com/grafana/loki/blob/main/pkg/analytics/stats.go # Refer to the buildReport method to see what goes into a report. # # If you would like to disable reporting, uncomment the following lines: #analytics: # reporting_enabled: false ================================================ FILE: log-secret-filtering/README.md ================================================ # Log Secret Filtering Demonstrates how Grafana Alloy's `loki.secretfilter` component automatically redacts secrets from log lines before they reach Loki. ## Overview A Python application continuously writes log lines -- some containing fake secrets (AWS keys, database connection strings, GitHub tokens, JWTs, Slack webhooks) -- to a shared log file. Alloy tails the file, passes every line through `loki.secretfilter` using built-in Gitleaks patterns, and forwards the sanitized output to Loki. By the time logs appear in Grafana, sensitive values have been replaced with ``. The example includes: - **secret-logger** -- Python app that emits a mix of normal and secret-containing log lines every 2 seconds. - **Alloy** -- Tails the log file, applies `loki.secretfilter`, and pushes to Loki. Runs with `--stability.level=experimental` because `loki.secretfilter` is an experimental component. - **Loki** -- Stores the redacted logs. - **Grafana** -- Visualize and query logs to verify secrets have been removed. ## Running the Demo 1. Clone the repository: ``` git clone https://github.com/grafana/alloy-scenarios.git cd alloy-scenarios ``` 2. Navigate to this example directory: ``` cd log-secret-filtering ``` 3. Run using Docker Compose: ``` docker compose up -d ``` Or use the centralized image management: ``` cd .. ./run-example.sh log-secret-filtering ``` 4. Access Grafana at [http://localhost:3000](http://localhost:3000) ## What to Expect 1. Open Grafana and navigate to **Explore**. 2. Select the **Loki** datasource. 3. Run the query `{job="secret-app"}`. 4. You should see log lines where secrets have been replaced, for example: - `Found config: with secret` - `Database connection: ` - Normal log lines (health checks, request timings) pass through unchanged. ## Architecture ``` ┌─────────────────┐ ┌───────────────────────────────────────┐ ┌──────┐ ┌─────────┐ │ secret-logger │─────▶│ Alloy │─────▶│ Loki │─────▶│ Grafana │ │ (writes logs) │ file │ local.file_match ─▶ loki.source.file │ push │ │ query│ │ └─────────────────┘ │ ─▶ loki.secretfilter ─▶ loki.write │ │ │ │ └───────────────────────────────────────┘ └──────┘ └─────────┘ ``` ## Alloy Pipeline The `config.alloy` pipeline: 1. `local.file_match` -- discovers log files at `/tmp/logs/*.log`. 2. `loki.source.file` -- tails matched files and forwards log entries. 3. `loki.secretfilter` -- scans each log line against Gitleaks secret patterns and replaces matches with ``. 4. `loki.write` -- pushes sanitized logs to Loki. Visit the Alloy UI at [http://localhost:12345](http://localhost:12345) to inspect the running pipeline and use the live debugging view. ================================================ FILE: log-secret-filtering/app/main.py ================================================ import time import random import datetime secrets = [ 'Found config: AKIAIOSFODNN7EXAMPLE with secret', 'Database connection: postgresql://admin:SuperSecret123@db:5432/prod', 'Setting API_KEY=ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdef12', 'Bearer token: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U', 'Slack webhook: https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX', ] normal = [ 'Processing request from 192.168.1.100', 'User login successful for user_id=42', 'Health check passed: all systems operational', 'Cache hit ratio: 94.2%', 'Request completed in 23ms', ] with open("/logs/app.log", "w") as f: pass while True: line = random.choice(secrets + normal + normal) # 2:1 ratio normal:secret ts = datetime.datetime.now().isoformat() with open("/logs/app.log", "a") as f: f.write(f"{ts} {line}\n") time.sleep(2) ================================================ FILE: log-secret-filtering/config.alloy ================================================ livedebugging { enabled = true } local.file_match "app_logs" { path_targets = [{"__path__" = "/tmp/logs/*.log", "job" = "secret-app"}] sync_period = "5s" } loki.source.file "log_scrape" { targets = local.file_match.app_logs.targets forward_to = [loki.secretfilter.default.receiver] tail_from_end = true } // Redact secrets from log lines before sending to Loki. // Uses built-in Gitleaks patterns to detect API keys, passwords, tokens, etc. loki.secretfilter "default" { forward_to = [loki.write.local.receiver] redact_with = "" } loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" } } ================================================ FILE: log-secret-filtering/docker-compose.coda.yml ================================================ services: secret-logger: image: python:${PYTHON_VERSION:-3.11-slim} volumes: - ./app/main.py:/app/main.py - ./logs:/logs command: python3 /app/main.py ================================================ FILE: log-secret-filtering/docker-compose.yml ================================================ services: # Python app that periodically logs fake secrets (API keys, passwords, tokens) secret-logger: image: python:${PYTHON_VERSION:-3.11-slim} volumes: - ./app/main.py:/app/main.py - ./logs:/logs command: python3 /app/main.py # Alloy telemetry pipeline — scrapes log files and redacts secrets before forwarding to Loki alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 volumes: - ./config.alloy:/etc/alloy/config.alloy - ./logs:/tmp/logs command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data --stability.level=experimental /etc/alloy/config.alloy depends_on: - loki # Loki for log aggregation loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - 3100:3100/tcp volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml # Grafana for visualization grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh depends_on: - loki ================================================ FILE: log-secret-filtering/loki-config.yaml ================================================ auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true ingester: max_chunk_age: 2h ================================================ FILE: logs-file/README.md ================================================ # File Scenarios Learn how to use Grafana Alloy to monitor logs from a file. ## Running the Demo ### Step 1: Clone the repository ```bash git clone https://github.com/grafana/alloy-scenarios.git ``` ### Step 2: Deploy the monitoring stack ```bash cd alloy-scenarios/logs-file docker-compose up -d ``` ### Step 3: Access Grafana Alloy UI Open your browser and go to `http://localhost:12345`. ### Step 4: Access Grafana UI Open your browser and go to `http://localhost:3000`. ================================================ FILE: logs-file/config.alloy ================================================ livedebugging { enabled = true } local.file_match "local_files" { path_targets = [{"__path__" = "/temp/logs/*.log", "job" = "python", "hostname" = constants.hostname}] sync_period = "5s" } loki.source.file "log_scrape" { targets = local.file_match.local_files.targets forward_to = [loki.write.local.receiver] tail_from_end = true } loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" } } ================================================ FILE: logs-file/docker-compose.coda.yml ================================================ services: logs-file: image: python:${PYTHON_VERSION:-3.11-slim} container_name: logs-file volumes: - ./main.py:/main.py - ./logs:/logs command: ["python3", "/main.py"] ================================================ FILE: logs-file/docker-compose.yml ================================================ version: '3.8' services: # Syslog simulator using a Python script logs-file: image: python:${PYTHON_VERSION:-3.11-slim} container_name: logs-file volumes: - ./main.py:/main.py # Syslog simulator script - ./logs:/logs # Directory to store the logs depends_on: - alloy command: ["python3", "/main.py"] alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 - 4318:4318 volumes: - ./config.alloy:/etc/alloy/config.alloy - ./logs:/temp/logs command: run --server.http.listen-addr=0.0.0.0:12345 --stability.level=experimental --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - loki loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false EOF /run.sh ================================================ FILE: logs-file/loki-config.yaml ================================================ # This is a complete configuration to deploy Loki backed by the filesystem. # The index will be shipped to the storage via tsdb-shipper. auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true # Note: We are setting the max chunk age far lower than the default expected value # This is due to the fact this scenario is used within the LogCLI demo and we need a short flush time. # To show how logcli stats --since 24h '{service_name="Delivery World", package_size="Large"}' works. ingester: max_chunk_age: 5m # Should be 2 hours ================================================ FILE: logs-file/main.py ================================================ import logging import time import random import os # Ensure the /logs directory exists log_directory = "/logs" log_file = os.path.join(log_directory, "app.log") if not os.path.exists(log_directory): os.makedirs(log_directory) # Create directory if it doesn't exist # Configure logging logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[ logging.FileHandler(log_file), logging.StreamHandler() ] ) logger = logging.getLogger("ExampleApp") def simulate_process(): """Simulates a process that generates logs every 5 seconds.""" actions = ["start", "process", "error", "complete"] while True: # Infinite loop action = random.choice(actions) if action == "start": logger.info("Process started successfully.") elif action == "process": logger.debug("Processing data...") elif action == "error": logger.error("An error occurred during processing.") elif action == "complete": logger.warning("Process completed with minor warnings.") time.sleep(5) # Generate a log every 5 seconds if __name__ == "__main__": logger.info("Application started.") try: simulate_process() except Exception as e: logger.critical(f"Unhandled exception: {e}") logger.info("Application finished.") ================================================ FILE: logs-tcp/README.md ================================================ # Logs Over TCP Scenario This scenario demonstrates how to send TCP logs to Alloy within a JSON format. We then use `log.process` to parse the logs and extract the fields from the JSON logs. These fields are used to generate labels and structured metadata for the logs. ## Running the Demo ### Step 1: Clone the repository ```bash git clone https://github.com/grafana/alloy-scenarios.git ``` ### Step 2: Deploy the monitoring stack ```bash cd alloy-scenarios/logs-tcp docker-compose up -d ``` ### Step 3: Access Grafana Alloy UI Open your browser and go to `http://localhost:12345`. ### Step 4: Access Grafana UI Open your browser and go to `http://localhost:3000`. ================================================ FILE: logs-tcp/config.alloy ================================================ livedebugging { enabled = true } loki.source.api "loki_push_api" { http { listen_address = "0.0.0.0" listen_port = 9999 } forward_to = [ loki.process.labels.receiver, ] } loki.process "labels" { stage.json { expressions = { "extracted_service" = "service_name", "extracted_code_line" = "code_line", "extracted_server" = "server_id", } } stage.labels { values = { "service_name" = "extracted_service", } } stage.structured_metadata { values = { "code_line" = "extracted_code_line", "server" = "extracted_server", } } forward_to = [loki.write.local.receiver] } loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" } } ================================================ FILE: logs-tcp/docker-compose.coda.yml ================================================ services: simulator: image: python:${PYTHON_VERSION:-3.11-slim} container_name: simulator volumes: - ./simulator.py:/simulator.py environment: - TARGET_HOST=alloy - TARGET_PORT=9999 command: ["python3", "/simulator.py"] ================================================ FILE: logs-tcp/docker-compose.yml ================================================ version: '3.8' services: # Syslog simulator using a Python script simulator: image: python:${PYTHON_VERSION:-3.11-slim} container_name: simulator volumes: - ./simulator.py:/simulator.py # Syslog simulator script environment: - TARGET_HOST=alloy - TARGET_PORT=9999 command: ["python3", "/simulator.py"] depends_on: - alloy alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 - 4318:4318 volumes: - ./config.alloy:/etc/alloy/config.alloy - ./logs:/tmp/app-logs/ command: run --server.http.listen-addr=0.0.0.0:12345 --stability.level=experimental --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - loki loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false EOF /run.sh volumes: rsyslog_data: ================================================ FILE: logs-tcp/loki-config.yaml ================================================ auth_enabled: false server: http_listen_port: 3100 grpc_listen_port: 9096 log_level: debug grpc_server_max_concurrent_streams: 1000 common: instance_addr: 127.0.0.1 path_prefix: /tmp/storage storage: filesystem: chunks_directory: /tmp/storage/chunks rules_directory: /tmp/storage/rules replication_factor: 1 ring: kvstore: store: inmemory query_range: results_cache: cache: embedded_cache: enabled: true max_size_mb: 100 limits_config: metric_aggregation_enabled: true schema_config: configs: - from: 2020-10-24 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h pattern_ingester: enabled: true metric_aggregation: loki_address: localhost:3100 ruler: alertmanager_url: http://localhost:9093 frontend: encoding: protobuf # By default, Loki will send anonymous, but uniquely-identifiable usage and configuration # analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/ # # Statistics help us better understand how Loki is used, and they show us performance # levels for most users. This helps us prioritize features and documentation. # For more information on what's sent, look at # https://github.com/grafana/loki/blob/main/pkg/analytics/stats.go # Refer to the buildReport method to see what goes into a report. # # If you would like to disable reporting, uncomment the following lines: #analytics: # reporting_enabled: false ================================================ FILE: logs-tcp/simulator.py ================================================ import socket import time import os import random import json from datetime import datetime # Get the target host and port from environment variables target_host = os.getenv('TARGET_HOST', 'localhost') target_port = int(os.getenv('TARGET_PORT', 5140)) # Define the endpoint path endpoint_path = "/loki/api/v1/raw" # Create a TCP socket sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: sock.connect((target_host, target_port)) except socket.error as e: print(f"Failed to connect to {target_host}:{target_port} - {e}") exit(1) # Define log levels and messages log_levels = ["INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"] messages = [ "System started successfully", "User login successful", "Configuration loaded", "Connection to database failed", "Data processed successfully", "Invalid API request received", "Memory usage high", "Disk space low", "Unknown error occurred", "Service restarted", ] # Define extra fields for the log payload service_names = ["AuthService", "DataService", "PaymentService", "NotificationService"] regions = ["us-east-1", "eu-west-1", "ap-south-1", "sa-east-1"] server_ids = ["srv-101", "srv-202", "srv-303", "srv-404"] # Generate and send JSON log messages every few seconds while True: try: # Correct timestamp format timestamp = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') log_level = random.choice(log_levels) message_text = random.choice(messages) service_name = random.choice(service_names) region = random.choice(regions) server_id = random.choice(server_ids) code_line = random.randint(20, 120) # Simulate random code line numbers # Create the JSON log payload log_payload = { "timestamp": timestamp, "severity": log_level, "body": message_text, "service_name": service_name, "code_line": code_line, "region": region, "server_id": server_id } # Convert the log payload to JSON string log_json = json.dumps(log_payload) # Create the HTTP POST request to send the log http_request = ( f"POST {endpoint_path} HTTP/1.1\r\n" f"Host: {target_host}\r\n" "Content-Type: application/json\r\n" f"Content-Length: {len(log_json)}\r\n" "Connection: keep-alive\r\n" "\r\n" f"{log_json}" ) # Send the HTTP request over TCP sock.sendall(http_request.encode()) print(f"Sent JSON log message to {target_host}:{target_port} - {log_json}") except socket.error as e: print(f"Failed to send log message - {e}") break # Wait for a few seconds before sending the next message time.sleep(random.randint(3, 8)) # Send a message every 3-8 seconds ================================================ FILE: mail-house/README.md ================================================ # Mail House Scenario Learn how to parse structured logs into Labels and Structured Metadata. ## Running the Demo ### Step 1: Clone the repository ```bash git clone https://github.com/grafana/alloy-scenarios.git ``` ### Step 2: Deploy the monitoring stack ```bash cd alloy-scenarios/mail-house docker-compose up -d ``` ### Step 3: Access Grafana Alloy UI Open your browser and go to `http://localhost:12345`. ### Step 4: Access Grafana UI Open your browser and go to `http://localhost:3000`. ================================================ FILE: mail-house/config.alloy ================================================ livedebugging { enabled = true } loki.source.api "loki_push_api" { http { listen_address = "0.0.0.0" listen_port = 9999 } forward_to = [ loki.process.labels.receiver, ] } loki.process "labels" { stage.json { expressions = { "timestamp" = "", "state" = "", "package_size" = "", "package_status" = "", "package_id" = "", "mail_house_id" = "", } } stage.timestamp { source = "timestamp" format = "RFC3339" } stage.labels { values = { "state" = "", "package_size" = "", "mail_house_id" = "", } } stage.structured_metadata { values = { "package_status" = "", "package_id" = "", } } stage.static_labels { values = { "service_name" = "Delivery World", } } stage.output { source = "message" } forward_to = [loki.write.local.receiver] } loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" } } ================================================ FILE: mail-house/docker-compose.coda.yml ================================================ services: mail-house-01: image: python:${PYTHON_VERSION:-3.11-slim} volumes: - ./main.py:/main.py command: ["python3", "/main.py"] environment: - MAIL_HOUSE_ID=DEPOT-01 restart: unless-stopped mail-house-02: image: python:${PYTHON_VERSION:-3.11-slim} volumes: - ./main.py:/main.py command: ["python3", "/main.py"] environment: - MAIL_HOUSE_ID=DEPOT-02 restart: unless-stopped mail-house-03: image: python:${PYTHON_VERSION:-3.11-slim} volumes: - ./main.py:/main.py command: ["python3", "/main.py"] environment: - MAIL_HOUSE_ID=DEPOT-03 restart: unless-stopped ================================================ FILE: mail-house/docker-compose.yml ================================================ version: '3.8' services: mail-house-01: image: python:${PYTHON_VERSION:-3.11-slim} volumes: - ./main.py:/main.py depends_on: - alloy command: ["python3", "/main.py"] environment: - MAIL_HOUSE_ID=DEPOT-01 restart: unless-stopped mail-house-02: image: python:${PYTHON_VERSION:-3.11-slim} volumes: - ./main.py:/main.py depends_on: - alloy command: ["python3", "/main.py"] environment: - MAIL_HOUSE_ID=DEPOT-02 restart: unless-stopped mail-house-03: image: python:${PYTHON_VERSION:-3.11-slim} volumes: - ./main.py:/main.py depends_on: - alloy command: ["python3", "/main.py"] environment: - MAIL_HOUSE_ID=DEPOT-03 restart: unless-stopped alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 - 4318:4318 volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --stability.level=experimental --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - loki loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false EOF /run.sh ================================================ FILE: mail-house/loki-config.yaml ================================================ # This is a complete configuration to deploy Loki backed by the filesystem. # The index will be shipped to the storage via tsdb-shipper. auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true # Note: We are setting the max chunk age far lower than the default expected value # This is due to the fact this scenario is used within the LogCLI demo and we need a short flush time. # To show how logcli stats --since 24h '{service_name="Delivery World", package_size="Large"}' works. ingester: max_chunk_age: 5m # Should be 2 hours ================================================ FILE: mail-house/main.py ================================================ import random import json import time import socket from datetime import datetime import os # Get the target host and port from environment variables target_host = os.getenv('TARGET_HOST', 'alloy') target_port = int(os.getenv('TARGET_PORT', 9999)) # Get the mail house ID from environment variables mail_house_id = os.getenv('MAIL_HOUSE_ID', 'MAIL-01') # Define the endpoint path endpoint_path = "/loki/api/v1/raw" # List of states and cities in America (abbreviated version) STATES_CITIES = { "California": ["Los Angeles", "San Francisco", "San Diego"], "Texas": ["Houston", "Dallas", "Austin"], "New York": ["New York City", "Buffalo", "Rochester"], "Florida": ["Miami", "Orlando", "Tampa"], "Illinois": ["Chicago", "Springfield", "Naperville"], } # Package statuses and metadata PACKAGE_SIZES = ["Small", "Medium", "Large"] PACKAGE_TYPES = ["Documents", "Electronics", "Clothing", "Food", "Furniture"] PACKAGE_STATUS_LEVELS = ["info", "warning", "critical", "error"] PACKAGE_NOTES = [ "In transit", "Out for delivery", "Delivered successfully", "Delayed due to weather", "Address not found", "Returned to sender", "Damaged during transit", ] def generate_log_entry(): state = random.choice(list(STATES_CITIES.keys())) city = random.choice(STATES_CITIES[state]) log_entry = { "timestamp": datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f") + "Z", "state": state, "city": city, "package_id": f"PKG{random.randint(10000, 99999)}", "package_type": random.choice(PACKAGE_TYPES), "package_size": random.choice(PACKAGE_SIZES), "package_status": random.choice(PACKAGE_STATUS_LEVELS), "note": random.choice(PACKAGE_NOTES), "sender": { "name": f"Sender{random.randint(1, 100)}", "address": f"{random.randint(100, 999)} {random.choice(['Main St', 'Broadway', 'Elm St', 'Maple Ave'])}, {city}, {state}", }, "receiver": { "name": f"Receiver{random.randint(1, 100)}", "address": f"{random.randint(100, 999)} {random.choice(['Oak St', 'Pine Rd', 'Cedar Blvd', 'Willow Ln'])}, {random.choice(STATES_CITIES[state])}, {state}", }, "mail_house_id": mail_house_id, } return log_entry def main(): # Create a TCP socket sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: sock.connect((target_host, target_port)) except socket.error as e: print(f"Failed to connect to {target_host}:{target_port} - {e}") time.sleep(1) main() while True: try: log_entry = generate_log_entry() log_entry_json = json.dumps(log_entry) http_request = ( f"POST {endpoint_path} HTTP/1.1\r\n" f"Host: {target_host}\r\n" "Content-Type: application/json\r\n" f"Content-Length: {len(log_entry_json)}\r\n" "Connection: keep-alive\r\n" "\r\n" f"{log_entry_json}" ) # Send the HTTP request over TCP sock.sendall(http_request.encode()) print(f"Sent JSON log message to {target_host}:{target_port} - {log_entry_json}") # Wait for a few seconds before sending the next log time.sleep(1) except socket.error as e: print(f"Failed to send log message - {e}") # Close the socket and exit sock.close() exit(1) if __name__ == "__main__": main() ================================================ FILE: memcached-monitoring/README.md ================================================ # Memcached Monitoring with Grafana Alloy This scenario demonstrates how to monitor a Memcached instance using Grafana Alloy's built-in `prometheus.exporter.memcached` component. ## Architecture - **Memcached** - The monitored Memcached instance - **Grafana Alloy** - Collects Memcached metrics via `prometheus.exporter.memcached` and remote writes them to Prometheus - **Prometheus** - Stores the scraped metrics - **Grafana** - Visualizes Memcached metrics (auto-provisioned with Prometheus datasource) ## Running ```bash # From this directory docker compose up -d # Or from the repo root using centralized image versions ./run-example.sh memcached-monitoring ``` ## Accessing - **Grafana**: http://localhost:3000 (no login required) - **Alloy UI**: http://localhost:12345 - **Prometheus**: http://localhost:9090 ## Key Metrics Once running, you can query Memcached metrics in Grafana or Prometheus. Some useful metrics include: - `memcached_up` - Whether Memcached is reachable - `memcached_current_connections` - Number of current connections - `memcached_current_bytes` - Current number of bytes stored - `memcached_current_items` - Current number of items stored - `memcached_commands_total` - Total commands by command type (get, set, etc.) - `memcached_items_evicted_total` - Total number of items evicted - `memcached_read_bytes_total` / `memcached_written_bytes_total` - Network throughput ## Stopping ```bash docker compose down ``` ================================================ FILE: memcached-monitoring/config.alloy ================================================ // Memcached Monitoring with Grafana Alloy // This configuration scrapes Memcached metrics using the built-in prometheus.exporter.memcached component // and remote writes them to Prometheus. livedebugging { enabled = true } prometheus.exporter.memcached "default" { address = "memcached:11211" } prometheus.scrape "memcached" { targets = prometheus.exporter.memcached.default.targets forward_to = [prometheus.remote_write.default.receiver] } prometheus.remote_write "default" { endpoint { url = "http://prometheus:9090/api/v1/write" } } ================================================ FILE: memcached-monitoring/docker-compose.coda.yml ================================================ services: memcached: image: memcached:1.6@sha256:277e0c4f249b118e95ab10e535bae2fa1af772271d9152f3468e58d59348db56 ports: - "11211:11211" ================================================ FILE: memcached-monitoring/docker-compose.yml ================================================ services: memcached: image: memcached:1.6@sha256:277e0c4f249b118e95ab10e535bae2fa1af772271d9152f3468e58d59348db56 ports: - "11211:11211" prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --config.file=/etc/prometheus/prometheus.yml volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml ports: - "9090:9090" grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - memcached - prometheus ================================================ FILE: memcached-monitoring/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s ================================================ FILE: mysql-monitoring/README.md ================================================ # MySQL Monitoring with Grafana Alloy This scenario demonstrates how to monitor a MySQL database using Grafana Alloy's `prometheus.exporter.mysql` component. Alloy scrapes MySQL metrics and remote-writes them to Prometheus, which Grafana queries for visualization. ## Prerequisites - Docker and Docker Compose installed ## Getting Started ```bash git clone https://github.com/grafana/alloy-scenarios.git cd alloy-scenarios/mysql-monitoring docker compose up -d ``` ## Access Points | Service | URL | |------------|------------------------------| | Grafana | http://localhost:3000 | | Alloy UI | http://localhost:12345 | | Prometheus | http://localhost:9090 | ## What to Expect Once the stack is running, Alloy connects to the MySQL instance and exposes metrics via the `prometheus.exporter.mysql` component. These metrics are scraped every 15 seconds and forwarded to Prometheus using remote write. Open Grafana at http://localhost:3000, navigate to **Explore**, select the **Prometheus** datasource, and query for `mysql_` prefixed metrics (e.g., `mysql_up`, `mysql_global_status_connections`, `mysql_global_status_threads_connected`). You can also inspect the Alloy pipeline at http://localhost:12345 to verify that the exporter, scrape, and remote write components are healthy. Live debugging is enabled for real-time pipeline inspection. ## Stopping the Scenario ```bash docker compose down ``` ================================================ FILE: mysql-monitoring/config.alloy ================================================ // ############################### // #### Metrics Configuration #### // ############################### // Enable live debugging for the Alloy UI. livedebugging { enabled = true } // Expose MySQL metrics using the prometheus.exporter.mysql component. prometheus.exporter.mysql "default" { data_source_name = "root:alloy@(mysql:3306)/" } // Configure a prometheus.scrape component to collect MySQL metrics. prometheus.scrape "mysql" { targets = prometheus.exporter.mysql.default.targets forward_to = [prometheus.remote_write.default.receiver] scrape_interval = "15s" } // Configure a prometheus.remote_write component to send metrics to Prometheus. prometheus.remote_write "default" { endpoint { url = "http://prometheus:9090/api/v1/write" } } ================================================ FILE: mysql-monitoring/docker-compose.coda.yml ================================================ services: mysql: image: mysql:9.7@sha256:f0ef1d92fa650fcfa5b85f1d82bb1a56a6dd579bf256b8f8f2a5a0b1b61c8b0b environment: - MYSQL_ROOT_PASSWORD=alloy - MYSQL_DATABASE=alloy ports: - "3306:3306" healthcheck: test: ["CMD", "mysqladmin", "ping", "-h", "localhost", "-u", "root", "-palloy"] interval: 10s timeout: 5s retries: 5 ================================================ FILE: mysql-monitoring/docker-compose.yml ================================================ services: mysql: image: mysql:9.7@sha256:f0ef1d92fa650fcfa5b85f1d82bb1a56a6dd579bf256b8f8f2a5a0b1b61c8b0b environment: - MYSQL_ROOT_PASSWORD=alloy - MYSQL_DATABASE=alloy ports: - "3306:3306" healthcheck: test: ["CMD", "mysqladmin", "ping", "-h", "localhost", "-u", "root", "-palloy"] interval: 10s timeout: 5s retries: 5 prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --config.file=/etc/prometheus/prometheus.yml volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml ports: - "9090:9090" grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: mysql: condition: service_healthy ================================================ FILE: mysql-monitoring/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s ================================================ FILE: nginx-monitoring/README.md ================================================ # NGINX Monitoring with Grafana Alloy End-to-end NGINX observability with a single Alloy pipeline: - **Logs** — `loki.source.file` tails NGINX access and error logs; `loki.process` parses the combined log format and promotes `method` and `status` to labels. - **Metrics** — `prometheus.scrape` scrapes `nginx-prometheus-exporter` (which itself reads NGINX's built-in `stub_status`) and remote-writes to Prometheus. ## Architecture - **NGINX** — the monitored web server (`/nginx_status` enabled, access/error logs written to a shared volume) - **nginx-prometheus-exporter** — translates `stub_status` into Prometheus metrics on `:9113` - **loadgen** — small `curl` loop that hits NGINX once per second so the demo has visible activity (200s and 404s) - **Grafana Alloy** — the pipeline above, exposed at `:12345` - **Loki / Prometheus / Grafana** — backends and visualization, with Loki and Prometheus datasources auto-provisioned ## Running ```bash # From this directory docker compose up -d # Or from the repo root using centralized image versions ./run-example.sh nginx-monitoring ``` ## Accessing - **Grafana**: http://localhost:3000 (no login required) - **Alloy UI**: http://localhost:12345 — verify components are healthy and inspect the live data flow - **Prometheus**: http://localhost:9090 - **NGINX**: http://localhost:8080 — `/` returns "ok", `/nginx_status` returns connection counters ## Trying it out The `loadgen` container hits NGINX once per second (alternating a 200 response and a 404). Within ~30 seconds you should see: ### Logs (Loki) ```logql # All access logs {job="nginx", log_type="access"} # Just 4xx {job="nginx", log_type="access", status=~"4.."} # Error log {job="nginx", log_type="error"} ``` The combined-log regex extracts `remote_addr`, `time_local`, `method`, `path`, `status`, and `bytes_sent`. Of those, `method` and `status` are promoted to Loki labels for fast filtering; the rest stay in the line text. ### Metrics (Prometheus) ```promql # Active connections nginx_connections_active # Accepted-since-start counter (per second) rate(nginx_connections_accepted[1m]) # Total HTTP requests nginx_http_requests_total ``` ## Customization - **Different log format**: edit the regex in `config.alloy` under `loki.process.nginx`. The default expects NGINX's built-in `combined` format. - **Different exporter target**: change the `--nginx.scrape-uri` flag on `nginx-exporter` in `docker-compose.yml`. - **More log sources**: add entries to `local.file_match.nginx.path_targets`. ## Stopping ```bash docker compose down -v ``` The `-v` removes the shared `nginx-logs` volume so the next run starts with a clean log file. ================================================ FILE: nginx-monitoring/config.alloy ================================================ // NGINX Monitoring with Grafana Alloy. // Logs: tail access.log + error.log via loki.source.file, parse the access log // with a combined-format regex, and ship to Loki with method/status labels. // Metrics: scrape nginx-prometheus-exporter and remote_write to Prometheus. livedebugging { enabled = true } // --- logs pipeline --- local.file_match "nginx" { path_targets = [ { __path__ = "/var/log/nginx-data/access.log", job = "nginx", log_type = "access", }, { __path__ = "/var/log/nginx-data/error.log", job = "nginx", log_type = "error", }, ] sync_period = "5s" } loki.source.file "nginx" { targets = local.file_match.nginx.targets forward_to = [loki.process.nginx.receiver] tail_from_end = true } loki.process "nginx" { // Extract `method` and `status` from access logs (combined format). // Error logs pass through unchanged. stage.match { selector = "{log_type=\"access\"}" stage.regex { expression = `^(?P\S+) - (?P\S+) \[(?P[^\]]+)\] "(?P\S+) (?P\S+) [^"]+" (?P\d+) (?P\d+)` } stage.labels { values = { method = "", status = "", } } } forward_to = [loki.write.local.receiver] } loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" } } // --- metrics pipeline --- prometheus.scrape "nginx" { targets = [{ __address__ = "nginx-exporter:9113", job = "nginx", }] forward_to = [prometheus.remote_write.local.receiver] scrape_interval = "15s" } prometheus.remote_write "local" { endpoint { url = "http://prometheus:9090/api/v1/write" } } ================================================ FILE: nginx-monitoring/docker-compose.yml ================================================ services: nginx: image: nginx:${NGINX_VERSION:-1.30-alpine} container_name: nginx-monitoring-nginx ports: - "8080:80" volumes: - ./nginx.conf:/etc/nginx/nginx.conf:ro - nginx-logs:/var/log/nginx-data nginx-exporter: image: nginx/nginx-prometheus-exporter:${NGINX_EXPORTER_VERSION:-1.5.1} container_name: nginx-monitoring-exporter command: - --nginx.scrape-uri=http://nginx:80/nginx_status ports: - "9113:9113" depends_on: - nginx loadgen: image: curlimages/curl:${CURL_VERSION:-8.20.0} container_name: nginx-monitoring-loadgen entrypoint: - sh - -c - | until curl -s -o /dev/null --max-time 2 http://nginx/; do sleep 1; done while true; do curl -s -o /dev/null http://nginx/ curl -s -o /dev/null http://nginx/missing-page sleep 1 done depends_on: - nginx restart: unless-stopped alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} container_name: nginx-monitoring-alloy ports: - "12345:12345" volumes: - ./config.alloy:/etc/alloy/config.alloy - nginx-logs:/var/log/nginx-data:ro command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - nginx - nginx-exporter - loki - prometheus loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} container_name: nginx-monitoring-loki ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} container_name: nginx-monitoring-prometheus command: - --web.enable-remote-write-receiver - --config.file=/etc/prometheus/prometheus.yml volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml ports: - "9090:9090" grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} container_name: nginx-monitoring-grafana environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - "3000:3000/tcp" entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false - name: Prometheus type: prometheus access: proxy orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh volumes: nginx-logs: ================================================ FILE: nginx-monitoring/loki-config.yaml ================================================ auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true ingester: max_chunk_age: 5m ================================================ FILE: nginx-monitoring/nginx.conf ================================================ worker_processes 1; events { worker_connections 1024; } http { # Use the nginx built-in "combined" log format: # '$remote_addr - $remote_user [$time_local] "$request" ' # '$status $body_bytes_sent "$http_referer" "$http_user_agent"' # # Write to a fresh path outside /var/log/nginx — that directory # has access.log/error.log pre-symlinked to /dev/stdout in the # nginx image, which Alloy's tailer cannot follow across containers. access_log /var/log/nginx-data/access.log combined; error_log /var/log/nginx-data/error.log warn; server { listen 80; server_name _; location = / { add_header Content-Type text/plain; return 200 "ok\n"; } location = /missing-page { return 404; } location /nginx_status { stub_status on; access_log off; } } } ================================================ FILE: nginx-monitoring/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s ================================================ FILE: otel-basic-tracing/README.md ================================================ # OpenTelemetry Basic Tracing with Grafana Alloy This example demonstrates how to collect and visualize OpenTelemetry traces using Grafana Alloy and Tempo. ## Overview The example includes: - A sample Python Flask application that generates various types of traces - Grafana Alloy as the telemetry pipeline - Tempo for trace storage and querying - Prometheus for metrics collection (service graphs) - Grafana for visualization ## Running the Demo 1. Clone the repository: ``` git clone https://github.com/grafana/alloy-scenarios.git cd alloy-scenarios ``` 2. Navigate to this example directory: ``` cd otel-basic-tracing ``` 3. Run using Docker Compose: ``` docker compose up -d ``` Or use the centralized image management: ``` cd .. ./run-example.sh otel-basic-tracing ``` 4. Access the demo application at http://localhost:8080 5. Access Grafana at http://localhost:3000 6. Access Prometheus at http://localhost:9090 ## What to Expect The demo application provides several endpoints that generate different types of traces: - **/simple**: Generates a simple trace with a single span - **/nested**: Generates a trace with nested spans (parent-child relationships) - **/error**: Generates a trace that includes an error - **/chain**: Simulates a chain of service calls to demonstrate distributed tracing After accessing these endpoints, you can view the traces in Grafana by: 1. Opening http://localhost:3000 2. Navigating to Explore 3. Selecting the Tempo data source 4. Using the Search tab to find and visualize traces ## Service Graphs This example includes service graph visualization capabilities. As you generate traces with the demo app (especially with the `/chain` endpoint), Tempo will generate service graph metrics that are sent to Prometheus. To view the service graph: 1. Open Grafana (http://localhost:3000) 2. Navigate to Explore 3. Select the Tempo data source 4. Click on the "Service Graph" tab 5. You should see a visual representation of the relationships between services ## Architecture ``` ┌────────────┐ ┌──────────┐ ┌───────┐ ┌─────────┐ │ Demo App │────▶│ Alloy │─────▶│ Tempo │─────▶│ Grafana │ │ (OTel SDK) │ │ │ │ │ │ │ └────────────┘ └──────────┘ └───┬───┘ └─────────┘ │ ▲ ▼ │ ┌─────────┐ │ │Prometheus│───────────┘ └─────────┘ ``` The Demo App generates traces using the OpenTelemetry SDK and sends them to Alloy, which processes and forwards them to Tempo. Tempo generates service graph metrics and sends them to Prometheus. Grafana queries both Tempo and Prometheus to visualize traces and service graphs. ## Customizing The Alloy configuration is a simple placeholder. You can modify `config.alloy` to add processors, filters, or additional exporters to demonstrate more complex telemetry pipelines. ================================================ FILE: otel-basic-tracing/app/Dockerfile ================================================ ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2 FROM python:${PYTHON_VERSION} WORKDIR /app COPY requirements.txt . RUN pip install -r requirements.txt COPY app.py . CMD ["python", "app.py"] ================================================ FILE: otel-basic-tracing/app/app.py ================================================ import os import random import time from flask import Flask, request from opentelemetry import trace from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.resources import SERVICE_NAME, Resource from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.instrumentation.flask import FlaskInstrumentor from opentelemetry.instrumentation.requests import RequestsInstrumentor import requests # Configure the tracer resource = Resource.create(attributes={ SERVICE_NAME: "trace-demo" }) trace.set_tracer_provider(TracerProvider(resource=resource)) # Configure the OTLP exporter using environment variables # OTEL_EXPORTER_OTLP_ENDPOINT will be used automatically otlp_exporter = OTLPSpanExporter(endpoint="http://alloy:4317/v1/traces", insecure=True) span_processor = BatchSpanProcessor(span_exporter=otlp_exporter, max_export_batch_size=1) trace.get_tracer_provider().add_span_processor(span_processor) # Create a tracer tracer = trace.get_tracer(__name__) # Create a Flask application app = Flask(__name__) # Instrument Flask FlaskInstrumentor().instrument_app(app) # Instrument requests RequestsInstrumentor().instrument() @app.route('/') def home(): return """

OpenTelemetry Demo

This app demonstrates OpenTelemetry tracing with Grafana Alloy.

""" @app.route('/simple') def simple_trace(): with tracer.start_as_current_span("simple-operation") as span: span.set_attribute("operation.type", "simple") span.set_attribute("operation.value", random.randint(1, 100)) time.sleep(0.1) # Simulate work return {"status": "ok", "message": "Simple trace generated"} @app.route('/nested') def nested_trace(): with tracer.start_as_current_span("parent-operation") as parent: parent.set_attribute("operation.type", "parent") time.sleep(0.05) # Simulate work with tracer.start_as_current_span("child-operation-1") as child1: child1.set_attribute("operation.type", "child") child1.set_attribute("child.number", 1) time.sleep(0.05) # Simulate work with tracer.start_as_current_span("child-operation-2") as child2: child2.set_attribute("operation.type", "child") child2.set_attribute("child.number", 2) time.sleep(0.05) # Simulate work with tracer.start_as_current_span("grandchild-operation") as grandchild: grandchild.set_attribute("operation.type", "grandchild") time.sleep(0.05) # Simulate work return {"status": "ok", "message": "Nested trace generated"} @app.route('/error') def error_trace(): with tracer.start_as_current_span("error-operation") as span: span.set_attribute("operation.type", "error") try: # Simulate an error result = 1 / 0 except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) return {"status": "error", "message": "Error trace generated"} @app.route('/chain') def chain_trace(): with tracer.start_as_current_span("chain-root") as span: span.set_attribute("operation.step", "start") # Simulate a chain of service calls try: # Call ourselves to simulate microservice calls # In a real world example these would be different services service_b_url = f"http://localhost:8080/service/b?id={random.randint(1000, 9999)}" response = requests.get(service_b_url) return {"status": "ok", "message": "Chain trace generated", "data": response.json()} except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) return {"status": "error", "message": "Failed to complete chain"} @app.route('/service/b') def service_b(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span(f"service-b-handler") as span: span.set_attribute("service", "B") span.set_attribute("request.id", req_id) time.sleep(0.1) # Simulate work # Call service C service_c_url = f"http://localhost:8080/service/c?id={req_id}" response = requests.get(service_c_url) return {"status": "ok", "message": "Service B completed", "data": response.json()} @app.route('/service/c') def service_c(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span(f"service-c-handler") as span: span.set_attribute("service", "C") span.set_attribute("request.id", req_id) time.sleep(0.15) # Simulate work # Randomly fail sometimes to show error traces if random.random() < 0.2: # 20% chance of failure span.set_status(trace.StatusCode.ERROR, "Random failure") return {"status": "error", "message": "Service C failed randomly"} return {"status": "ok", "message": "Service C completed successfully"} # New delayed chain implementation @app.route('/delayed-chain') def delayed_chain_trace(): with tracer.start_as_current_span("delayed-chain-root") as span: span.set_attribute("operation.step", "start") span.set_attribute("operation.type", "delayed-chain") try: # Start the chain with Service A service_a_url = f"http://localhost:8080/delayed/service-a?id={random.randint(1000, 9999)}" response = requests.get(service_a_url) return { "status": "ok", "message": "Delayed chain trace generated", "data": response.json() } except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) return {"status": "error", "message": "Failed to complete delayed chain"} @app.route('/delayed/service-a') def delayed_service_a(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span("service-a-handler") as span: span.set_attribute("service", "A") span.set_attribute("request.id", req_id) span.set_attribute("service.latency", "normal") time.sleep(0.1) # Normal latency # Call service B service_b_url = f"http://localhost:8080/delayed/service-b?id={req_id}" response = requests.get(service_b_url) return {"status": "ok", "message": "Service A completed", "data": response.json()} @app.route('/delayed/service-b') def delayed_service_b(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span("service-b-handler") as span: span.set_attribute("service", "B") span.set_attribute("request.id", req_id) span.set_attribute("service.latency", "normal") time.sleep(0.15) # Normal latency # Call service C service_c_url = f"http://localhost:8080/delayed/service-c?id={req_id}" response = requests.get(service_c_url) return {"status": "ok", "message": "Service B completed", "data": response.json()} @app.route('/delayed/service-c') def delayed_service_c(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span("service-c-handler") as span: span.set_attribute("service", "C") span.set_attribute("request.id", req_id) span.set_attribute("service.latency", "normal") time.sleep(0.2) # Normal latency # Call the slow service D service_d_url = f"http://localhost:8080/delayed/service-d?id={req_id}" response = requests.get(service_d_url) return {"status": "ok", "message": "Service C completed", "data": response.json()} @app.route('/delayed/service-d') def delayed_service_d(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span("service-d-handler") as span: span.set_attribute("service", "D") span.set_attribute("request.id", req_id) span.set_attribute("service.latency", "high") span.set_attribute("latency.category", "bottleneck") # This service consistently has high latency (3-4 seconds) delay = random.uniform(3.0, 4.0) span.set_attribute("latency.seconds", delay) time.sleep(delay) # High latency # Call final service E service_e_url = f"http://localhost:8080/delayed/service-e?id={req_id}" response = requests.get(service_e_url) return {"status": "ok", "message": "Service D completed (with delay)", "data": response.json()} @app.route('/delayed/service-e') def delayed_service_e(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span("service-e-handler") as span: span.set_attribute("service", "E") span.set_attribute("request.id", req_id) span.set_attribute("service.latency", "normal") time.sleep(0.1) # Normal latency return {"status": "ok", "message": "Service E completed (chain end)"} if __name__ == '__main__': app.run(host='0.0.0.0', port=8080) ================================================ FILE: otel-basic-tracing/app/requirements.txt ================================================ flask requests opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp opentelemetry-instrumentation-flask opentelemetry-instrumentation-requests ================================================ FILE: otel-basic-tracing/config-otel.yaml ================================================ # # OTel Collector YAML Configuration for OpenTelemetry Trace Collection # # This is the OTel-native equivalent of config.alloy for use with the Alloy OTel Engine. # Run with: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d # receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 processors: batch: {} exporters: otlp/tempo: endpoint: tempo:4317 tls: insecure: true service: pipelines: traces: receivers: [otlp] processors: [batch] exporters: [otlp/tempo] ================================================ FILE: otel-basic-tracing/config.alloy ================================================ /* * Alloy Configuration for OpenTelemetry Trace Collection */ otelcol.receiver.otlp "default" { http {} grpc {} output { traces = [otelcol.processor.batch.default.input] } } otelcol.processor.batch "default" { output { traces = [otelcol.exporter.otlp.tempo.input] } } otelcol.exporter.otlp "tempo" { client { endpoint = "tempo:4317" tls { insecure = true } } } ================================================ FILE: otel-basic-tracing/docker-compose-otel.yml ================================================ # OTel Engine Override # # Uses Alloy's experimental OTel Engine to run a standard OTel Collector YAML config # instead of the River/HCL config.alloy file. # # Usage: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d # Stop: docker compose -f docker-compose.yml -f docker-compose-otel.yml down # services: alloy: command: otel --config=/etc/alloy/config-otel.yaml volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml ports: - 8888:8888 # OTel Engine HTTP server - 4317:4317 # OTLP gRPC - 4318:4318 # OTLP HTTP # Override demo-app endpoint to use standard OTLP gRPC port demo-app: environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=demo-service - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo ================================================ FILE: otel-basic-tracing/docker-compose.coda.yml ================================================ services: demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} network_mode: host restart: unless-stopped environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:12345 - OTEL_SERVICE_NAME=demo-service - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo ================================================ FILE: otel-basic-tracing/docker-compose.yml ================================================ version: '3.8' services: # Prometheus for metrics collection prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --enable-feature=native-histograms - --enable-feature=exemplar-storage - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090/tcp volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml # Tempo for tracing tempo: image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4} command: ["-config.file=/etc/tempo.yaml"] ports: - 3200:3200/tcp # tempo volumes: - ./tempo-config.yaml:/etc/tempo.yaml depends_on: - prometheus memcached: image: memcached:1.6.40@sha256:572b011ce33954ee809066d8cecbeb3ec98912109ee3be3663a3197425fd81ac container_name: memcached ports: - "11211:11211" environment: - MEMCACHED_MAX_MEMORY=64m # Set the maximum memory usage - MEMCACHED_THREADS=4 # Number of threads to use # Grafana for visualization grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false - GF_INSTALL_PLUGINS=https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: false version: 1 editable: false - name: Tempo type: tempo access: proxy orgId: 1 url: http://tempo:3200 basicAuth: false isDefault: true version: 1 editable: false jsonData: serviceMap: datasourceUid: 'Prometheus' nodeGraph: enabled: true EOF /run.sh depends_on: - prometheus - tempo # Alloy for telemetry pipeline alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 # Alloy HTTP server volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy # Demo app that generates OpenTelemetry traces demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8080:8080 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:12345 - OTEL_SERVICE_NAME=demo-service - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo ================================================ FILE: otel-basic-tracing/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s ================================================ FILE: otel-basic-tracing/tempo-config.yaml ================================================ stream_over_http_enabled: true server: http_listen_port: 3200 log_level: info cache: background: writeback_goroutines: 5 caches: - roles: - frontend-search memcached: addresses: dns+memcached:11211 query_frontend: search: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 metadata_slo: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 trace_by_id: duration_slo: 100ms metrics: max_duration: 200h # maximum duration of a metrics query, increase for local setups query_backend_after: 5m duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 distributor: receivers: # this configuration will listen on all ports and protocols that tempo is capable of. jaeger: # the receives all come from the OpenTelemetry collector. more configuration information can protocols: # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver thrift_http: # endpoint: "tempo:14268" # for a production deployment you should only enable the receivers you need! grpc: endpoint: "tempo:14250" thrift_binary: endpoint: "tempo:6832" thrift_compact: endpoint: "tempo:6831" zipkin: endpoint: "tempo:9411" otlp: protocols: grpc: endpoint: "tempo:4317" http: endpoint: "tempo:4318" opencensus: endpoint: "tempo:55678" ingester: max_block_duration: 5m # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally compactor: compaction: block_retention: 720h # overall Tempo trace retention. set for demo purposes metrics_generator: registry: external_labels: source: tempo cluster: docker-compose storage: path: /var/tempo/generator/wal remote_write: - url: http://prometheus:9090/api/v1/write send_exemplars: true traces_storage: path: /var/tempo/generator/traces processor: local_blocks: filter_server_spans: false flush_to_storage: true storage: trace: backend: local # backend configuration to use wal: path: /var/tempo/wal # where to store the wal locally local: path: /var/tempo/blocks overrides: defaults: metrics_generator: processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator generate_native_histograms: both ================================================ FILE: otel-examples/README.md ================================================ # OTel Engine Examples These scenarios use the **Alloy OTel Engine** -- an experimental feature introduced in Alloy v1.14 that lets you run standard [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) YAML configurations directly inside Alloy. Instead of writing Alloy's River/HCL syntax, you use the exact same YAML format that the upstream OTel Collector uses. ## What is the Alloy OTel Engine? Grafana Alloy has traditionally used its own **River** configuration language (HCL-like syntax in `config.alloy` files). Starting with v1.14, Alloy ships an experimental **OTel Engine** that accepts standard OTel Collector YAML. This means: - **No new language to learn** -- if you already know OTel Collector config, you can use Alloy directly - **Copy-paste from upstream docs** -- OTel Collector examples work as-is - **Migration path** -- move from vanilla OTel Collector to Alloy without rewriting configs - **Best of both worlds** -- Alloy's single-binary distribution with OTel Collector's YAML config The OTel Engine is started with: ```bash alloy otel --config= ``` You can validate configs before running: ```bash alloy otel validate --config= ``` ## Running These Examples Each scenario has a `docker-compose.yml` with the full stack: ```bash cd && docker compose up -d ``` Or from the repo root with centralized image versions: ```bash cd otel-examples/ && docker compose --env-file ../../image-versions.env up -d ``` ### Access the stack - **Grafana**: [http://localhost:3000](http://localhost:3000) (no login required) - **Alloy UI**: [http://localhost:12345](http://localhost:12345) (pipeline debugging UI, enabled via the `alloyengine` extension) ### Stop ```bash docker compose down ``` ## Scenarios | Scenario | Description | Key OTel Components | |----------|-------------|-------------------| | [filelog-processing](filelog-processing/) | Collect and parse mixed-format log files (JSON + plaintext) using the filelog receiver's operator chain | `filelog` receiver, `json_parser`, `regex_parser`, `severity_parser` operators | | [pii-redaction](pii-redaction/) | Scrub credit cards, emails, and IP addresses from traces and logs using OTTL `replace_pattern` | `transform` processor (OTTL) | | [routing-multi-tenant](routing-multi-tenant/) | Route logs to different Loki tenants based on resource attributes using fan-out + filter | `forward` connector, `filter` processor, `resource` processor | | [cost-control](cost-control/) | Drop health checks, filter debug logs, and apply head-based sampling to reduce telemetry volume | `filter` processor, `probabilistic_sampler` processor | | [resource-enrichment](resource-enrichment/) | Auto-discover and attach host/OS/Docker metadata to all telemetry signals | `resourcedetection` processor (env, system, docker) | | [count-connector](count-connector/) | Derive count metrics (request rate, error rate) from traces and logs | `count` connector | | [ottl-transform](ottl-transform/) | A cookbook of OTTL patterns: JSON parsing, severity mapping, attribute promotion, truncation | `transform` processor (OTTL) | | [host-metrics](host-metrics/) | Collect CPU, memory, disk, network metrics -- an OTel-native replacement for node_exporter | `hostmetrics` receiver | | [multi-pipeline-fanout](multi-pipeline-fanout/) | Send traces to two backends with different processing per destination (full vs. sampled) | `forward` connector, `probabilistic_sampler` processor | | [kafka-buffer](kafka-buffer/) | Buffer traces through Kafka for durability and backpressure handling | `kafka` receiver/exporter | ## Alloy UI and the `alloyengine` Extension Each scenario includes an `alloyengine` extension in `config-otel.yaml` that starts the Alloy River UI alongside the OTel pipeline. This gives you the visual pipeline debugging UI at [http://localhost:12345](http://localhost:12345). If you prefer a pure OTel Collector config without the Alloy UI, you can remove the `alloyengine` extension block and the `extensions: [alloyengine]` line from the `service:` section in `config-otel.yaml`, and drop the `config.alloy` volume mount from `docker-compose.yml`. The OTel pipeline will work identically -- you just won't have the UI. ## OTel Engine vs. River Configs For comparison, the parent repo's existing scenarios (e.g., `otel-basic-tracing/`, `otel-span-metrics/`) also have OTel YAML alternatives alongside their River configs. Run those with: ```bash docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d ``` ## Available Connectors The Alloy OTel Engine supports these connectors: `count`, `grafanacloud`, `servicegraph`, `spanmetrics`, `forward`. ## Further Reading - [Alloy OTel Engine Documentation](https://grafana.com/docs/alloy/latest/set-up/otel_engine/) - [OpenTelemetry Collector Configuration](https://opentelemetry.io/docs/collector/configuration/) - [OTTL (OpenTelemetry Transformation Language)](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/pkg/ottl) ================================================ FILE: otel-examples/cost-control/README.md ================================================ # Telemetry Cost Control Reduce observability costs by filtering noisy telemetry and applying probabilistic sampling in the Alloy OTel pipeline, before data reaches your backends. ## What This Demonstrates - **Filter processor** to drop unwanted spans (health checks, readiness probes, metrics endpoints) - **Filter processor** to drop low-severity logs (DEBUG level) - **Probabilistic sampler** for head-based trace sampling (keeps 25% of remaining traces) - **Transform processor** to strip high-cardinality attributes (`http.user_agent`, cookies) that inflate storage ## Prerequisites - Docker and Docker Compose ## Run ```bash docker compose up -d ``` ## Alloy UI The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline. If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`. ## Explore Open Grafana at [http://localhost:3000](http://localhost:3000). ### Verify filtering is working 1. **Traces (Tempo):** Go to Explore > Tempo. Search for traces from `cost-control-demo`. You should see `/api/order` and `/api/error` spans but **no** `/health`, `/ready`, or `/metrics` spans -- those are dropped by the filter processor. 2. **Logs (Loki):** Go to Explore > Loki. Query `{service_name="cost-control-demo"}`. You should see INFO and ERROR logs but **no** DEBUG logs. 3. **Sampling:** Only ~25% of the remaining (non-filtered) traces make it through. Compare the demo app's request rate with the trace count in Tempo to see the reduction. ### Sample Loki query ```logql {service_name="cost-control-demo"} | json ``` ### Check the Alloy OTel pipeline Visit the Alloy OTel HTTP server at [http://localhost:8888](http://localhost:8888). ## Key Configuration The `config-otel.yaml` pipeline applies three cost-control stages: 1. **`filter/traces`** -- Drops spans where `http.target` or `http.route` matches `/health`, `/ready`, or `/metrics`. These high-frequency probes generate enormous trace volume with no diagnostic value. 2. **`filter/logs`** -- Drops log records with `severity_number < 9` (below INFO). DEBUG logs are useful in development but costly at scale. 3. **`probabilistic_sampler`** -- Keeps 25% of remaining traces via consistent head-based sampling. Adjust `sampling_percentage` to trade off between cost and visibility. 4. **`transform/strip`** -- Removes `http.user_agent` and `http.request.header.cookie` attributes from spans. These high-cardinality fields consume significant index and storage space. ## Stop ```bash docker compose down ``` ================================================ FILE: otel-examples/cost-control/app/Dockerfile ================================================ ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2 FROM python:${PYTHON_VERSION} WORKDIR /app COPY requirements.txt . RUN pip install -r requirements.txt COPY app.py . CMD ["python", "app.py"] ================================================ FILE: otel-examples/cost-control/app/app.py ================================================ """ Demo Flask app for the cost-control scenario. Generates a noisy mix of telemetry: frequent health/ready checks, DEBUG logs, and occasional real business traces. The Alloy OTel pipeline filters out the noise using filter processors and probabilistic sampling. """ import logging import random import threading import time from flask import Flask, jsonify from opentelemetry import trace from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.instrumentation.flask import FlaskInstrumentor from opentelemetry.sdk.resources import Resource from opentelemetry.trace import StatusCode # --- OTel Setup --- resource = Resource.create({ "service.name": "cost-control-demo", "service.version": "1.0.0", }) # Traces tracer_provider = TracerProvider(resource=resource) tracer_provider.add_span_processor( BatchSpanProcessor(OTLPSpanExporter(endpoint="alloy:4317", insecure=True)) ) trace.set_tracer_provider(tracer_provider) tracer = trace.get_tracer(__name__) # Logs via OTel logger_provider = LoggerProvider(resource=resource) logger_provider.add_log_record_processor( BatchLogRecordProcessor(OTLPLogExporter(endpoint="alloy:4317", insecure=True)) ) handler = LoggingHandler(level=logging.DEBUG, logger_provider=logger_provider) logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger("cost-control-demo") logger.addHandler(handler) # --- Flask App --- app = Flask(__name__) FlaskInstrumentor().instrument_app(app) @app.route("/health") def health(): """Noisy health check endpoint - called very frequently.""" logger.debug("Health check OK") return jsonify({"status": "healthy"}) @app.route("/ready") def ready(): """Noisy readiness probe endpoint.""" logger.debug("Readiness check OK") return jsonify({"status": "ready"}) @app.route("/api/order") def order(): """Real business endpoint that produces useful traces.""" with tracer.start_as_current_span("process-order") as span: order_id = f"ORD-{random.randint(1000, 9999)}" span.set_attribute("order.id", order_id) span.set_attribute("order.amount", round(random.uniform(10.0, 500.0), 2)) span.set_attribute("customer.tier", random.choice(["gold", "silver", "bronze"])) # Simulate processing time time.sleep(random.uniform(0.05, 0.2)) logger.info("Order %s processed successfully", order_id) return jsonify({"order_id": order_id, "status": "completed"}) @app.route("/api/error") def error(): """Endpoint that occasionally generates errors.""" with tracer.start_as_current_span("handle-error") as span: error_code = random.choice(["TIMEOUT", "INVALID_INPUT", "DB_ERROR"]) span.set_attribute("error.code", error_code) span.set_status(StatusCode.ERROR, f"Simulated error: {error_code}") span.record_exception(Exception(f"Simulated {error_code}")) logger.error("Request failed with error: %s", error_code) return jsonify({"error": error_code}), 500 def load_generator(): """Background thread that generates traffic with a noisy distribution.""" import requests base_url = "http://localhost:8080" # Wait for Flask to start time.sleep(5) while True: r = random.random() try: if r < 0.70: requests.get(f"{base_url}/health", timeout=2) elif r < 0.80: requests.get(f"{base_url}/ready", timeout=2) elif r < 0.95: requests.get(f"{base_url}/api/order", timeout=2) else: requests.get(f"{base_url}/api/error", timeout=2) except Exception: pass # Also emit frequent DEBUG logs (noise) logger.debug("Background tick at %s", time.time()) time.sleep(random.uniform(0.2, 1.0)) if __name__ == "__main__": thread = threading.Thread(target=load_generator, daemon=True) thread.start() app.run(host="0.0.0.0", port=8080) ================================================ FILE: otel-examples/cost-control/app/requirements.txt ================================================ flask opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp opentelemetry-instrumentation-flask ================================================ FILE: otel-examples/cost-control/config-otel.yaml ================================================ # # OTel Collector YAML: Telemetry Cost Control # # Demonstrates using filter and probabilistic_sampler processors # to drop noisy telemetry (health checks, debug logs) and apply # head-based sampling to reduce observability costs. # extensions: alloyengine: config: file: /etc/alloy/config.alloy flags: server.http.listen-addr: 0.0.0.0:12345 receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 processors: # Drop health check and readiness probe spans filter/traces: error_mode: ignore traces: span: - attributes["http.target"] == "/health" - attributes["http.target"] == "/ready" - attributes["http.target"] == "/metrics" - attributes["http.route"] == "/health" - attributes["http.route"] == "/ready" # Drop DEBUG-level logs filter/logs: error_mode: ignore logs: log_record: - severity_number < 9 # Head-based probabilistic sampling: keep 25% of remaining traces probabilistic_sampler: sampling_percentage: 25 # Strip high-cardinality attributes to reduce storage transform/strip: error_mode: ignore trace_statements: - context: span statements: - delete_key(attributes, "http.user_agent") - delete_key(attributes, "http.request.header.cookie") batch: {} exporters: otlp/tempo: endpoint: tempo:4317 tls: insecure: true otlphttp/loki: endpoint: http://loki:3100/otlp debug: verbosity: basic service: extensions: [alloyengine] pipelines: traces: receivers: [otlp] processors: [filter/traces, probabilistic_sampler, transform/strip, batch] exporters: [otlp/tempo] logs: receivers: [otlp] processors: [filter/logs, batch] exporters: [otlphttp/loki, debug] ================================================ FILE: otel-examples/cost-control/config.alloy ================================================ // Minimal Alloy config to enable the Alloy UI alongside the OTel Engine. // The OTel pipeline is defined in config-otel.yaml. ================================================ FILE: otel-examples/cost-control/docker-compose.coda.yml ================================================ services: demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} network_mode: host restart: unless-stopped environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 - OTEL_SERVICE_NAME=cost-control-demo ================================================ FILE: otel-examples/cost-control/docker-compose.yml ================================================ version: '3.8' services: # Loki for log storage loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} command: -config.file=/etc/loki/local-config.yaml ports: - 3100:3100/tcp volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml # Tempo for trace storage tempo: image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4} command: ["-config.file=/etc/tempo.yaml"] ports: - 3200:3200/tcp volumes: - ./tempo-config.yaml:/etc/tempo.yaml # Grafana for visualization grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false - name: Tempo type: tempo access: proxy orgId: 1 url: http://tempo:3200 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh depends_on: - loki - tempo # Alloy in OTel engine mode alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} command: otel --config=/etc/alloy/config-otel.yaml ports: - 8888:8888 # OTel engine HTTP server - 4317:4317 # OTLP gRPC - 4318:4318 # OTLP HTTP - 12345:12345 # Alloy UI volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml - ./config.alloy:/etc/alloy/config.alloy depends_on: - loki - tempo # Demo app that generates noisy telemetry demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8080:8080 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=cost-control-demo depends_on: - alloy ================================================ FILE: otel-examples/cost-control/loki-config.yaml ================================================ auth_enabled: false server: http_listen_port: 3100 grpc_listen_port: 9096 log_level: info common: instance_addr: 127.0.0.1 path_prefix: /tmp/storage storage: filesystem: chunks_directory: /tmp/storage/chunks rules_directory: /tmp/storage/rules replication_factor: 1 ring: kvstore: store: inmemory query_range: results_cache: cache: embedded_cache: enabled: true max_size_mb: 100 limits_config: metric_aggregation_enabled: true schema_config: configs: - from: 2020-10-24 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h pattern_ingester: enabled: true metric_aggregation: loki_address: localhost:3100 ruler: alertmanager_url: http://localhost:9093 frontend: encoding: protobuf ================================================ FILE: otel-examples/cost-control/tempo-config.yaml ================================================ stream_over_http_enabled: true server: http_listen_port: 3200 log_level: info distributor: receivers: otlp: protocols: grpc: endpoint: "tempo:4317" http: endpoint: "tempo:4318" ingester: max_block_duration: 5m compactor: compaction: block_retention: 720h storage: trace: backend: local wal: path: /var/tempo/wal local: path: /var/tempo/blocks overrides: defaults: {} ================================================ FILE: otel-examples/count-connector/README.md ================================================ # Count Connector (Derive Metrics from Signals) Use the OTel count connector to automatically derive count metrics from traces and logs -- the "metrics from signals" pattern -- without additional instrumentation. ## What This Demonstrates - **Count connector** deriving metrics from trace spans and log records - Generating error rate metrics (`span.error.count`, `log.error.count`) from signal status codes - Generating volume metrics (`span.count`, `log.count`) for throughput monitoring - Routing derived metrics to Prometheus while original signals go to Tempo and Loki ## Prerequisites - Docker and Docker Compose ## Run ```bash docker compose up -d ``` ## Alloy UI The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline. If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`. ## Explore Open Grafana at [http://localhost:3000](http://localhost:3000). ### View derived metrics in Prometheus Go to Explore > Prometheus and query the following metrics: ```promql # Total span count (rate per second) rate(span_count_total[5m]) # Error span count (rate per second) rate(span_error_count_total[5m]) # Error rate as a percentage rate(span_error_count_total[5m]) / rate(span_count_total[5m]) * 100 # Total log record count rate(log_count_total[5m]) # Error log count rate(log_error_count_total[5m]) ``` ### View original traces in Tempo Go to Explore > Tempo and search for `count-connector-demo` traces. You will see both successful (OK) and error traces. ### View original logs in Loki Go to Explore > Loki and query: ```logql {service_name="count-connector-demo"} | json ``` ### Check the Alloy OTel pipeline Visit the Alloy OTel HTTP server at [http://localhost:8888](http://localhost:8888). ## Key Configuration The `config-otel.yaml` pipeline uses the **count connector** to bridge signals: 1. **`connectors/count`** -- Defines four derived metrics: - `span.count` -- Total number of spans received - `span.error.count` -- Spans where `status.code == 2` (ERROR) - `log.count` -- Total number of log records received - `log.error.count` -- Logs where `severity_number >= 17` (ERROR and above) 2. **Pipeline wiring:** - `traces` pipeline: receives OTLP, exports to both `count` connector and `otlp/tempo` - `logs` pipeline: receives OTLP, exports to both `count` connector and `otlphttp/loki` - `metrics` pipeline: receives from `count` connector, exports to `otlphttp/prometheus` The count connector acts as both an exporter (in the traces/logs pipelines) and a receiver (in the metrics pipeline), bridging signals without any application changes. ## Stop ```bash docker compose down ``` ================================================ FILE: otel-examples/count-connector/app/Dockerfile ================================================ ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2 FROM python:${PYTHON_VERSION} WORKDIR /app COPY requirements.txt . RUN pip install -r requirements.txt COPY app.py . CMD ["python", "app.py"] ================================================ FILE: otel-examples/count-connector/app/app.py ================================================ """ Demo Flask app for the count-connector scenario. Generates a mix of successful and error traces plus log records at various severity levels. The Alloy OTel pipeline uses the count connector to derive metrics (span.count, span.error.count, log.count, log.error.count) from these signals. """ import logging import random import threading import time from flask import Flask, jsonify from opentelemetry import trace from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.instrumentation.flask import FlaskInstrumentor from opentelemetry.sdk.resources import Resource from opentelemetry.trace import StatusCode # --- OTel Setup --- resource = Resource.create({ "service.name": "count-connector-demo", "service.version": "1.0.0", }) # Traces tracer_provider = TracerProvider(resource=resource) tracer_provider.add_span_processor( BatchSpanProcessor(OTLPSpanExporter(endpoint="alloy:4317", insecure=True)) ) trace.set_tracer_provider(tracer_provider) tracer = trace.get_tracer(__name__) # Logs via OTel logger_provider = LoggerProvider(resource=resource) logger_provider.add_log_record_processor( BatchLogRecordProcessor(OTLPLogExporter(endpoint="alloy:4317", insecure=True)) ) handler = LoggingHandler(level=logging.DEBUG, logger_provider=logger_provider) logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger("count-connector-demo") logger.addHandler(handler) # --- Flask App --- app = Flask(__name__) FlaskInstrumentor().instrument_app(app) @app.route("/api/process") def process(): """Simulates a processing request. ~80% success, ~20% error.""" with tracer.start_as_current_span("process-request") as span: request_id = f"REQ-{random.randint(1000, 9999)}" span.set_attribute("request.id", request_id) time.sleep(random.uniform(0.02, 0.15)) if random.random() < 0.20: error_type = random.choice(["ValidationError", "TimeoutError", "DatabaseError"]) span.set_status(StatusCode.ERROR, f"Simulated {error_type}") span.set_attribute("error.type", error_type) span.record_exception(Exception(f"Simulated {error_type}")) logger.error("Request %s failed: %s", request_id, error_type) return jsonify({"request_id": request_id, "error": error_type}), 500 logger.info("Request %s processed successfully", request_id) return jsonify({"request_id": request_id, "status": "ok"}) @app.route("/api/notify") def notify(): """Simulates sending a notification.""" with tracer.start_as_current_span("send-notification") as span: channel = random.choice(["email", "sms", "push"]) span.set_attribute("notification.channel", channel) time.sleep(random.uniform(0.01, 0.1)) if random.random() < 0.10: span.set_status(StatusCode.ERROR, "Notification delivery failed") logger.error("Notification via %s failed", channel) return jsonify({"channel": channel, "status": "failed"}), 500 logger.info("Notification sent via %s", channel) return jsonify({"channel": channel, "status": "sent"}) @app.route("/health") def health(): return jsonify({"status": "healthy"}) def load_generator(): """Background thread generating continuous traffic every 2 seconds.""" import requests base_url = "http://localhost:8080" time.sleep(5) while True: try: endpoint = random.choice(["/api/process", "/api/process", "/api/notify"]) requests.get(f"{base_url}{endpoint}", timeout=5) except Exception: pass # Also emit some standalone log records severity = random.choices( ["info", "warn", "error"], weights=[60, 25, 15], k=1, )[0] if severity == "info": logger.info("Background task check - all systems normal") elif severity == "warn": logger.warning("Background task check - queue depth elevated") else: logger.error("Background task check - connectivity issue detected") time.sleep(2) if __name__ == "__main__": thread = threading.Thread(target=load_generator, daemon=True) thread.start() app.run(host="0.0.0.0", port=8080) ================================================ FILE: otel-examples/count-connector/app/requirements.txt ================================================ flask opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp opentelemetry-instrumentation-flask ================================================ FILE: otel-examples/count-connector/config-otel.yaml ================================================ # # OTel Collector YAML: Count Connector (Derive Metrics from Signals) # # Demonstrates using the count connector to derive count metrics # from traces and logs: error rates, request counts per service, # log volume by severity -- "metrics from signals" pattern. # extensions: alloyengine: config: file: /etc/alloy/config.alloy flags: server.http.listen-addr: 0.0.0.0:12345 receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 connectors: count: spans: span.count: description: Total number of spans received span.error.count: description: Number of error spans conditions: - status.code == 2 logs: log.count: description: Total number of log records log.error.count: description: Number of error log records conditions: - severity_number >= 17 processors: batch: {} # Convert delta temporality (from count connector) to cumulative for Prometheus deltatocumulative: {} exporters: otlp/tempo: endpoint: tempo:4317 tls: insecure: true otlphttp/loki: endpoint: http://loki:3100/otlp otlphttp/prometheus: endpoint: http://prometheus:9090/api/v1/otlp tls: insecure: true service: extensions: [alloyengine] pipelines: # Ingest traces and forward to count connector + Tempo traces: receivers: [otlp] processors: [batch] exporters: [count, otlp/tempo] # Ingest logs and forward to count connector + Loki logs: receivers: [otlp] processors: [batch] exporters: [count, otlphttp/loki] # Export derived count metrics to Prometheus metrics: receivers: [count] processors: [deltatocumulative, batch] exporters: [otlphttp/prometheus] ================================================ FILE: otel-examples/count-connector/config.alloy ================================================ // Minimal Alloy config to enable the Alloy UI alongside the OTel Engine. // The OTel pipeline is defined in config-otel.yaml. ================================================ FILE: otel-examples/count-connector/docker-compose.coda.yml ================================================ services: demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} network_mode: host restart: unless-stopped environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 - OTEL_SERVICE_NAME=count-connector-demo ================================================ FILE: otel-examples/count-connector/docker-compose.yml ================================================ version: '3.8' services: # Loki for log storage loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} command: -config.file=/etc/loki/local-config.yaml ports: - 3100:3100/tcp volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml # Prometheus for metrics storage (receives derived count metrics via OTLP) prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --web.enable-otlp-receiver - --enable-feature=native-histograms - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090/tcp volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml # Tempo for trace storage tempo: image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4} command: ["-config.file=/etc/tempo.yaml"] ports: - 3200:3200/tcp volumes: - ./tempo-config.yaml:/etc/tempo.yaml # Grafana for visualization grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: false version: 1 editable: false - name: Tempo type: tempo access: proxy orgId: 1 url: http://tempo:3200 basicAuth: false isDefault: true version: 1 editable: false jsonData: serviceMap: datasourceUid: 'Prometheus' nodeGraph: enabled: true EOF /run.sh depends_on: - loki - prometheus - tempo # Alloy in OTel engine mode alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} command: otel --config=/etc/alloy/config-otel.yaml ports: - 8888:8888 # OTel engine HTTP server - 4317:4317 # OTLP gRPC - 4318:4318 # OTLP HTTP - 12345:12345 # Alloy UI volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml - ./config.alloy:/etc/alloy/config.alloy depends_on: - loki - prometheus - tempo # Demo app that generates traces and logs for count connector demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8080:8080 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=count-connector-demo depends_on: - alloy ================================================ FILE: otel-examples/count-connector/loki-config.yaml ================================================ auth_enabled: false server: http_listen_port: 3100 grpc_listen_port: 9096 log_level: info common: instance_addr: 127.0.0.1 path_prefix: /tmp/storage storage: filesystem: chunks_directory: /tmp/storage/chunks rules_directory: /tmp/storage/rules replication_factor: 1 ring: kvstore: store: inmemory query_range: results_cache: cache: embedded_cache: enabled: true max_size_mb: 100 limits_config: metric_aggregation_enabled: true schema_config: configs: - from: 2020-10-24 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h pattern_ingester: enabled: true metric_aggregation: loki_address: localhost:3100 ruler: alertmanager_url: http://localhost:9093 frontend: encoding: protobuf ================================================ FILE: otel-examples/count-connector/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s otlp: promote_resource_attributes: - service.instance.id - service.name - service.namespace - service.version - deployment.environment storage: tsdb: out_of_order_time_window: 30m ================================================ FILE: otel-examples/count-connector/tempo-config.yaml ================================================ stream_over_http_enabled: true server: http_listen_port: 3200 log_level: info distributor: receivers: otlp: protocols: grpc: endpoint: "tempo:4317" http: endpoint: "tempo:4318" ingester: max_block_duration: 5m compactor: compaction: block_retention: 720h metrics_generator: registry: external_labels: source: tempo cluster: docker-compose storage: path: /var/tempo/generator/wal remote_write: - url: http://prometheus:9090/api/v1/write send_exemplars: true traces_storage: path: /var/tempo/generator/traces processor: local_blocks: filter_server_spans: false flush_to_storage: true storage: trace: backend: local wal: path: /var/tempo/wal local: path: /var/tempo/blocks overrides: defaults: metrics_generator: processors: [service-graphs, span-metrics, local-blocks] generate_native_histograms: both ================================================ FILE: otel-examples/filelog-processing/README.md ================================================ # Filelog Processing Demonstrates the OTel Collector **filelog receiver** with operator chains to parse mixed-format log files. A log generator writes both JSON and plaintext log lines to a shared volume, and Alloy (running the OTel engine) reads, parses, and ships them to Loki. ## What This Demonstrates - **Filelog receiver** reading log files from disk using glob patterns - **Conditional operator chains** that detect log format and apply the correct parser (JSON vs regex) - **Severity parsing** to map log levels to OTel severity - **Resource attribute injection** to tag all logs with a service name - Exporting parsed logs to **Loki via OTLP/HTTP** ## Prerequisites - Docker and Docker Compose ## Run ```bash docker compose up -d ``` ## Alloy UI The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline. If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`. ## Explore 1. Open Grafana at [http://localhost:3000](http://localhost:3000) (no login required). 2. Go to **Explore** and select the **Loki** datasource. 3. Try these LogQL queries: ```logql {service_name="log-demo"} ``` ```logql {service_name="log-demo"} | json ``` ```logql {service_name="log-demo"} |= "ERROR" ``` 4. Observe that both JSON and plaintext lines are ingested, with severity levels and timestamps correctly parsed. ## Key Configuration The `config-otel.yaml` defines a filelog receiver with chained operators: - **`json_parser`** (conditional) -- fires when the log line starts with `{`, extracting structured fields and timestamps. - **`regex_parser`** (conditional) -- fires when the log line starts with a date pattern, capturing timestamp, level, and message. - **`severity_parser`** -- maps the parsed `level` attribute to OTel severity. - **`add` operator** -- injects `service.name` as a resource attribute. Logs are batched and exported to Loki's OTLP endpoint at `http://loki:3100/otlp`. ## Stop ```bash docker compose down ``` ================================================ FILE: otel-examples/filelog-processing/app/generate_logs.py ================================================ """ Log generator that writes mixed-format log lines to /var/log/app/demo.log. Alternates between JSON and plaintext formats with random log levels to exercise the filelog receiver's operator chains. """ import json import os import random import time from datetime import datetime, timezone LOG_DIR = "/var/log/app" LOG_FILE = os.path.join(LOG_DIR, "demo.log") LEVELS = ["DEBUG", "INFO", "INFO", "INFO", "WARN", "ERROR"] JSON_MESSAGES = [ ("User logged in", {"user_id": "u123", "region": "us-east"}), ("Order placed", {"order_id": "ord-9876", "amount": 49.99}), ("Cache hit", {"cache_key": "session:abc", "ttl": 300}), ("Payment processed", {"user_id": "u456", "method": "credit_card"}), ("Item shipped", {"order_id": "ord-5432", "carrier": "fedex"}), ("User signed up", {"user_id": "u789", "plan": "premium"}), ] PLAIN_MESSAGES = [ "Failed to process request for user u456", "Connection timeout reaching database primary", "Rate limit exceeded for API key ak-1234", "Scheduled cleanup completed, removed 42 expired sessions", "Health check passed for service order-api", "Retrying failed webhook delivery attempt 3/5", "Disk usage at 78% on volume /data", ] def write_json_line(f, level): msg, extra = random.choice(JSON_MESSAGES) record = { "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z", "level": level, "message": msg, **extra, } f.write(json.dumps(record) + "\n") def write_plain_line(f, level): ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S,%f")[:-3] msg = random.choice(PLAIN_MESSAGES) f.write(f"{ts} {level} {msg}\n") def main(): os.makedirs(LOG_DIR, exist_ok=True) print(f"Writing logs to {LOG_FILE}") while True: level = random.choice(LEVELS) with open(LOG_FILE, "a") as f: if random.random() < 0.5: write_json_line(f, level) else: write_plain_line(f, level) time.sleep(2) if __name__ == "__main__": main() ================================================ FILE: otel-examples/filelog-processing/config-otel.yaml ================================================ # # OTel Collector YAML: File Log Processing # # Demonstrates the filelog receiver with operator chains for parsing # mixed-format log files (JSON, plaintext, multiline stack traces). # extensions: alloyengine: config: file: /etc/alloy/config.alloy flags: server.http.listen-addr: 0.0.0.0:12345 receivers: filelog: include: - /var/log/app/*.log operators: # Try to parse as JSON first - type: json_parser if: body matches "^\\{" parse_from: body parse_to: attributes timestamp: parse_from: attributes.timestamp layout: "%Y-%m-%dT%H:%M:%S.%fZ" # For non-JSON lines, extract with regex - type: regex_parser if: body matches "^\\d{4}-\\d{2}-\\d{2}" regex: "^(?P\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}) (?P\\w+) (?P.*)" timestamp: parse_from: attributes.timestamp layout: "%Y-%m-%d %H:%M:%S,%f" # Map severity from parsed level - type: severity_parser parse_from: attributes.level if: attributes.level != nil # Add a static resource attribute - type: add field: resource["service.name"] value: log-demo processors: batch: timeout: 2s send_batch_size: 256 exporters: otlphttp/loki: endpoint: http://loki:3100/otlp service: extensions: [alloyengine] pipelines: logs: receivers: [filelog] processors: [batch] exporters: [otlphttp/loki] ================================================ FILE: otel-examples/filelog-processing/config.alloy ================================================ // Minimal Alloy config to enable the Alloy UI alongside the OTel Engine. // The OTel pipeline is defined in config-otel.yaml. ================================================ FILE: otel-examples/filelog-processing/docker-compose.coda.yml ================================================ services: log-generator: image: python:${PYTHON_VERSION:-3.11-slim} network_mode: host restart: unless-stopped volumes: - ./app/generate_logs.py:/app/generate_logs.py - /var/log/alloy-demo:/var/log/app command: ["python3", "/app/generate_logs.py"] ================================================ FILE: otel-examples/filelog-processing/docker-compose.yml ================================================ version: '3.8' services: loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 8888:8888 - 12345:12345 # Alloy UI volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml - ./config.alloy:/etc/alloy/config.alloy - app-logs:/var/log/app command: otel --config=/etc/alloy/config-otel.yaml depends_on: - loki log-generator: image: python:${PYTHON_VERSION:-3.11-slim} volumes: - ./app/generate_logs.py:/app/generate_logs.py - app-logs:/var/log/app command: ["python3", "/app/generate_logs.py"] depends_on: - alloy grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp depends_on: - loki entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh volumes: app-logs: ================================================ FILE: otel-examples/filelog-processing/loki-config.yaml ================================================ auth_enabled: false server: http_listen_port: 3100 grpc_listen_port: 9096 log_level: info common: instance_addr: 127.0.0.1 path_prefix: /tmp/storage storage: filesystem: chunks_directory: /tmp/storage/chunks rules_directory: /tmp/storage/rules replication_factor: 1 ring: kvstore: store: inmemory query_range: results_cache: cache: embedded_cache: enabled: true max_size_mb: 100 limits_config: metric_aggregation_enabled: true schema_config: configs: - from: 2020-10-24 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h pattern_ingester: enabled: true metric_aggregation: loki_address: localhost:3100 ruler: alertmanager_url: http://localhost:9093 frontend: encoding: protobuf ================================================ FILE: otel-examples/host-metrics/README.md ================================================ # Host Metrics with OTel Hostmetrics Receiver Collect CPU, memory, disk, filesystem, network, and process metrics using the OpenTelemetry `hostmetrics` receiver -- an OTel-native replacement for Prometheus node_exporter. Metrics are exported via OTLP to Prometheus. ## What This Demonstrates - **Hostmetrics receiver**: Collects system-level metrics without a separate exporter binary - **Scrapers**: CPU (with utilization), memory (with utilization), disk, filesystem, network, load, and process scrapers - **Resource detection**: Automatically adds host metadata (hostname, OS type) to all metrics - **OTLP export to Prometheus**: Metrics are sent via OTLP to Prometheus's native OTLP receiver - **Stress testing**: A stress container generates CPU and memory load to produce interesting metric data ## Metrics Collected | Scraper | Example Metrics | |------------|-------------------------------------------------------------------| | CPU | `system_cpu_time`, `system_cpu_utilization` | | Memory | `system_memory_usage`, `system_memory_utilization` | | Disk | `system_disk_io`, `system_disk_operations` | | Filesystem | `system_filesystem_usage`, `system_filesystem_utilization` | | Network | `system_network_io`, `system_network_packets` | | Load | `system_cpu_load_average_1m`, `system_cpu_load_average_5m` | | Process | `process_cpu_time`, `process_memory_physical_usage` | ## Prerequisites - Docker and Docker Compose - Linux host (hostmetrics requires access to `/proc` and `/sys`) ## Run ```bash docker compose up -d ``` ## Alloy UI The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline. If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`. ## Explore Open Grafana at [http://localhost:3000](http://localhost:3000) and go to **Explore > Prometheus**. ### Sample PromQL Queries **CPU utilization:** ```promql system_cpu_utilization{state="user"} ``` **Memory usage (bytes):** ```promql system_memory_usage{state="used"} ``` **Disk I/O rate:** ```promql rate(system_disk_io_total[5m]) ``` **Network bytes transmitted:** ```promql rate(system_network_io_total{direction="transmit"}[5m]) ``` **System load averages:** ```promql system_cpu_load_average_1m ``` **Top processes by CPU:** ```promql topk(10, rate(process_cpu_time_total[5m])) ``` ## Key Configuration The `config-otel.yaml` configures: 1. **`hostmetrics` receiver**: Enables all major scrapers with 15s collection interval. CPU and memory utilization metrics are explicitly enabled. 2. **`resourcedetection` processor**: Uses `env` and `system` detectors to add hostname and OS metadata. 3. **`otlphttp/prometheus` exporter**: Sends metrics via OTLP to Prometheus's native OTLP endpoint. The Alloy container runs with `pid: host` and mounts `/proc`, `/sys`, and `/` from the host to enable full system visibility. ## Stop ```bash docker compose down ``` ================================================ FILE: otel-examples/host-metrics/config-otel.yaml ================================================ # # OTel Collector YAML: Host Metrics Collection # # Demonstrates using the hostmetrics receiver to collect CPU, memory, # disk, filesystem, and network metrics -- an OTel-native replacement # for Prometheus node_exporter. # extensions: alloyengine: config: file: /etc/alloy/config.alloy flags: server.http.listen-addr: 0.0.0.0:12345 receivers: hostmetrics: collection_interval: 15s scrapers: cpu: metrics: system.cpu.utilization: enabled: true memory: metrics: system.memory.utilization: enabled: true disk: {} filesystem: {} network: {} load: {} process: include: match_type: regexp names: [".*"] mute_process_exe_error: true mute_process_io_error: true mute_process_user_error: true processors: # Detect host metadata automatically resourcedetection: detectors: [env, system] system: hostname_sources: ["os"] batch: timeout: 10s send_batch_size: 512 exporters: otlphttp/prometheus: endpoint: http://prometheus:9090/api/v1/otlp tls: insecure: true service: extensions: [alloyengine] pipelines: metrics: receivers: [hostmetrics] processors: [resourcedetection, batch] exporters: [otlphttp/prometheus] ================================================ FILE: otel-examples/host-metrics/config.alloy ================================================ // Minimal Alloy config to enable the Alloy UI alongside the OTel Engine. // The OTel pipeline is defined in config-otel.yaml. ================================================ FILE: otel-examples/host-metrics/docker-compose.coda.yml ================================================ services: stress: image: polinux/stress@sha256:b6144f84f9c15dac80deb48d3a646b55c7043ab1d83ea0a697c09097aaad21aa command: ["stress", "--cpu", "1", "--vm", "1", "--vm-bytes", "64M"] ================================================ FILE: otel-examples/host-metrics/docker-compose.yml ================================================ version: '3.8' services: prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-otlp-receiver - --enable-feature=native-histograms - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090 volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 8888:8888 - 12345:12345 # Alloy UI volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml - ./config.alloy:/etc/alloy/config.alloy - /proc:/hostfs/proc:ro - /sys:/hostfs/sys:ro - /:/hostfs:ro environment: - HOST_PROC=/hostfs/proc - HOST_SYS=/hostfs/sys - HOST_ETC=/hostfs/etc - HOST_VAR=/hostfs/var - HOST_RUN=/hostfs/run pid: host command: otel --config=/etc/alloy/config-otel.yaml depends_on: - prometheus stress: image: polinux/stress@sha256:b6144f84f9c15dac80deb48d3a646b55c7043ab1d83ea0a697c09097aaad21aa command: ["stress", "--cpu", "1", "--vm", "1", "--vm-bytes", "64M"] grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp depends_on: - prometheus entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh ================================================ FILE: otel-examples/host-metrics/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s otlp: promote_resource_attributes: - service.instance.id - service.name - service.namespace - service.version - deployment.environment - host.name - os.type storage: tsdb: out_of_order_time_window: 30m ================================================ FILE: otel-examples/kafka-buffer/README.md ================================================ # Kafka-Buffered Trace Pipeline Demonstrates using Apache Kafka as a durable buffer in an OpenTelemetry trace pipeline. Alloy runs both the agent tier (OTLP receiver to Kafka) and the gateway tier (Kafka to Tempo) in a single collector instance, showcasing the two-tier architecture pattern. ## What This Demonstrates - **Kafka as a durable buffer**: Traces are written to Kafka before being exported to Tempo, providing resilience against backend outages - **Two-tier collector architecture**: The agent tier ingests OTLP and writes to Kafka; the gateway tier reads from Kafka and exports to Tempo - **Single-collector demo**: Both tiers run in one Alloy instance for simplicity, but in production these would be separate deployments - **KRaft mode Kafka**: Uses Bitnami Kafka with KRaft (no ZooKeeper required) - **Auto topic creation**: The `otlp-traces` topic is created automatically on first write ## Architecture ``` App --OTLP--> Alloy (agent tier) --Kafka--> Alloy (gateway tier) --OTLP--> Tempo ``` In this demo, both tiers are the same Alloy instance with two separate pipelines: 1. **`traces/ingest`**: `otlp` receiver -> `kafka` exporter 2. **`traces/export`**: `kafka` receiver -> `batch` processor -> `otlp/tempo` exporter ## Prerequisites - Docker and Docker Compose ## Run ```bash docker compose up -d ``` Wait about 30 seconds for Kafka to initialize before traces start flowing. ## Alloy UI The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline. If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`. ## Explore Open Grafana at [http://localhost:3000](http://localhost:3000) and go to **Explore > Tempo**. Search for traces from `kafka-buffer-demo`. You should see traces for HTTP endpoints (`/api/items`, `/api/checkout`, `/api/health`) with database query child spans. ### Demonstrate Resilience The key benefit of the Kafka buffer is resilience. Try this experiment: 1. Let the demo run for a minute to generate some traces 2. Stop Tempo: `docker compose stop tempo` 3. Wait 30 seconds (traces are buffering in Kafka) 4. Restart Tempo: `docker compose start tempo` 5. Check Grafana -- the buffered traces should appear in Tempo This works because Kafka retains messages until the consumer (gateway tier) successfully reads them. ## Key Configuration The `config-otel.yaml` defines: 1. **`kafka` exporter**: Writes OTLP-encoded trace data to the `otlp-traces` Kafka topic 2. **`kafka` receiver**: Reads from the same topic and deserializes traces 3. **Two pipelines**: `traces/ingest` (app -> Kafka) and `traces/export` (Kafka -> Tempo) The Kafka exporter uses `otlp_proto` encoding, which preserves full trace fidelity through the buffer. ## Stop ```bash docker compose down ``` ================================================ FILE: otel-examples/kafka-buffer/app/Dockerfile ================================================ ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2 FROM python:${PYTHON_VERSION} WORKDIR /app COPY requirements.txt . RUN pip install -r requirements.txt COPY app.py . CMD ["python", "app.py"] ================================================ FILE: otel-examples/kafka-buffer/app/app.py ================================================ """ Flask app generating traces for the Kafka buffer demo. Produces varied HTTP traces that flow through the Alloy pipeline: app -> OTLP -> Alloy -> Kafka -> Alloy -> Tempo A background thread generates continuous load against the Flask endpoints. """ import random import time import threading from flask import Flask, jsonify from opentelemetry import trace from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.instrumentation.flask import FlaskInstrumentor from opentelemetry.sdk.resources import Resource resource = Resource.create({ "service.name": "kafka-buffer-demo", "service.version": "1.0.0", "deployment.environment": "demo", }) tracer_provider = TracerProvider(resource=resource) tracer_provider.add_span_processor( BatchSpanProcessor(OTLPSpanExporter(endpoint="alloy:4317", insecure=True)) ) trace.set_tracer_provider(tracer_provider) tracer = trace.get_tracer("kafka-demo") app = Flask(__name__) FlaskInstrumentor().instrument_app(app) @app.route("/api/items", methods=["GET"]) def list_items(): with tracer.start_as_current_span("query-items-db") as span: span.set_attribute("db.system", "postgresql") span.set_attribute("db.statement", "SELECT * FROM items LIMIT 20") time.sleep(random.uniform(0.01, 0.04)) return jsonify({"items": [{"id": i, "name": f"item-{i}"} for i in range(5)]}) @app.route("/api/items/", methods=["GET"]) def get_item(item_id): with tracer.start_as_current_span("query-single-item") as span: span.set_attribute("db.system", "postgresql") span.set_attribute("db.statement", f"SELECT * FROM items WHERE id = {item_id}") span.set_attribute("app.item_id", item_id) time.sleep(random.uniform(0.005, 0.02)) return jsonify({"id": item_id, "name": f"item-{item_id}", "price": round(random.uniform(5, 100), 2)}) @app.route("/api/checkout", methods=["POST"]) def checkout(): with tracer.start_as_current_span("process-checkout") as span: span.set_attribute("app.cart_size", random.randint(1, 10)) span.set_attribute("app.payment_method", random.choice(["credit_card", "paypal", "apple_pay"])) time.sleep(random.uniform(0.05, 0.15)) # Simulate occasional failures if random.random() < 0.1: span.set_attribute("error", True) span.set_attribute("error.message", "Payment gateway timeout") return jsonify({"error": "Payment failed"}), 500 return jsonify({"order_id": random.randint(10000, 99999), "status": "confirmed"}), 201 @app.route("/api/health") def health(): return jsonify({"status": "ok"}) def generate_load(): """Background thread that sends requests to the Flask app.""" import urllib.request time.sleep(5) # Wait for Flask to start base = "http://localhost:8080" endpoints = [ ("GET", f"{base}/api/items"), ("GET", f"{base}/api/items/1"), ("GET", f"{base}/api/items/2"), ("GET", f"{base}/api/items/3"), ("POST", f"{base}/api/checkout"), ("GET", f"{base}/api/health"), ] while True: method, url = random.choice(endpoints) try: req = urllib.request.Request(url, method=method) if method == "POST": req.add_header("Content-Type", "application/json") req.data = b'{"items": [1, 2, 3]}' urllib.request.urlopen(req) except Exception: pass time.sleep(random.uniform(0.5, 2.0)) if __name__ == "__main__": load_thread = threading.Thread(target=generate_load, daemon=True) load_thread.start() app.run(host="0.0.0.0", port=8080) ================================================ FILE: otel-examples/kafka-buffer/app/requirements.txt ================================================ flask opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp opentelemetry-instrumentation-flask opentelemetry-instrumentation-requests ================================================ FILE: otel-examples/kafka-buffer/config-otel.yaml ================================================ # # OTel Collector YAML: Kafka-Buffered Pipeline (Gateway) # # Demonstrates a two-tier collector architecture: # Agent: otlp receiver -> kafka exporter (writes to Kafka) # Gateway: kafka receiver -> batch -> otlp exporter (reads from Kafka, writes to backends) # # This config runs BOTH tiers in a single collector for demo purposes, # using Kafka as a durable buffer between ingest and export. # extensions: alloyengine: config: file: /etc/alloy/config.alloy flags: server.http.listen-addr: 0.0.0.0:12345 receivers: # Tier 1: Accept OTLP from applications otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 # Tier 2: Read back from Kafka kafka: brokers: - kafka:9092 protocol_version: "3.0.0" traces: topic: otlp-traces processors: batch: {} exporters: # Tier 1: Write to Kafka buffer kafka: brokers: - kafka:9092 topic: otlp-traces protocol_version: "3.0.0" encoding: otlp_proto # Tier 2: Write to Tempo otlp/tempo: endpoint: tempo:4317 tls: insecure: true service: extensions: [alloyengine] pipelines: # Agent tier: ingest OTLP and buffer to Kafka traces/ingest: receivers: [otlp] exporters: [kafka] # Gateway tier: read from Kafka and export to backend traces/export: receivers: [kafka] processors: [batch] exporters: [otlp/tempo] ================================================ FILE: otel-examples/kafka-buffer/config.alloy ================================================ // Minimal Alloy config to enable the Alloy UI alongside the OTel Engine. // The OTel pipeline is defined in config-otel.yaml. ================================================ FILE: otel-examples/kafka-buffer/docker-compose.coda.yml ================================================ services: kafka: image: apache/kafka:4.2.0@sha256:9516fb7634bad307d17c33b589fde9023003b0cb761374f500002b980a3149b9 network_mode: host restart: unless-stopped environment: - KAFKA_NODE_ID=0 - KAFKA_PROCESS_ROLES=broker,controller - KAFKA_CONTROLLER_QUORUM_VOTERS=0@localhost:9093 - KAFKA_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093 - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://localhost:9092 - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT - KAFKA_CONTROLLER_LISTENER_NAMES=CONTROLLER - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1 - CLUSTER_ID=kafka-buffer-demo-cluster-001 demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} network_mode: host restart: unless-stopped environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 ================================================ FILE: otel-examples/kafka-buffer/docker-compose.yml ================================================ version: '3.8' services: kafka: image: apache/kafka:4.2.0@sha256:9516fb7634bad307d17c33b589fde9023003b0cb761374f500002b980a3149b9 ports: - 9092:9092 environment: - KAFKA_NODE_ID=0 - KAFKA_PROCESS_ROLES=broker,controller - KAFKA_CONTROLLER_QUORUM_VOTERS=0@kafka:9093 - KAFKA_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093 - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092 - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT - KAFKA_CONTROLLER_LISTENER_NAMES=CONTROLLER - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1 - CLUSTER_ID=kafka-buffer-demo-cluster-001 tempo: image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4} command: ["-config.file=/etc/tempo.yaml"] ports: - 3200:3200 volumes: - ./tempo-config.yaml:/etc/tempo.yaml alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 8888:8888 - 4317:4317 - 4318:4318 - 12345:12345 # Alloy UI volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml - ./config.alloy:/etc/alloy/config.alloy command: otel --config=/etc/alloy/config-otel.yaml depends_on: - kafka - tempo demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8080:8080 depends_on: - alloy grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp depends_on: - tempo entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Tempo type: tempo access: proxy orgId: 1 url: http://tempo:3200 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh ================================================ FILE: otel-examples/kafka-buffer/tempo-config.yaml ================================================ stream_over_http_enabled: true server: http_listen_port: 3200 log_level: info distributor: receivers: otlp: protocols: grpc: endpoint: "0.0.0.0:4317" http: endpoint: "0.0.0.0:4318" ingester: max_block_duration: 5m compactor: compaction: block_retention: 720h storage: trace: backend: local wal: path: /var/tempo/wal local: path: /var/tempo/blocks overrides: defaults: {} ================================================ FILE: otel-examples/multi-pipeline-fanout/README.md ================================================ # Multi-Pipeline Fan-Out Demonstrates sending the same traces to multiple backends with different processing per destination using the OpenTelemetry forward connector. Full-fidelity traces go to a primary Tempo instance, while sampled and attribute-stripped traces go to a secondary instance. This is a common pattern for migrations and tiered storage strategies. ## What This Demonstrates - **Forward connector**: The `forward/sampled` connector duplicates trace data from one pipeline into another - **Fan-out pattern**: A single intake pipeline fans out to two export pipelines with independent processing - **Probabilistic sampling**: The secondary pipeline only keeps 10% of traces - **Attribute stripping**: The secondary pipeline removes sensitive/large attributes (user agent, cookies, request body) and truncates remaining attributes to 128 characters - **Dual Tempo instances**: Two independent Tempo backends receiving different subsets and fidelity levels of the same trace data ## Prerequisites - Docker and Docker Compose ## Run ```bash docker compose up -d ``` ## Alloy UI The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline. If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`. ## Explore Open Grafana at [http://localhost:3000](http://localhost:3000). ### Compare Primary vs Secondary 1. Go to **Explore** and select **Tempo Primary** datasource 2. Search for traces from `fanout-demo-app` 3. Pick a trace and note the attributes: full `http.request.header.user_agent`, `http.request.header.cookie`, `http.request.body` values 4. Switch datasource to **Tempo Secondary** 5. Search for the same service -- you will see far fewer traces (only ~10%) 6. On traces that do appear, the user agent, cookie, and request body attributes are gone, and remaining attributes are truncated to 128 characters ### What to Look For | Aspect | Tempo Primary | Tempo Secondary | |---------------------|-------------------------------|----------------------------------| | Trace volume | 100% of traces | ~10% of traces | | Attribute fidelity | Full (all attributes present) | Stripped (no UA, cookies, body) | | Attribute length | Unlimited | Truncated to 128 chars | ## Key Configuration The `config-otel.yaml` defines three pipelines: 1. **`traces/intake`**: Receives OTLP, batches, then exports to both `otlp/tempo-primary` and `forward/sampled` 2. **`traces/sampled`**: Receives from the forward connector, applies probabilistic sampling (10%), strips attributes, and exports to `otlp/tempo-secondary` The forward connector (`forward/sampled`) acts as the bridge that duplicates data from the intake pipeline to the sampled pipeline. ## Stop ```bash docker compose down ``` ================================================ FILE: otel-examples/multi-pipeline-fanout/app/Dockerfile ================================================ ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2 FROM python:${PYTHON_VERSION} WORKDIR /app COPY requirements.txt . RUN pip install -r requirements.txt COPY app.py . CMD ["python", "app.py"] ================================================ FILE: otel-examples/multi-pipeline-fanout/app/app.py ================================================ """ Flask app generating varied traces for the multi-pipeline fan-out demo. Produces traces with large attribute values, user agents, cookies, and request bodies to demonstrate how the secondary pipeline strips these while the primary retains full fidelity. """ import random import time import threading from flask import Flask, jsonify, request from opentelemetry import trace from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.instrumentation.flask import FlaskInstrumentor from opentelemetry.sdk.resources import Resource resource = Resource.create({ "service.name": "fanout-demo-app", "service.version": "1.0.0", "deployment.environment": "demo", }) tracer_provider = TracerProvider(resource=resource) tracer_provider.add_span_processor( BatchSpanProcessor(OTLPSpanExporter(endpoint="alloy:4317", insecure=True)) ) trace.set_tracer_provider(tracer_provider) tracer = trace.get_tracer("fanout-demo") app = Flask(__name__) FlaskInstrumentor().instrument_app(app) USER_AGENTS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 Safari/605.1.15", "Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 Chrome/120.0.0.0 Mobile Safari/537.36", "curl/8.4.0", ] COOKIES = [ "session=abc123def456; preferences=dark_mode; tracking_id=xx-" + "a" * 200, "session=xyz789; cart=item1,item2,item3; locale=en-US", "", ] @app.route("/api/orders", methods=["GET"]) def list_orders(): with tracer.start_as_current_span("fetch-orders-from-db") as span: span.set_attribute("db.system", "postgresql") span.set_attribute("db.statement", "SELECT * FROM orders WHERE status = 'active'") time.sleep(random.uniform(0.01, 0.05)) return jsonify({"orders": [{"id": i, "status": "active"} for i in range(5)]}) @app.route("/api/orders", methods=["POST"]) def create_order(): with tracer.start_as_current_span("insert-order") as span: span.set_attribute("db.system", "postgresql") span.set_attribute("db.statement", "INSERT INTO orders (product, qty) VALUES ($1, $2)") span.set_attribute("http.request.body", '{"product": "widget", "qty": 10, "notes": "' + "x" * 500 + '"}') time.sleep(random.uniform(0.02, 0.08)) return jsonify({"id": random.randint(1000, 9999), "status": "created"}), 201 @app.route("/api/health") def health(): return jsonify({"status": "ok"}) def generate_load(): """Background thread that sends requests to the Flask app.""" import urllib.request time.sleep(5) # Wait for Flask to start base = "http://localhost:8080" endpoints = [ ("GET", f"{base}/api/orders"), ("POST", f"{base}/api/orders"), ("GET", f"{base}/api/health"), ] while True: method, url = random.choice(endpoints) try: req = urllib.request.Request(url, method=method) # Add varied headers that will become span attributes req.add_header("User-Agent", random.choice(USER_AGENTS)) cookie = random.choice(COOKIES) if cookie: req.add_header("Cookie", cookie) if method == "POST": req.add_header("Content-Type", "application/json") req.data = b'{"product": "widget", "qty": 1}' urllib.request.urlopen(req) except Exception: pass time.sleep(random.uniform(0.5, 2.0)) if __name__ == "__main__": load_thread = threading.Thread(target=generate_load, daemon=True) load_thread.start() app.run(host="0.0.0.0", port=8080) ================================================ FILE: otel-examples/multi-pipeline-fanout/app/requirements.txt ================================================ flask opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp opentelemetry-instrumentation-flask opentelemetry-instrumentation-requests ================================================ FILE: otel-examples/multi-pipeline-fanout/config-otel.yaml ================================================ # # OTel Collector YAML: Multi-Pipeline Fan-Out # # Demonstrates sending the same traces to multiple backends with # different processing per destination: full-fidelity traces to Tempo, # and sampled traces with reduced attributes to a second store. # Common for migrations and tiered storage strategies. # extensions: alloyengine: config: file: /etc/alloy/config.alloy flags: server.http.listen-addr: 0.0.0.0:12345 receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 connectors: forward/sampled: {} processors: batch: {} # For the sampled pipeline: only keep 10% of traces probabilistic_sampler: sampling_percentage: 10 # Strip detailed attributes for the sampled/cheap store transform/strip: error_mode: ignore trace_statements: - context: span statements: - delete_key(attributes, "http.request.header.user_agent") - delete_key(attributes, "http.request.header.cookie") - delete_key(attributes, "http.request.body") - truncate_all(attributes, 128) exporters: # Primary: full-fidelity traces to Tempo otlp/tempo-primary: endpoint: tempo:4317 tls: insecure: true # Secondary: sampled + stripped traces to second Tempo instance otlp/tempo-secondary: endpoint: tempo-secondary:4317 tls: insecure: true service: extensions: [alloyengine] pipelines: # Intake pipeline: receive and fan out to both destinations traces/intake: receivers: [otlp] processors: [batch] exporters: [otlp/tempo-primary, forward/sampled] # Sampled pipeline: reduced data to secondary store traces/sampled: receivers: [forward/sampled] processors: [probabilistic_sampler, transform/strip, batch] exporters: [otlp/tempo-secondary] ================================================ FILE: otel-examples/multi-pipeline-fanout/config.alloy ================================================ // Minimal Alloy config to enable the Alloy UI alongside the OTel Engine. // The OTel pipeline is defined in config-otel.yaml. ================================================ FILE: otel-examples/multi-pipeline-fanout/docker-compose.coda.yml ================================================ services: demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} network_mode: host restart: unless-stopped environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 ================================================ FILE: otel-examples/multi-pipeline-fanout/docker-compose.yml ================================================ version: '3.8' services: tempo: image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4} command: ["-config.file=/etc/tempo.yaml"] ports: - 3200:3200 volumes: - ./tempo-config.yaml:/etc/tempo.yaml tempo-secondary: image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4} command: ["-config.file=/etc/tempo.yaml"] ports: - 3201:3200 volumes: - ./tempo-config.yaml:/etc/tempo.yaml prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --enable-feature=native-histograms - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090 volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 8888:8888 - 4317:4317 - 4318:4318 - 12345:12345 # Alloy UI volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml - ./config.alloy:/etc/alloy/config.alloy command: otel --config=/etc/alloy/config-otel.yaml depends_on: - tempo - tempo-secondary demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8080:8080 depends_on: - alloy grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp depends_on: - tempo - tempo-secondary - prometheus entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: false version: 1 editable: false - name: Tempo Primary type: tempo access: proxy orgId: 1 url: http://tempo:3200 basicAuth: false isDefault: true version: 1 editable: false jsonData: serviceMap: datasourceUid: 'Prometheus' nodeGraph: enabled: true - name: Tempo Secondary type: tempo access: proxy orgId: 1 url: http://tempo-secondary:3200 basicAuth: false isDefault: false version: 1 editable: false EOF /run.sh ================================================ FILE: otel-examples/multi-pipeline-fanout/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s storage: tsdb: out_of_order_time_window: 30m ================================================ FILE: otel-examples/multi-pipeline-fanout/tempo-config.yaml ================================================ stream_over_http_enabled: true server: http_listen_port: 3200 log_level: info distributor: receivers: otlp: protocols: grpc: endpoint: "0.0.0.0:4317" http: endpoint: "0.0.0.0:4318" ingester: max_block_duration: 5m compactor: compaction: block_retention: 720h metrics_generator: registry: external_labels: source: tempo cluster: docker-compose storage: path: /var/tempo/generator/wal remote_write: - url: http://prometheus:9090/api/v1/write send_exemplars: true traces_storage: path: /var/tempo/generator/traces processor: local_blocks: filter_server_spans: false flush_to_storage: true storage: trace: backend: local wal: path: /var/tempo/wal local: path: /var/tempo/blocks overrides: defaults: metrics_generator: processors: [service-graphs, span-metrics, local-blocks] generate_native_histograms: both ================================================ FILE: otel-examples/ottl-transform/README.md ================================================ # OTTL Transform Cookbook A cookbook of the most useful OpenTelemetry Transformation Language (OTTL) patterns running in Grafana Alloy's OTel engine. Demonstrates JSON body parsing, severity mapping, attribute promotion, truncation, pattern replacement, and conditional transforms. ## What This Demonstrates - **JSON body parsing**: Log records arrive with JSON string bodies; OTTL parses them and promotes fields to attributes - **Severity mapping**: String severity levels ("INFO", "WARN", "ERROR") are mapped to proper OTel severity numbers - **Attribute cleanup**: Promoted fields like `level` and `timestamp` are deleted after extraction - **Tier labeling**: Trace spans are automatically tagged with `app.tier=frontend` (when `http.target` is present) or `app.tier=backend` (when `db.system` is present) - **Attribute truncation**: All span attributes are truncated to 256 characters - **Resource enrichment**: A `deployment.environment=demo` attribute is added to all trace resources ## Prerequisites - Docker and Docker Compose ## Run ```bash docker compose up -d ``` ## Alloy UI The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline. If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`. ## Explore ### Logs in Loki Open Grafana at [http://localhost:3000](http://localhost:3000) and go to **Explore > Loki**. Query to see parsed JSON attributes: ```logql {service_name="ottl-demo-app"} ``` You should see that JSON fields from the log body (`order_id`, `message`, `amount`, `error_code`, etc.) have been promoted to log attributes. The `level` and `timestamp` fields should be removed after promotion. Severity should be correctly set (INFO=9, WARN=13, ERROR=17). ### Traces in Tempo Switch to **Explore > Tempo** and search for traces from `ottl-demo-app`. Look for: - `app.tier` label on spans: `frontend` for HTTP spans, `backend` for database spans - Long attribute values (like `http.user_agent` or `db.connection_string`) truncated to 256 characters - `deployment.environment=demo` on trace resources ## Key Configuration The `config-otel.yaml` defines three transform processors: 1. **`transform/parse-logs`**: Parses JSON string bodies with `ParseJSON(body)`, maps severity, and cleans up attributes 2. **`transform/traces`**: Adds tier labels based on attribute presence, truncates all attributes to 256 chars 3. **`transform/resources`**: Adds `deployment.environment=demo` to trace resources These are wired into separate pipelines for traces and logs. ## Stop ```bash docker compose down ``` ================================================ FILE: otel-examples/ottl-transform/app/Dockerfile ================================================ ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2 FROM python:${PYTHON_VERSION} WORKDIR /app COPY requirements.txt . RUN pip install -r requirements.txt COPY app.py . CMD ["python", "app.py"] ================================================ FILE: otel-examples/ottl-transform/app/app.py ================================================ """ Demo app that sends "messy" telemetry to exercise OTTL transform patterns. Sends: - Log records with JSON string bodies (to test JSON parsing + attribute promotion) - Log records with string severity fields but no severity_number set - Traces with varied attributes (http.target, db.system, long values) """ import json import time import random import logging from opentelemetry import trace from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter from opentelemetry.sdk.resources import Resource resource = Resource.create({ "service.name": "ottl-demo-app", "service.version": "1.0.0", }) # --- Tracing setup --- tracer_provider = TracerProvider(resource=resource) tracer_provider.add_span_processor( BatchSpanProcessor(OTLPSpanExporter(endpoint="alloy:4317", insecure=True)) ) trace.set_tracer_provider(tracer_provider) tracer = trace.get_tracer("ottl-demo") # --- Logging setup --- logger_provider = LoggerProvider(resource=resource) logger_provider.add_log_record_processor( BatchLogRecordProcessor(OTLPLogExporter(endpoint="alloy:4317", insecure=True)) ) handler = LoggingHandler(logger_provider=logger_provider) logger = logging.getLogger("ottl-demo") logger.addHandler(handler) logger.setLevel(logging.DEBUG) def send_json_log_records(): """Send log records with JSON string bodies for OTTL JSON parsing.""" orders = [ {"timestamp": "2024-01-15T10:30:00Z", "level": "INFO", "message": "Order processed", "order_id": "ORD-123", "amount": 49.99}, {"timestamp": "2024-01-15T10:30:01Z", "level": "ERROR", "message": "Payment failed", "order_id": "ORD-456", "error_code": "INSUFFICIENT_FUNDS"}, {"timestamp": "2024-01-15T10:30:02Z", "level": "WARN", "message": "Inventory low", "product_id": "SKU-789", "remaining": 3}, {"timestamp": "2024-01-15T10:30:03Z", "level": "INFO", "message": "User login", "user_id": "USR-101", "ip": "192.168.1.42"}, {"timestamp": "2024-01-15T10:30:04Z", "level": "ERROR", "message": "Database timeout", "query": "SELECT * FROM orders", "duration_ms": 30000}, ] record = random.choice(orders) # Send as a JSON string body -- OTTL will parse this logger.info(json.dumps(record)) def send_traces(): """Send traces with varied attributes to exercise OTTL trace transforms.""" # Frontend-style span with http.target with tracer.start_as_current_span("GET /api/orders") as span: span.set_attribute("http.method", "get") span.set_attribute("http.target", "/api/orders?page=1&limit=50") span.set_attribute("http.status_code", 200) span.set_attribute("http.user_agent", "Mozilla/5.0 " + "x" * 300) # Very long value time.sleep(random.uniform(0.01, 0.05)) # Backend-style span with db.system with tracer.start_as_current_span("SELECT orders") as db_span: db_span.set_attribute("db.system", "postgresql") db_span.set_attribute("db.statement", "SELECT id, status, amount FROM orders WHERE user_id = $1 ORDER BY created_at DESC LIMIT 50") db_span.set_attribute("db.name", "shop") db_span.set_attribute("db.operation", "SELECT") # Very long attribute to test truncation db_span.set_attribute("db.connection_string", "host=db.internal port=5432 dbname=shop user=app " + "extra_param=value " * 50) time.sleep(random.uniform(0.02, 0.08)) # Another trace pattern with tracer.start_as_current_span("POST /api/checkout") as span: span.set_attribute("http.method", "post") span.set_attribute("http.target", "/api/checkout") span.set_attribute("http.status_code", random.choice([200, 201, 400, 500])) time.sleep(random.uniform(0.05, 0.15)) def main(): print("OTTL demo app started. Sending messy telemetry every 3 seconds...") while True: try: send_json_log_records() send_traces() except Exception as e: print(f"Error sending telemetry: {e}") time.sleep(3) if __name__ == "__main__": main() ================================================ FILE: otel-examples/ottl-transform/app/requirements.txt ================================================ opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-grpc ================================================ FILE: otel-examples/ottl-transform/config-otel.yaml ================================================ # # OTel Collector YAML: OTTL Transform Cookbook # # A "cookbook" of the most useful OTTL transformation patterns: # JSON parsing, attribute promotion, severity mapping, conditional # transforms, pattern replacement, and key deletion. # extensions: alloyengine: config: file: /etc/alloy/config.alloy flags: server.http.listen-addr: 0.0.0.0:12345 receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 processors: # Transform 1: Parse JSON log bodies and promote fields transform/parse-logs: error_mode: ignore log_statements: - context: log statements: # Parse JSON string body into a map - merge_maps(attributes, ParseJSON(body), "upsert") where IsString(body) and IsMatch(body, "^[{]") # Map string severity to proper severity number - set(severity_text, attributes["level"]) where attributes["level"] != nil - set(severity_number, 9) where attributes["level"] == "INFO" - set(severity_number, 13) where attributes["level"] == "WARN" - set(severity_number, 17) where attributes["level"] == "ERROR" # Clean up promoted attributes - delete_key(attributes, "level") - delete_key(attributes, "timestamp") # Transform 2: Enrich and clean trace attributes transform/traces: error_mode: ignore trace_statements: - context: span statements: # Add deployment environment from resource - set(attributes["app.tier"], "frontend") where attributes["http.target"] != nil - set(attributes["app.tier"], "backend") where attributes["db.system"] != nil # Truncate overly long attribute values - truncate_all(attributes, 256) # Normalize HTTP method to uppercase - replace_pattern(attributes["http.method"], "^(.*)$", "$$1") # Transform 3: Add computed resource attributes transform/resources: error_mode: ignore trace_statements: - context: resource statements: - set(attributes["deployment.environment"], "demo") batch: {} exporters: otlp/tempo: endpoint: tempo:4317 tls: insecure: true otlphttp/loki: endpoint: http://loki:3100/otlp debug: verbosity: detailed service: extensions: [alloyengine] pipelines: traces: receivers: [otlp] processors: [transform/traces, transform/resources, batch] exporters: [otlp/tempo] logs: receivers: [otlp] processors: [transform/parse-logs, batch] exporters: [otlphttp/loki, debug] ================================================ FILE: otel-examples/ottl-transform/config.alloy ================================================ // Minimal Alloy config to enable the Alloy UI alongside the OTel Engine. // The OTel pipeline is defined in config-otel.yaml. ================================================ FILE: otel-examples/ottl-transform/docker-compose.coda.yml ================================================ services: demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} network_mode: host restart: unless-stopped environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 ================================================ FILE: otel-examples/ottl-transform/docker-compose.yml ================================================ version: '3.8' services: loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml tempo: image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4} command: ["-config.file=/etc/tempo.yaml"] ports: - 3200:3200 volumes: - ./tempo-config.yaml:/etc/tempo.yaml alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 8888:8888 - 4317:4317 - 4318:4318 - 12345:12345 # Alloy UI volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml - ./config.alloy:/etc/alloy/config.alloy command: otel --config=/etc/alloy/config-otel.yaml depends_on: - loki - tempo demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} depends_on: - alloy grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp depends_on: - loki - tempo entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: true version: 1 editable: false - name: Tempo type: tempo access: proxy orgId: 1 url: http://tempo:3200 basicAuth: false isDefault: false version: 1 editable: false EOF /run.sh ================================================ FILE: otel-examples/ottl-transform/loki-config.yaml ================================================ auth_enabled: false server: http_listen_port: 3100 grpc_listen_port: 9096 log_level: info common: instance_addr: 127.0.0.1 path_prefix: /tmp/storage storage: filesystem: chunks_directory: /tmp/storage/chunks rules_directory: /tmp/storage/rules replication_factor: 1 ring: kvstore: store: inmemory query_range: results_cache: cache: embedded_cache: enabled: true max_size_mb: 100 limits_config: metric_aggregation_enabled: true schema_config: configs: - from: 2020-10-24 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h pattern_ingester: enabled: true metric_aggregation: loki_address: localhost:3100 ruler: alertmanager_url: http://localhost:9093 frontend: encoding: protobuf ================================================ FILE: otel-examples/ottl-transform/tempo-config.yaml ================================================ stream_over_http_enabled: true server: http_listen_port: 3200 log_level: info distributor: receivers: otlp: protocols: grpc: endpoint: "tempo:4317" http: endpoint: "tempo:4318" ingester: max_block_duration: 5m compactor: compaction: block_retention: 720h storage: trace: backend: local wal: path: /var/tempo/wal local: path: /var/tempo/blocks overrides: defaults: {} ================================================ FILE: otel-examples/pii-redaction/README.md ================================================ # PII Redaction Demonstrates using the OTel Collector **transform processor** with OTTL `replace_pattern` statements to scrub personally identifiable information (credit card numbers, email addresses, IP addresses) from traces and logs before they reach storage backends. ## What This Demonstrates - **Transform processor** with OTTL expressions for pattern-based redaction - Scrubbing PII from **trace span attributes** (credit cards, emails, IPs) - Scrubbing PII from **log record bodies** (credit cards, emails) - A Flask demo app that intentionally emits telemetry containing sensitive data - Verifying that redacted data arrives in Tempo and Loki with masked values ## Prerequisites - Docker and Docker Compose ## Run ```bash docker compose up -d ``` The demo app automatically generates traffic every 3 seconds -- no manual interaction needed. ## Alloy UI The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline. If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`. ## Explore 1. Open Grafana at [http://localhost:3000](http://localhost:3000) (no login required). ### Check Traces (Tempo) 2. Go to **Explore** and select the **Tempo** datasource. 3. Search for traces from `pii-demo-app`. 4. Open a trace and inspect the `process-order` span attributes. You should see: - `user.credit_card` = `****-****-****-****` - `user.email` = `***@***.***` - `client.ip` = `***.***.***.***` ### Check Logs (Loki) 5. Switch to the **Loki** datasource. 6. Run: ```logql {service_name="pii-demo-app"} ``` 7. Log messages should contain masked values like `Payment processed for card ****-****-****-**** by ***@***.***`. ## Key Configuration The `config-otel.yaml` defines two transform processors: - **`transform/traces`** -- applies `replace_pattern` on span attributes to mask credit card numbers, emails, and IP addresses using regex. - **`transform/logs`** -- applies `replace_pattern` on log bodies to mask credit cards and emails. Both processors use `error_mode: ignore` so a failed match does not block the pipeline. The pipeline receives OTLP data on ports 4317 (gRPC) and 4318 (HTTP), processes it through the transform stage, then exports traces to Tempo and logs to Loki. ## Stop ```bash docker compose down ``` ================================================ FILE: otel-examples/pii-redaction/app/Dockerfile ================================================ ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2 FROM python:${PYTHON_VERSION} WORKDIR /app COPY requirements.txt . RUN pip install -r requirements.txt COPY app.py . CMD ["python", "app.py"] ================================================ FILE: otel-examples/pii-redaction/app/app.py ================================================ """ Flask app that generates traces and logs containing PII data. The PII (credit cards, emails, IPs) should be redacted by the Alloy transform processor before reaching Loki and Tempo. """ import logging import threading import time import requests from flask import Flask, jsonify from opentelemetry import trace from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.sdk.resources import Resource from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.instrumentation.flask import FlaskInstrumentor # --- Resource --- resource = Resource.create({ "service.name": "pii-demo-app", "service.version": "1.0.0", }) # --- Traces --- trace_exporter = OTLPSpanExporter(endpoint="alloy:4317", insecure=True) tracer_provider = TracerProvider(resource=resource) tracer_provider.add_span_processor(BatchSpanProcessor(trace_exporter)) trace.set_tracer_provider(tracer_provider) tracer = trace.get_tracer(__name__) # --- Logs --- log_exporter = OTLPLogExporter(endpoint="alloy:4317", insecure=True) logger_provider = LoggerProvider(resource=resource) logger_provider.add_log_record_processor(BatchLogRecordProcessor(log_exporter)) otel_handler = LoggingHandler(level=logging.INFO, logger_provider=logger_provider) logger = logging.getLogger("pii-demo") logger.setLevel(logging.INFO) logger.addHandler(otel_handler) # --- Flask App --- app = Flask(__name__) FlaskInstrumentor().instrument_app(app) # Sample PII data used in requests ORDERS = [ { "user": "alice", "credit_card": "4532-1234-5678-9012", "email": "alice@example.com", "ip": "192.168.1.100", }, { "user": "bob", "credit_card": "5425-9876-5432-1098", "email": "bob@company.org", "ip": "10.0.42.7", }, { "user": "charlie", "credit_card": "3782-822463-10005", "email": "charlie@startup.io", "ip": "172.16.0.55", }, ] order_index = 0 @app.route("/order", methods=["GET"]) def place_order(): global order_index order = ORDERS[order_index % len(ORDERS)] order_index += 1 with tracer.start_as_current_span("process-order") as span: # Set span attributes containing PII span.set_attribute("user.credit_card", order["credit_card"]) span.set_attribute("user.email", order["email"]) span.set_attribute("client.ip", order["ip"]) span.set_attribute("order.user", order["user"]) # Emit a log record containing PII in the body logger.info( f"Payment processed for card {order['credit_card']} " f"by {order['email']} from {order['ip']}" ) return jsonify({"status": "ok", "user": order["user"]}) @app.route("/health", methods=["GET"]) def health(): return jsonify({"status": "healthy"}) def traffic_generator(): """Background thread that calls /order every 3 seconds.""" time.sleep(5) # Wait for Flask to start while True: try: requests.get("http://localhost:5000/order", timeout=5) except Exception: pass time.sleep(3) if __name__ == "__main__": t = threading.Thread(target=traffic_generator, daemon=True) t.start() app.run(host="0.0.0.0", port=5000) ================================================ FILE: otel-examples/pii-redaction/app/requirements.txt ================================================ flask opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp opentelemetry-instrumentation-flask opentelemetry-instrumentation-requests ================================================ FILE: otel-examples/pii-redaction/config-otel.yaml ================================================ # # OTel Collector YAML: PII Redaction Pipeline # # Demonstrates using the transform processor with OTTL to scrub # sensitive data (credit cards, emails, IPs) from trace attributes # and log bodies before export. # extensions: alloyengine: config: file: /etc/alloy/config.alloy flags: server.http.listen-addr: 0.0.0.0:12345 receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 processors: # Scrub PII from trace span attributes transform/traces: error_mode: ignore trace_statements: - context: span statements: # Mask credit card numbers (16 digits with optional separators) - replace_pattern(attributes["user.credit_card"], "\\d{4}[- ]?\\d{4}[- ]?\\d{4}[- ]?\\d{4}", "****-****-****-****") # Mask email addresses - replace_pattern(attributes["user.email"], "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "***@***.***") # Mask IP addresses - replace_pattern(attributes["client.ip"], "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", "***.***.***.***") # Scrub PII from log bodies transform/logs: error_mode: ignore log_statements: - context: log statements: # Mask credit card numbers in log body - replace_pattern(body, "\\d{4}[- ]?\\d{4}[- ]?\\d{4}[- ]?\\d{4}", "****-****-****-****") # Mask email addresses in log body - replace_pattern(body, "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "***@***.***") batch: {} exporters: otlp/tempo: endpoint: tempo:4317 tls: insecure: true otlphttp/loki: endpoint: http://loki:3100/otlp debug: verbosity: detailed service: extensions: [alloyengine] pipelines: traces: receivers: [otlp] processors: [transform/traces, batch] exporters: [otlp/tempo] logs: receivers: [otlp] processors: [transform/logs, batch] exporters: [otlphttp/loki, debug] ================================================ FILE: otel-examples/pii-redaction/config.alloy ================================================ // Minimal Alloy config to enable the Alloy UI alongside the OTel Engine. // The OTel pipeline is defined in config-otel.yaml. ================================================ FILE: otel-examples/pii-redaction/docker-compose.coda.yml ================================================ services: demo-app: build: context: ./app args: PYTHON_VERSION: ${PYTHON_VERSION:-3.11-slim} network_mode: host restart: unless-stopped environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 ================================================ FILE: otel-examples/pii-redaction/docker-compose.yml ================================================ version: '3.8' services: loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml tempo: image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4} command: ["-config.file=/etc/tempo.yaml"] ports: - 3200:3200/tcp volumes: - ./tempo-config.yaml:/etc/tempo.yaml alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 8888:8888 - 4317:4317/tcp - 4318:4318/tcp - 12345:12345 # Alloy UI volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml - ./config.alloy:/etc/alloy/config.alloy command: otel --config=/etc/alloy/config-otel.yaml depends_on: - loki - tempo demo-app: build: context: ./app args: PYTHON_VERSION: ${PYTHON_VERSION:-3.11-slim} depends_on: - alloy grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp depends_on: - loki - tempo entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false - name: Tempo type: tempo access: proxy orgId: 1 url: http://tempo:3200 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh ================================================ FILE: otel-examples/pii-redaction/loki-config.yaml ================================================ auth_enabled: false server: http_listen_port: 3100 grpc_listen_port: 9096 log_level: info common: instance_addr: 127.0.0.1 path_prefix: /tmp/storage storage: filesystem: chunks_directory: /tmp/storage/chunks rules_directory: /tmp/storage/rules replication_factor: 1 ring: kvstore: store: inmemory query_range: results_cache: cache: embedded_cache: enabled: true max_size_mb: 100 limits_config: metric_aggregation_enabled: true schema_config: configs: - from: 2020-10-24 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h pattern_ingester: enabled: true metric_aggregation: loki_address: localhost:3100 ruler: alertmanager_url: http://localhost:9093 frontend: encoding: protobuf ================================================ FILE: otel-examples/pii-redaction/tempo-config.yaml ================================================ stream_over_http_enabled: true server: http_listen_port: 3200 log_level: info distributor: receivers: otlp: protocols: grpc: endpoint: "tempo:4317" http: endpoint: "tempo:4318" ingester: max_block_duration: 5m compactor: compaction: block_retention: 720h storage: trace: backend: local wal: path: /var/tempo/wal local: path: /var/tempo/blocks overrides: defaults: {} ================================================ FILE: otel-examples/resource-enrichment/README.md ================================================ # Resource Enrichment Automatically enrich all telemetry signals with host, OS, and container metadata using the Alloy OTel pipeline -- without changing application code. ## What This Demonstrates - **`resourcedetection` processor** with `env`, `system`, and `docker` detectors to discover environment metadata - **`resource` processor** to add custom attributes (`deployment.environment`, `service.namespace`) - How the collector adds context that apps do not set themselves (hostname, OS type, architecture) - **Debug exporter** with `detailed` verbosity to inspect enriched resource attributes ## Prerequisites - Docker and Docker Compose ## Run ```bash docker compose up -d ``` ## Alloy UI The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline. If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`. ## Explore Open Grafana at [http://localhost:3000](http://localhost:3000). ### Check enriched traces in Tempo 1. Go to Explore > Tempo. 2. Search for traces from `enrichment-demo`. 3. Click on any trace and expand the resource attributes. You should see attributes the app did **not** set: - `host.name` -- the collector container's hostname - `os.type` -- detected OS - `host.arch` -- CPU architecture - `deployment.environment` = `demo` - `service.namespace` = `otel-examples` ### Check enriched metrics in Prometheus 1. Go to Explore > Prometheus. 2. Query `app_requests_total` -- the metric labels should include `deployment_environment`, `service_namespace`, and other enriched attributes. ### Inspect debug exporter output ```bash docker compose logs alloy ``` Look for the `debug` exporter output showing the full resource with detected attributes attached. ### Check the Alloy OTel pipeline Visit the Alloy OTel HTTP server at [http://localhost:8888](http://localhost:8888). ## Key Configuration The `config-otel.yaml` pipeline uses two processors: 1. **`resourcedetection`** -- Auto-detects environment metadata: - `env` detector: reads `OTEL_RESOURCE_ATTRIBUTES` environment variable - `system` detector: discovers `host.name`, `os.type`, `host.arch` - `docker` detector: discovers container metadata (requires Docker socket mount) - `override: false` ensures app-set attributes are not overwritten 2. **`resource`** -- Adds static attributes: - `deployment.environment` = `demo` - `service.namespace` = `otel-examples` - Uses `upsert` action so existing values are updated but new ones are also created Note: The Alloy container mounts `/var/run/docker.sock` read-only to enable the Docker detector. ## Stop ```bash docker compose down ``` ================================================ FILE: otel-examples/resource-enrichment/app/Dockerfile ================================================ ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2 FROM python:${PYTHON_VERSION} WORKDIR /app COPY requirements.txt . RUN pip install -r requirements.txt COPY app.py . CMD ["python", "app.py"] ================================================ FILE: otel-examples/resource-enrichment/app/app.py ================================================ """ Demo Flask app for the resource-enrichment scenario. A simple app that generates traces and metrics WITHOUT setting host/container metadata. The Alloy OTel pipeline uses resourcedetection + resource processors to automatically enrich all signals with environment attributes. """ import random import threading import time from flask import Flask, jsonify from opentelemetry import trace, metrics from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader from opentelemetry.instrumentation.flask import FlaskInstrumentor from opentelemetry.sdk.resources import Resource # --- OTel Setup (minimal resource - no host/container info) --- resource = Resource.create({ "service.name": "enrichment-demo", "service.version": "1.0.0", }) # Traces tracer_provider = TracerProvider(resource=resource) tracer_provider.add_span_processor( BatchSpanProcessor(OTLPSpanExporter(endpoint="alloy:4317", insecure=True)) ) trace.set_tracer_provider(tracer_provider) tracer = trace.get_tracer(__name__) # Metrics metric_reader = PeriodicExportingMetricReader( OTLPMetricExporter(endpoint="alloy:4317", insecure=True), export_interval_millis=10000, ) meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) metrics.set_meter_provider(meter_provider) meter = metrics.get_meter(__name__) # Custom metrics request_counter = meter.create_counter("app.requests", description="Total requests") request_duration = meter.create_histogram("app.request.duration", unit="ms", description="Request duration") # --- Flask App --- app = Flask(__name__) FlaskInstrumentor().instrument_app(app) @app.route("/api/users") def list_users(): """Returns a list of mock users.""" with tracer.start_as_current_span("fetch-users") as span: start = time.time() user_count = random.randint(1, 50) span.set_attribute("user.count", user_count) time.sleep(random.uniform(0.01, 0.1)) request_counter.add(1, {"endpoint": "/api/users", "method": "GET"}) request_duration.record((time.time() - start) * 1000, {"endpoint": "/api/users"}) return jsonify({"users": [f"user-{i}" for i in range(user_count)]}) @app.route("/api/items") def list_items(): """Returns a list of mock items.""" with tracer.start_as_current_span("fetch-items") as span: start = time.time() item_count = random.randint(1, 100) span.set_attribute("item.count", item_count) time.sleep(random.uniform(0.01, 0.15)) request_counter.add(1, {"endpoint": "/api/items", "method": "GET"}) request_duration.record((time.time() - start) * 1000, {"endpoint": "/api/items"}) return jsonify({"items": [f"item-{i}" for i in range(item_count)]}) @app.route("/health") def health(): return jsonify({"status": "healthy"}) def load_generator(): """Background thread that hits endpoints every 2 seconds.""" import requests base_url = "http://localhost:8080" time.sleep(5) while True: try: endpoint = random.choice(["/api/users", "/api/items"]) requests.get(f"{base_url}{endpoint}", timeout=5) except Exception: pass time.sleep(2) if __name__ == "__main__": thread = threading.Thread(target=load_generator, daemon=True) thread.start() app.run(host="0.0.0.0", port=8080) ================================================ FILE: otel-examples/resource-enrichment/app/requirements.txt ================================================ flask requests opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp opentelemetry-instrumentation-flask opentelemetry-instrumentation-requests ================================================ FILE: otel-examples/resource-enrichment/config-otel.yaml ================================================ # # OTel Collector YAML: Resource Enrichment # # Demonstrates using the resourcedetection processor to automatically # discover and attach environment metadata (host, OS, Docker container) # to all telemetry signals without any app-level changes. # extensions: alloyengine: config: file: /etc/alloy/config.alloy flags: server.http.listen-addr: 0.0.0.0:12345 receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 processors: # Auto-detect environment metadata resourcedetection: detectors: [env, system, docker] system: hostname_sources: ["os"] resource_attributes: host.name: enabled: true os.type: enabled: true host.arch: enabled: true docker: resource_attributes: host.name: enabled: true os.type: enabled: true timeout: 5s override: false # Add custom resource attributes resource: attributes: - key: deployment.environment value: demo action: upsert - key: service.namespace value: otel-examples action: upsert batch: {} exporters: otlp/tempo: endpoint: tempo:4317 tls: insecure: true otlphttp/prometheus: endpoint: http://prometheus:9090/api/v1/otlp tls: insecure: true debug: verbosity: detailed service: extensions: [alloyengine] pipelines: traces: receivers: [otlp] processors: [resourcedetection, resource, batch] exporters: [otlp/tempo, debug] metrics: receivers: [otlp] processors: [resourcedetection, resource, batch] exporters: [otlphttp/prometheus] ================================================ FILE: otel-examples/resource-enrichment/config.alloy ================================================ // Minimal Alloy config to enable the Alloy UI alongside the OTel Engine. // The OTel pipeline is defined in config-otel.yaml. ================================================ FILE: otel-examples/resource-enrichment/docker-compose.coda.yml ================================================ services: demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} network_mode: host restart: unless-stopped environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 - OTEL_SERVICE_NAME=enrichment-demo ================================================ FILE: otel-examples/resource-enrichment/docker-compose.yml ================================================ version: '3.8' services: # Prometheus for metrics storage (with OTLP receiver) prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --web.enable-otlp-receiver - --enable-feature=native-histograms - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090/tcp volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml # Tempo for trace storage tempo: image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4} command: ["-config.file=/etc/tempo.yaml"] ports: - 3200:3200/tcp volumes: - ./tempo-config.yaml:/etc/tempo.yaml # Grafana for visualization grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: false version: 1 editable: false - name: Tempo type: tempo access: proxy orgId: 1 url: http://tempo:3200 basicAuth: false isDefault: true version: 1 editable: false jsonData: serviceMap: datasourceUid: 'Prometheus' nodeGraph: enabled: true EOF /run.sh depends_on: - prometheus - tempo # Alloy in OTel engine mode (with Docker socket for container detection) alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} command: otel --config=/etc/alloy/config-otel.yaml ports: - 8888:8888 # OTel engine HTTP server - 4317:4317 # OTLP gRPC - 4318:4318 # OTLP HTTP - 12345:12345 # Alloy UI volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml - ./config.alloy:/etc/alloy/config.alloy - /var/run/docker.sock:/var/run/docker.sock:ro depends_on: - prometheus - tempo # Demo app with minimal resource attributes (collector enriches them) demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8080:8080 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=enrichment-demo depends_on: - alloy ================================================ FILE: otel-examples/resource-enrichment/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s otlp: promote_resource_attributes: - service.instance.id - service.name - service.namespace - service.version - deployment.environment storage: tsdb: out_of_order_time_window: 30m ================================================ FILE: otel-examples/resource-enrichment/tempo-config.yaml ================================================ stream_over_http_enabled: true server: http_listen_port: 3200 log_level: info distributor: receivers: otlp: protocols: grpc: endpoint: "tempo:4317" http: endpoint: "tempo:4318" ingester: max_block_duration: 5m compactor: compaction: block_retention: 720h metrics_generator: registry: external_labels: source: tempo cluster: docker-compose storage: path: /var/tempo/generator/wal remote_write: - url: http://prometheus:9090/api/v1/write send_exemplars: true traces_storage: path: /var/tempo/generator/traces processor: local_blocks: filter_server_spans: false flush_to_storage: true storage: trace: backend: local wal: path: /var/tempo/wal local: path: /var/tempo/blocks overrides: defaults: metrics_generator: processors: [service-graphs, span-metrics, local-blocks] generate_native_histograms: both ================================================ FILE: otel-examples/routing-multi-tenant/README.md ================================================ # Routing Multi-Tenant Demonstrates using the OTel Collector **forward connector** and **filter processor** to route logs from different tenants into separate Loki organizations. A single OTLP intake pipeline fans out to per-tenant pipelines, each filtering by a `tenant` resource attribute and exporting with the correct `X-Scope-OrgID` header. ## What This Demonstrates - **Forward connector** to fan out logs from one pipeline into multiple downstream pipelines - **Filter processor** to keep only logs matching a specific tenant - **Resource processor** to enrich logs with per-tenant attributes - **Multi-tenant Loki** with `auth_enabled: true` and `X-Scope-OrgID` header routing - Querying isolated tenant data in Grafana using separate datasources ## Prerequisites - Docker and Docker Compose ## Run ```bash docker compose up -d ``` The log generator automatically sends logs for both tenants every 2 seconds. ## Alloy UI The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline. If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`. ## Explore 1. Open Grafana at [http://localhost:3000](http://localhost:3000) (no login required). 2. Go to **Explore**. ### Query team-a logs 3. Select the **Loki (team-a)** datasource and run: ```logql {service_name="frontend-service"} ``` You should only see logs from team-a (frontend-service messages). ### Query team-b logs 4. Switch to the **Loki (team-b)** datasource and run: ```logql {service_name="order-service"} ``` You should only see logs from team-b (order-service messages). ### Verify isolation 5. Confirm that team-a's datasource cannot see team-b's logs and vice versa -- this is enforced by Loki's multi-tenant `X-Scope-OrgID` header. ## Key Configuration The `config-otel.yaml` uses a three-stage pipeline architecture: 1. **Intake pipeline** (`logs/intake`) -- receives all OTLP logs and exports to two forward connectors (`forward/team-a` and `forward/team-b`). 2. **Per-tenant pipelines** (`logs/team-a`, `logs/team-b`) -- each receives from its forward connector, applies a filter processor that drops logs not matching the tenant, enriches with a resource processor, and exports to a tenant-specific Loki exporter with the appropriate `X-Scope-OrgID` header. The filter processors use `resource.attributes["tenant"] != "team-a"` (and `team-b`) to drop non-matching logs, effectively routing each tenant's data to its own Loki organization. ## Stop ```bash docker compose down ``` ================================================ FILE: otel-examples/routing-multi-tenant/app/generate_logs.py ================================================ """ Multi-tenant log generator using OTel SDK. Alternates between sending logs with resource attribute tenant="team-a" and tenant="team-b" via OTLP gRPC to alloy:4317. """ import logging import time import random from opentelemetry.sdk.resources import Resource from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter TEAM_A_MESSAGES = [ "Team A: Deployed frontend v2.3.1 to production", "Team A: User authentication service healthy", "Team A: CDN cache invalidation completed", "Team A: A/B test experiment-42 started for 10% of users", "Team A: Search index rebuild finished in 23s", "Team A: Rate limiter triggered for IP range 10.0.0.0/8", ] TEAM_B_MESSAGES = [ "Team B: Payment gateway latency increased to 450ms", "Team B: Inventory sync completed for warehouse-west", "Team B: Order fulfillment pipeline processed 1,247 orders", "Team B: Database replica lag at 120ms", "Team B: Shipping label API returned 503, retrying", "Team B: Nightly report generation started", ] LEVELS = [logging.DEBUG, logging.INFO, logging.INFO, logging.WARNING, logging.ERROR] def create_logger(tenant: str, service_name: str) -> logging.Logger: """Create an OTel-instrumented logger for a specific tenant.""" resource = Resource.create({ "service.name": service_name, "tenant": tenant, }) exporter = OTLPLogExporter(endpoint="alloy:4317", insecure=True) provider = LoggerProvider(resource=resource) provider.add_log_record_processor(BatchLogRecordProcessor(exporter)) handler = LoggingHandler(level=logging.DEBUG, logger_provider=provider) logger = logging.getLogger(f"tenant-{tenant}") logger.setLevel(logging.DEBUG) logger.addHandler(handler) return logger def main(): print("Starting multi-tenant log generator...") time.sleep(3) # Wait for Alloy to be ready logger_a = create_logger("team-a", "frontend-service") logger_b = create_logger("team-b", "order-service") while True: # Send a team-a log level = random.choice(LEVELS) msg = random.choice(TEAM_A_MESSAGES) logger_a.log(level, msg) time.sleep(1) # Send a team-b log level = random.choice(LEVELS) msg = random.choice(TEAM_B_MESSAGES) logger_b.log(level, msg) time.sleep(1) if __name__ == "__main__": main() ================================================ FILE: otel-examples/routing-multi-tenant/app/requirements.txt ================================================ opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-grpc ================================================ FILE: otel-examples/routing-multi-tenant/config-otel.yaml ================================================ # # OTel Collector YAML: Multi-Tenant Routing # # Demonstrates using the forward connector to fan out logs into # multiple pipelines, then filter processors to route by tenant # attribute. Each tenant gets its own processing and Loki org ID. # # Available connectors in Alloy OTel Engine: count, grafanacloud, # servicegraph, spanmetrics, forward. # extensions: alloyengine: config: file: /etc/alloy/config.alloy flags: server.http.listen-addr: 0.0.0.0:12345 receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 connectors: # Fork logs into per-tenant pipelines forward/team-a: {} forward/team-b: {} processors: batch: {} # Keep only team-a logs filter/team-a: error_mode: ignore logs: log_record: - resource.attributes["tenant"] != "team-a" # Keep only team-b logs filter/team-b: error_mode: ignore logs: log_record: - resource.attributes["tenant"] != "team-b" # Add team resource attribute for team-a resource/team-a: attributes: - key: team value: team-a action: upsert # Add team resource attribute for team-b resource/team-b: attributes: - key: team value: team-b action: upsert exporters: otlphttp/loki-team-a: endpoint: http://loki:3100/otlp headers: X-Scope-OrgID: team-a otlphttp/loki-team-b: endpoint: http://loki:3100/otlp headers: X-Scope-OrgID: team-b service: extensions: [alloyengine] pipelines: # Intake: receive and fan out to both tenant pipelines logs/intake: receivers: [otlp] exporters: [forward/team-a, forward/team-b] # Team A pipeline: filter + enrich + export logs/team-a: receivers: [forward/team-a] processors: [filter/team-a, resource/team-a, batch] exporters: [otlphttp/loki-team-a] # Team B pipeline: filter + enrich + export logs/team-b: receivers: [forward/team-b] processors: [filter/team-b, resource/team-b, batch] exporters: [otlphttp/loki-team-b] ================================================ FILE: otel-examples/routing-multi-tenant/config.alloy ================================================ // Minimal Alloy config to enable the Alloy UI alongside the OTel Engine. // The OTel pipeline is defined in config-otel.yaml. ================================================ FILE: otel-examples/routing-multi-tenant/docker-compose.coda.yml ================================================ services: log-generator: image: python:${PYTHON_VERSION:-3.11-slim} network_mode: host restart: unless-stopped volumes: - ./app/generate_logs.py:/app/generate_logs.py - ./app/requirements.txt:/app/requirements.txt command: - sh - -c - "pip install -r /app/requirements.txt && python /app/generate_logs.py" environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 ================================================ FILE: otel-examples/routing-multi-tenant/docker-compose.yml ================================================ version: '3.8' services: loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 8888:8888 - 4317:4317/tcp - 4318:4318/tcp - 12345:12345 # Alloy UI volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml - ./config.alloy:/etc/alloy/config.alloy command: otel --config=/etc/alloy/config-otel.yaml depends_on: - loki log-generator: image: python:${PYTHON_VERSION:-3.11-slim} volumes: - ./app/generate_logs.py:/app/generate_logs.py - ./app/requirements.txt:/app/requirements.txt command: - sh - -c - "pip install -r /app/requirements.txt && python /app/generate_logs.py" depends_on: - alloy grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp depends_on: - loki entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki (team-a) type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: true version: 1 editable: false jsonData: httpHeaderName1: X-Scope-OrgID secureJsonData: httpHeaderValue1: team-a - name: Loki (team-b) type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false jsonData: httpHeaderName1: X-Scope-OrgID secureJsonData: httpHeaderValue1: team-b EOF /run.sh ================================================ FILE: otel-examples/routing-multi-tenant/loki-config.yaml ================================================ auth_enabled: true server: http_listen_port: 3100 grpc_listen_port: 9096 log_level: info common: instance_addr: 127.0.0.1 path_prefix: /tmp/storage storage: filesystem: chunks_directory: /tmp/storage/chunks rules_directory: /tmp/storage/rules replication_factor: 1 ring: kvstore: store: inmemory query_range: results_cache: cache: embedded_cache: enabled: true max_size_mb: 100 limits_config: metric_aggregation_enabled: true schema_config: configs: - from: 2020-10-24 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h pattern_ingester: enabled: true metric_aggregation: loki_address: localhost:3100 ruler: alertmanager_url: http://localhost:9093 frontend: encoding: protobuf ================================================ FILE: otel-metrics-pipeline/README.md ================================================ # OTel Metrics Pipeline Demonstrates a full OpenTelemetry metrics pipeline through Grafana Alloy: a Python application generates OTLP metrics which flow through Alloy (with batching and attribute transformation) into Prometheus, and are visualized in Grafana. ## Overview The pipeline includes: - **Python demo app** -- generates counters, histograms, and up-down counters via the OpenTelemetry SDK, sending them as OTLP/gRPC to Alloy. - **Grafana Alloy** -- receives OTLP metrics, batches them, applies a transform processor (adds a `deployment.environment` resource attribute), and exports via OTLP/HTTP to Prometheus. - **Prometheus** -- ingests metrics through its native OTLP receiver with native histogram support enabled. - **Grafana** -- auto-provisioned with a Prometheus datasource for exploring the metrics. ## Running the Demo 1. Clone the repository: ``` git clone https://github.com/grafana/alloy-scenarios.git cd alloy-scenarios ``` 2. Navigate to this example directory: ``` cd otel-metrics-pipeline ``` 3. Run using Docker Compose: ``` docker compose up -d ``` Or use the centralized image management: ``` cd .. ./run-example.sh otel-metrics-pipeline ``` 4. Access the services: - **Grafana**: http://localhost:3000 - **Alloy UI**: http://localhost:12345 - **Prometheus**: http://localhost:9090 ## What to Expect After a few seconds the demo app begins emitting metrics. You can explore them in several ways: - **Prometheus** -- navigate to http://localhost:9090 and query for metrics such as `app_requests_total`, `app_errors_total`, `app_request_duration_milliseconds`, or `app_active_users`. Note that OTLP metric names are translated to Prometheus conventions (dots become underscores, units are appended as suffixes). - **Grafana Explore** -- open http://localhost:3000/explore, select the Prometheus datasource, and build PromQL queries against the ingested metrics. - **Alloy pipeline UI** -- visit http://localhost:12345 to inspect the live component graph showing the receiver, batch processor, transform processor, and exporter. ## Metrics Generated | Metric | Type | Description | |---|---|---| | `app.requests.total` | Counter | Total HTTP requests by endpoint, method, and status | | `app.errors.total` | Counter | Total errors by endpoint | | `app.request.duration` | Histogram | Request latency in milliseconds | | `app.active_users` | UpDownCounter | Current active users by region | ## Architecture ``` ┌─────────────┐ OTLP/gRPC ┌───────────────┐ OTLP/HTTP ┌────────────┐ │ Python App │─────────────▶│ Grafana Alloy │────────────▶│ Prometheus │ │ (metrics gen)│ :4317 │ (batch + │ :9090 │ │ └─────────────┘ │ transform) │ └─────┬──────┘ └───────────────┘ │ :12345 │ (Alloy UI) ▼ ┌──────────┐ │ Grafana │ │ :3000 │ └──────────┘ ``` ================================================ FILE: otel-metrics-pipeline/app/main.py ================================================ import time import random from opentelemetry import metrics from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader from opentelemetry.sdk.resources import Resource resource = Resource.create({"service.name": "demo-metrics-app"}) exporter = OTLPMetricExporter() reader = PeriodicExportingMetricReader(exporter, export_interval_millis=5000) provider = MeterProvider(resource=resource, metric_readers=[reader]) metrics.set_meter_provider(provider) meter = metrics.get_meter(__name__) # Create different metric types request_counter = meter.create_counter("app.requests.total", description="Total requests", unit="requests") error_counter = meter.create_counter("app.errors.total", description="Total errors", unit="errors") latency_histogram = meter.create_histogram("app.request.duration", description="Request duration", unit="ms") active_users = meter.create_up_down_counter("app.active_users", description="Active users") print("Starting OTLP metrics generator...") while True: # Simulate request metrics endpoint = random.choice(["/api/users", "/api/orders", "/api/products", "/health"]) method = random.choice(["GET", "POST"]) status = random.choice(["200", "200", "200", "200", "404", "500"]) request_counter.add(1, {"endpoint": endpoint, "method": method, "status": status}) if status == "500": error_counter.add(1, {"endpoint": endpoint}) latency = random.uniform(5, 500) if status != "500" else random.uniform(500, 2000) latency_histogram.record(latency, {"endpoint": endpoint, "method": method}) # Simulate active users fluctuation active_users.add(random.choice([-1, 0, 1]), {"region": random.choice(["us-east", "eu-west"])}) time.sleep(1) ================================================ FILE: otel-metrics-pipeline/config-otel.yaml ================================================ # # OTel Collector YAML Configuration for OpenTelemetry Metrics Pipeline # # This is the OTel-native equivalent of config.alloy for use with the Alloy OTel Engine. # Run with: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d # receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 processors: batch: {} transform: error_mode: ignore metric_statements: - context: resource statements: - set(attributes["deployment.environment"], "demo") exporters: otlphttp/prometheus: endpoint: http://prometheus:9090/api/v1/otlp tls: insecure: true service: pipelines: metrics: receivers: [otlp] processors: [batch, transform] exporters: [otlphttp/prometheus] ================================================ FILE: otel-metrics-pipeline/config.alloy ================================================ livedebugging { enabled = true } // Receive OTLP metrics from the demo app otelcol.receiver.otlp "default" { http { } grpc { } output { metrics = [otelcol.processor.batch.default.input] } } // Batch metrics for efficient export otelcol.processor.batch "default" { output { metrics = [otelcol.processor.transform.default.input] } } // Transform metric attributes (example: add a deployment label) otelcol.processor.transform "default" { error_mode = "ignore" metric_statements { context = "resource" statements = [ "set(attributes[\"deployment.environment\"], \"demo\")", ] } output { metrics = [otelcol.exporter.otlphttp.prometheus.input] } } // Export metrics to Prometheus via OTLP otelcol.exporter.otlphttp "prometheus" { client { endpoint = "http://prometheus:9090/api/v1/otlp" tls { insecure = true } } } ================================================ FILE: otel-metrics-pipeline/docker-compose-otel.yml ================================================ # OTel Engine Override # # Uses Alloy's experimental OTel Engine to run a standard OTel Collector YAML config # instead of the River/HCL config.alloy file. # # Usage: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d # Stop: docker compose -f docker-compose.yml -f docker-compose-otel.yml down # services: alloy: command: otel --config=/etc/alloy/config-otel.yaml volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml ports: - 8888:8888 # OTel Engine HTTP server ================================================ FILE: otel-metrics-pipeline/docker-compose.coda.yml ================================================ services: app: image: python:${PYTHON_VERSION:-3.11-slim} network_mode: host restart: unless-stopped volumes: - ./app:/app working_dir: /app command: sh -c "pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-grpc && python3 main.py" environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 ================================================ FILE: otel-metrics-pipeline/docker-compose.yml ================================================ services: # Python app that generates OTLP metrics app: image: python:${PYTHON_VERSION:-3.11-slim} volumes: - ./app:/app working_dir: /app command: sh -c "pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-grpc && python3 main.py" environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 depends_on: - alloy # Alloy for telemetry pipeline alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 # Alloy HTTP server - 4317:4317 # OTLP gRPC - 4318:4318 # OTLP HTTP volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - prometheus # Prometheus for metrics storage prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --web.enable-otlp-receiver - --enable-feature=native-histograms - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090/tcp volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml # Grafana for visualization grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh depends_on: - prometheus ================================================ FILE: otel-metrics-pipeline/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s otlp: promote_resource_attributes: - service.instance.id - service.name - service.namespace - service.version - deployment.environment storage: tsdb: out_of_order_time_window: 30m ================================================ FILE: otel-span-metrics/README.md ================================================ # OTel Span Metrics (RED Metrics from Traces) This scenario demonstrates how to generate **RED metrics** (Request rate, Error rate, Duration) from OpenTelemetry traces using Grafana Alloy's `otelcol.connector.spanmetrics` component. ## Overview Instead of relying on Tempo's built-in metrics generator, this approach uses Alloy's spanmetrics connector to derive metrics directly from trace spans in the telemetry pipeline. This gives you fine-grained control over which dimensions are extracted and how histograms are configured. ### Architecture ``` Flask App ---(OTLP/gRPC)---> Alloy ---> Tempo (traces) | +---> spanmetrics connector ---> Prometheus (RED metrics) ``` ### What Gets Generated The `otelcol.connector.spanmetrics` component produces the following metrics from every span: - **`duration_milliseconds`** - Histogram of span durations (for latency/duration analysis) - **`calls`** - Counter of span calls, with `status_code` label (for request rate and error rate) Additional dimensions extracted: `http.method`, `http.status_code`. ## Running ```bash # From repo root ./run-example.sh otel-span-metrics # Or directly cd otel-span-metrics && docker compose up -d ``` ## Accessing the UIs | Service | URL | |------------|----------------------------| | Grafana | http://localhost:3000 | | Alloy | http://localhost:12345 | | Prometheus | http://localhost:9090 | | Tempo | http://localhost:3200 | | Demo App | http://localhost:5000 | ## Exploring the Metrics Once the scenario is running and the load generator has been active for a minute or so, open Grafana and navigate to the **Explore** page with the **Prometheus** datasource. Try these queries: ```promql # Request rate by service and span name rate(duration_milliseconds_count[5m]) # Error rate (spans with error status) rate(calls{status_code="STATUS_CODE_ERROR"}[5m]) # P95 latency by span name histogram_quantile(0.95, rate(duration_milliseconds_bucket[5m])) ``` ## Stopping ```bash cd otel-span-metrics && docker compose down ``` ================================================ FILE: otel-span-metrics/app/load.py ================================================ import requests, time, random endpoints = ["http://app:5000/", "http://app:5000/api/data", "http://app:5000/api/slow"] while True: try: url = random.choice(endpoints[:2]) # mostly hit fast endpoints if random.random() < 0.1: url = endpoints[2] # occasionally hit slow requests.get(url, timeout=5) except: pass time.sleep(random.uniform(0.5, 2.0)) ================================================ FILE: otel-span-metrics/app/main.py ================================================ from flask import Flask, jsonify import random, time from opentelemetry import trace from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.sdk.resources import Resource resource = Resource.create({"service.name": "demo-app"}) provider = TracerProvider(resource=resource) exporter = OTLPSpanExporter() provider.add_span_processor(BatchSpanProcessor(exporter)) trace.set_tracer_provider(provider) tracer = trace.get_tracer(__name__) app = Flask(__name__) @app.route("/") def index(): with tracer.start_as_current_span("index"): time.sleep(random.uniform(0.01, 0.05)) return jsonify({"status": "ok"}) @app.route("/api/data") def get_data(): with tracer.start_as_current_span("get-data"): time.sleep(random.uniform(0.02, 0.1)) if random.random() < 0.1: raise Exception("Random error") return jsonify({"data": [1, 2, 3]}) @app.route("/api/slow") def slow(): with tracer.start_as_current_span("slow-operation"): time.sleep(random.uniform(0.5, 2.0)) return jsonify({"status": "done"}) if __name__ == "__main__": app.run(host="0.0.0.0", port=5000) ================================================ FILE: otel-span-metrics/app/requirements.txt ================================================ flask opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-grpc ================================================ FILE: otel-span-metrics/config-otel.yaml ================================================ # # OTel Collector YAML Configuration for Span Metrics (RED Metrics from Traces) # # This is the OTel-native equivalent of config.alloy for use with the Alloy OTel Engine. # Run with: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d # receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 processors: batch: {} connectors: spanmetrics: histogram: explicit: {} dimensions: - name: http.method - name: http.status_code metrics_flush_interval: 5s exporters: otlphttp/prometheus: endpoint: http://prometheus:9090/api/v1/otlp tls: insecure: true otlp/tempo: endpoint: tempo:4317 tls: insecure: true service: pipelines: traces: receivers: [otlp] processors: [batch] exporters: [spanmetrics, otlp/tempo] metrics: receivers: [spanmetrics] exporters: [otlphttp/prometheus] ================================================ FILE: otel-span-metrics/config.alloy ================================================ livedebugging { enabled = true } // Receive OTel traces from the demo app otelcol.receiver.otlp "default" { http { } grpc { } output { traces = [otelcol.processor.batch.default.input] } } // Batch traces for efficiency otelcol.processor.batch "default" { output { traces = [ otelcol.connector.spanmetrics.default.input, otelcol.exporter.otlp.tempo.input, ] } } // Generate RED metrics from spans otelcol.connector.spanmetrics "default" { histogram { explicit { } } dimension { name = "http.method" } dimension { name = "http.status_code" } metrics_flush_interval = "5s" output { metrics = [otelcol.exporter.otlphttp.prometheus.input] } } // Send RED metrics to Prometheus otelcol.exporter.otlphttp "prometheus" { client { endpoint = "http://prometheus:9090/api/v1/otlp" tls { insecure = true } } } // Send traces to Tempo otelcol.exporter.otlp "tempo" { client { endpoint = "tempo:4317" tls { insecure = true } } } ================================================ FILE: otel-span-metrics/docker-compose-otel.yml ================================================ # OTel Engine Override # # Uses Alloy's experimental OTel Engine to run a standard OTel Collector YAML config # instead of the River/HCL config.alloy file. # # Usage: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d # Stop: docker compose -f docker-compose.yml -f docker-compose-otel.yml down # services: alloy: command: otel --config=/etc/alloy/config-otel.yaml volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml ports: - 8888:8888 # OTel Engine HTTP server ================================================ FILE: otel-span-metrics/docker-compose.coda.yml ================================================ services: app: image: python:${PYTHON_VERSION:-3.11-slim} network_mode: host restart: unless-stopped volumes: - ./app:/app working_dir: /app command: sh -c "pip install -r requirements.txt && python main.py" environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 load: image: python:${PYTHON_VERSION:-3.11-slim} network_mode: host restart: unless-stopped volumes: - ./app:/app working_dir: /app command: sh -c "pip install requests && python load.py" ================================================ FILE: otel-span-metrics/docker-compose.yml ================================================ services: # Python Flask app that generates traces app: image: python:${PYTHON_VERSION:-3.11-slim} ports: - 5000:5000/tcp volumes: - ./app:/app working_dir: /app command: sh -c "pip install -r requirements.txt && python main.py" environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 depends_on: - alloy # Load generator to continuously hit the app endpoints load: image: python:${PYTHON_VERSION:-3.11-slim} volumes: - ./app:/app working_dir: /app command: sh -c "pip install requests && python load.py" depends_on: - app # Alloy for telemetry pipeline with spanmetrics connector alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 # Alloy HTTP server - 4317:4317 # OTLP gRPC - 4318:4318 # OTLP HTTP volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - prometheus - tempo # Prometheus for metrics collection prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --web.enable-otlp-receiver - --enable-feature=native-histograms - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090/tcp volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml # Tempo for trace storage tempo: image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4} command: ["-config.file=/etc/tempo.yaml"] ports: - 3200:3200/tcp volumes: - ./tempo-config.yaml:/etc/tempo.yaml depends_on: - tempo-init - memcached # Init container to set up Tempo storage directories tempo-init: image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4} user: root entrypoint: - "chown" - "10001:10001" - "/var/tempo" volumes: - tempo-data:/var/tempo memcached: image: memcached:1.6@sha256:277e0c4f249b118e95ab10e535bae2fa1af772271d9152f3468e58d59348db56 container_name: memcached ports: - "11211:11211" environment: - MEMCACHED_MAX_MEMORY=64m - MEMCACHED_THREADS=4 # Grafana for visualization grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false - name: Tempo type: tempo access: proxy orgId: 1 url: http://tempo:3200 basicAuth: false isDefault: false version: 1 editable: false jsonData: serviceMap: datasourceUid: 'Prometheus' nodeGraph: enabled: true EOF /run.sh depends_on: - prometheus - tempo volumes: tempo-data: ================================================ FILE: otel-span-metrics/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s otlp: promote_resource_attributes: - service.instance.id - service.name - service.namespace - service.version storage: tsdb: out_of_order_time_window: 30m ================================================ FILE: otel-span-metrics/tempo-config.yaml ================================================ stream_over_http_enabled: true server: http_listen_port: 3200 log_level: info cache: background: writeback_goroutines: 5 caches: - roles: - frontend-search memcached: addresses: dns+memcached:11211 query_frontend: search: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 metadata_slo: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 trace_by_id: duration_slo: 100ms metrics: max_duration: 200h query_backend_after: 5m duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 distributor: receivers: jaeger: protocols: thrift_http: endpoint: "tempo:14268" grpc: endpoint: "tempo:14250" thrift_binary: endpoint: "tempo:6832" thrift_compact: endpoint: "tempo:6831" zipkin: endpoint: "tempo:9411" otlp: protocols: grpc: endpoint: "tempo:4317" http: endpoint: "tempo:4318" opencensus: endpoint: "tempo:55678" ingester: max_block_duration: 5m compactor: compaction: block_retention: 720h storage: trace: backend: local wal: path: /var/tempo/wal local: path: /var/tempo/blocks ================================================ FILE: otel-tail-sampling/README.md ================================================ # OpenTelemetry Tail Sampling with Grafana Alloy This example demonstrates how to implement tail sampling for OpenTelemetry traces using Grafana Alloy, allowing you to intelligently filter and sample traces based on various criteria. ## Overview The example includes: - A Python Flask application that automatically generates different types of traces in the background - Grafana Alloy configured with tail sampling policies and transform processor - Tempo for trace storage and querying - Prometheus for metrics collection - Grafana for visualization - Live debugging for monitoring the sampling process ## Running the Demo 1. Clone the repository: ``` git clone https://github.com/grafana/alloy-scenarios.git cd alloy-scenarios ``` 2. Navigate to this example directory: ``` cd otel-tail-sampling ``` 3. Run using Docker Compose: ``` docker compose up -d ``` Or use the centralized image management: ``` cd .. ./run-example.sh otel-tail-sampling ``` 4. Access the demo application at http://localhost:8080 5. Access Grafana at http://localhost:3000 6. Access Prometheus at http://localhost:9090 7. Access Alloy's live debugging endpoint at http://localhost:12345/debug/livedebugging ## What to Expect The demo application automatically generates various types of traces in the background: - **Simple Traces**: Basic single-span traces - **Nested Traces**: Traces with parent-child relationships - **Error Traces**: Traces containing errors - **High Latency Traces**: Traces with execution times over 5 seconds - **Delayed Chain Traces**: Service chains with Service D consistently having high latency (3-4 seconds) You can also manually trigger trace generation using the web UI. The application will continuously generate a mix of these trace types in the background at random intervals. ## Processing Pipeline This example demonstrates a more complex trace processing pipeline with the following components: > Note: In the case of tail sampling, this ensures that trace spans are presented to the tail sampler as early as possible, to ensure that a decision period includes all relevant spans for a trace. Batch processing potentially prevents spans from arriving at the sampler before a sampling decision is made once the first span for a trace has been seen. This can lead to incorrect decisions being made, and starts to rely on a cache being enabled for future sampling decisions. 1. **OTLP Receiver**: Receives traces from the application via gRPC or HTTP 2. **Tail Sampling Processor**: Applies sampling policies based on trace properties 3. **Batch Processor**: Groups spans for efficient processing 4. **OTLP Exporter**: Sends sampled traces to Tempo ## Tail Sampling Configuration This example uses Alloy's `otelcol.processor.tail_sampling` processor, which makes sampling decisions based on the entire trace, not just individual spans. This allows for more intelligent sampling based on trace-wide properties. > Note: Tempo indexes upon TraceID's and SpanID's not resource attributes. Make sure you only send When requesting trace IDs or carrying out TraceQL queries, this will mean that returned traces will in fact consist of whichever duplicate span is encountered first. This will mean that subsequent queries will potentially not yield the same result, and that the service names for spans in the same trace could be comprised of both raw-traces and trace-demo-tail-sampled in the same trace, or appear to be from a sampled trace when it was in fact unsampled, or vice versa. To ensure consistency, only one set of spans with a unique ID and traceID should be emitted to Tempo. The tail sampling configuration includes the following policies: 1. **Attribute-Based Sampling**: Samples traces with a specific attribute value ``` policy { name = "test-attribute-policy" type = "string_attribute" string_attribute { key = "test_attr_key_1" values = ["test_attr_val_1"] } } ``` 2. **Error Sampling**: Always samples traces with ERROR status ``` policy { name = "error-policy" type = "status_code" status_code { status_codes = ["ERROR"] } } ``` 3. **Latency-Based Sampling**: Samples traces that exceed a latency threshold ``` policy { name = "latency-policy" type = "latency" latency { threshold_ms = 5000 // 5 seconds } } ``` 4. **Numerical Range Sampling**: Samples traces with a numeric attribute in a specific range ``` policy { name = "numeric-policy" type = "numeric_attribute" numeric_attribute { key = "key1" min_value = 70 max_value = 100 } } ``` 5. **URL-Based Filtering**: Excludes health check and metrics endpoints ``` policy { name = "url-filter-policy" type = "string_attribute" string_attribute { key = "http.url" values = ["/health", "/metrics"] invert_match = true } } ``` 6. **Probabilistic Sampling**: Samples a percentage of remaining traces ``` policy { name = "probabilistic-policy" type = "probabilistic" probabilistic { sampling_percentage = 10 } } ``` ## Live Debugging This example enables Alloy's live debugging feature, which provides real-time insights into the sampling process: ``` livedebugging { enabled = true } ``` Access the live debugging interface at http://localhost:12345 to see: - Current processing pipeline state - Trace sampling decisions in real-time - Policy hit counts and performance metrics - Throughput statistics ## Sampling Implications With tail sampling enabled in this example: - All error traces are preserved for troubleshooting - High latency traces (>5s) are kept for performance analysis - Traces with specific attribute values used for monitoring are retained - Health check and metrics endpoints are filtered out to reduce noise - A small percentage of other traces are kept for baseline monitoring - Traces not matching any criteria are dropped, reducing storage needs - Raw traces are stored with a different service name for comparison ## Viewing Traces in Grafana To view the sampled traces: 1. Open Grafana (http://localhost:3000) 2. Navigate to Explore 3. Select the Tempo data source 4. Use the Search tab to find traces based on various criteria ## Sample Queries Try these queries in Grafana's Tempo Explorer: - Find all traces for the sampled service: ``` {resource.service.name="trace-demo-tail-sampled"} ``` - Find error traces: ``` {status=error} ``` - Find high latency traces: ``` {duration>5s} ``` - Find traces with a specific attribute: ``` {span.test_attr_key_1="test_attr_val_1"} ``` - Find traces with Service D bottleneck: ``` {span.service.latency="high" && span.latency.category="bottleneck"} ``` ## Customizing You can modify the `config.alloy` file to adjust the sampling policies: - Change the decision wait time to balance memory usage vs. complete trace visibility - Adjust the sampling thresholds to capture more or fewer traces - Add additional sampling policies based on your specific needs - Modify the existing policies to match your application's attributes - Update the transform processor to add or modify different attributes ## Further Resources - [Grafana Alloy Tail Sampling Documentation](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.tail_sampling/) - [Grafana Alloy Transform Processor Documentation](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.transform/) - [OpenTelemetry Tail Sampling Processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/tailsamplingprocessor) - [Live Debugging in Grafana Alloy](https://grafana.com/docs/alloy/latest/debug-alloy-flow/) ================================================ FILE: otel-tail-sampling/app/Dockerfile ================================================ ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2 FROM python:${PYTHON_VERSION} WORKDIR /app COPY requirements.txt . RUN pip install -r requirements.txt COPY app.py . CMD ["python", "app.py"] ================================================ FILE: otel-tail-sampling/app/app.py ================================================ import os import random import time import threading import logging import uuid from flask import Flask, request from opentelemetry import trace from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.resources import Resource, SERVICE_NAME from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.instrumentation.flask import FlaskInstrumentor from opentelemetry.instrumentation.requests import RequestsInstrumentor import requests from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Configure the tracer resource = Resource.create(attributes={ SERVICE_NAME: "trace-demo-tail-sampled" }) trace.set_tracer_provider(TracerProvider(resource=resource)) # Configure the OTLP exporter using environment variables # OTEL_EXPORTER_OTLP_ENDPOINT will be used automatically otlp_exporter = OTLPSpanExporter(endpoint="http://alloy:4317/v1/traces", insecure=True) span_processor = BatchSpanProcessor(span_exporter=otlp_exporter, max_export_batch_size=1) trace.get_tracer_provider().add_span_processor(span_processor) # Create a tracer tracer = trace.get_tracer(__name__) # Create a propagator for handling trace context propagator = TraceContextTextMapPropagator() # Create a Flask application app = Flask(__name__) # Instrument Flask FlaskInstrumentor().instrument_app(app) # Instrument requests RequestsInstrumentor().instrument() # Background trace generation functions def generate_simple_trace(): with tracer.start_as_current_span("simple-operation") as span: span.set_attribute("operation.type", "simple") span.set_attribute("operation.value", random.randint(1, 100)) # Set a sampling-relevant attribute span.set_attribute("test_attr_key_1", "test_attr_val_1" if random.random() < 0.3 else "other_value") time.sleep(0.1) # Simulate work logger.info("Generated simple trace") def generate_nested_trace(): with tracer.start_as_current_span("parent-operation") as parent: parent.set_attribute("operation.type", "parent") parent.set_attribute("key1", random.randint(1, 100)) # For numeric attribute sampling time.sleep(0.05) # Simulate work with tracer.start_as_current_span("child-operation-1") as child1: child1.set_attribute("operation.type", "child") child1.set_attribute("child.number", 1) child1.set_attribute("key2", "value1" if random.random() < 0.5 else "other_value") # For string attribute sampling time.sleep(0.05) # Simulate work with tracer.start_as_current_span("child-operation-2") as child2: child2.set_attribute("operation.type", "child") child2.set_attribute("child.number", 2) time.sleep(0.05) # Simulate work with tracer.start_as_current_span("grandchild-operation") as grandchild: grandchild.set_attribute("operation.type", "grandchild") time.sleep(0.05) # Simulate work logger.info("Generated nested trace") def generate_error_trace(): with tracer.start_as_current_span("error-operation") as span: span.set_attribute("operation.type", "error") try: # Simulate an error result = 1 / 0 except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) logger.info("Generated error trace") def generate_high_latency_trace(): with tracer.start_as_current_span("high-latency-operation") as span: span.set_attribute("operation.type", "high-latency") # Add a randomized latency between 3-10 seconds latency = random.uniform(3.0, 10.0) span.set_attribute("latency.seconds", latency) time.sleep(latency) # Simulate high latency work logger.info(f"Generated high latency trace with {latency:.2f}s delay") def generate_delayed_chain_trace(): """Generate a chain of service calls with service D having high latency""" try: with tracer.start_as_current_span("delayed-chain-root") as span: span.set_attribute("operation.step", "start") span.set_attribute("operation.type", "delayed-chain") # Start the chain with Service A req_id = random.randint(1000, 9999) # Instead of making HTTP calls in the background, simulate the chain directly with tracer.start_as_current_span("service-a-handler") as span_a: span_a.set_attribute("service", "A") span_a.set_attribute("request.id", str(req_id)) span_a.set_attribute("service.latency", "normal") span_a.set_attribute("http.url", "/delayed/service-a") time.sleep(0.1) # Normal latency with tracer.start_as_current_span("service-b-handler") as span_b: span_b.set_attribute("service", "B") span_b.set_attribute("request.id", str(req_id)) span_b.set_attribute("service.latency", "normal") span_b.set_attribute("http.url", "/delayed/service-b") time.sleep(0.15) # Normal latency with tracer.start_as_current_span("service-c-handler") as span_c: span_c.set_attribute("service", "C") span_c.set_attribute("request.id", str(req_id)) span_c.set_attribute("service.latency", "normal") span_c.set_attribute("http.url", "/delayed/service-c") time.sleep(0.2) # Normal latency with tracer.start_as_current_span("service-d-handler") as span_d: span_d.set_attribute("service", "D") span_d.set_attribute("request.id", str(req_id)) span_d.set_attribute("service.latency", "high") span_d.set_attribute("latency.category", "bottleneck") span_d.set_attribute("http.url", "/delayed/service-d") # This service consistently has high latency (3-4 seconds) delay = random.uniform(3.0, 4.0) span_d.set_attribute("latency.seconds", delay) time.sleep(delay) # High latency with tracer.start_as_current_span("service-e-handler") as span_e: span_e.set_attribute("service", "E") span_e.set_attribute("request.id", str(req_id)) span_e.set_attribute("service.latency", "normal") span_e.set_attribute("http.url", "/delayed/service-e") time.sleep(0.1) # Normal latency logger.info("Generated delayed chain trace with high latency in Service D") except Exception as e: logger.error(f"Error generating delayed chain trace: {e}") # New function for generating true multi-service traces def generate_multi_service_trace_bg(): """Generate a trace that spans multiple services with true service.name differentiation""" try: # Create a unique trace ID for correlating spans trace_id = str(uuid.uuid4()) transaction_id = str(uuid.uuid4())[:8] logger.info(f"Generating multi-service trace. Transaction ID: {transaction_id}") # Simulate a microservice architecture with: # 1. Frontend service (web-ui) # 2. API Gateway (api-gateway) # 3. Authentication service (auth-service) # 4. User service (user-service) # 5. Notification service (notification-service) # 6. Database service (db-service) # Create a custom resource for each service web_ui_resource = Resource.create(attributes={SERVICE_NAME: "web-ui"}) api_gw_resource = Resource.create(attributes={SERVICE_NAME: "api-gateway"}) auth_resource = Resource.create(attributes={SERVICE_NAME: "auth-service"}) user_resource = Resource.create(attributes={SERVICE_NAME: "user-service"}) notif_resource = Resource.create(attributes={SERVICE_NAME: "notification-service"}) db_resource = Resource.create(attributes={SERVICE_NAME: "db-service"}) # Create tracers for each service web_ui_tracer = trace.get_tracer("web-ui-tracer", resource=web_ui_resource) api_gw_tracer = trace.get_tracer("api-gw-tracer", resource=api_gw_resource) auth_tracer = trace.get_tracer("auth-tracer", resource=auth_resource) user_tracer = trace.get_tracer("user-tracer", resource=user_resource) notif_tracer = trace.get_tracer("notif-tracer", resource=notif_resource) db_tracer = trace.get_tracer("db-tracer", resource=db_resource) # 1. Frontend service (web-ui) - User logs in with web_ui_tracer.start_as_current_span("login-page-render") as web_span: web_span.set_attribute("component", "web-ui") web_span.set_attribute("transaction.id", transaction_id) web_span.set_attribute("user.action", "login") web_span.set_attribute("http.method", "GET") web_span.set_attribute("http.url", "/login") time.sleep(0.1) # 2. Send login request to API Gateway with api_gw_tracer.start_as_current_span("api-gateway-login-handler") as api_span: api_span.set_attribute("component", "api-gateway") api_span.set_attribute("transaction.id", transaction_id) api_span.set_attribute("endpoint", "/api/v1/login") api_span.set_attribute("http.method", "POST") time.sleep(0.15) # 3. API Gateway calls Authentication Service with auth_tracer.start_as_current_span("authenticate-user") as auth_span: auth_span.set_attribute("component", "auth-service") auth_span.set_attribute("transaction.id", transaction_id) auth_span.set_attribute("auth.method", "password") time.sleep(0.2) # 4. Auth service calls User Service to retrieve user details with user_tracer.start_as_current_span("get-user-details") as user_span: user_span.set_attribute("component", "user-service") user_span.set_attribute("transaction.id", transaction_id) user_span.set_attribute("user.id", f"user_{random.randint(1000, 9999)}") # 5. User service calls DB Service with db_tracer.start_as_current_span("db-query") as db_span: db_span.set_attribute("component", "db-service") db_span.set_attribute("transaction.id", transaction_id) db_span.set_attribute("db.operation", "SELECT") db_span.set_attribute("db.table", "users") # Randomly introduce database latency if random.random() < 0.3: delay = random.uniform(0.5, 1.5) db_span.set_attribute("db.latency", delay) db_span.set_attribute("latency.category", "slow-query") time.sleep(delay) else: time.sleep(0.1) # 6. After successful login, send notification with notif_tracer.start_as_current_span("send-login-notification") as notif_span: notif_span.set_attribute("component", "notification-service") notif_span.set_attribute("transaction.id", transaction_id) notif_span.set_attribute("notification.type", "login_alert") notif_span.set_attribute("notification.channel", random.choice(["email", "sms", "push"])) time.sleep(0.15) logger.info(f"Generated multi-service trace with transaction ID: {transaction_id}") return transaction_id except Exception as e: logger.error(f"Error generating multi-service trace: {e}") return None def generate_trace_batch(): """Generates a batch of different trace types""" trace_generators = [ generate_simple_trace, generate_nested_trace, generate_error_trace, generate_high_latency_trace, generate_delayed_chain_trace, generate_multi_service_trace_bg # Add the new trace type ] # Randomly select which traces to generate with weighted probabilities weights = [0.20, 0.20, 0.15, 0.1, 0.15, 0.2] # Add weight for multi-service trace for _ in range(random.randint(3, 8)): # Generate 3-8 traces per batch selected_generator = random.choices(trace_generators, weights=weights, k=1)[0] selected_generator() time.sleep(random.uniform(0.1, 0.5)) # Small delay between traces def trace_generator_thread(): """Background thread that generates traces at regular intervals""" while True: try: generate_trace_batch() # Wait between 5-15 seconds before generating the next batch delay = random.uniform(5, 15) logger.info(f"Next trace batch in {delay:.2f} seconds") time.sleep(delay) except Exception as e: logger.error(f"Error in trace generation: {e}") time.sleep(5) # Wait before retrying # API endpoints @app.route('/') def home(): return """

OpenTelemetry Tail Sampling Demo

This app demonstrates OpenTelemetry tracing with Tail Sampling using Grafana Alloy.

The app automatically generates various types of traces in the background.

You can also trigger trace generation manually using these endpoints:

""" @app.route('/simple') def simple_trace(): generate_simple_trace() return {"status": "ok", "message": "Simple trace generated"} @app.route('/nested') def nested_trace(): generate_nested_trace() return {"status": "ok", "message": "Nested trace generated"} @app.route('/error') def error_trace(): generate_error_trace() return {"status": "ok", "message": "Error trace generated"} @app.route('/high-latency') def high_latency_trace(): generate_high_latency_trace() return {"status": "ok", "message": "High latency trace generated"} @app.route('/batch') def batch_trace(): generate_trace_batch() return {"status": "ok", "message": "Trace batch generated"} @app.route('/multi-service') def multi_service_trace(): transaction_id = generate_multi_service_trace_bg() return { "status": "ok", "message": "Multi-service trace generated", "transaction_id": transaction_id, "services": ["web-ui", "api-gateway", "auth-service", "user-service", "notification-service", "db-service"] } @app.route('/chain') def chain_trace(): with tracer.start_as_current_span("chain-root") as span: span.set_attribute("operation.step", "start") # Simulate a chain of service calls try: # Call ourselves to simulate microservice calls # In a real world example these would be different services service_b_url = f"http://localhost:8080/service/b?id={random.randint(1000, 9999)}" response = requests.get(service_b_url) return {"status": "ok", "message": "Chain trace generated", "data": response.json()} except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) return {"status": "error", "message": "Failed to complete chain"} @app.route('/service/b') def service_b(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span(f"service-b-handler") as span: span.set_attribute("service", "B") span.set_attribute("request.id", req_id) span.set_attribute("http.url", "/service/b") # For URL-based sampling time.sleep(0.1) # Simulate work # Call service C service_c_url = f"http://localhost:8080/service/c?id={req_id}" response = requests.get(service_c_url) return {"status": "ok", "message": "Service B completed", "data": response.json()} @app.route('/service/c') def service_c(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span(f"service-c-handler") as span: span.set_attribute("service", "C") span.set_attribute("request.id", req_id) span.set_attribute("http.url", "/service/c") # For URL-based sampling time.sleep(0.15) # Simulate work # Randomly fail sometimes to show error traces if random.random() < 0.2: # 20% chance of failure span.set_status(trace.StatusCode.ERROR, "Random failure") return {"status": "error", "message": "Service C failed randomly"} return {"status": "ok", "message": "Service C completed successfully"} # Add the delayed chain implementation @app.route('/delayed-chain') def delayed_chain_trace_endpoint(): with tracer.start_as_current_span("delayed-chain-root") as span: span.set_attribute("operation.step", "start") span.set_attribute("operation.type", "delayed-chain") try: # Start the chain with Service A service_a_url = f"http://localhost:8080/delayed/service-a?id={random.randint(1000, 9999)}" response = requests.get(service_a_url) return { "status": "ok", "message": "Delayed chain trace generated", "data": response.json() } except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) return {"status": "error", "message": "Failed to complete delayed chain"} @app.route('/delayed/service-a') def delayed_service_a(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span("service-a-handler") as span: span.set_attribute("service", "A") span.set_attribute("request.id", req_id) span.set_attribute("service.latency", "normal") span.set_attribute("http.url", "/delayed/service-a") time.sleep(0.1) # Normal latency # Call service B service_b_url = f"http://localhost:8080/delayed/service-b?id={req_id}" response = requests.get(service_b_url) return {"status": "ok", "message": "Service A completed", "data": response.json()} @app.route('/delayed/service-b') def delayed_service_b(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span("service-b-handler") as span: span.set_attribute("service", "B") span.set_attribute("request.id", req_id) span.set_attribute("service.latency", "normal") span.set_attribute("http.url", "/delayed/service-b") time.sleep(0.15) # Normal latency # Call service C service_c_url = f"http://localhost:8080/delayed/service-c?id={req_id}" response = requests.get(service_c_url) return {"status": "ok", "message": "Service B completed", "data": response.json()} @app.route('/delayed/service-c') def delayed_service_c(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span("service-c-handler") as span: span.set_attribute("service", "C") span.set_attribute("request.id", req_id) span.set_attribute("service.latency", "normal") span.set_attribute("http.url", "/delayed/service-c") time.sleep(0.2) # Normal latency # Call the slow service D service_d_url = f"http://localhost:8080/delayed/service-d?id={req_id}" response = requests.get(service_d_url) return {"status": "ok", "message": "Service C completed", "data": response.json()} @app.route('/delayed/service-d') def delayed_service_d(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span("service-d-handler") as span: span.set_attribute("service", "D") span.set_attribute("request.id", req_id) span.set_attribute("service.latency", "high") span.set_attribute("latency.category", "bottleneck") span.set_attribute("http.url", "/delayed/service-d") # This service consistently has high latency (3-4 seconds) delay = random.uniform(3.0, 4.0) span.set_attribute("latency.seconds", delay) time.sleep(delay) # High latency # Call final service E service_e_url = f"http://localhost:8080/delayed/service-e?id={req_id}" response = requests.get(service_e_url) return {"status": "ok", "message": "Service D completed (with delay)", "data": response.json()} @app.route('/delayed/service-e') def delayed_service_e(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span("service-e-handler") as span: span.set_attribute("service", "E") span.set_attribute("request.id", req_id) span.set_attribute("service.latency", "normal") span.set_attribute("http.url", "/delayed/service-e") time.sleep(0.1) # Normal latency return {"status": "ok", "message": "Service E completed (chain end)"} if __name__ == '__main__': # Start the background trace generator thread trace_thread = threading.Thread(target=trace_generator_thread, daemon=True) trace_thread.start() logger.info("Starting the application with background trace generation") app.run(host='0.0.0.0', port=8080) ================================================ FILE: otel-tail-sampling/app/requirements.txt ================================================ flask requests opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp opentelemetry-instrumentation-flask opentelemetry-instrumentation-requests ================================================ FILE: otel-tail-sampling/config-otel.yaml ================================================ # # OTel Collector YAML Configuration for Tail Sampling # # This is the OTel-native equivalent of config.alloy for use with the Alloy OTel Engine. # Run with: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d # receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 processors: # Tail Sampling: wait for complete traces before making sampling decisions tail_sampling: decision_wait: 10s num_traces: 100 expected_new_traces_per_sec: 10 policies: # Policy 1: Always sample traces with a specific attribute value - name: test-attribute-policy type: string_attribute string_attribute: key: test_attr_key_1 values: [test_attr_val_1] # Policy 2: Sample error traces - name: error-policy type: status_code status_code: status_codes: [ERROR] # Policy 3: Sample high latency traces (> 5s) - name: latency-policy type: latency latency: threshold_ms: 5000 # Policy 4: Sample traces matching a numeric attribute range - name: numeric-policy type: numeric_attribute numeric_attribute: key: key1 min_value: 70 max_value: 100 # Policy 5: URL-based policy to filter out health checks - name: url-filter-policy type: string_attribute string_attribute: key: http.url values: ["/health", "/metrics"] invert_match: true # Policy 6: Probabilistic sampling as a fallback (sample 10% of remaining traces) - name: probabilistic-policy type: probabilistic probabilistic: sampling_percentage: 10 batch: {} exporters: otlp/tempo: endpoint: tempo:4317 tls: insecure: true service: pipelines: traces: receivers: [otlp] processors: [tail_sampling, batch] exporters: [otlp/tempo] ================================================ FILE: otel-tail-sampling/config.alloy ================================================ /* * Alloy Configuration for OpenTelemetry Trace Collection with Tail Sampling */ // Receive OpenTelemetry traces otelcol.receiver.otlp "default" { http {} grpc {} output { traces = [otelcol.processor.tail_sampling.default.input] } } // Tail Sampling processor otelcol.processor.tail_sampling "default" { // Wait time to make a sampling decision decision_wait = "10s" // Number of traces kept in memory num_traces = 100 // Expected new traces per second expected_new_traces_per_sec = 10 // Policy 1: Always sample traces with a specific attribute value policy { name = "test-attribute-policy" type = "string_attribute" string_attribute { key = "test_attr_key_1" values = ["test_attr_val_1"] } } // Policy 2: Sample error traces policy { name = "error-policy" type = "status_code" status_code { status_codes = ["ERROR"] } } // Policy 3: Sample high latency traces policy { name = "latency-policy" type = "latency" latency { threshold_ms = 5000 // 5 seconds } } // Policy 4: Sample traces matching a numeric attribute range policy { name = "numeric-policy" type = "numeric_attribute" numeric_attribute { key = "key1" min_value = 70 max_value = 100 } } // Policy 5: URL-based policy to filter out health checks policy { name = "url-filter-policy" type = "string_attribute" string_attribute { key = "http.url" values = ["/health", "/metrics"] invert_match = true // Sample everything EXCEPT these URLs } } // Policy 6: Probabilistic sampling as a fallback (sample 10% of remaining traces) policy { name = "probabilistic-policy" type = "probabilistic" probabilistic { sampling_percentage = 10 } } output { traces = [otelcol.processor.batch.default.input] } } // Batch processor to improve performance otelcol.processor.batch "default" { output { traces = [otelcol.exporter.otlp.tempo.input] } } // Send sampled traces to Tempo otelcol.exporter.otlp "tempo" { client { endpoint = "tempo:4317" tls { insecure = true } } } livedebugging { enabled = true } ================================================ FILE: otel-tail-sampling/docker-compose-otel.yml ================================================ # OTel Engine Override # # Uses Alloy's experimental OTel Engine to run a standard OTel Collector YAML config # instead of the River/HCL config.alloy file. # # Usage: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d # Stop: docker compose -f docker-compose.yml -f docker-compose-otel.yml down # services: alloy: command: otel --config=/etc/alloy/config-otel.yaml volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml ports: - 8888:8888 # OTel Engine HTTP server ================================================ FILE: otel-tail-sampling/docker-compose.coda.yml ================================================ services: demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} network_mode: host restart: unless-stopped environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=demo-service - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo ================================================ FILE: otel-tail-sampling/docker-compose.yml ================================================ version: '3.8' services: # Prometheus for metrics collection prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --web.enable-otlp-receiver - --enable-feature=native-histograms - --enable-feature=exemplar-storage - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090/tcp volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml # Tempo for tracing tempo: image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4} command: ["-config.file=/etc/tempo.yaml"] ports: - 3200:3200/tcp # tempo volumes: - ./tempo-config.yaml:/etc/tempo.yaml depends_on: - prometheus # Grafana for visualization grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false - GF_INSTALL_PLUGINS=https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: false version: 1 editable: false - name: Tempo type: tempo access: proxy orgId: 1 url: http://tempo:3200 basicAuth: false isDefault: true version: 1 editable: false jsonData: serviceMap: datasourceUid: 'Prometheus' nodeGraph: enabled: true EOF /run.sh depends_on: - prometheus - tempo # Alloy for telemetry pipeline and tail sampling alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 # Alloy HTTP server - 4317:4317/tcp # OTLP gRPC - 4318:4318/tcp # OTLP HTTP volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy # Demo app that generates OpenTelemetry traces demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8080:8080 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=demo-service - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo ================================================ FILE: otel-tail-sampling/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s ================================================ FILE: otel-tail-sampling/tempo-config.yaml ================================================ stream_over_http_enabled: true server: http_listen_port: 3200 log_level: info cache: background: writeback_goroutines: 5 caches: - roles: - frontend-search memcached: addresses: dns+memcached:11211 query_frontend: search: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 metadata_slo: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 trace_by_id: duration_slo: 100ms metrics: max_duration: 200h # maximum duration of a metrics query, increase for local setups query_backend_after: 5m duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 distributor: receivers: # this configuration will listen on all ports and protocols that tempo is capable of. jaeger: # the receives all come from the OpenTelemetry collector. more configuration information can protocols: # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver thrift_http: # endpoint: "tempo:14268" # for a production deployment you should only enable the receivers you need! grpc: endpoint: "tempo:14250" thrift_binary: endpoint: "tempo:6832" thrift_compact: endpoint: "tempo:6831" zipkin: endpoint: "tempo:9411" otlp: protocols: grpc: endpoint: "tempo:4317" http: endpoint: "tempo:4318" opencensus: endpoint: "tempo:55678" ingester: max_block_duration: 5m # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally compactor: compaction: block_retention: 720h # overall Tempo trace retention. set for demo purposes metrics_generator: registry: external_labels: source: tempo cluster: docker-compose storage: path: /var/tempo/generator/wal remote_write: - url: http://prometheus:9090/api/v1/write send_exemplars: true traces_storage: path: /var/tempo/generator/traces processor: local_blocks: filter_server_spans: false flush_to_storage: true storage: trace: backend: local # backend configuration to use wal: path: /var/tempo/wal # where to store the wal locally local: path: /var/tempo/blocks overrides: defaults: metrics_generator: processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator generate_native_histograms: both ================================================ FILE: otel-tracing-service-graphs/README.md ================================================ # Alloy Service Graphs with OpenTelemetry This example demonstrates how to use Grafana Alloy to generate service graphs from OpenTelemetry traces and send them to Prometheus via OTLP HTTP, instead of relying on Tempo's built-in metrics generator. ## Overview The example includes: - A sample Python Flask application that generates various types of traces - Grafana Alloy as the telemetry pipeline with service graph generation - Tempo for trace storage and querying (without metrics generation) - Prometheus with OTLP receiver enabled for metrics collection - Memcached for Tempo caching - Grafana for visualization ## Running the Demo 1. Clone the repository: ``` git clone https://github.com/grafana/alloy-scenarios.git cd alloy-scenarios ``` 2. Navigate to this example directory: ``` cd otel-tracing-service-graphs ``` 3. Run using Docker Compose: ``` docker compose up -d ``` Or use the centralized image management: ``` cd .. ./run-example.sh otel-tracing-service-graphs ``` 4. Access the demo application at http://localhost:8080 5. Access Grafana at http://localhost:3000 6. Access Prometheus at http://localhost:9090 ## What to Expect The demo application provides several endpoints that generate different types of traces: - **/simple**: Generates a simple trace with a single span - **/nested**: Generates a trace with nested spans (parent-child relationships) - **/error**: Generates a trace that includes an error - **/chain**: Simulates a chain of service calls to demonstrate distributed tracing After accessing these endpoints, you can view the traces and service graphs in Grafana. ## Alloy Service Graph Generation This example demonstrates using Alloy's `otelcol.connector.servicegraph` component to generate service graphs from traces, which offers several advantages over using Tempo's built-in metrics generator: 1. **More Flexibility**: Alloy's service graph connector allows for customization of dimensions and collection intervals 2. **Pipeline Integration**: The service graph metrics can be part of a larger telemetry pipeline with additional processing 3. **Reduced Load on Tempo**: By offloading the service graph generation to Alloy, Tempo can focus on trace storage and querying The key component in the Alloy configuration is: ``` otelcol.connector.servicegraph "default" { metrics_flush_interval = "10s" dimensions = ["http.method"] output { metrics = [otelcol.exporter.otlphttp.prometheus.input] } } ``` ## Prometheus OTLP Integration This example uses Prometheus's OTLP HTTP receiver endpoint. This approach has several benefits: 1. **Native OTLP Integration**: Uses the OpenTelemetry Protocol directly between Alloy and Prometheus 2. **Simplified Configuration**: Uses Prometheus's built-in OTLP receiver without needing special ports 3. **Better Metadata Handling**: Resource attributes from OTLP are properly promoted to Prometheus labels The OTLP HTTP exporter configuration in Alloy is: ``` otelcol.exporter.otlphttp "prometheus" { client { endpoint = "http://prometheus:9090/api/v1/otlp" tls { insecure = true } } } ``` And in Prometheus, we've enabled the OTLP receiver and configured resource attributes to be promoted to labels: ``` otlp: promote_resource_attributes: - service.instance.id - service.name - service.namespace - service.version - deployment.environment # ...and more relevant attributes ``` ## Viewing Service Graphs To view the service graph: 1. Open Grafana (http://localhost:3000) 2. Navigate to Explore 3. Select the Tempo data source 4. Click on the "Service Graph" tab 5. You should see a visual representation of the relationships between services The service graph metrics are stored in Prometheus with the following metrics: - `calls_total`: Total number of calls between services - `calls_failed_total`: Total number of failed calls between services - `latency`: Histogram of latencies between services The metrics are segmented by HTTP method, allowing you to see which endpoints are being called. ## Architecture ``` ┌────────────┐ ┌──────────────────────┐ ┌───────┐ ┌─────────┐ │ Demo App │────▶│ Alloy │─────▶│ Tempo │─────▶│ Grafana │ │ (OTel SDK) │ │ ┌──────────────────┐ │ │ │ │ │ └────────────┘ │ │Service Graph Gen.│ │ └───────┘ └─────────┘ │ └────────┬─────────┘ │ ▲ └──────────┼───────────┘ │ │ │ ▼ │ ┌─────────┐ │ │Prometheus│──────────────────────────────────┘ │ (OTLP) │ └─────────┘ ``` In this architecture: 1. The Demo App generates traces using the OpenTelemetry SDK and sends them to Alloy 2. Alloy processes the traces and: - Generates service graph metrics using the servicegraph connector - Forwards the raw traces to Tempo 3. Service graph metrics are sent to Prometheus via OTLP HTTP 4. Grafana queries both Tempo for traces and Prometheus for service graph metrics ## Customizing The Alloy configuration can be further customized to add: - Additional processors for trace data - Filtering based on service names or other attributes - Custom dimensions for the service graph metrics (currently using HTTP method) - Additional metrics exporters for different backend systems ================================================ FILE: otel-tracing-service-graphs/app/Dockerfile ================================================ ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2 FROM python:${PYTHON_VERSION} WORKDIR /app COPY requirements.txt . RUN pip install -r requirements.txt COPY app.py . CMD ["python", "app.py"] ================================================ FILE: otel-tracing-service-graphs/app/app.py ================================================ import os import random import time import uuid from flask import Flask, request from opentelemetry import trace from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.resources import SERVICE_NAME, Resource from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.instrumentation.flask import FlaskInstrumentor from opentelemetry.instrumentation.requests import RequestsInstrumentor import requests # Configure the tracer resource = Resource.create(attributes={ SERVICE_NAME: "trace-demo" }) trace.set_tracer_provider(TracerProvider(resource=resource)) # Configure the OTLP exporter using environment variables # OTEL_EXPORTER_OTLP_ENDPOINT will be used automatically otlp_exporter = OTLPSpanExporter(endpoint="http://alloy:4317/v1/traces", insecure=True) span_processor = BatchSpanProcessor(span_exporter=otlp_exporter, max_export_batch_size=1) trace.get_tracer_provider().add_span_processor(span_processor) # Create a tracer tracer = trace.get_tracer(__name__) # Create a Flask application app = Flask(__name__) # Instrument Flask FlaskInstrumentor().instrument_app(app) # Instrument requests RequestsInstrumentor().instrument() @app.route('/') def home(): return """

OpenTelemetry Service Graph Demo

This app demonstrates OpenTelemetry tracing with Grafana Alloy and service graph generation.

""" @app.route('/simple') def simple_trace(): with tracer.start_as_current_span("simple-operation") as span: span.set_attribute("operation.type", "simple") span.set_attribute("operation.value", random.randint(1, 100)) time.sleep(0.1) # Simulate work return {"status": "ok", "message": "Simple trace generated"} @app.route('/nested') def nested_trace(): with tracer.start_as_current_span("parent-operation") as parent: parent.set_attribute("operation.type", "parent") time.sleep(0.05) # Simulate work with tracer.start_as_current_span("child-operation-1") as child1: child1.set_attribute("operation.type", "child") child1.set_attribute("child.number", 1) time.sleep(0.05) # Simulate work with tracer.start_as_current_span("child-operation-2") as child2: child2.set_attribute("operation.type", "child") child2.set_attribute("child.number", 2) time.sleep(0.05) # Simulate work with tracer.start_as_current_span("grandchild-operation") as grandchild: grandchild.set_attribute("operation.type", "grandchild") time.sleep(0.05) # Simulate work return {"status": "ok", "message": "Nested trace generated"} @app.route('/error') def error_trace(): with tracer.start_as_current_span("error-operation") as span: span.set_attribute("operation.type", "error") try: # Simulate an error result = 1 / 0 except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) return {"status": "error", "message": "Error trace generated"} @app.route('/chain') def chain_trace(): with tracer.start_as_current_span("chain-root") as span: span.set_attribute("operation.step", "start") # Simulate a chain of service calls try: # Call ourselves to simulate microservice calls # In a real world example these would be different services service_b_url = f"http://localhost:8080/service/b?id={random.randint(1000, 9999)}" response = requests.get(service_b_url) return {"status": "ok", "message": "Chain trace generated", "data": response.json()} except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) return {"status": "error", "message": "Failed to complete chain"} @app.route('/service/b') def service_b(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span(f"service-b-handler") as span: span.set_attribute("service", "B") span.set_attribute("request.id", req_id) time.sleep(0.1) # Simulate work # Call service C service_c_url = f"http://localhost:8080/service/c?id={req_id}" response = requests.get(service_c_url) return {"status": "ok", "message": "Service B completed", "data": response.json()} @app.route('/service/c') def service_c(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span(f"service-c-handler") as span: span.set_attribute("service", "C") span.set_attribute("request.id", req_id) time.sleep(0.15) # Simulate work # Randomly fail sometimes to show error traces if random.random() < 0.2: # 20% chance of failure span.set_status(trace.StatusCode.ERROR, "Random failure") return {"status": "error", "message": "Service C failed randomly"} return {"status": "ok", "message": "Service C completed successfully"} # New delayed chain implementation @app.route('/delayed-chain') def delayed_chain_trace(): with tracer.start_as_current_span("delayed-chain-root") as span: span.set_attribute("operation.step", "start") span.set_attribute("operation.type", "delayed-chain") try: # Start the chain with Service A service_a_url = f"http://localhost:8080/delayed/service-a?id={random.randint(1000, 9999)}" response = requests.get(service_a_url) return { "status": "ok", "message": "Delayed chain trace generated", "data": response.json() } except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) return {"status": "error", "message": "Failed to complete delayed chain"} @app.route('/delayed/service-a') def delayed_service_a(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span("service-a-handler") as span: span.set_attribute("service", "A") span.set_attribute("client.service.name", "frontend") span.set_attribute("request.id", req_id) span.set_attribute("service.latency", "normal") span.set_attribute("http.method", "GET") time.sleep(0.1) # Normal latency # Call service B service_b_url = f"http://localhost:8080/delayed/service-b?id={req_id}" response = requests.get(service_b_url) return {"status": "ok", "message": "Service A completed", "data": response.json()} @app.route('/delayed/service-b') def delayed_service_b(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span("service-b-handler") as span: span.set_attribute("service", "B") span.set_attribute("client.service.name", "service-a") span.set_attribute("request.id", req_id) span.set_attribute("service.latency", "normal") span.set_attribute("http.method", "GET") time.sleep(0.15) # Normal latency # Call service C service_c_url = f"http://localhost:8080/delayed/service-c?id={req_id}" response = requests.get(service_c_url) return {"status": "ok", "message": "Service B completed", "data": response.json()} @app.route('/delayed/service-c') def delayed_service_c(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span("service-c-handler") as span: span.set_attribute("service", "C") span.set_attribute("client.service.name", "service-b") span.set_attribute("request.id", req_id) span.set_attribute("service.latency", "normal") span.set_attribute("http.method", "GET") time.sleep(0.2) # Normal latency # Call the slow service D service_d_url = f"http://localhost:8080/delayed/service-d?id={req_id}" response = requests.get(service_d_url) return {"status": "ok", "message": "Service C completed", "data": response.json()} @app.route('/delayed/service-d') def delayed_service_d(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span("service-d-handler") as span: span.set_attribute("service", "D") span.set_attribute("client.service.name", "service-c") span.set_attribute("request.id", req_id) span.set_attribute("service.latency", "high") span.set_attribute("latency.category", "bottleneck") span.set_attribute("http.method", "GET") # This service consistently has high latency (3-4 seconds) delay = random.uniform(3.0, 4.0) span.set_attribute("latency.seconds", delay) time.sleep(delay) # High latency # Call final service E service_e_url = f"http://localhost:8080/delayed/service-e?id={req_id}" response = requests.get(service_e_url) return {"status": "ok", "message": "Service D completed (with delay)", "data": response.json()} @app.route('/delayed/service-e') def delayed_service_e(): req_id = request.args.get('id', 'unknown') with tracer.start_as_current_span("service-e-handler") as span: span.set_attribute("service", "E") span.set_attribute("client.service.name", "service-d") span.set_attribute("request.id", req_id) span.set_attribute("service.latency", "normal") span.set_attribute("http.method", "GET") time.sleep(0.1) # Normal latency return {"status": "ok", "message": "Service E completed (chain end)"} @app.route('/multi-service') def multi_service_trace(): transaction_id = generate_multi_service_trace() return { "status": "ok", "message": "Multi-service trace generated", "transaction_id": transaction_id, "services": ["web-ui", "api-gateway", "auth-service", "user-service", "notification-service", "db-service"] } # code fixed thanks to @hedss def generate_multi_service_trace(): """Generate a trace that spans multiple services with true service.name differentiation""" try: # Create a unique transaction ID for correlating spans transaction_id = str(uuid.uuid4())[:8] # Create a custom resource for each service web_ui_resource = Resource.create(attributes={SERVICE_NAME: "web-ui"}) api_gw_resource = Resource.create(attributes={SERVICE_NAME: "api-gateway"}) auth_resource = Resource.create(attributes={SERVICE_NAME: "auth-service"}) user_resource = Resource.create(attributes={SERVICE_NAME: "user-service"}) notif_resource = Resource.create(attributes={SERVICE_NAME: "notification-service"}) db_resource = Resource.create(attributes={SERVICE_NAME: "db-service"}) # Create trace providers with each resource web_ui_provider = TracerProvider(resource=web_ui_resource) api_gw_provider = TracerProvider(resource=api_gw_resource) auth_provider = TracerProvider(resource=auth_resource) user_provider = TracerProvider(resource=user_resource) notif_provider = TracerProvider(resource=notif_resource) db_provider = TracerProvider(resource=db_resource) # Connect the providers to the same OTLP exporter via span processors web_ui_provider.add_span_processor(span_processor) api_gw_provider.add_span_processor(span_processor) auth_provider.add_span_processor(span_processor) user_provider.add_span_processor(span_processor) notif_provider.add_span_processor(span_processor) db_provider.add_span_processor(span_processor) # Create tracers for each service using their respective providers web_ui_tracer = web_ui_provider.get_tracer("web-ui-tracer") api_gw_tracer = api_gw_provider.get_tracer("api-gw-tracer") auth_tracer = auth_provider.get_tracer("auth-tracer") user_tracer = user_provider.get_tracer("user-tracer") notif_tracer = notif_provider.get_tracer("notif-tracer") db_tracer = db_provider.get_tracer("db-tracer") # 1. Frontend service (web-ui) - User logs in with web_ui_tracer.start_as_current_span("login-page-render", kind=trace.SpanKind.SERVER) as web_span: web_span.set_attribute("component", "web-ui") web_span.set_attribute("transaction.id", transaction_id) web_span.set_attribute("user.action", "login") web_span.set_attribute("http.method", "GET") web_span.set_attribute("http.url", "/login") time.sleep(0.1) # 2. Send login request to API Gateway with web_ui_tracer.start_as_current_span("api-gateway-request", kind=trace.SpanKind.CLIENT) as web_client_span: web_client_span.set_attribute("component", "web-ui") web_client_span.set_attribute("transaction.id", transaction_id) web_client_span.set_attribute("http.method", "POST") web_client_span.set_attribute("http.url", "/api/v1/login") # API Gateway receives the request with api_gw_tracer.start_as_current_span("api-gateway-login-handler", kind=trace.SpanKind.SERVER) as api_span: api_span.set_attribute("component", "api-gateway") api_span.set_attribute("transaction.id", transaction_id) api_span.set_attribute("endpoint", "/api/v1/login") api_span.set_attribute("http.method", "POST") time.sleep(0.15) # 3. API Gateway calls Authentication Service with api_gw_tracer.start_as_current_span("auth-service-request", kind=trace.SpanKind.CLIENT) as api_client_span: api_client_span.set_attribute("component", "api-gateway") api_client_span.set_attribute("transaction.id", transaction_id) api_client_span.set_attribute("http.method", "POST") api_client_span.set_attribute("http.url", "/auth/authenticate") # Auth service receives the request with auth_tracer.start_as_current_span("authenticate-user", kind=trace.SpanKind.SERVER) as auth_span: auth_span.set_attribute("component", "auth-service") auth_span.set_attribute("transaction.id", transaction_id) auth_span.set_attribute("auth.method", "password") time.sleep(0.2) # 4. Auth service calls User Service with auth_tracer.start_as_current_span("user-service-request", kind=trace.SpanKind.CLIENT) as auth_client_span: auth_client_span.set_attribute("component", "auth-service") auth_client_span.set_attribute("transaction.id", transaction_id) auth_client_span.set_attribute("http.method", "GET") auth_client_span.set_attribute("http.url", "/user/details") # User service receives the request with user_tracer.start_as_current_span("get-user-details", kind=trace.SpanKind.SERVER) as user_span: user_span.set_attribute("component", "user-service") user_span.set_attribute("transaction.id", transaction_id) user_span.set_attribute("user.id", f"user_{random.randint(1000, 9999)}") # 5. User service calls DB Service with user_tracer.start_as_current_span("db-service-request", kind=trace.SpanKind.CLIENT) as user_client_span: user_client_span.set_attribute("component", "user-service") user_client_span.set_attribute("transaction.id", transaction_id) user_client_span.set_attribute("db.operation", "SELECT") user_client_span.set_attribute("db.table", "users") # DB service receives the request with db_tracer.start_as_current_span("db-query", kind=trace.SpanKind.SERVER) as db_span: db_span.set_attribute("component", "db-service") db_span.set_attribute("transaction.id", transaction_id) db_span.set_attribute("db.operation", "SELECT") db_span.set_attribute("db.table", "users") # Randomly introduce database latency if random.random() < 0.3: delay = random.uniform(0.5, 1.5) db_span.set_attribute("db.latency", delay) db_span.set_attribute("latency.category", "slow-query") time.sleep(delay) else: time.sleep(0.1) # 6. After successful login, send notification with api_gw_tracer.start_as_current_span("notification-service-request", kind=trace.SpanKind.CLIENT) as notif_client_span: notif_client_span.set_attribute("component", "api-gateway") notif_client_span.set_attribute("transaction.id", transaction_id) notif_client_span.set_attribute("http.method", "POST") notif_client_span.set_attribute("http.url", "/notifications/send") # Notification service receives the request with notif_tracer.start_as_current_span("send-login-notification", kind=trace.SpanKind.SERVER) as notif_span: notif_span.set_attribute("component", "notification-service") notif_span.set_attribute("transaction.id", transaction_id) notif_span.set_attribute("notification.type", "login_alert") notif_span.set_attribute("notification.channel", random.choice(["email", "sms", "push"])) time.sleep(0.15) return transaction_id except Exception as e: print(f"Error generating multi-service trace: {e}") return None if __name__ == '__main__': app.run(host='0.0.0.0', port=8080) ================================================ FILE: otel-tracing-service-graphs/app/requirements.txt ================================================ flask requests opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp opentelemetry-instrumentation-flask opentelemetry-instrumentation-requests ================================================ FILE: otel-tracing-service-graphs/config-otel.yaml ================================================ # # OTel Collector YAML Configuration for Service Graph Generation # # This is the OTel-native equivalent of config.alloy for use with the Alloy OTel Engine. # Run with: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d # receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 processors: batch: {} connectors: servicegraph: metrics_flush_interval: 10s dimensions: - service.name - http.method store: max_items: 5000 ttl: 30s exporters: otlphttp/prometheus: endpoint: http://prometheus:9090/api/v1/otlp tls: insecure: true otlp/tempo: endpoint: tempo:4317 tls: insecure: true service: pipelines: traces: receivers: [otlp] processors: [batch] exporters: [servicegraph, otlp/tempo] metrics: receivers: [servicegraph] exporters: [otlphttp/prometheus] ================================================ FILE: otel-tracing-service-graphs/config.alloy ================================================ /* * Alloy Configuration for OpenTelemetry Trace Collection with Service Graph Generation */ // Receive OpenTelemetry traces otelcol.receiver.otlp "default" { http {} grpc {} output { traces = [otelcol.processor.batch.default.input] } } // Batch processor to improve performance otelcol.processor.batch "default" { output { traces = [ otelcol.connector.servicegraph.default.input, otelcol.exporter.otlp.tempo.input, ] } } // Service Graph Generator otelcol.connector.servicegraph "default" { metrics_flush_interval = "10s" dimensions = ["service.name", "http.method"] // Configure the span store for better pairing store { max_items = 5000 ttl = "30s" } output { metrics = [otelcol.exporter.otlphttp.prometheus.input] } } // Send service graph metrics to Prometheus via OTLP otelcol.exporter.otlphttp "prometheus" { client { endpoint = "http://prometheus:9090/api/v1/otlp" tls { insecure = true } } } // Send traces to Tempo for storage and visualization otelcol.exporter.otlp "tempo" { client { endpoint = "tempo:4317" tls { insecure = true } } } livedebugging { enabled = true } ================================================ FILE: otel-tracing-service-graphs/docker-compose-otel.yml ================================================ # OTel Engine Override # # Uses Alloy's experimental OTel Engine to run a standard OTel Collector YAML config # instead of the River/HCL config.alloy file. # # Usage: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d # Stop: docker compose -f docker-compose.yml -f docker-compose-otel.yml down # services: alloy: command: otel --config=/etc/alloy/config-otel.yaml volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml ports: - 8888:8888 # OTel Engine HTTP server - 4317:4317 # OTLP gRPC - 4318:4318 # OTLP HTTP # Override demo-app endpoint to use standard OTLP gRPC port demo-app: environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=demo-service - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo ================================================ FILE: otel-tracing-service-graphs/docker-compose.coda.yml ================================================ services: demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} network_mode: host restart: unless-stopped environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:12345 - OTEL_SERVICE_NAME=demo-service - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo ================================================ FILE: otel-tracing-service-graphs/docker-compose.yml ================================================ version: '3.8' services: # Prometheus for metrics collection prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --web.enable-otlp-receiver - --enable-feature=native-histograms - --enable-feature=exemplar-storage - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090/tcp volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml # Tempo for tracing without metrics generation tempo: image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4} command: ["-config.file=/etc/tempo.yaml"] ports: - 3200:3200/tcp # tempo volumes: - ./tempo-config.yaml:/etc/tempo.yaml depends_on: - prometheus memcached: image: memcached:1.6.40@sha256:572b011ce33954ee809066d8cecbeb3ec98912109ee3be3663a3197425fd81ac container_name: memcached ports: - "11211:11211" environment: - MEMCACHED_MAX_MEMORY=64m # Set the maximum memory usage - MEMCACHED_THREADS=4 # Number of threads to use # Grafana for visualization grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false - GF_INSTALL_PLUGINS=https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: false version: 1 editable: false - name: Tempo type: tempo access: proxy orgId: 1 url: http://tempo:3200 basicAuth: false isDefault: true version: 1 editable: false jsonData: serviceMap: datasourceUid: 'Prometheus' nodeGraph: enabled: true EOF /run.sh depends_on: - prometheus - tempo # Alloy for telemetry pipeline and service graph generation alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 # Alloy HTTP server volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy # Demo app that generates OpenTelemetry traces demo-app: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8080:8080 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:12345 - OTEL_SERVICE_NAME=demo-service - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo ================================================ FILE: otel-tracing-service-graphs/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s otlp: # Recommended attributes to be promoted to labels. promote_resource_attributes: - service.instance.id - service.name - service.namespace - service.version - cloud.availability_zone - cloud.region - container.name - deployment.environment - deployment.environment.name - k8s.cluster.name - k8s.container.name - k8s.cronjob.name - k8s.daemonset.name - k8s.deployment.name - k8s.job.name - k8s.namespace.name - k8s.pod.name - k8s.replicaset.name - k8s.statefulset.name storage: tsdb: out_of_order_time_window: 30m ================================================ FILE: otel-tracing-service-graphs/tempo-config.yaml ================================================ stream_over_http_enabled: true server: http_listen_port: 3200 log_level: info cache: background: writeback_goroutines: 5 caches: - roles: - frontend-search memcached: addresses: dns+memcached:11211 query_frontend: search: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 metadata_slo: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 trace_by_id: duration_slo: 100ms metrics: max_duration: 200h # maximum duration of a metrics query, increase for local setups query_backend_after: 5m duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 distributor: receivers: # this configuration will listen on all ports and protocols that tempo is capable of. jaeger: # the receives all come from the OpenTelemetry collector. more configuration information can protocols: # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver thrift_http: # endpoint: "tempo:14268" # for a production deployment you should only enable the receivers you need! grpc: endpoint: "tempo:14250" thrift_binary: endpoint: "tempo:6832" thrift_compact: endpoint: "tempo:6831" zipkin: endpoint: "tempo:9411" otlp: protocols: grpc: endpoint: "tempo:4317" http: endpoint: "tempo:4318" opencensus: endpoint: "tempo:55678" ingester: max_block_duration: 5m # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally compactor: compaction: block_retention: 720h # overall Tempo trace retention. set for demo purposes metrics_generator: registry: external_labels: source: tempo cluster: docker-compose storage: path: /var/tempo/generator/wal remote_write: - url: http://prometheus:9090/api/v1/write send_exemplars: true traces_storage: path: /var/tempo/generator/traces processor: local_blocks: filter_server_spans: false flush_to_storage: true storage: trace: backend: local # backend configuration to use wal: path: /var/tempo/wal # where to store the wal locally local: path: /var/tempo/blocks overrides: defaults: metrics_generator: processors: [local-blocks] # enables metrics generator ================================================ FILE: postgres-monitoring/README.md ================================================ # PostgreSQL Monitoring with Grafana Alloy This scenario demonstrates how to monitor a PostgreSQL database using Grafana Alloy's built-in `prometheus.exporter.postgres` component. Alloy scrapes PostgreSQL server metrics and forwards them to Prometheus via remote write. Grafana is pre-configured with Prometheus as a datasource so you can explore the collected metrics immediately. ## Prerequisites - Docker - Docker Compose - Git ## Getting Started ```bash git clone https://github.com/grafana/alloy-scenarios.git cd alloy-scenarios/postgres-monitoring docker compose up -d ``` To use the centralized image versions from the repo root: ```bash cd alloy-scenarios ./run-example.sh postgres-monitoring ``` ## Access Points | Service | URL | |------------|----------------------------| | Grafana | http://localhost:3000 | | Alloy UI | http://localhost:12345 | | Prometheus | http://localhost:9090 | Grafana is configured with anonymous admin access enabled, so no login is required. ## What to Expect Once the stack is running, Alloy connects to the PostgreSQL instance and begins collecting metrics via the `prometheus.exporter.postgres` component. These metrics are scraped every 15 seconds and forwarded to Prometheus. Metrics you can explore in Grafana include: - **pg_up** -- Whether the PostgreSQL instance is reachable - **pg_stat_database_*/** -- Database-level statistics (transactions committed, rolled back, rows fetched, inserted, updated, deleted, deadlocks, temp files, etc.) - **pg_stat_bgwriter_*/** -- Background writer statistics (buffers written, checkpoints, etc.) - **pg_settings_*/** -- PostgreSQL server configuration settings exposed as metrics - **pg_stat_activity_*/** -- Connection and session activity - **pg_locks_*/** -- Lock statistics by mode ### Exploring Metrics 1. Open **Grafana** at http://localhost:3000 2. Go to **Explore** and select the **Prometheus** datasource 3. Search for metrics starting with `pg_` to browse all available PostgreSQL metrics ### Debugging the Pipeline 1. Open the **Alloy UI** at http://localhost:12345 2. Navigate to the component graph to see the pipeline: `prometheus.exporter.postgres` -> `prometheus.scrape` -> `prometheus.remote_write` 3. Use the **Live Debugging** feature (enabled in the config) to inspect data flowing through each component ## Stopping the Scenario ```bash docker compose down ``` ================================================ FILE: postgres-monitoring/config.alloy ================================================ // ############################################## // #### PostgreSQL Metrics Configuration #### // ############################################## livedebugging { enabled = true } // Expose PostgreSQL metrics using the built-in postgres exporter. prometheus.exporter.postgres "example" { data_source_names = ["postgresql://alloy:alloy@postgres:5432/alloy?sslmode=disable"] } // Scrape the postgres exporter targets. prometheus.scrape "postgres" { targets = prometheus.exporter.postgres.example.targets forward_to = [prometheus.remote_write.default.receiver] scrape_interval = "15s" } // Send metrics to the local Prometheus instance via remote write. prometheus.remote_write "default" { endpoint { url = "http://prometheus:9090/api/v1/write" } } ================================================ FILE: postgres-monitoring/docker-compose.coda.yml ================================================ services: postgres: image: postgres:18@sha256:78481659c47e862334611ccdaf7c369c986b3046da9857112f3b309114a65fb4 environment: POSTGRES_USER: alloy POSTGRES_PASSWORD: alloy POSTGRES_DB: alloy ports: - "5432:5432" healthcheck: test: ["CMD-SHELL", "pg_isready -U alloy"] interval: 5s timeout: 5s retries: 5 ================================================ FILE: postgres-monitoring/docker-compose.yml ================================================ services: postgres: image: postgres:18@sha256:78481659c47e862334611ccdaf7c369c986b3046da9857112f3b309114a65fb4 environment: POSTGRES_USER: alloy POSTGRES_PASSWORD: alloy POSTGRES_DB: alloy ports: - "5432:5432" healthcheck: test: ["CMD-SHELL", "pg_isready -U alloy"] interval: 5s timeout: 5s retries: 5 prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --config.file=/etc/prometheus/prometheus.yml volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml ports: - "9090:9090" alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: postgres: condition: service_healthy grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh ================================================ FILE: postgres-monitoring/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s ================================================ FILE: rabbitmq-monitoring/README.md ================================================ # RabbitMQ Monitoring with Grafana Alloy This scenario demonstrates RabbitMQ observability with a single Alloy pipeline: - **Metrics** - `prometheus.scrape` collects RabbitMQ's built-in `/metrics` endpoint from the `rabbitmq_prometheus` plugin and remote-writes the samples to Prometheus. - **Logs** - `loki.source.docker` tails the RabbitMQ container logs from the Docker socket and sends them to Loki. ## Architecture - **RabbitMQ** - the monitored broker, running the management and Prometheus plugins - **loadgen** - a small RabbitMQ PerfTest publisher that creates the durable `alloy-sample` queue and publishes one persistent message per second - **Grafana Alloy** - scrapes broker metrics, collects broker container logs, and forwards both signals - **Loki / Prometheus / Grafana** - local backends and visualization, with datasources auto-provisioned ## Running ```bash # From this directory docker compose up -d # Or from the repo root using centralized image versions ./run-example.sh rabbitmq-monitoring ``` ## Accessing - **Grafana**: http://localhost:3000 (no login required) - **Alloy UI**: http://localhost:12345 - **Prometheus**: http://localhost:9090 - **Loki**: http://localhost:3100 - **RabbitMQ Management UI**: http://localhost:15672 (`guest` / `guest`) - **RabbitMQ Prometheus endpoint**: http://localhost:15692/metrics ## Trying It Out Within about 30 seconds, open Grafana Explore and run these queries. ### Metrics ```promql rabbitmq_up ``` ```promql rabbitmq_queue_messages{queue="alloy-sample"} ``` ```promql rabbitmq_channels ``` The scenario sets `prometheus.return_per_object_metrics = true` so queue-level labels are visible on `/metrics`. ### Logs ```logql {job="rabbitmq"} ``` ```logql {job="rabbitmq"} |~ "accepting AMQP connection|authenticated and granted access" ``` RabbitMQ logs connection lifecycle events by default. Channel counts are best checked with metrics: ```promql rabbitmq_channels ``` ## Key Configuration - `enabled_plugins` enables `rabbitmq_management` and `rabbitmq_prometheus`. - `rabbitmq.conf` sends debug-level console logs to Docker and returns per-object queue metrics from `/metrics`. - `config.alloy` keeps the metrics and logs pipelines separate and labels RabbitMQ logs as `job="rabbitmq"`. ## Stopping ```bash docker compose down -v ``` ================================================ FILE: rabbitmq-monitoring/config.alloy ================================================ // RabbitMQ Monitoring with Grafana Alloy. // Metrics: scrape RabbitMQ's built-in Prometheus endpoint. // Logs: collect RabbitMQ container logs from Docker and ship them to Loki. livedebugging { enabled = true } prometheus.scrape "rabbitmq" { targets = [{ __address__ = "rabbitmq:15692", job = "rabbitmq", }] forward_to = [prometheus.remote_write.local.receiver] scrape_interval = "15s" } prometheus.remote_write "local" { endpoint { url = "http://prometheus:9090/api/v1/write" } } discovery.docker "linux" { host = "unix:///var/run/docker.sock" } discovery.relabel "rabbitmq_logs" { targets = discovery.docker.linux.targets rule { source_labels = ["__meta_docker_container_name"] regex = "/rabbitmq-monitoring-rabbitmq" action = "keep" } rule { target_label = "job" replacement = "rabbitmq" } rule { source_labels = ["__meta_docker_container_name"] regex = "/(.*)" target_label = "container_name" } } loki.source.docker "rabbitmq" { host = "unix:///var/run/docker.sock" targets = discovery.docker.linux.targets relabel_rules = discovery.relabel.rabbitmq_logs.rules forward_to = [loki.write.local.receiver] } loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" } } ================================================ FILE: rabbitmq-monitoring/docker-compose.coda.yml ================================================ services: rabbitmq: image: rabbitmq:${RABBITMQ_VERSION:-4.3.0-management} container_name: rabbitmq-monitoring-rabbitmq hostname: rabbitmq ports: - "5672:5672" - "15672:15672" - "15692:15692" environment: - RABBITMQ_DEFAULT_USER=guest - RABBITMQ_DEFAULT_PASS=guest volumes: - ./enabled_plugins:/etc/rabbitmq/enabled_plugins:ro - ./rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf:ro healthcheck: test: ["CMD", "rabbitmq-diagnostics", "-q", "ping"] interval: 10s timeout: 5s retries: 12 loadgen: image: pivotalrabbitmq/perf-test:${RABBITMQ_PERF_TEST_VERSION:-2.24.0} container_name: rabbitmq-monitoring-loadgen command: - --uri - amqp://guest:guest@rabbitmq:5672 - --queue - alloy-sample - --producers - "1" - --consumers - "0" - --rate - "1" - --size - "256" - --flag - persistent - --id - alloy-rabbitmq-demo depends_on: rabbitmq: condition: service_healthy restart: unless-stopped ================================================ FILE: rabbitmq-monitoring/docker-compose.yml ================================================ services: rabbitmq: image: rabbitmq:${RABBITMQ_VERSION:-4.3.0-management} container_name: rabbitmq-monitoring-rabbitmq hostname: rabbitmq ports: - "5672:5672" - "15672:15672" - "15692:15692" environment: - RABBITMQ_DEFAULT_USER=guest - RABBITMQ_DEFAULT_PASS=guest volumes: - ./enabled_plugins:/etc/rabbitmq/enabled_plugins:ro - ./rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf:ro healthcheck: test: ["CMD", "rabbitmq-diagnostics", "-q", "ping"] interval: 10s timeout: 5s retries: 12 loadgen: image: pivotalrabbitmq/perf-test:${RABBITMQ_PERF_TEST_VERSION:-2.24.0} container_name: rabbitmq-monitoring-loadgen command: - --uri - amqp://guest:guest@rabbitmq:5672 - --queue - alloy-sample - --producers - "1" - --consumers - "0" - --rate - "1" - --size - "256" - --flag - persistent - --id - alloy-rabbitmq-demo depends_on: rabbitmq: condition: service_healthy restart: unless-stopped alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} container_name: rabbitmq-monitoring-alloy ports: - "12345:12345" volumes: - ./config.alloy:/etc/alloy/config.alloy - /var/run/docker.sock:/var/run/docker.sock:ro command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: rabbitmq: condition: service_healthy loki: condition: service_started prometheus: condition: service_started loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} container_name: rabbitmq-monitoring-loki ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} container_name: rabbitmq-monitoring-prometheus command: - --web.enable-remote-write-receiver - --config.file=/etc/prometheus/prometheus.yml volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml ports: - "9090:9090" grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} container_name: rabbitmq-monitoring-grafana environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - "3000:3000/tcp" entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false - name: Prometheus type: prometheus access: proxy orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh ================================================ FILE: rabbitmq-monitoring/enabled_plugins ================================================ [rabbitmq_management,rabbitmq_prometheus]. ================================================ FILE: rabbitmq-monitoring/loki-config.yaml ================================================ auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true ingester: max_chunk_age: 5m ================================================ FILE: rabbitmq-monitoring/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s ================================================ FILE: rabbitmq-monitoring/rabbitmq.conf ================================================ prometheus.return_per_object_metrics = true log.console = true log.console.level = debug log.console.formatter.single_line = on log.file = false ================================================ FILE: redis-monitoring/README.md ================================================ # Redis Monitoring with Grafana Alloy This scenario demonstrates how to monitor a Redis instance using Grafana Alloy's built-in `prometheus.exporter.redis` component. ## Architecture - **Redis** - The monitored Redis instance - **Grafana Alloy** - Collects Redis metrics via `prometheus.exporter.redis` and remote writes them to Prometheus - **Prometheus** - Stores the scraped metrics - **Grafana** - Visualizes Redis metrics (auto-provisioned with Prometheus datasource) ## Running ```bash # From this directory docker compose up -d # Or from the repo root using centralized image versions ./run-example.sh redis-monitoring ``` ## Accessing - **Grafana**: http://localhost:3000 (no login required) - **Alloy UI**: http://localhost:12345 - **Prometheus**: http://localhost:9090 ## Key Metrics Once running, you can query Redis metrics in Grafana or Prometheus. Some useful metrics include: - `redis_up` - Whether Redis is reachable - `redis_connected_clients` - Number of connected clients - `redis_used_memory_bytes` - Memory usage - `redis_commands_total` - Total commands processed - `redis_keyspace_hits_total` / `redis_keyspace_misses_total` - Cache hit ratio ## Stopping ```bash docker compose down ``` ================================================ FILE: redis-monitoring/config.alloy ================================================ // Redis Monitoring with Grafana Alloy // This configuration scrapes Redis metrics using the built-in prometheus.exporter.redis component // and remote writes them to Prometheus. livedebugging { enabled = true } prometheus.exporter.redis "default" { redis_addr = "redis:6379" } prometheus.scrape "redis" { targets = prometheus.exporter.redis.default.targets forward_to = [prometheus.remote_write.default.receiver] } prometheus.remote_write "default" { endpoint { url = "http://prometheus:9090/api/v1/write" } } ================================================ FILE: redis-monitoring/docker-compose.coda.yml ================================================ services: redis: image: redis:8@sha256:0c341492924cad6f5483f9133e43bd6c51ecdecbcadfac5b51657393b6a7936c ports: - "6379:6379" ================================================ FILE: redis-monitoring/docker-compose.yml ================================================ services: redis: image: redis:8@sha256:0c341492924cad6f5483f9133e43bd6c51ecdecbcadfac5b51657393b6a7936c ports: - "6379:6379" prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --config.file=/etc/prometheus/prometheus.yml volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml ports: - "9090:9090" grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - redis - prometheus ================================================ FILE: redis-monitoring/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s ================================================ FILE: renovate.json ================================================ { "$schema": "https://docs.renovatebot.com/renovate-schema.json", "description": "Local additive config — extends whatever org-level renovate config the bot is configured with. Tracks the centralized version pins in image-versions.env so they stay current alongside the docker-compose fallback defaults.", "customManagers": [ { "customType": "regex", "description": "Bump every VERSION variable in image-versions.env. Each line is preceded by a `# renovate: datasource=… depName=…` comment that tells the bot what the variable refers to.", "managerFilePatterns": [ "/^image-versions\\.env$/" ], "matchStrings": [ "# renovate: datasource=(?.+?) depName=(?.+?)\\s+\\w+_VERSION=(?.+)" ] }, { "customType": "regex", "description": "Bump grafana/k8s-monitoring chart version pinned in k8s/*/README.md install commands. The other two charts in each scenario (backend + grafana) are intentionally unpinned (`helm install` resolves latest at run time), so only k8s-monitoring needs tracking.", "managerFilePatterns": [ "/^k8s/.+/README\\.md$/" ], "matchStrings": [ "grafana/k8s-monitoring --version \"(?[^\"]+)\"" ], "datasourceTemplate": "helm", "depNameTemplate": "k8s-monitoring", "registryUrlTemplate": "https://grafana.github.io/helm-charts" }, { "customType": "regex", "description": "Sync ${*_VERSION:-default} fallbacks in every docker-compose file alongside image-versions.env updates. Captures depName from the image reference itself (e.g. `image: nginx/nginx-prometheus-exporter:${NGINX_EXPORTER_VERSION:-1.4.2}` → depName=nginx/nginx-prometheus-exporter, currentValue=1.4.2). One rule covers every variable — adding a new VERSION var to image-versions.env requires no change here as long as the compose line follows the convention `image: :${_VERSION:-}`. Renovate's docker-compose manager treats `${VAR}` substitution as a templated reference and won't update inline fallback defaults — without this customManager, the env file moves but the fallbacks drift, breaking `docker compose up` for users who don't pass --env-file.", "managerFilePatterns": [ "/docker-compose(\\.coda)?\\.ya?ml$/" ], "matchStrings": [ "image:\\s*(?\\S+?):\\$\\{[A-Z_]+_VERSION:-(?[^}]+)\\}" ], "datasourceTemplate": "docker" } ] } ================================================ FILE: routing/README.MD ================================================ # Example scenario for grafana alloy routing Simple example for cases where a singular source [e.g. OCP cluster log forwarder] is sending logs from multiple applications, and you need to route it to different loki instances [or in the example case, same loki instance, but different tenants]. The example covers an if-else scenario, where looking at the logs contents, the `tenantKey` loki request metadata is populated, for the logs to be stored in the appropriate tenant. In this setup, alloy is a single instance that receives logs from multiple OCP clusters [test and prod]. It checks the origin of the log [via the `hostname` field in the request] and afterwards checks the type of log [via the custom `message.logger` field]. Diagram: ``` - - - Loki tenant test app OCP - - - | [test] | | - - Loki tenant test audit ---> Alloy ---> Loki --- | | - - Loki tenant prod app OCP - - - | [prod] - - - Loki tenant prod audit ``` ## Testing The provided docker compose has a local promtail-alloy-loki-grafana setup to mimick a real env [using promtail to mimick an OCP log forwarder]. Logs are stored in a local minio s3 bucket. use `docker compose up` to bring the system up. [note, if minio has errors starting up, create a `.customData` dir] Put your log entries [with newline endings] inside the `support/promtail/myCustomLog.log` file. They will be automatically pushed to loki via alloy. Open grafana [localhost:3000], login with default admin [`admin/admin`] and go to `Explore`. You will have multiple loki datasources; use the query `{job="myApp"}`. Depending on the log content you put in the custom log file, the log will be stored in the appropriate loki tenant and is fetchable using the correct datasource. ## Message examples Test app message [goes to test app tenant] `{"@timestamp":"2025-10-31T09:40:25.229103272Z","hostname":"tos-worker-002.tos.example.come","level":"info","log_source":"container","log_type":"application","message":"{\"timestamp\":\"2025-10-31 10:40:25.229\",\"level\":\"INFO\",\"thread\":\"http-nio-8080-exec-5\",\"mdc\":{\"correlationId\":\"test\",\"uniqueTrackingId\":\"test\"},\"logger\":\"app\",\"message\":\"[START] msisdn=38763383706\",\"context\":\"default\"}"}` Test audit message [goes to test audit tenant] `{"@timestamp":"2025-10-31T09:40:25.229103272Z","hostname":"tos-worker-002.tos.example.come","level":"info","log_source":"container","log_type":"application","message":"{\"timestamp\":\"2025-10-31 10:40:25.229\",\"level\":\"INFO\",\"thread\":\"http-nio-8080-exec-5\",\"mdc\":{\"correlationId\":\"test\",\"uniqueTrackingId\":\"test\"},\"logger\":\"audit\",\"message\":\"[START] msisdn=38763383706\",\"context\":\"default\"}"}` Prod app message [goes to app tenant] `{"@timestamp":"2025-10-31T09:40:25.229103272Z","hostname":"pos-worker-001.pos.example.come","level":"info","log_source":"container","log_type":"application","message":"{\"timestamp\":\"2025-10-31 10:40:25.229\",\"level\":\"INFO\",\"thread\":\"http-nio-8080-exec-5\",\"mdc\":{\"correlationId\":\"test\",\"uniqueTrackingId\":\"test\"},\"logger\":\"app\",\"message\":\"[START] msisdn=38763383706\",\"context\":\"default\"}"}` Prod audit message [goes to audit tenant] `{"@timestamp":"2025-10-31T09:40:25.229103272Z","hostname":"pos-worker-001.pos.example.come","level":"info","log_source":"container","log_type":"application","message":"{\"timestamp\":\"2025-10-31 10:40:25.229\",\"level\":\"INFO\",\"thread\":\"http-nio-8080-exec-5\",\"mdc\":{\"correlationId\":\"test\",\"uniqueTrackingId\":\"test\"},\"logger\":\"audit\",\"message\":\"[START] msisdn=38763383706\",\"context\":\"default\"}"}` ================================================ FILE: routing/config.alloy ================================================ //listen to api requests for incomming logs from OCP loki.source.api "listener" { http { //listen_address = "" //defaults to all/localhost listen_port = 3005 } forward_to = [loki.process.default_values.receiver] } //set a default label, so that all logs that passed thru alloy are marked as such loki.process "default_values" { stage.static_labels { values = { source = "grafana-alloy", } } forward_to = [loki.process.redirect_env.receiver] } //we check the hostname field to see if the source is the test or prod cluster loki.process "redirect_env" { stage.json { expressions = {extractedHostname = "hostname"} } stage.labels { values = {hostnameLabel = "extractedHostname"} } stage.match { pipeline_name = "Send to test tenants if tos source" selector = "{hostnameLabel =~ \"tos.*\"}" //default for test stage.tenant { value = "test_loki_app" } //take main payload stage.json { expressions = {payload = "message"} } //extract logger property from it stage.json { source = "payload" expressions = {logger = "logger"} } //set it as a label, match selector works only with it stage.labels { values = {loggerLabel = "logger"} } //route only 'audit' logger types to audit tenant [rest goes to default - app tenant] stage.match { pipeline_name = "Audit log routing" selector = "{loggerLabel = \"audit\"}" stage.tenant { value = "test_loki_audit" } } stage.match { pipeline_name = "Otherwise it is production - re check the logic above" selector = "{hostnameLabel !~ \"tos.*\"}" //default value - app stage.tenant { value = "loki_app" } //take main payload stage.json { expressions = {payload = "message"} } //extract logger property from it stage.json { source = "payload" expressions = {logger = "logger"} } stage.labels { values = {loggerLabel = "logger"} } //route only 'audit' logger types to audit tenant [rest goes to default - app tenant] stage.match { pipeline_name = "Audit log routing" selector = "{loggerLabel = \"audit\"}" stage.tenant { value = "loki_audit" } } } forward_to = [loki.write.loki_default.receiver] } loki.write "loki_default" { endpoint { url = "http://:/loki/api/v1/push" } } ================================================ FILE: routing/docker-compose.yaml ================================================ services: minio: image: "minio/minio:RELEASE.2024-10-29T16-01-48Z@sha256:ebd2af76d40ff25ccc630533615f7ccd55fbe83d629a4b7c7a1b6311c1af3d6c" restart: "unless-stopped" entrypoint: - "sh" - "-euc" - "mkdir -p /data/loki && /usr/bin/docker-entrypoint.sh minio server --quiet --address 0.0.0.0:9000 --console-address ':9001' /data" volumes: - "./.customData/minio:/data" environment: - "MINIO_ROOT_USER=myuser" - "MINIO_ROOT_PASSWORD=mypass" ports: - "9000:9000" - "9001:9001" loki: image: "grafana/loki:latest@sha256:73e905b51a7f917f7a1075e4be68759df30226e03dcb3cd2213b989cc0dc8eb4" restart: "unless-stopped" command: "-config.file=/etc/loki/server.yml" volumes: - "./support/loki/server.yml:/etc/loki/server.yml" ports: - "3100:3100" - "7946" depends_on: - "minio" grafana: image: "grafana/grafana:latest@sha256:0f86bada30d65ef9d0183b90c1e2682ac92d53d95da8bed322b984ea78a4a73a" restart: "unless-stopped" user: '0' volumes: - "./support/grafana/datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml" - "./.customData/grafana:/var/lib/grafana" ports: - "3000:3000" depends_on: - "loki" alloy: image: "grafana/alloy:latest@sha256:51aeb9d829239345070619dad3edd6873186f913c84f45b365b74574fcb38ec0" restart: "unless-stopped" command: "run --server.http.listen-addr=0.0.0.0:3000 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy" volumes: - "./config.alloy:/etc/alloy/config.alloy" ports: - "3005:3000" - "3015:3005" depends_on: - "loki" ###Local app that generates logs promtail: image: "grafana/promtail:latest@sha256:6cfa64ec432b24a912d640e2edb940eeae2666f61861a66c121d763dd7241381" volumes: - "./support/promtail/promtail-config.yml:/etc/promtail/config.yml" - "./support/promtail/myCustomLog.txt:/var/log/myCustomLog.txt" ports: - "9080:9080" depends_on: - "alloy" ================================================ FILE: routing/support/grafana/datasources.yml ================================================ apiVersion: 1 datasources: - name: "Loki" type: "loki" access: "proxy" orgId: 1 url: "http://loki:3100" basicAuth: false isDefault: false version: 1 editable: false apiVersion: 1 uid: "loki" jsonData: httpHeaderName1: 'X-Scope-OrgID' secureJsonData: httpHeaderValue1: 'fake' - name: "Loki app" type: "loki" access: "proxy" orgId: 1 url: "http://loki:3100" basicAuth: false isDefault: false version: 1 editable: false apiVersion: 1 uid: "lokiApp" jsonData: httpHeaderName1: 'X-Scope-OrgID' secureJsonData: httpHeaderValue1: 'loki_app' - name: "Loki audit" type: "loki" access: "proxy" orgId: 1 url: "http://loki:3100" basicAuth: false isDefault: false version: 1 editable: false apiVersion: 1 uid: "lokiAudit" jsonData: httpHeaderName1: 'X-Scope-OrgID' secureJsonData: httpHeaderValue1: 'loki_audit' - name: "Loki test app" type: "loki" access: "proxy" orgId: 1 url: "http://loki:3100" basicAuth: false isDefault: false version: 1 editable: false apiVersion: 1 uid: "lokiTestApp" jsonData: httpHeaderName1: 'X-Scope-OrgID' secureJsonData: httpHeaderValue1: 'test_loki_app' - name: "Loki test audit" type: "loki" access: "proxy" orgId: 1 url: "http://loki:3100" basicAuth: false isDefault: false version: 1 editable: false apiVersion: 1 uid: "lokiTestAudit" jsonData: httpHeaderName1: 'X-Scope-OrgID' secureJsonData: httpHeaderValue1: 'test_loki_audit' ================================================ FILE: routing/support/loki/server.yaml ================================================ auth_enabled: true server: http_listen_address: 0.0.0.0 grpc_listen_address: 0.0.0.0 http_listen_port: 3100 grpc_listen_port: 9095 log_level: "info" common: path_prefix: "/loki/data" storage: s3: endpoint: "minio:9000" insecure: true bucketnames: "loki" access_key_id: "myuser" secret_access_key: "mypass" s3forcepathstyle: true memberlist: dead_node_reclaim_time: "30s" gossip_to_dead_nodes_time: "15s" left_ingesters_timeout: "30s" gossip_interval: "2s" bind_port: 7946 bind_addr: - "0.0.0.0" join_members: - "loki" ingester: lifecycler: join_after: "10s" observe_period: "5s" ring: replication_factor: 1 kvstore: store: "memberlist" final_sleep: "0s" chunk_idle_period: "1m" wal: enabled: true dir: "/loki/wal" max_chunk_age: "1m" chunk_retain_period: "30s" chunk_encoding: "snappy" chunk_target_size: 1.572864e+06 chunk_block_size: 262144 flush_op_timeout: "10s" schema_config: configs: - from: "2020-08-01" store: "tsdb" object_store: "s3" schema: "v13" index: prefix: "index_" period: "24h" storage_config: boltdb_shipper: active_index_directory: "/tmp/index" cache_location: "/tmp/boltdb-cache" limits_config: max_cache_freshness_per_query: "10m" reject_old_samples: true reject_old_samples_max_age: "30m" split_queries_by_interval: "15m" ingestion_rate_mb: 10 ingestion_burst_size_mb: 20 table_manager: retention_deletes_enabled: true retention_period: "336h" query_range: max_retries: 5 align_queries_with_step: true parallelise_shardable_queries: true cache_results: true frontend: log_queries_longer_than: "5s" compress_responses: true max_outstanding_per_tenant: 2048 query_scheduler: max_outstanding_requests_per_tenant: 1024 querier: query_ingesters_within: "2h" compactor: working_directory: "/tmp/compactor" retention_enabled: true compaction_interval: 30m retention_delete_delay: 1h retention_delete_worker_count: 150 ================================================ FILE: routing/support/promtail/myCustomLog.txt ================================================ ############################################################################ #find this in grafana via - {job="myApp"} |= `` ############################################################################ ############################################################################ 2025-10-15 - MY APP - Started logging by custom means... ================================================ FILE: routing/support/promtail/promtail-config.yml ================================================ server: http_listen_port: 9080 grpc_listen_port: 0 positions: filename: /tmp/positions.yaml clients: - url: http://alloy:3005/loki/api/v1/push scrape_configs: - job_name: logs static_configs: - targets: - localhost labels: job: myApp __path__: /var/log/myCustomLog.txt ================================================ FILE: run-example.sh ================================================ #!/bin/bash set -euo pipefail # Usage check if [ $# -lt 1 ]; then echo "Usage: $0 " echo "Available examples:" ls -d */ | grep -v "k8s\|img\|.git" | tr -d '/' exit 1 fi EXAMPLE_DIR=$1 # Check if the example directory exists if [ ! -d "$EXAMPLE_DIR" ]; then echo "Error: Example directory '$EXAMPLE_DIR' not found." exit 1 fi # Check if a docker-compose file exists in the example directory. # Some scenarios use .yaml instead of .yml; accept either. if [ ! -f "$EXAMPLE_DIR/docker-compose.yml" ] && [ ! -f "$EXAMPLE_DIR/docker-compose.yaml" ]; then echo "Error: No docker-compose.yml or docker-compose.yaml found in '$EXAMPLE_DIR'." exit 1 fi # Source the image versions if [ ! -f "image-versions.env" ]; then echo "Error: image-versions.env file not found." exit 1 fi # Run docker-compose in the example directory with the environment variables echo "Starting example: $EXAMPLE_DIR" (cd "$EXAMPLE_DIR" && docker compose --env-file ../image-versions.env up -d) echo "Example started successfully." echo "Access Grafana at http://localhost:3000" echo "To stop the example, run: cd $EXAMPLE_DIR && docker compose down" ================================================ FILE: self-monitoring/README.md ================================================ # Self-Monitoring with Grafana Alloy This example demonstrates how to configure Grafana Alloy to monitor itself, collecting both its own metrics and logs alongside other Docker containers. ## Prerequisites - Docker - Docker Compose - Git ## Running the Demo ### Step 1: Clone the repository ```bash git clone https://github.com/grafana/alloy-scenarios.git ``` ### Step 2: Deploy the monitoring stack ```bash cd alloy-scenarios/self-monitoring docker-compose up -d ``` ### Step 3: Access Grafana Alloy UI Open your browser and go to `http://localhost:12345`. ### Step 4: Access Prometheus UI Open your browser and go to `http://localhost:9090`. ### Step 5: Access Loki Loki is available at `http://localhost:3100`. ## What This Demo Shows This scenario demonstrates: - **Metrics Collection**: Using `prometheus.exporter.self` to export Alloy's own internal metrics - **Log Collection**: Using `loki.source.docker` to collect logs from all Docker containers, including Alloy itself - **Service Discovery**: Automatic discovery of Docker containers with proper labeling - **Remote Write**: Sending metrics to Prometheus and logs to Loki ## Key Configuration Elements ### Self-Monitoring Metrics The `prometheus.exporter.self` component exposes Alloy's internal metrics: - Memory usage - CPU utilization - Component health - Scrape statistics ### Docker Log Collection The configuration automatically discovers and collects logs from all Docker containers running on the host, including: - Alloy's own logs - Prometheus logs - Loki logs - Any other containers running on the same Docker host ================================================ FILE: self-monitoring/config.alloy ================================================ prometheus.exporter.self "integrations_alloy_health" { } discovery.relabel "integrations_alloy_health" { targets = prometheus.exporter.self.integrations_alloy_health.targets rule { target_label = "instance" replacement = constants.hostname } rule { target_label = "container" replacement = "alloy" } } prometheus.scrape "integrations_alloy_health" { targets = array.concat( discovery.relabel.integrations_alloy_health.output, ) forward_to = [prometheus.relabel.integrations_alloy_health.receiver] job_name = "integrations/alloy" } prometheus.relabel "integrations_alloy_health" { forward_to = [prometheus.remote_write.default.receiver] } prometheus.remote_write "default" { endpoint { url = "http://prometheus:9090/api/v1/write" } } // ############################### // #### Logging Configuration #### // ############################### // Discover Docker containers and extract metadata. discovery.docker "linux" { host = "unix:///var/run/docker.sock" } // Define a relabeling rule to create a service name from the container name. discovery.relabel "logs_integrations_docker" { targets = [] // Extract the docker-compose service name from container names like // /grafana-pathfinder-app-alloy-1 -> alloy rule { source_labels = ["__meta_docker_container_name"] regex = "^/(?:.+-)?([^-]+)-(?:\\d+)$" target_label = "container" } rule { target_label = "instance" replacement = constants.hostname } } // Configure a loki.source.docker component to collect logs from Docker containers. loki.source.docker "default" { host = "unix:///var/run/docker.sock" targets = discovery.docker.linux.targets relabel_rules = discovery.relabel.logs_integrations_docker.rules forward_to = [loki.write.local.receiver] } loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" } } ================================================ FILE: self-monitoring/docker-compose.yaml ================================================ services: prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --enable-feature=native-histograms - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090/tcp loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - '3100:3100' volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 # Alloy HTTP server volumes: - ./config.alloy:/etc/alloy/config.alloy - /var/run/docker.sock:/var/run/docker.sock command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy ================================================ FILE: self-monitoring/loki-config.yaml ================================================ # This is a complete configuration to deploy Loki backed by the filesystem. # The index will be shipped to the storage via tsdb-shipper. auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true distributor: otlp_config: # List of default otlp resource attributes to be picked as index labels # CLI flag: -distributor.otlp.default_resource_attributes_as_index_labels default_resource_attributes_as_index_labels: [ service.name service.namespace service.instance.id deployment.environment deployment.environment.name cloud.region cloud.availability_zone k8s.cluster.name k8s.namespace.name k8s.container.name container.name k8s.replicaset.name k8s.deployment.name k8s.statefulset.name k8s.daemonset.name k8s.cronjob.name k8s.job.name, ] server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true ================================================ FILE: snmp/Readme.md ================================================ # Monitoring Linux with Alloy Grafana Alloy can be used to monitor Linux servers and containers. In this guide, we will show you how to deploy Grafana Alloy in a Docker environment to monitor Linux system metrics and logs. The setup consists of: * Node Exporter metrics for system performance monitoring * System logs collection with Loki ## Prerequisites * Git - You will need Git to clone the repository. * Docker and Docker Compose - This tutorial uses Docker to host Grafana, Loki, Prometheus, and Alloy. * Linux environment - Either a Linux host running Docker or a Linux VM. ## About this Demo This demo runs Alloy in a container alongside Grafana, Prometheus, and Loki, creating a self-contained monitoring stack. The Alloy container acts as a "fake Linux server" to demonstrate monitoring capabilities out of the box. In a production environment, you would typically install Alloy directly on each Linux server you want to monitor. ## Step 1: Clone the Repository Clone the repository to your machine: ```bash git clone https://github.com/grafana/alloy-scenarios.git cd alloy-scenarios/linux ``` ## Step 2: Deploy the Monitoring Stack Use Docker Compose to deploy Grafana, Loki, Prometheus, and Alloy: ```bash docker-compose up -d ``` You can check the status of the containers: ```bash docker ps ``` Grafana should be running on [http://localhost:3000](http://localhost:3000). ## Step 3: Explore the Monitoring Data Once the stack is running, you can explore the collected metrics and logs: 1. Access Grafana at [http://localhost:3000](http://localhost:3000) (default credentials are admin/admin) 2. Import the Node Exporter dashboard to visualize system metrics: - Go to Dashboards → Import - Upload the JSON file from [here](https://grafana.com/api/dashboards/1860/revisions/37/download) - Select the Prometheus data source and click Import This community dashboard provides comprehensive system metrics including CPU, memory, disk, and network usage. ## Step 4: Viewing Logs Open your browser and go to [http://localhost:3000/a/grafana-lokiexplore-app](http://localhost:3000/a/grafana-lokiexplore-app). This will take you to the Loki explorer in Grafana. ## Deploying on Bare Metal To monitor actual Linux servers in production, you would: 1. Install Alloy directly on each Linux server 2. Modify the `config.alloy` file to point to your Prometheus and Loki instances: ``` prometheus.remote_write "local" { endpoint { url = "http://localhost:9090/api/v1/write" } } loki.write "local" { endpoint { url = "http://localhost:3100/loki/api/v1/push" } } ``` 3. Run Alloy as a service: ```bash sudo alloy run /path/to/config.alloy ``` ## Configuration Customization The included `config.alloy` file sets up: 1. Node Exporter integration to collect system metrics 2. Log collection from system logs and journal 3. Relabeling rules to organize metrics and logs 4. Remote write endpoints for Prometheus and Loki You can customize which collectors are enabled/disabled and adjust scrape intervals in the configuration file. ## Troubleshooting If you encounter issues: * Check container logs: `docker-compose logs` * Verify Alloy is running: `docker-compose ps` * Ensure ports are not conflicting with existing services * Review the Alloy configuration in `config.alloy` ================================================ FILE: snmp/config.alloy ================================================ // --- Remote Write to Prometheus --- prometheus.remote_write "remote" { endpoint { url = "http://prometheus:9090/api/v1/write" } } // --- SNMP Exporter Configuration --- prometheus.exporter.snmp "snmp_exporter" { config_file = "/etc/snmp/snmp.yml" target "tm" { address = "snmpd" module = "CISCO" walk_params = "Cisco" labels = { "ilo_node" = "switch", } } walk_param "cisco" { retries = "2" timeout = "30s" } } // --- SNMP Scrape Configuration --- discovery.relabel "snmp_targets" { targets = prometheus.exporter.snmp.snmp_exporter.targets rule { target_label = "job" replacement = "smpt" } } prometheus.scrape "snmp_targets" { scrape_interval = "30s" targets = discovery.relabel.snmp_targets.output forward_to = [prometheus.remote_write.remote.receiver] } // --- Enable Live Debugging --- livedebugging {} ================================================ FILE: snmp/docker-compose.yml ================================================ version: '3.8' services: loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - 3100:3100/tcp volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090/tcp volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 volumes: - ./config.alloy:/etc/alloy/config.alloy - ./snmp.yml:/etc/alloy/snmp.yml command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy ================================================ FILE: snmp/loki-config.yaml ================================================ # This is a complete configuration to deploy Loki backed by the filesystem. # The index will be shipped to the storage via tsdb-shipper. auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true distributor: otlp_config: # List of default otlp resource attributes to be picked as index labels # CLI flag: -distributor.otlp.default_resource_attributes_as_index_labels default_resource_attributes_as_index_labels: [service.name service.namespace service.instance.id deployment.environment deployment.environment.name cloud.region cloud.availability_zone k8s.cluster.name k8s.namespace.name k8s.container.name container.name k8s.replicaset.name k8s.deployment.name k8s.statefulset.name k8s.daemonset.name k8s.cronjob.name k8s.job.name] server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true ================================================ FILE: snmp/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s ================================================ FILE: snmp/snmp.yml ================================================ modules: CISCO: walk: - 1.4.6.1.4.3.9.9.244.1.2.1.1.7 metrics: - name: ifInterface oid: 1.4.6.1.4.3.9.9.244.1.2.1.1.7 type: gauge help: A unique value, greater than zero, for each interface indexes: - labelname: ifInterface type: gauge auths: public_v1: community: security_level: noAuthNoPriv version: 1 public_v2: community: security_level: noAuthNoPriv version: 2 ================================================ FILE: syslog/README.md ================================================ # Syslog Scenario This scenario demonstrates how to use ryslog and Alloy to monitor non RFC5424 compliant syslog messages. Alloy by itself does not support non RFC5424 compliant syslog messages. However, we can use rsyslog to convert non RFC5424 compliant syslog messages to RFC5424 compliant syslog messages. ## Running the Demo ### Step 1: Clone the repository ```bash git clone https://github.com/grafana/alloy-scenarios.git ``` ### Step 2: Deploy the monitoring stack ```bash cd alloy-scenarios/syslog docker-compose up -d ``` ### Step 3: Access Grafana Alloy UI Open your browser and go to `http://localhost:12345`. ### Step 4: Access Grafana UI Open your browser and go to `http://localhost:3000`. ================================================ FILE: syslog/config.alloy ================================================ livedebugging { enabled = true } loki.source.syslog "local" { listener { address = "0.0.0.0:51893" labels = { component = "loki.source.syslog", protocol = "tcp" } } listener { address = "0.0.0.0:51898" protocol = "udp" labels = { component = "loki.source.syslog", protocol = "udp"} } forward_to = [loki.write.local.receiver] } loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" } } ================================================ FILE: syslog/docker-compose.coda.yml ================================================ services: rsyslog: image: rsyslog/syslog_appliance_alpine:latest@sha256:c0dd7cad9ff3234967ff59879590175b7590e8a5f5621ec49a85aff546b44a3b container_name: rsyslog ports: - "514:514/udp" - "514:514/tcp" volumes: - ./rsyslog.conf:/etc/rsyslog.conf syslog-simulator: image: python:${PYTHON_VERSION:-3.11-slim} container_name: syslog-simulator volumes: - ./syslog_simulator.py:/syslog_simulator.py environment: - SYSLOG_HOST=rsyslog - SYSLOG_PORT=514 depends_on: - rsyslog command: ["python3", "/syslog_simulator.py"] ================================================ FILE: syslog/docker-compose.yml ================================================ version: '3.8' services: # Rsyslog service rsyslog: image: rsyslog/syslog_appliance_alpine:latest@sha256:c0dd7cad9ff3234967ff59879590175b7590e8a5f5621ec49a85aff546b44a3b container_name: rsyslog ports: - "514:514/udp" # Standard syslog UDP port - "514:514/tcp" # Standard syslog TCP port (if needed) volumes: - ./rsyslog.conf:/etc/rsyslog.conf # Custom rsyslog configuration depends_on: - alloy # Syslog simulator using a Python script syslog-simulator: image: python:${PYTHON_VERSION:-3.11-slim} container_name: syslog-simulator volumes: - ./syslog_simulator.py:/syslog_simulator.py # Syslog simulator script environment: - SYSLOG_HOST=rsyslog - SYSLOG_PORT=514 depends_on: - rsyslog command: ["python3", "/syslog_simulator.py"] alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 - 51893:51893 - 51898:51898 volumes: - ./config.alloy:/etc/alloy/config.alloy - ./logs:/tmp/app-logs/ command: run --server.http.listen-addr=0.0.0.0:12345 --stability.level=experimental --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - loki loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false EOF /run.sh volumes: rsyslog_data: ================================================ FILE: syslog/loki-config.yaml ================================================ # This is a complete configuration to deploy Loki backed by the filesystem. # The index will be shipped to the storage via tsdb-shipper. auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true ================================================ FILE: syslog/rsyslog.conf ================================================ # Load necessary modules module(load="imudp") # For receiving UDP messages input(type="imudp" port="514") # TCP endpoint configuration *.* action(type="omfwd" target="alloy" port="51893" protocol="tcp" Template="RSYSLOG_SyslogProtocol23Format") ================================================ FILE: syslog/syslog_simulator.py ================================================ import socket import time import os import random from datetime import datetime # Get the target host and port from environment variables syslog_host = os.getenv('SYSLOG_HOST', 'localhost') syslog_port = int(os.getenv('SYSLOG_PORT', 514)) # Create a UDP socket sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) # Define log levels and messages log_levels = ["INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"] messages = [ "System started successfully", "User login successful", "Configuration loaded", "Connection to database failed", "Data processed successfully", "Invalid API request received", "Memory usage high", "Disk space low", "Unknown error occurred", "Service restarted", ] # Generate and send syslog messages every few seconds while True: # Correct timestamp format timestamp = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') log_level = random.choice(log_levels) message_text = random.choice(messages) pid = random.randint(100, 999) # Simulate random process IDs app_name = "MyApp" hostname = socket.gethostname() msgid = '-' structured_data = '-' # Include the log level in the message body message_body = f"{log_level}: {message_text}" # Correct syslog message format message = f"<34>1 {timestamp} {hostname} {app_name} {pid} {msgid} {structured_data} {message_body}" sock.sendto(message.encode(), (syslog_host, syslog_port)) print(f"Sent syslog message to {syslog_host}:{syslog_port} - {message_body}") time.sleep(random.randint(3, 8)) # Send a message every 3-8 seconds ================================================ FILE: systemd-journal/README.md ================================================ # systemd journal to Loki — focused filtering recipes A focused logs-only scenario for shipping a Linux host's systemd journal to Loki, with filtering and label promotion tuned for keeping the index lean and queries fast. ## How this differs from `linux/` | Aspect | `linux/` (existing) | `systemd-journal/` (this) | |---|---|---| | Scope | Metrics + journal + flat files (full Linux observability suite) | **Journal only** — focused scenario | | Pipeline | Pass-through ingest, all units, all priorities | **Drops noisy units + drops info/debug priorities** | | Stack | Prom + Loki + Grafana + node_exporter | **Loki + Grafana only** | | Labels promoted | none specifically | `unit`, `priority`, `hostname` | | Demo intent | "monitor a Linux box end-to-end" | "show advanced journal filtering recipes" | If you want general-purpose Linux observability, use `linux/`. If you specifically need journal filtering recipes (drop noisy units, drop low-priority entries, label by unit/priority for fast filtering), this scenario is the minimal moving-parts version. ## Linux host required `loki.source.journal` reads `/var/log/journal` and `/run/log/journal`. **These directories only exist on Linux hosts running systemd**. On macOS or Windows Docker Desktop: - The bind mounts will resolve to empty directories (Docker creates them silently). - Alloy will start cleanly but the source will sit idle with no journal entries. - The scenario is functionally a no-op — there's no synthesised journal to fall back to. To exercise the scenario fully you need: - A Linux host (bare metal, VM, WSL2 with systemd, or a Linux VM on macOS such as OrbStack / Lima / multipass). - `systemd` writing journals to `/var/log/journal` (persistent) or `/run/log/journal` (volatile). Most distros ship with at least the volatile journal active. ## Running On a Linux host: ```bash cd systemd-journal docker compose up -d ``` Wait ~10 seconds, then open Grafana. ## Accessing - **Grafana**: http://localhost:3000 (no login required) - **Alloy UI**: http://localhost:12345 — confirm components are healthy and use livedebugging to inspect entries flowing through each stage - **Loki API**: http://localhost:3100 ## Trying it out Generate some journal traffic on the Linux host: ```bash # Trigger a notice logger -p user.notice "test from systemd-journal scenario" # Trigger an error logger -p user.err "this is a test error" # Tickle a service unit to produce events sudo systemctl restart cron 2>/dev/null || sudo systemctl restart crond ``` Then in Grafana Explore on Loki: ```logql # All journal entries (after filtering) {job="systemd-journal"} # Errors only {job="systemd-journal", priority=~"err|crit|alert|emerg"} # A specific unit {job="systemd-journal", unit="ssh.service"} # A specific host (useful when shipping from many) {job="systemd-journal", hostname="my-server"} # All recent NetworkManager events {job="systemd-journal", unit="NetworkManager.service"} ``` ## What's filtered out The pipeline drops these at the Alloy side: | Filter | What it drops | Why | |---|---|---| | `{unit=~"systemd-logind.service\|systemd-tmpfiles-clean.service\|cron.service"}` | Login session housekeeping, tmpfile cleanup, every cron tick | High-volume, low-signal in dev/ops dashboards | | `{priority=~"info\|debug"}` | LOG_INFO and LOG_DEBUG entries | Keep `notice` and above | To keep one of these back, edit `stage.match` in `config.alloy` — remove the corresponding entry from the regex. ## Why run Alloy as root The Alloy container runs with `user: "0:0"`. On most Linux distros, `/var/log/journal/*.journal` files are owned by `root:systemd-journal` with mode 0640. Reading them requires either being root or a member of the `systemd-journal` group. Running Alloy as root inside a container with a read-only bind-mount keeps things simple for a demo. In production, prefer running the Alloy native package as a service — it joins the right groups automatically. ## Stopping ```bash docker compose down -v ``` ## Customization ideas - **Promote more journal fields**: extend the `loki.relabel.journal` block. `__journal__pid` → `pid`, `__journal__exe` → `exe`, `__journal__cmdline` → `cmdline`, etc. - **Per-environment unit filters**: maintain different `stage.match` regexes for prod vs dev. - **Forward errors only**: add a `stage.match` keeping only `priority=~"err|crit|alert|emerg"` if you want a focused error stream. - **Multi-host fan-in**: deploy this on every Linux host with the same `loki.write` URL pointing at a central Loki cluster. ================================================ FILE: systemd-journal/config.alloy ================================================ // systemd journal → Loki, with filtering recipes. // // Demonstrates three patterns the broader `linux/` scenario doesn't: // 1. Promoting useful journal fields (`unit`, `priority`, `hostname`) // to Loki labels via `loki.relabel`. // 2. Dropping noisy systemd units that flood the journal but rarely // carry useful signal. // 3. Dropping low-priority entries (info/debug) at ingestion time // to keep Loki cardinality and storage low. // // Linux-host only — `loki.source.journal` reads /var/log/journal, // which doesn't exist on macOS or Windows. See README for details. livedebugging { enabled = true } // Translate the journal's underscore-prefixed metadata into clean // Loki label names. The journal exposes a lot of fields; we promote // only a few useful ones. loki.relabel "journal" { forward_to = [] rule { source_labels = ["__journal__systemd_unit"] target_label = "unit" } rule { source_labels = ["__journal_priority_keyword"] target_label = "priority" } rule { source_labels = ["__journal__hostname"] target_label = "hostname" } } loki.source.journal "host" { path = "/var/log/journal" max_age = "12h" relabel_rules = loki.relabel.journal.rules labels = { job = "systemd-journal" } forward_to = [loki.process.journal.receiver] } loki.process "journal" { // Drop high-volume units that rarely carry actionable signal in a // generic dev/ops dashboard. Tune this list to your environment. stage.match { selector = `{unit=~"systemd-logind.service|systemd-tmpfiles-clean.service|cron.service"}` action = "drop" } // Drop low-priority entries (info / debug). Keep notice and above. // Adjust if you want to keep info messages. stage.match { selector = `{priority=~"info|debug"}` action = "drop" } forward_to = [loki.write.local.receiver] } loki.write "local" { endpoint { url = "http://loki:3100/loki/api/v1/push" } } ================================================ FILE: systemd-journal/docker-compose.yml ================================================ services: loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100/tcp" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - "3000:3000/tcp" entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} # Run as root so Alloy can read /var/log/journal — the journal files # are owned by root:systemd-journal with mode 0640 on most distros. user: "0:0" ports: - "12345:12345" volumes: - ./config.alloy:/etc/alloy/config.alloy # Bind-mount the host's journal read-only. On Linux hosts this # exposes the actual systemd journal. On macOS/Windows the path # doesn't exist and Docker creates an empty directory; Alloy # will run but the source will report "no journal entries". - /var/log/journal:/var/log/journal:ro - /run/log/journal:/run/log/journal:ro command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - loki ================================================ FILE: systemd-journal/loki-config.yaml ================================================ auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true ingester: max_chunk_age: 5m ================================================ FILE: trace-delivery/README.md ================================================ # Trace Delivery Demo This scenario demonstrates how distributed tracing works using a realistic sofa delivery workflow. It shows the journey of a sofa order from the shop to the customer's house, passing through multiple services. ## Overview The demo includes five interconnected services simulating a sofa ordering and delivery process: 1. **Sofa Shop** - Where customers browse sofas and place orders 2. **Sofa Factory** - Manufactures the ordered sofas with detailed assembly steps 3. **Global Distribution Center** - Handles global logistics and shipping 4. **Local Distribution Center** - Manages local delivery logistics 5. **Customer House** - The final destination for delivery Each service generates spans as part of a complete trace that follows the sofa from order to delivery. This demo includes three main scenarios: 1. **Successful Delivery** - A complete, happy-path delivery with no issues 2. **Failed Delivery** - Simulated failures at different points in the delivery process 3. **Latency Issues** - Abnormal delays in one service affecting the entire delivery process ## Architecture ``` ┌────────────┐ ┌──────────────┐ ┌─────────────────────┐ ┌──────────────────┐ ┌────────────────┐ │ Sofa Shop │────▶│ Sofa Factory │────▶│ Global Distribution │────▶│ Local Distribution│────▶│ Customer House │ └────────────┘ └──────────────┘ └─────────────────────┘ └──────────────────┘ └────────────────┘ │ │ ▼ ┌────────────┐ │ Sofa Shop │ └────────────┘ (notification) ``` All services are instrumented with OpenTelemetry to generate traces, which are collected by Grafana Alloy and visualized in Grafana via Tempo. ## Demo Features - **Realistic Business Process**: Simulates a real-world business workflow with multiple services and dependencies - **Trace Context Propagation**: Demonstrates how trace context is passed between services - **Background Trace Generation**: Automatically generates traces for all scenarios periodically - **Nested Spans**: Shows detailed manufacturing steps with nested spans and span events - **Bidirectional Communication**: Local Distribution center notifies the Shop when delivery is dispatched - **Error Cases**: Shows how errors are recorded and propagated in traces with exceptions - **Latency Visualization**: Illustrates how performance bottlenecks appear in traces - **Span Events**: Each service adds detailed span events to provide context for operations - **Tail Sampling**: Demonstrates tail sampling policies that focus on errors, latency issues, and specific order attributes - **Service Graph**: Visualizes the connections between services ## Running the Demo 1. Clone the repository: ``` git clone https://github.com/grafana/alloy-scenarios.git cd alloy-scenarios ``` 2. Navigate to this example directory: ``` cd trace-delivery ``` 3. Run using Docker Compose: ``` docker compose up -d ``` Or use the centralized image management: ``` cd .. ./run-example.sh trace-delivery ``` 4. Access the Sofa Shop at http://localhost:8080 ## Demo Scenarios ### 1. Successful Delivery Navigate to http://localhost:8080/demo/success to trigger a successful delivery flow, which will: - Create an order for a Classic Comfort sofa - Process it through all stages of the delivery pipeline - Show the detailed manufacturing steps with nested spans - Have the Local Distribution center notify the Shop of the dispatch - Complete delivery successfully - Generate a full trace that can be examined in Grafana ### 2. Failed Delivery Navigate to http://localhost:8080/demo/failure to simulate a failure scenario, which will: - Create an order for a Luxury Lounge sofa - Simulate a failure at one of the services (factory by default) - Record an actual exception in the trace with detailed error information - Generate an error trace that will be sampled by the error policy You can change where the failure occurs by adding a query parameter: - http://localhost:8080/demo/failure?service=sofa-factory - http://localhost:8080/demo/failure?service=global-distribution - http://localhost:8080/demo/failure?service=local-distribution ### 3. Latency Issues Navigate to http://localhost:8080/demo/latency to simulate a latency scenario, which will: - Create an order for a Limited Edition Designer sofa - Introduce significant latency in one service (factory by default) - Add span events explaining the cause of the latency - Demonstrate how tail sampling captures high-latency traces You can change where the latency occurs by adding a query parameter: - http://localhost:8080/demo/latency?service=sofa-factory - http://localhost:8080/demo/latency?service=global-distribution - http://localhost:8080/demo/latency?service=local-distribution ## Background Trace Generation The demo automatically generates traces in the background to populate your trace data: - Successful delivery traces (70% of background traces) - Failure scenarios (15% of background traces) - Latency scenarios (15% of background traces) This helps ensure you have data to analyze without having to manually trigger scenarios. ## Viewing Traces 1. Open Grafana at http://localhost:3000 2. Navigate to Explore 3. Select Tempo as the data source 4. Click on the "Search" tab and select filters like: - `delivery.status = "failed"` to see failed deliveries - `sofa.model = "limited-edition"` to see traces for limited edition sofas - `customer.type = "vip"` to see VIP customer orders - `background = true` to see background-generated traces - `scenario = "delivery-failure"` to see failure scenarios 5. Or explore the service graph by clicking the "Service Graph" tab ## Span Events Each span in the trace contains detailed events providing context about what's happening: - **Manufacturing**: Events for each assembly step like frame construction, spring installation, etc. - **Distribution**: Events for package preparation, routing, loading, etc. - **Delivery**: Events for delivery dispatched, delivered, etc. - **Failure**: Detailed information about what went wrong and where - **Latency**: Information about delays and their causes ## Tail Sampling Policies This demo configures Grafana Alloy with six tail sampling policies: 1. **Failed Delivery Policy**: Captures all traces with `delivery.status = "failed"` 2. **Error Policy**: Samples traces with errors 3. **Latency Policy**: Samples traces exceeding 5 seconds in duration 4. **VIP Customer Policy**: Samples all orders from VIP customers 5. **Limited Edition Policy**: Samples all orders for limited edition sofas 6. **Probabilistic Policy**: Samples 20% of all remaining traces These policies ensure important traces (errors, performance issues, VIP customers) are retained while still sampling a representative subset of normal traffic. ## Troubleshooting If you encounter issues: 1. **Missing services**: Ensure all containers are running with `docker compose ps` 2. **Network issues**: Check if services can communicate with each other 3. **Trace data missing**: Verify Alloy and Tempo are configured properly 4. **Service failures**: Check logs with `docker compose logs ` ## Customizing the Demo You can modify the demo in several ways: - Edit `app.py` to change service behavior, add new features, or adjust timing - Modify `config.alloy` to change sampling policies or add new connectors - Edit failure and latency probabilities in the script to increase/decrease error rates - Add new sofa models or customer types to expand the demo ## Learning from the Demo This demo helps understand: 1. How distributed tracing works across multiple services 2. How trace context is propagated through HTTP requests 3. How nested spans create a hierarchical view of operations 4. How span events provide detailed context about operations 5. How to use tail sampling to focus on important traces 6. How to troubleshoot errors and performance issues using traces 7. How service graphs visualize the relationships between services ================================================ FILE: trace-delivery/app/Dockerfile ================================================ ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2 FROM python:${PYTHON_VERSION} WORKDIR /app COPY requirements.txt . RUN pip install -r requirements.txt COPY *.py ./ # We'll pass the service name as an environment variable ENV SERVICE_PORT=8080 ENV PYTHONUNBUFFERED=1 CMD ["python", "app.py"] ================================================ FILE: trace-delivery/app/app.py ================================================ import os import random import time import uuid import logging import threading from flask import Flask, request, jsonify import requests from opentelemetry import trace from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.instrumentation.flask import FlaskInstrumentor from opentelemetry.instrumentation.requests import RequestsInstrumentor from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Get environment variables service_name = os.environ.get('OTEL_SERVICE_NAME', 'unknown-service') service_port = int(os.environ.get('SERVICE_PORT', '8080')) # Configure the tracer resource = Resource.create() # Use OTEL_RESOURCE_ATTRIBUTES environment variable trace.set_tracer_provider(TracerProvider(resource=resource)) # Configure the OTLP exporter otlp_exporter = OTLPSpanExporter() span_processor = BatchSpanProcessor(span_exporter=otlp_exporter) trace.get_tracer_provider().add_span_processor(span_processor) # Create a tracer tracer = trace.get_tracer(__name__) # Create a propagator for handling trace context propagator = TraceContextTextMapPropagator() # Create Flask application app = Flask(__name__) FlaskInstrumentor().instrument_app(app) RequestsInstrumentor().instrument() # Furniture models available sofa_models = [ {"id": "classic-001", "name": "Classic Comfort", "price": 899.99, "production_time": 2}, {"id": "modern-002", "name": "Modern Minimalist", "price": 1299.99, "production_time": 3}, {"id": "luxury-003", "name": "Luxury Lounge", "price": 2499.99, "production_time": 5}, {"id": "sectional-004", "name": "Sectional Supreme", "price": 1899.99, "production_time": 4}, {"id": "limited-edition", "name": "Limited Edition Designer", "price": 4999.99, "production_time": 7} ] # Customer types customer_types = ["regular", "premium", "vip"] # Distribution centers distribution_centers = { "global": ["New York", "Shanghai", "Berlin", "Sydney"], "local": ["North District", "South District", "East District", "West District"] } # Simulated failures by service failure_scenarios = { "sofa-factory": {"probability": 0.2, "message": "Production line issue: Unable to complete sofa manufacturing"}, "global-distribution": {"probability": 0.15, "message": "Item lost in global distribution center"}, "local-distribution": {"probability": 0.1, "message": "Delivery vehicle breakdown"} } # Simulated latency scenarios latency_scenarios = { "sofa-factory": {"probability": 0.1, "min_delay": 5, "max_delay": 8, "message": "Production backlog causing delays"}, "global-distribution": {"probability": 0.1, "min_delay": 6, "max_delay": 10, "message": "Customs inspection delay"}, "local-distribution": {"probability": 0.1, "min_delay": 3, "max_delay": 7, "message": "Traffic congestion affecting local delivery"} } # Generate a unique order ID with a prefix def generate_order_id(): return f"ORD-{uuid.uuid4().hex[:8].upper()}" # Select a random item from a list def random_item(items): return random.choice(items) # Determine if a failure should occur based on probability def should_fail(service_name, order): # Check if this is a failure demo or has a failure scenario tag if order.get("demo") == "failure" and order.get("failure_service") == service_name: return True # Check if this is a background failure scenario if order.get("scenario") == "delivery-failure" and order.get("failure_service") == service_name: return True # Regular orders should NOT randomly fail return False # Add latency if applicable for the service def maybe_add_latency(service_name, span): if service_name in latency_scenarios: if random.random() < latency_scenarios[service_name]["probability"]: scenario = latency_scenarios[service_name] delay = random.uniform(scenario["min_delay"], scenario["max_delay"]) reason = scenario["message"] span.set_attribute("latency.seconds", delay) span.set_attribute("latency.reason", reason) time.sleep(delay) return (True, delay, reason) return (False, None, None) # SOFA SHOP SERVICE (entry point) @app.route('/') def home(): if service_name == "sofa-shop": return """

Sofa Shop - Trace Delivery Demo

Welcome to our sofa shop! Here you can order sofas and track their delivery through our system.

Endpoints:

Demo Scenarios:

""" else: return f"

{service_name} service

This service is part of the trace delivery demo.

" # CATALOG ENDPOINT - SHOP SERVICE @app.route('/catalog') def catalog(): if service_name != "sofa-shop": return jsonify({"error": f"Not available in {service_name}"}), 404 with tracer.start_as_current_span("view-catalog") as span: span.set_attribute("action", "view-catalog") return jsonify({"sofas": sofa_models}) # ORDER ENDPOINT - SHOP SERVICE @app.route('/order') def place_order(): if service_name != "sofa-shop": return jsonify({"error": f"Not available in {service_name}"}), 404 with tracer.start_as_current_span("place-order") as span: # Generate order data order_id = generate_order_id() sofa = random_item(sofa_models) customer_type = random_item(customer_types) # Set span attributes span.set_attribute("order.id", order_id) span.set_attribute("sofa.model", sofa["id"]) span.set_attribute("sofa.name", sofa["name"]) span.set_attribute("sofa.price", sofa["price"]) span.set_attribute("customer.type", customer_type) span.set_attribute("action", "place-order") # Create order order = { "order_id": order_id, "sofa": sofa, "customer_type": customer_type, "timestamp": time.time() } logger.info(f"New order placed: {order_id} for {sofa['name']}") # Forward to factory for manufacturing try: factory_url = os.environ.get('SERVICE_FACTORY_URL', 'http://sofa-factory:8081') headers = {} propagator.inject(headers) response = requests.post( f"{factory_url}/manufacture", json=order, headers=headers ) if response.status_code == 200: result = response.json() return jsonify({ "message": "Order placed successfully!", "order_id": order_id, "sofa": sofa["name"], "customer_type": customer_type, "status": "manufacturing" }) else: span.set_status(trace.StatusCode.ERROR) return jsonify({"error": "Failed to process order at factory", "details": response.text}), 500 except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) return jsonify({"error": f"Failed to connect to factory: {str(e)}"}), 500 # ORDER STATUS ENDPOINT - SHOP SERVICE @app.route('/order-status') def check_order_status(): if service_name != "sofa-shop": return jsonify({"error": f"Not available in {service_name}"}), 404 order_id = request.args.get('order_id') if not order_id: return jsonify({"error": "No order ID provided"}), 400 with tracer.start_as_current_span("check-order-status") as span: span.set_attribute("order.id", order_id) span.set_attribute("action", "check-order-status") # In a real system, we would look up the order status in a database # For this demo, we'll return a random status statuses = ["manufactured", "picked up", "in global distribution", "in local distribution", "out for delivery", "delivered"] status = random_item(statuses) return jsonify({ "order_id": order_id, "status": status, "last_update": time.time() }) # DELIVERY NOTIFICATION ENDPOINT - SHOP SERVICE @app.route('/delivery-notification', methods=['POST']) def delivery_notification(): if service_name != "sofa-shop": return jsonify({"error": f"Not available in {service_name}"}), 404 notification = request.json order_id = notification.get("order_id") notification_type = notification.get("notification_type") delivery_time = notification.get("delivery_time") with tracer.start_as_current_span("process-delivery-notification") as span: span.set_attribute("order.id", order_id) span.set_attribute("notification.type", notification_type) span.set_attribute("action", "process-notification") # Add a span event for processing the notification span.add_event("notification_received", { "order_id": order_id, "notification_type": notification_type, "timestamp": time.time() }) # In a real app, we would update the order status in the database # For this demo, we'll just log it logger.info(f"Notification received: Order {order_id} has been {notification_type} at {delivery_time}") # Simulate update to database or other processing time.sleep(0.1) # Add span event for completing notification processing span.add_event("notification_processed", { "order_id": order_id, "success": True, "timestamp": time.time() }) return jsonify({ "status": "success", "message": f"Notification for order {order_id} processed successfully", "notification_type": notification_type }) # MANUFACTURE ENDPOINT - FACTORY SERVICE @app.route('/manufacture', methods=['POST']) def manufacture(): if service_name != "sofa-factory": return jsonify({"error": f"Not available in {service_name}"}), 404 order = request.json order_id = order.get("order_id") sofa = order.get("sofa", {}) is_background = order.get("background", False) with tracer.start_as_current_span("manufacture-sofa") as span: span.set_attribute("order.id", order_id) span.set_attribute("sofa.model", sofa.get("id", "unknown")) span.set_attribute("sofa.name", sofa.get("name", "unknown")) span.set_attribute("action", "manufacture") span.set_attribute("background", is_background) # Add a span event for manufacture start span.add_event("manufacture_started", { "order_id": order_id, "timestamp": time.time(), "sofa_model": sofa.get("name", "unknown") }) # Check for simulated failure if should_fail(service_name, order): error_message = failure_scenarios[service_name]["message"] logger.error(f"Manufacturing failure for order {order_id}: {error_message}") span.set_attribute("error", True) span.set_attribute("error.message", error_message) span.set_attribute("delivery.status", "failed") # Add span event for the failure span.add_event("manufacture_failed", { "error": error_message, "timestamp": time.time() }) # Record an actual exception to show in the trace try: raise Exception(f"Manufacturing process failed: {error_message}") except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) return jsonify({"error": error_message}), 500 # Add latency if applicable latency_result = (False, None, None) if order.get("demo") == "latency" and order.get("latency_service") == "sofa-factory": # For demo, explicitly add latency delay = random.uniform(5, 8) reason = "Production backlog causing delays" span.set_attribute("latency.seconds", delay) span.set_attribute("latency.reason", reason) time.sleep(delay) latency_result = (True, delay, reason) else: # Check for random latency latency_result = maybe_add_latency(service_name, span) # If latency was added, record the event if latency_result[0]: delay = latency_result[1] reason = latency_result[2] span.add_event("manufacture_delayed", { "delay_seconds": delay, "reason": reason, "timestamp": time.time() }) # Create nested spans for the assembly process # 1. Frame construction with tracer.start_as_current_span("frame-construction") as frame_span: frame_span.set_attribute("order.id", order_id) frame_span.set_attribute("assembly.step", "frame") frame_span.set_attribute("material", "hardwood") # Simulate work time.sleep(0.2) frame_span.add_event("frame_completed", { "timestamp": time.time(), "quality_check": "passed" }) # 2. Spring installation with tracer.start_as_current_span("spring-installation") as spring_span: spring_span.set_attribute("order.id", order_id) spring_span.set_attribute("assembly.step", "springs") spring_span.set_attribute("spring.count", 24) # Simulate work time.sleep(0.15) spring_span.add_event("springs_installed", { "timestamp": time.time(), "tension_test": "passed" }) # 3. Cushion preparation with tracer.start_as_current_span("cushion-preparation") as cushion_span: cushion_span.set_attribute("order.id", order_id) cushion_span.set_attribute("assembly.step", "cushions") # Sub-step: foam cutting with tracer.start_as_current_span("foam-cutting") as foam_span: foam_span.set_attribute("material", "memory foam") foam_span.set_attribute("density", "high") time.sleep(0.1) # Sub-step: fabric cutting with tracer.start_as_current_span("fabric-cutting") as fabric_span: fabric_span.set_attribute("material", "premium leather" if sofa.get("id") == "luxury-003" else "fabric") time.sleep(0.1) # Sub-step: cushion assembly with tracer.start_as_current_span("cushion-assembly") as assembly_span: assembly_span.set_attribute("components", "foam + fabric + zippers") time.sleep(0.15) cushion_span.add_event("cushions_completed", { "timestamp": time.time() }) # 4. Final assembly with tracer.start_as_current_span("final-assembly") as final_span: final_span.set_attribute("order.id", order_id) final_span.set_attribute("assembly.step", "final") # Simulate work time.sleep(0.25) final_span.add_event("assembly_completed", { "timestamp": time.time(), "inspector": f"Inspector #{random.randint(1, 10)}" }) # Simulate manufacturing time (in addition to the assembly steps) production_time = sofa.get("production_time", 3) time.sleep(production_time / 20) # Scale down for demo purposes # Add event for manufacturing completion span.add_event("manufacture_completed", { "order_id": order_id, "timestamp": time.time(), "quality_check": "passed", "inspector_id": f"QA-{random.randint(100, 999)}" }) logger.info(f"Completed manufacturing for order {order_id}") # Request pickup from global distribution try: distribution_url = os.environ.get('SERVICE_DISTRIBUTION_URL', 'http://global-distribution:8082') headers = {} propagator.inject(headers) response = requests.post( f"{distribution_url}/pickup", json=order, headers=headers ) if response.status_code == 200: result = response.json() return jsonify({ "order_id": order_id, "status": "manufactured", "next_step": "global distribution" }) else: error_message = f"Global distribution pickup failed: {response.text}" span.set_status(trace.StatusCode.ERROR) span.set_attribute("delivery.status", "failed") return jsonify({"error": error_message}), 500 except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) span.set_attribute("delivery.status", "failed") return jsonify({"error": f"Failed to connect to global distribution: {str(e)}"}), 500 # PICKUP ENDPOINT - GLOBAL DISTRIBUTION SERVICE @app.route('/pickup', methods=['POST']) def global_pickup(): if service_name != "global-distribution": return jsonify({"error": f"Not available in {service_name}"}), 404 order = request.json order_id = order.get("order_id") sofa = order.get("sofa", {}) with tracer.start_as_current_span("global-distribution-pickup") as span: span.set_attribute("order.id", order_id) span.set_attribute("sofa.model", sofa.get("id", "unknown")) distribution_center = random_item(distribution_centers["global"]) span.set_attribute("distribution.center", distribution_center) span.set_attribute("action", "global-pickup") # Add event for starting the pickup process span.add_event("global_pickup_started", { "order_id": order_id, "distribution_center": distribution_center, "timestamp": time.time() }) # Check for simulated failure if should_fail(service_name, order): error_message = failure_scenarios[service_name]["message"] logger.error(f"Global distribution failure for order {order_id}: {error_message}") span.set_attribute("error", True) span.set_attribute("error.message", error_message) span.set_attribute("delivery.status", "failed") # Add event for the failure span.add_event("global_pickup_failed", { "error": error_message, "timestamp": time.time() }) # Record an actual exception to show in the trace try: raise Exception(f"Global distribution failed: {error_message}") except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) return jsonify({"error": error_message}), 500 # Add latency if applicable latency_result = (False, None, None) if order.get("demo") == "latency" and order.get("latency_service") == "global-distribution": # For demo, explicitly add latency delay = random.uniform(6, 10) reason = "Customs inspection delay" span.set_attribute("latency.seconds", delay) span.set_attribute("latency.reason", reason) time.sleep(delay) latency_result = (True, delay, reason) else: # Check for random latency latency_result = maybe_add_latency(service_name, span) # If latency was added, record the event if latency_result[0]: delay = latency_result[1] reason = latency_result[2] span.add_event("global_pickup_delayed", { "delay_seconds": delay, "reason": reason, "timestamp": time.time() }) # Create nested spans for logistics operations with tracer.start_as_current_span("inventory-processing") as inventory_span: inventory_span.set_attribute("order.id", order_id) inventory_span.set_attribute("operation", "inventory") inventory_span.set_attribute("location", distribution_center) # Simulate inventory processing time.sleep(0.1) inventory_span.add_event("inventory_processed", { "warehouse": f"{distribution_center}-{random.randint(1, 5)}", "timestamp": time.time() }) with tracer.start_as_current_span("global-logistics") as logistics_span: logistics_span.set_attribute("order.id", order_id) logistics_span.set_attribute("operation", "logistics") # Simulate logistics processing time.sleep(0.2) # Select random transport type transport = random.choice(["air", "sea", "road", "rail"]) logistics_span.set_attribute("transport.type", transport) logistics_span.add_event("transport_arranged", { "type": transport, "carrier": f"Carrier-{random.randint(100, 999)}", "timestamp": time.time() }) # Simulate processing time time.sleep(0.3) # Add event for successful pickup span.add_event("global_pickup_completed", { "order_id": order_id, "distribution_center": distribution_center, "timestamp": time.time() }) logger.info(f"Global distribution processed order {order_id}") # Forward to local distribution try: local_url = os.environ.get('SERVICE_LOCAL_URL', 'http://local-distribution:8083') headers = {} propagator.inject(headers) response = requests.post( f"{local_url}/deliver", json=order, headers=headers ) if response.status_code == 200: result = response.json() return jsonify({ "order_id": order_id, "status": "in global distribution", "next_step": "local distribution" }) else: error_message = f"Local distribution handoff failed: {response.text}" span.set_status(trace.StatusCode.ERROR) span.set_attribute("delivery.status", "failed") return jsonify({"error": error_message}), 500 except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) span.set_attribute("delivery.status", "failed") return jsonify({"error": f"Failed to connect to local distribution: {str(e)}"}), 500 # DELIVER ENDPOINT - LOCAL DISTRIBUTION SERVICE @app.route('/deliver', methods=['POST']) def local_deliver(): if service_name != "local-distribution": return jsonify({"error": f"Not available in {service_name}"}), 404 order = request.json order_id = order.get("order_id") sofa = order.get("sofa", {}) with tracer.start_as_current_span("local-distribution-delivery") as span: span.set_attribute("order.id", order_id) span.set_attribute("sofa.model", sofa.get("id", "unknown")) distribution_center = random_item(distribution_centers["local"]) span.set_attribute("distribution.center", distribution_center) span.set_attribute("action", "local-delivery") # Add event for starting local delivery span.add_event("local_delivery_started", { "order_id": order_id, "distribution_center": distribution_center, "timestamp": time.time() }) # Check for simulated failure if should_fail(service_name, order): error_message = failure_scenarios[service_name]["message"] logger.error(f"Local distribution failure for order {order_id}: {error_message}") span.set_attribute("error", True) span.set_attribute("error.message", error_message) span.set_attribute("delivery.status", "failed") # Add event for the failure span.add_event("local_delivery_failed", { "error": error_message, "timestamp": time.time() }) # Record an actual exception to show in the trace try: raise Exception(f"Local delivery failed: {error_message}") except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) return jsonify({"error": error_message}), 500 # Add latency if applicable latency_result = (False, None, None) if order.get("demo") == "latency" and order.get("latency_service") == "local-distribution": # For demo, explicitly add latency delay = random.uniform(3, 7) reason = "Traffic congestion affecting local delivery" span.set_attribute("latency.seconds", delay) span.set_attribute("latency.reason", reason) time.sleep(delay) latency_result = (True, delay, reason) else: # Check for random latency latency_result = maybe_add_latency(service_name, span) # If latency was added, record the event if latency_result[0]: delay = latency_result[1] reason = latency_result[2] span.add_event("local_delivery_delayed", { "delay_seconds": delay, "reason": reason, "timestamp": time.time() }) # Create nested spans for local delivery operations with tracer.start_as_current_span("package-preparation") as prep_span: prep_span.set_attribute("order.id", order_id) prep_span.set_attribute("operation", "package-prep") # Simulate packaging operations time.sleep(0.15) prep_span.add_event("package_prepared", { "packaging_type": "heavy-duty", "timestamp": time.time() }) with tracer.start_as_current_span("delivery-route-planning") as route_span: route_span.set_attribute("order.id", order_id) route_span.set_attribute("operation", "route-planning") # Simulate route planning time.sleep(0.15) # Pick random delivery details vehicle = random.choice(["van", "truck", "specialized transport"]) route_span.set_attribute("delivery.vehicle", vehicle) driver = f"Driver-{random.randint(100, 999)}" route_span.set_attribute("delivery.driver", driver) route_span.add_event("route_planned", { "vehicle": vehicle, "driver": driver, "estimated_arrival": time.time() + 3600, # 1 hour from now "timestamp": time.time() }) # Simulate processing time time.sleep(0.4) # Add event for successfully loaded for delivery span.add_event("local_delivery_loaded", { "order_id": order_id, "distribution_center": distribution_center, "timestamp": time.time() }) logger.info(f"Local distribution processed order {order_id}") # Notify the shop that the order has been dispatched for delivery with tracer.start_as_current_span("notify-shop-delivery-dispatched") as notify_span: notify_span.set_attribute("order.id", order_id) notify_span.set_attribute("action", "notify-shop") # Create the notification notification = { "order_id": order_id, "sofa": sofa, "customer_type": order.get("customer_type", "regular"), "dispatch_time": time.time(), "notification_type": "delivery_dispatched", "vehicle": vehicle, "driver": driver, "distribution_center": distribution_center } # Send notification to shop shop_url = "http://sofa-shop:8080/delivery-notification" headers = {} propagator.inject(headers) notify_span.add_event("sending_notification", { "target": "sofa-shop", "notification_type": "delivery_dispatched", "timestamp": time.time() }) # Try to send the notification - don't fail the whole delivery if this fails try: requests.post( shop_url, json=notification, headers=headers, timeout=1 # Short timeout so we don't block if shop is down ) notify_span.add_event("notification_sent", { "success": True, "timestamp": time.time() }) except Exception as notify_err: logger.warning(f"Failed to notify shop of dispatch: {str(notify_err)}") notify_span.record_exception(notify_err) notify_span.set_status(trace.StatusCode.ERROR, str(notify_err)) notify_span.add_event("notification_failed", { "success": False, "error": str(notify_err), "timestamp": time.time() }) # Deliver to customer try: customer_url = os.environ.get('SERVICE_CUSTOMER_URL', 'http://customer-house:8084') headers = {} propagator.inject(headers) response = requests.post( f"{customer_url}/receive", json=order, headers=headers ) if response.status_code == 200: result = response.json() span.add_event("local_delivery_completed", { "order_id": order_id, "timestamp": time.time() }) return jsonify({ "order_id": order_id, "status": "out for delivery", "next_step": "customer delivery" }) else: error_message = f"Customer delivery failed: {response.text}" span.set_status(trace.StatusCode.ERROR) span.set_attribute("delivery.status", "failed") return jsonify({"error": error_message}), 500 except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) span.set_attribute("delivery.status", "failed") return jsonify({"error": f"Failed to connect to customer house: {str(e)}"}), 500 # RECEIVE ENDPOINT - CUSTOMER HOUSE SERVICE @app.route('/receive', methods=['POST']) def customer_receive(): if service_name != "customer-house": return jsonify({"error": f"Not available in {service_name}"}), 404 order = request.json order_id = order.get("order_id") sofa = order.get("sofa", {}) customer_type = order.get("customer_type", "regular") with tracer.start_as_current_span("customer-house-receive") as span: span.set_attribute("order.id", order_id) span.set_attribute("sofa.model", sofa.get("id", "unknown")) span.set_attribute("customer.type", customer_type) span.set_attribute("action", "customer-receive") span.set_attribute("delivery.status", "delivered") # Add span event for delivery span.add_event("sofa_delivered", { "order_id": order_id, "timestamp": time.time(), "customer_type": customer_type }) # Simulate final delivery time.sleep(0.2) logger.info(f"Order {order_id} successfully delivered to customer") # Generate customer satisfaction score - VIP customers are generally more satisfied satisfaction = random.randint(85, 100) if customer_type == "vip" else random.randint(70, 95) return jsonify({ "order_id": order_id, "status": "delivered", "delivery_time": time.time(), "satisfaction": satisfaction }) # DEMO ENDPOINTS - SHOP SERVICE @app.route('/demo/success') def demo_success(): if service_name != "sofa-shop": return jsonify({"error": f"Not available in {service_name}"}), 404 # Set environment variables for other services to not fail os.environ["FORCE_SUCCESS"] = "true" with tracer.start_as_current_span("demo-success-flow") as span: # Use a predefined sofa for the demo order_id = generate_order_id() sofa = sofa_models[0] # Classic sofa customer_type = "regular" span.set_attribute("order.id", order_id) span.set_attribute("sofa.model", sofa["id"]) span.set_attribute("sofa.name", sofa["name"]) span.set_attribute("customer.type", customer_type) span.set_attribute("demo", "success-flow") # Create order order = { "order_id": order_id, "sofa": sofa, "customer_type": customer_type, "timestamp": time.time(), "demo": "success" } logger.info(f"Demo success flow initiated: {order_id}") # Forward to factory for manufacturing try: factory_url = os.environ.get('SERVICE_FACTORY_URL', 'http://sofa-factory:8081') headers = {} propagator.inject(headers) response = requests.post( f"{factory_url}/manufacture", json=order, headers=headers ) if response.status_code == 200: return jsonify({ "message": "Success demo initiated!", "order_id": order_id, "sofa": sofa["name"], "trace_id": span.get_span_context().trace_id }) else: return jsonify({"error": "Demo failed to start", "details": response.text}), 500 except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) return jsonify({"error": f"Demo failed to start: {str(e)}"}), 500 @app.route('/demo/failure') def demo_failure_endpoint(): return demo_failure() def demo_failure(failure_service=None, is_background=False): if service_name != "sofa-shop": return jsonify({"error": f"Not available in {service_name}"}), 404 # Set environment variables for this specific demo os.environ["FORCE_FAILURE"] = "true" os.environ["FAILURE_SERVICE"] = failure_service or request.args.get('service', 'sofa-factory') with tracer.start_as_current_span("background-failure-scenario" if is_background else "demo-failure-flow") as span: # Use a predefined sofa for the demo order_id = generate_order_id() sofa = sofa_models[2] # Luxury sofa customer_type = "premium" span.set_attribute("order.id", order_id) span.set_attribute("sofa.model", sofa["id"]) span.set_attribute("sofa.name", sofa["name"]) span.set_attribute("customer.type", customer_type) span.set_attribute("demo", "failure-flow") span.set_attribute("background", is_background) span.set_attribute("scenario", "delivery-failure") span.set_attribute("failure_service", os.environ["FAILURE_SERVICE"]) # Create order order = { "order_id": order_id, "sofa": sofa, "customer_type": customer_type, "timestamp": time.time(), "demo": "failure", "background": is_background, "scenario": "delivery-failure", "failure_service": os.environ["FAILURE_SERVICE"] } logger.info(f"{'Background' if is_background else 'Demo'} failure flow initiated: {order_id} (failure in {os.environ['FAILURE_SERVICE']})") # Forward to factory for manufacturing try: factory_url = os.environ.get('SERVICE_FACTORY_URL', 'http://sofa-factory:8081') headers = {} propagator.inject(headers) response = requests.post( f"{factory_url}/manufacture", json=order, headers=headers ) if is_background: return None else: return jsonify({ "message": "Failure demo initiated!", "order_id": order_id, "sofa": sofa["name"], "failure_service": os.environ["FAILURE_SERVICE"], "trace_id": span.get_span_context().trace_id }) except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) if is_background: logger.error(f"Background demo failed to start: {str(e)}") return None else: return jsonify({"error": f"Demo failed to start: {str(e)}"}), 500 @app.route('/demo/latency') def demo_latency_endpoint(): return demo_latency() def demo_latency(latency_service=None, is_background=False): if service_name != "sofa-shop": return jsonify({"error": f"Not available in {service_name}"}), 404 # Set environment variables for this specific demo os.environ["FORCE_LATENCY"] = "true" os.environ["LATENCY_SERVICE"] = latency_service or request.args.get('service', 'sofa-factory') with tracer.start_as_current_span("background-latency-scenario" if is_background else "demo-latency-flow") as span: # Use a predefined sofa for the demo order_id = generate_order_id() sofa = sofa_models[4] # Limited edition customer_type = "vip" span.set_attribute("order.id", order_id) span.set_attribute("sofa.model", sofa["id"]) span.set_attribute("sofa.name", sofa["name"]) span.set_attribute("customer.type", customer_type) span.set_attribute("demo", "latency-flow") span.set_attribute("background", is_background) span.set_attribute("scenario", "delivery-latency") span.set_attribute("latency_service", os.environ["LATENCY_SERVICE"]) # Create order order = { "order_id": order_id, "sofa": sofa, "customer_type": customer_type, "timestamp": time.time(), "demo": "latency", "background": is_background, "scenario": "delivery-latency", "latency_service": os.environ["LATENCY_SERVICE"] } logger.info(f"{'Background' if is_background else 'Demo'} latency flow initiated: {order_id} (latency in {os.environ['LATENCY_SERVICE']})") # Forward to factory for manufacturing try: factory_url = os.environ.get('SERVICE_FACTORY_URL', 'http://sofa-factory:8081') headers = {} propagator.inject(headers) response = requests.post( f"{factory_url}/manufacture", json=order, headers=headers ) if is_background: return None else: return jsonify({ "message": "Latency demo initiated!", "order_id": order_id, "sofa": sofa["name"], "latency_service": os.environ["LATENCY_SERVICE"], "trace_id": span.get_span_context().trace_id }) except Exception as e: span.record_exception(e) span.set_status(trace.StatusCode.ERROR, str(e)) if is_background: logger.error(f"Background demo failed to start: {str(e)}") return None else: return jsonify({"error": f"Demo failed to start: {str(e)}"}), 500 # Background trace generation functions def generate_random_trace(): """Generate a random trace in the background""" if service_name != "sofa-shop": return # Only the shop should generate random traces # Randomly choose between normal order, error scenario, or latency scenario scenario_type = random.choices( ["normal", "error", "latency"], weights=[0.7, 0.15, 0.15], k=1 )[0] try: if scenario_type == "normal": # Normal order flow order_id = generate_order_id() sofa = random_item(sofa_models) customer_type = random_item(customer_types) with tracer.start_as_current_span("background-successful-order") as span: span.set_attribute("order.id", order_id) span.set_attribute("sofa.model", sofa["id"]) span.set_attribute("sofa.name", sofa["name"]) span.set_attribute("sofa.price", sofa["price"]) span.set_attribute("customer.type", customer_type) span.set_attribute("action", "place-order") span.set_attribute("background", True) span.set_attribute("scenario", "successful-delivery") # Add a span event for order creation span.add_event("order_created", { "order_id": order_id, "timestamp": time.time(), "customer_type": customer_type, "scenario": "successful-delivery" }) # Create order order = { "order_id": order_id, "sofa": sofa, "customer_type": customer_type, "timestamp": time.time(), "background": True, "scenario": "successful-delivery" } logger.info(f"Background successful order placed: {order_id} for {sofa['name']}") # Forward to factory for manufacturing factory_url = os.environ.get('SERVICE_FACTORY_URL', 'http://sofa-factory:8081') headers = {} propagator.inject(headers) requests.post( f"{factory_url}/manufacture", json=order, headers=headers ) elif scenario_type == "error": # Error scenario failure_service = random.choice(list(failure_scenarios.keys())) demo_failure(failure_service=failure_service, is_background=True) elif scenario_type == "latency": # Latency scenario latency_service = random.choice(list(latency_scenarios.keys())) demo_latency(latency_service=latency_service, is_background=True) except Exception as e: logger.error(f"Error generating background trace: {str(e)}") def trace_generator_thread(): """Background thread that generates traces at regular intervals""" while True: try: # Only generate random traces if we're the sofa-shop service if service_name == "sofa-shop": generate_random_trace() # Wait between 20-60 seconds before generating the next trace delay = random.uniform(10, 20) logger.info(f"Next background trace in {delay:.2f} seconds") time.sleep(delay) except Exception as e: logger.error(f"Error in trace generation thread: {e}") time.sleep(10) # Wait before retrying if __name__ == '__main__': logger.info(f"Starting {service_name} service on port {service_port}") # Start the background trace generator thread (only for sofa-shop) if service_name == "sofa-shop": trace_thread = threading.Thread(target=trace_generator_thread, daemon=True) trace_thread.start() logger.info("Started background trace generator") app.run(host='0.0.0.0', port=service_port) ================================================ FILE: trace-delivery/app/requirements.txt ================================================ flask requests opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp opentelemetry-instrumentation-flask opentelemetry-instrumentation-requests ================================================ FILE: trace-delivery/config-otel.yaml ================================================ # # OTel Collector YAML Configuration for Sofa Delivery Trace Demo # # This is the OTel-native equivalent of config.alloy for use with the Alloy OTel Engine. # Run with: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d # receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 processors: batch: {} exporters: otlp/tempo: endpoint: tempo:4317 tls: insecure: true service: pipelines: traces: receivers: [otlp] processors: [batch] exporters: [otlp/tempo] ================================================ FILE: trace-delivery/config.alloy ================================================ /* * Alloy Configuration for Sofa Delivery Trace Demo */ // Receive OpenTelemetry traces otelcol.receiver.otlp "default" { http {} grpc {} output { traces = [otelcol.processor.batch.default.input] } } // Batch processor to improve performance otelcol.processor.batch "default" { output { traces = [otelcol.exporter.otlp.tempo.input] } } // Send traces to Tempo otelcol.exporter.otlp "tempo" { client { endpoint = "tempo:4317" tls { insecure = true } } } livedebugging { enabled = true } ================================================ FILE: trace-delivery/docker-compose-otel.yml ================================================ # OTel Engine Override # # Uses Alloy's experimental OTel Engine to run a standard OTel Collector YAML config # instead of the River/HCL config.alloy file. # # Usage: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d # Stop: docker compose -f docker-compose.yml -f docker-compose-otel.yml down # services: alloy: command: otel --config=/etc/alloy/config-otel.yaml volumes: - ./config-otel.yaml:/etc/alloy/config-otel.yaml ports: - 8888:8888 # OTel Engine HTTP server ================================================ FILE: trace-delivery/docker-compose.coda.yml ================================================ services: # Sofa Shop Service sofa-shop: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8080:8080 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=sofa-shop - OTEL_RESOURCE_ATTRIBUTES=service.name=sofa-shop,service.version=1.0.0,deployment.environment=delivery-demo - SERVICE_FACTORY_URL=http://sofa-factory:8081 depends_on: - sofa-factory restart: on-failure # Sofa Factory Service sofa-factory: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8081:8081 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=sofa-factory - OTEL_RESOURCE_ATTRIBUTES=service.name=sofa-factory,service.version=1.0.0,deployment.environment=delivery-demo - SERVICE_PORT=8081 - SERVICE_DISTRIBUTION_URL=http://global-distribution:8082 depends_on: - global-distribution restart: on-failure # Global Distribution Service global-distribution: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8082:8082 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=global-distribution - OTEL_RESOURCE_ATTRIBUTES=service.name=global-distribution,service.version=1.0.0,deployment.environment=delivery-demo - SERVICE_PORT=8082 - SERVICE_LOCAL_URL=http://local-distribution:8083 depends_on: - local-distribution restart: on-failure # Local Distribution Service local-distribution: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8083:8083 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=local-distribution - OTEL_RESOURCE_ATTRIBUTES=service.name=local-distribution,service.version=1.0.0,deployment.environment=delivery-demo - SERVICE_PORT=8083 - SERVICE_CUSTOMER_URL=http://customer-house:8084 depends_on: - customer-house restart: on-failure # Customer House Service customer-house: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8084:8084 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=customer-house - OTEL_RESOURCE_ATTRIBUTES=service.name=customer-house,service.version=1.0.0,deployment.environment=delivery-demo - SERVICE_PORT=8084 restart: on-failure ================================================ FILE: trace-delivery/docker-compose.yml ================================================ version: '3.8' services: # Prometheus for metrics collection prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --web.enable-otlp-receiver - --enable-feature=native-histograms - --enable-feature=exemplar-storage - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090/tcp volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml memcached: image: memcached:1.6.40@sha256:572b011ce33954ee809066d8cecbeb3ec98912109ee3be3663a3197425fd81ac container_name: memcached ports: - "11211:11211" environment: - MEMCACHED_MAX_MEMORY=64m # Set the maximum memory usage - MEMCACHED_THREADS=4 # Number of threads to use # Tempo for tracing tempo: image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4} command: ["-config.file=/etc/tempo.yaml"] ports: - 3200:3200/tcp # tempo volumes: - ./tempo-config.yaml:/etc/tempo.yaml depends_on: - prometheus - memcached # Grafana for visualization grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false - GF_INSTALL_PLUGINS=https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: false version: 1 editable: false - name: Tempo type: tempo access: proxy orgId: 1 url: http://tempo:3200 basicAuth: false isDefault: true version: 1 editable: false jsonData: serviceMap: datasourceUid: 'Prometheus' nodeGraph: enabled: true EOF /run.sh depends_on: - prometheus - tempo # Alloy for telemetry pipeline alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} ports: - 12345:12345 # Alloy HTTP server - 4317:4317/tcp # OTLP gRPC (used by our services) - 4318:4318/tcp # OTLP HTTP (used by our services) volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: - tempo - prometheus # Sofa Shop Service sofa-shop: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8080:8080 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=sofa-shop - OTEL_RESOURCE_ATTRIBUTES=service.name=sofa-shop,service.version=1.0.0,deployment.environment=delivery-demo - SERVICE_FACTORY_URL=http://sofa-factory:8081 depends_on: - alloy - sofa-factory restart: on-failure # Sofa Factory Service sofa-factory: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8081:8081 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=sofa-factory - OTEL_RESOURCE_ATTRIBUTES=service.name=sofa-factory,service.version=1.0.0,deployment.environment=delivery-demo - SERVICE_PORT=8081 - SERVICE_DISTRIBUTION_URL=http://global-distribution:8082 depends_on: - alloy - global-distribution restart: on-failure # Global Distribution Service global-distribution: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8082:8082 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=global-distribution - OTEL_RESOURCE_ATTRIBUTES=service.name=global-distribution,service.version=1.0.0,deployment.environment=delivery-demo - SERVICE_PORT=8082 - SERVICE_LOCAL_URL=http://local-distribution:8083 depends_on: - alloy - local-distribution restart: on-failure # Local Distribution Service local-distribution: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8083:8083 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=local-distribution - OTEL_RESOURCE_ATTRIBUTES=service.name=local-distribution,service.version=1.0.0,deployment.environment=delivery-demo - SERVICE_PORT=8083 - SERVICE_CUSTOMER_URL=http://customer-house:8084 depends_on: - alloy - customer-house restart: on-failure # Customer House Service customer-house: build: context: ./app dockerfile: Dockerfile args: - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim} ports: - 8084:8084 environment: - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317 - OTEL_SERVICE_NAME=customer-house - OTEL_RESOURCE_ATTRIBUTES=service.name=customer-house,service.version=1.0.0,deployment.environment=delivery-demo - SERVICE_PORT=8084 depends_on: - alloy restart: on-failure ================================================ FILE: trace-delivery/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s otlp: # Recommended attributes to be promoted to labels. promote_resource_attributes: - service.name - service.namespace - service.version - deployment.environment storage: tsdb: out_of_order_time_window: 30m ================================================ FILE: trace-delivery/tempo-config.yaml ================================================ stream_over_http_enabled: true server: http_listen_port: 3200 log_level: info cache: background: writeback_goroutines: 5 caches: - roles: - frontend-search memcached: addresses: dns+memcached:11211 query_frontend: search: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 metadata_slo: duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 trace_by_id: duration_slo: 100ms metrics: max_duration: 200h # maximum duration of a metrics query, increase for local setups query_backend_after: 5m duration_slo: 5s throughput_bytes_slo: 1.073741824e+09 distributor: receivers: # this configuration will listen on all ports and protocols that tempo is capable of. jaeger: # the receives all come from the OpenTelemetry collector. more configuration information can protocols: # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver thrift_http: # endpoint: "tempo:14268" # for a production deployment you should only enable the receivers you need! grpc: endpoint: "tempo:14250" thrift_binary: endpoint: "tempo:6832" thrift_compact: endpoint: "tempo:6831" zipkin: endpoint: "tempo:9411" otlp: protocols: grpc: endpoint: "tempo:4317" http: endpoint: "tempo:4318" opencensus: endpoint: "tempo:55678" ingester: max_block_duration: 5m # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally compactor: compaction: block_retention: 720h # overall Tempo trace retention. set for demo purposes metrics_generator: registry: external_labels: source: tempo cluster: docker-compose storage: path: /var/tempo/generator/wal remote_write: - url: http://prometheus:9090/api/v1/write send_exemplars: true traces_storage: path: /var/tempo/generator/traces processor: local_blocks: filter_server_spans: false flush_to_storage: true storage: trace: backend: local # backend configuration to use wal: path: /var/tempo/wal # where to store the wal locally local: path: /var/tempo/blocks overrides: defaults: metrics_generator: processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator generate_native_histograms: both ================================================ FILE: vault-secrets/README.md ================================================ # Vault secrets with Grafana Alloy Demonstrates Alloy's [`remote.vault`](https://grafana.com/docs/alloy/latest/reference/components/remote/remote.vault/) component pulling `prometheus.remote_write` basic_auth credentials from HashiCorp Vault at runtime, and shows that rotating the Vault secret is picked up without restarting Alloy. ## Overview | Service | Role | | ------------- | ------------------------------------------------------------------- | | `vault` | HashiCorp Vault in dev mode. Boots, then seeds `secret/alloy/remote-write` from its entrypoint before unsealing the healthcheck. | | `nginx-auth` | Basic-auth reverse proxy in front of Prometheus's remote-write API. | | `prometheus` | Receives remote-writes from Alloy. | | `grafana` | Pre-provisioned with Prometheus as the default datasource. | | `alloy` | Scrapes its own `/metrics` and remote-writes via `nginx-auth`, with `basic_auth` credentials sourced from Vault. | ``` ┌─────────────┐ reread 30s │ │ ┌──── remote.vault ◀──────────│ Vault │ │ (auth.token) │ │ ▼ └─────────────┘ ┌────────┐ ▲ │ Alloy │ scrape self → remote_write │ vault kv put └────────┘ (basic_auth from Vault) │ via rotate.sh │ │ ▼ │ ┌─────────────────┐ updated htpasswd │ │ nginx-auth │◀────────────────────────┘ │ (basic_auth) │ via rotate.sh └─────────────────┘ │ ▼ ┌────────────┐ │ Prometheus │ └────────────┘ ▲ │ ┌────────────┐ │ Grafana │ └────────────┘ ``` ## Running ```bash docker compose up -d # or, from the repo root: ./run-example.sh vault-secrets ``` | Service | URL | | ----------- | ---------------------------------------------- | | Grafana | | | Alloy UI | | | Prometheus | | | Vault | (token: `root-token-for-demo`) | | nginx-auth | (basic-auth required) | ## What to expect on a fresh boot 1. Watch nginx accept Alloy's writes: ```bash docker compose logs --tail=20 nginx-auth ``` You should see `200` responses with `user=alloy`. 2. Confirm the seeded secret in Vault: ```bash docker exec -e VAULT_ADDR=http://127.0.0.1:8200 \ -e VAULT_TOKEN=root-token-for-demo \ vault-secrets-vault vault kv get secret/alloy/remote-write ``` 3. Inspect the Alloy pipeline at — `prometheus.remote_write.via_nginx` should be healthy with no last-error. 4. Verify metrics flowed to Prometheus: ```bash curl -s 'http://localhost:9090/api/v1/query?query=up' | jq '.data.result' ``` ## Demonstrating credential rotation The interesting moment is the `401 → 200` transition: rotating nginx's htpasswd makes Alloy fail auth immediately, then Alloy recovers automatically once the Vault secret is updated and `remote.vault` re-reads (≤ 30 s). ```bash # Step 1 — rotate htpasswd, reload nginx. Alloy starts 401-ing. ./rotate.sh htpasswd hunter2 # Watch nginx logs for 401s with user=- docker compose logs -f nginx-auth # Step 2 — update Vault to the new value. Alloy catches up within # reread_frequency (30s) and goes back to 200 with user=alloy. ./rotate.sh vault hunter2 # Or do both in one go with a built-in 5s gap to make the 401 window # observable: ./rotate.sh both rotated-password ``` You can also rotate Vault directly without the helper: ```bash docker exec -e VAULT_ADDR=http://127.0.0.1:8200 \ -e VAULT_TOKEN=root-token-for-demo \ vault-secrets-vault \ vault kv put secret/alloy/remote-write username=alloy password=hunter2 ``` ## Inspecting Vault ```bash # Read the current secret docker exec -e VAULT_ADDR=http://127.0.0.1:8200 \ -e VAULT_TOKEN=root-token-for-demo \ vault-secrets-vault vault kv get secret/alloy/remote-write # Open the UI open http://localhost:8200 # Token: root-token-for-demo ``` ## Notes and caveats - **Root token is hardcoded.** `root-token-for-demo` is fine for a demo, never for production. The real-world swap-in is `auth.approle` (with a wrapped role-id/secret-id) or `auth.kubernetes` — same component, different `auth.*` block. - **`convert.nonsensitive` on `basic_auth.username`.** `remote.vault.creds.data.username` is a `Secret`; `basic_auth.username` expects a plain `string`, so it has to be unwrapped. `basic_auth.password` accepts `Secret` directly, so it doesn't need the conversion. Forgetting `convert.nonsensitive` on the username is the single most common mistake — the error is "expected string, got secret" at config load. - **nginx is the source of truth for the credential.** If you update Vault but forget to update the htpasswd file, Alloy will 401 forever — that's the deliberate demo property, not a bug. - **Vault dev-mode is in-memory.** A `docker compose down` followed by `up` resets the secret to `initial-password`. - **Production caveat for the basic-auth path itself:** `Authorization: Basic …` is base64-encoded, not encrypted. In production this hop must be TLS — out of scope for this demo. ## Stopping ```bash docker compose down --remove-orphans ``` ================================================ FILE: vault-secrets/auth/htpasswd ================================================ alloy:$2y$05$yXToETJn9D.sOxFM3036b.l2/FkJU1iN2CIuWYAqIIgT7xSMDvJtO ================================================ FILE: vault-secrets/config.alloy ================================================ // vault-secrets scenario // // remote.vault pulls remote_write basic_auth credentials from HashiCorp // Vault at runtime. reread_frequency makes Alloy pick up rotated values // without a restart — see README for the rotation demo. livedebugging { enabled = true } remote.vault "creds" { server = "http://vault:8200" // path = the KV mount; key = the secret path within that mount. // Alloy handles the KV v2 /data/ prefix internally. path = "secret" key = "alloy/remote-write" reread_frequency = "30s" auth.token { token = "root-token-for-demo" } } prometheus.exporter.self "self" {} prometheus.scrape "self" { targets = prometheus.exporter.self.self.targets forward_to = [prometheus.remote_write.via_nginx.receiver] scrape_interval = "10s" } prometheus.remote_write "via_nginx" { endpoint { url = "http://nginx-auth/api/v1/write" basic_auth { username = convert.nonsensitive(remote.vault.creds.data.username) password = remote.vault.creds.data.password } } } ================================================ FILE: vault-secrets/docker-compose.yml ================================================ services: vault: image: hashicorp/vault:${VAULT_VERSION:-2.0.0} container_name: vault-secrets-vault ports: - "8200:8200" environment: VAULT_ADDR: http://127.0.0.1:8200 VAULT_TOKEN: root-token-for-demo # Start dev-mode in the background, wait for readiness, then seed # secret/alloy/remote-write. The wait keeps Vault as PID 1. entrypoint: - sh - -euc - | vault server -dev \ -dev-listen-address=0.0.0.0:8200 \ -dev-root-token-id=root-token-for-demo & VAULT_PID=$$! until vault status >/dev/null 2>&1; do sleep 1; done vault kv put secret/alloy/remote-write \ username=alloy \ password=initial-password echo "seeded secret/alloy/remote-write" wait $$VAULT_PID healthcheck: # Pass only once the secret has been seeded — otherwise Alloy may # start before the KV write lands and fail its first reread. test: ["CMD", "sh", "-c", "vault kv get secret/alloy/remote-write >/dev/null 2>&1"] interval: 5s timeout: 3s retries: 20 nginx-auth: image: nginx:${NGINX_VERSION:-1.30-alpine} container_name: vault-secrets-nginx-auth ports: - "8080:80" volumes: - ./nginx.conf:/etc/nginx/nginx.conf:ro - ./auth/htpasswd:/etc/nginx/htpasswd:ro depends_on: - prometheus prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} container_name: vault-secrets-prometheus command: - --web.enable-remote-write-receiver - --config.file=/etc/prometheus/prometheus.yml volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml ports: - "9090:9090" grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} container_name: vault-secrets-grafana environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - "3000:3000/tcp" entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh alloy: image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1} container_name: vault-secrets-alloy ports: - "12345:12345" volumes: - ./config.alloy:/etc/alloy/config.alloy command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy depends_on: vault: condition: service_healthy nginx-auth: condition: service_started prometheus: condition: service_started ================================================ FILE: vault-secrets/nginx.conf ================================================ worker_processes 1; events { worker_connections 1024; } http { log_format auth '$remote_addr user=$remote_user [$time_local] ' '"$request" $status $body_bytes_sent'; access_log /dev/stdout auth; error_log /dev/stderr warn; upstream prom { server prometheus:9090; } server { listen 80; server_name _; location /api/v1/write { auth_basic "alloy-remote-write"; auth_basic_user_file /etc/nginx/htpasswd; proxy_pass http://prom/api/v1/write; proxy_http_version 1.1; proxy_set_header Host $host; proxy_set_header X-Forwarded-For $remote_addr; } location = /healthz { access_log off; return 200 "ok\n"; } } } ================================================ FILE: vault-secrets/prom-config.yaml ================================================ global: scrape_interval: 15s evaluation_interval: 15s ================================================ FILE: vault-secrets/rotate.sh ================================================ #!/usr/bin/env bash # Demo helper for the vault-secrets scenario. # # Usage: # ./rotate.sh htpasswd # update nginx htpasswd + reload # ./rotate.sh vault # update the Vault secret # ./rotate.sh both # do both, with a 5s gap so the # # 401 window is visible set -euo pipefail cmd=${1:-} pw=${2:-} if [[ -z "$cmd" || -z "$pw" ]]; then echo "usage: rotate.sh htpasswd|vault|both " >&2 exit 2 fi cd "$(dirname "$0")" rotate_htpasswd() { echo ">> generating new bcrypt entry for alloy" docker run --rm httpd:2.4-alpine htpasswd -nbB -C 5 alloy "$pw" \ > auth/htpasswd echo ">> reloading nginx" docker exec vault-secrets-nginx-auth nginx -s reload } rotate_vault() { echo ">> writing new credentials to Vault" docker exec \ -e VAULT_ADDR=http://127.0.0.1:8200 \ -e VAULT_TOKEN=root-token-for-demo \ vault-secrets-vault \ vault kv put secret/alloy/remote-write \ username=alloy \ password="$pw" } case "$cmd" in htpasswd) rotate_htpasswd ;; vault) rotate_vault ;; both) rotate_htpasswd echo ">> nginx flipped; Alloy will 401 until Vault catches up. Sleeping 5s..." sleep 5 rotate_vault ;; *) echo "unknown command: $cmd" >&2 exit 2 ;; esac ================================================ FILE: windows/README.md ================================================ # Monitoring Windows with Alloy Grafana Alloy can be used to monitor Windows servers and desktops. In this guide we will show you how to install Grafana Alloy on a Windows machine and how to configure it to monitor the following system attributes: * Windows Performance Metrics * Windows Event Logs ## Prerequisites * Git - You will need Git to clone the repository. * Docker - In this tutorial we assume you are using Docker desktop for Windows. This is where we host Grafana, Loki and Prometheus. Note that you can also install native Windows versions of Grafana, Loki and Prometheus if you prefer or host them on a Linux server. * Windows Server or Desktop - We will be monitoring a Windows machine, so you will need a Windows server or desktop to monitor. * Admin access to the Windows machine - You will need admin access to the Windows machine to install the Grafana Alloy and configure it to collect metrics and logs. ## Step 1: Clone the Repository Clone the repository to your Windows machine. ```bash git clone https://github.com/grafana/alloy-scenarios.git ``` ## Step 2: Deploy Grafana, Loki and Prometheus First, you need to deploy Grafana, Loki and Prometheus on your Windows machine. Within this tutorial, we have included a docker-compose file that will deploy Grafana, Loki and Prometheus on your Windows machine. ```bash cd alloy-scenarios/windows docker-compose up -d ``` You can check the status of the containers by running the following command: ```bash docker ps ``` Grafana should be running on [http://localhost:3000](http://localhost:3000). ## Step 3: Install Grafana Alloy Follow the instructions in the [Grafana Alloy documentation](https://grafana.com/docs/alloy/latest/set-up/install/windows/) to install Grafana Alloy on your Windows machine. Recommended steps: * Install Grafana Alloy as a Windows service. * Use Windows Installer to install Grafana Alloy. Make sure to also checkout the [Grafana Alloy configuration](https://grafana.com/docs/alloy/latest/set-up/configuration/) documentation. Personal recommendation: If you would like to see the Alloy UI from a remote machine you need to change the run arguments of the Grafana Alloy service. To do this: 1. Open Registery Editor. 2. Navigate to `HKEY_LOCAL_MACHINE\SOFTWARE\GrafanaLabs\Alloy`. 3. Double click on `Arguments` 4. Change the contents to the following: ``` run C:\Program Files\GrafanaLabs\Alloy\config.alloy --storage.path=C:\ProgramData\GrafanaLabs\Alloy\data --server.http.listen-addr=0.0.0.0:12345 ``` 5. Restart the Grafana Alloy service. (Search for `Services` in the start menu, find `Grafana Alloy`, right click and restart) You should now be able to access the Alloy UI from a remote machine by going to `http://:12345`. ## Step 4: Configure Grafana Alloy to Monitor Windows Now that you have Grafana Alloy installed, you need to configure it to monitor your Windows machine. Grafana Alloy will currently be running a default configuration file. This needs to be replaced with the `config.alloy` file that is included in the `alloy-scenarios/windows` directory. To do this: 1. Stop the Grafana Alloy service. 2. Replace the `config.alloy` file in `C:\Program Files\GrafanaLabs\Alloy` with the `config.alloy` file from the `alloy-scenarios/windows` directory. 3. Start the Grafana Alloy service. 4. Open your browser and go to `http://localhost:12345` to access the Alloy UI. ## Step 5: Viewing the Windows Performance Metrics and Event Logs You will now be able to view the Windows Performance Metrics and Event Logs in Grafana: * Open your browser and go to [http://localhost:3000/explore/metrics](http://localhost:3000/explore/metrics). This will take you to the metrics explorer in Grafana. * Open your browser and go to [http://localhost:3000/a/grafana-lokiexplore-app](http://localhost:3000/a/grafana-lokiexplore-app). This will take you to the Loki explorer in Grafana. ================================================ FILE: windows/config.alloy ================================================ // #################################### // Windows Server Metrics Configuration // #################################### prometheus.exporter.windows "default" { enabled_collectors = ["cpu","cs","logical_disk","net","os","service","system", "memory", "scheduled_task", "tcp"] } // Configure a prometheus.scrape component to collect windows metrics. prometheus.scrape "example" { targets = prometheus.exporter.windows.default.targets forward_to = [prometheus.remote_write.demo.receiver] } prometheus.remote_write "demo" { endpoint { url = "http://localhost:9090/api/v1/write" } } // #################################### // Windows Server Logs Configuration // #################################### loki.source.windowsevent "application" { eventlog_name = "Application" use_incoming_timestamp = true forward_to = [loki.process.endpoint.receiver] } loki.source.windowsevent "System" { eventlog_name = "System" use_incoming_timestamp = true forward_to = [loki.process.endpoint.receiver] } loki.process "endpoint" { forward_to = [loki.write.endpoint.receiver] stage.json { expressions = { message = "", Overwritten = "", source = "", computer = "", eventRecordID = "", channel = "", component_id = "", execution = "", } } // Extract nested fields from the "execution" object (e.g. processId, processName). stage.json { source = "execution" expressions = { processId = "", processName = "", } } stage.structured_metadata { values = { "eventRecordID" = "", "channel" = "", "component_id" = "", "execution_processId" = "processId", "execution_processName" = "processName", } } stage.eventlogmessage { source = "message" overwrite_existing = true } stage.labels { values = { "service_name" = "source", } } stage.output { source = "message" } } loki.write "endpoint" { endpoint { url ="http://localhost:3100/loki/api/v1/push" } } livedebugging{} ================================================ FILE: windows/docker-compose.yml ================================================ version: '3.8' services: loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - 3100:3100/tcp volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml prometheus: image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3} command: - --web.enable-remote-write-receiver - --config.file=/etc/prometheus/prometheus.yml ports: - 9090:9090/tcp volumes: - ./prom-config.yaml:/etc/prometheus/prometheus.yml grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - 3000:3000/tcp entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: false version: 1 editable: false - name: Prometheus type: prometheus orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh ================================================ FILE: windows/loki-config.yaml ================================================ # This is a complete configuration to deploy Loki backed by the filesystem. # The index will be shipped to the storage via tsdb-shipper. auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true # Note: We are setting the max chunk age far lower than the default expected value # This is due to the fact this scenario is used within the LogCLI demo and we need a short flush time. # To show how logcli stats --since 24h '{service_name="Delivery World", package_size="Large"}' works. ingester: max_chunk_age: 5m # Should be 2 hours ================================================ FILE: windows/prom-config.yaml ================================================ # my global config global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). ================================================ FILE: windows-events/README.md ================================================ # Windows Security Event Logs with Grafana Alloy A focused logs-only scenario for shipping the **Windows Security event channel** to Loki, with filtering and field-extraction tuned for SOC-style queries (logon attempts, privilege escalation, account changes). ## How this differs from the [`windows/`](../windows/) scenario | Aspect | `windows/` (broad) | `windows-events/` (this) | |---|---|---| | Channels | Application + System + Performance metrics | **Security** only | | Processing | Pass-through with basic JSON parsing | **Drops noise event IDs** + extracts security-specific fields as labels | | Backend | Loki + Prometheus + Grafana | **Loki + Grafana** (no metrics) | | Demo intent | "ship Windows logs to Loki" | "make Security events queryable for SOC use cases" | If you want general-purpose Windows monitoring, use `windows/`. If you specifically care about Security audit events, use this one. ## Prerequisites - A Windows host (Server or Desktop) with admin access — `loki.source.windowsevent` reads from the Windows Event Log API and only runs on Windows. - Docker Desktop for Windows (or any Linux machine you can reach over the network) for the Loki/Grafana backend. - Git, to clone the repo. ## Step 1 — Backend (Loki + Grafana) On the machine that will host the backend (the Windows host itself, or any Linux machine): ```bash git clone https://github.com/grafana/alloy-scenarios.git cd alloy-scenarios/windows-events docker compose up -d ``` Grafana is on `http://:3000` with the Loki datasource already provisioned. ## Step 2 — Install Alloy on the Windows host Follow the [Windows install guide](https://grafana.com/docs/alloy/latest/set-up/install/windows/). Recommended: Windows Installer + Windows Service. If your backend is on a different machine than the Windows host, edit the `loki.write` URL in `config.alloy` from `http://localhost:3100` to `http://:3100`. ## Step 3 — Replace the Alloy config 1. Stop the `Grafana Alloy` Windows service. 2. Replace `C:\Program Files\GrafanaLabs\Alloy\config.alloy` with the [`config.alloy`](./config.alloy) from this directory. 3. Start the service. 4. Open `http://localhost:12345` to confirm the components load without error. ## Step 4 — Generate Security events To see traffic, trigger some auditable actions on the Windows host: - **Failed logon (4625)**: try to log in with a wrong password from a remote machine, or run `runas /user:fakeuser cmd` and enter a wrong password. - **Successful logon (4624)**: log out and back in, or open a new RDP session. - **User created (4720)**: `net user testuser P@ssw0rd /add` from an admin shell. - **Privilege use (4672)**: any action requiring Administrator elevation. Some of these only generate events if the corresponding **audit policy** is enabled. Check `auditpol /get /category:*` on the Windows host; enable additional audit policies via `auditpol /set /subcategory:"" /success:enable /failure:enable` if needed. ## Step 5 — Query in Grafana ```logql # All Security events {eventlog_name="Security"} # Failed logons {eventlog_name="Security", event_id="4625"} # Successful logons by a specific user {eventlog_name="Security", event_id="4624", target_user_name="alice"} # All events affecting a specific user account {eventlog_name="Security", target_user_name="alice"} # Recent privileged-operation events {eventlog_name="Security", event_id=~"4672|4673"} ``` The promoted labels are `event_id`, `subject_user_name`, `target_user_name`, and `logon_type`. Other event fields (computer, eventRecordID, channel) are kept as **structured metadata** — searchable via Loki's `| json` filter without inflating the label index. ## What's filtered out The pipeline drops these event IDs at the Alloy side: | Event ID | Description | Why dropped | |---|---|---| | 4658 | Handle to an object was closed | Pairs with 4656/4663; on its own rarely actionable | | 4690 | Attempt to duplicate a handle to an object | Audit noise | | 4674 | Operation attempted on a privileged object | Fires for routine privileged ops | | 5379 | Credential Manager credentials were read | Frequent false-positive in normal use | If you want one of these back, edit `stage.match` in `config.alloy` to remove the corresponding ID from the `event_id=~"…"` regex. ## Stopping ```bash docker compose down -v ``` Stop the Alloy Windows service separately if you no longer want it running. ================================================ FILE: windows-events/config.alloy ================================================ // ################################################################### // Windows Security Event Log → Loki, with filtering and field labels // ################################################################### // // Differs from the broader `windows/` scenario in three ways: // 1. Security channel only (Application + System are covered there) // 2. Drops high-volume audit-noise event IDs that bury real signal // 3. Promotes security-specific fields (subject_user_name, // target_user_name, logon_type) to labels for SOC-style queries // // Run target: a Windows host with Alloy installed natively. The // docker-compose.yml in this directory only runs Loki + Grafana; // Alloy itself is a Windows service. livedebugging {} // Ingest the Security channel. `use_incoming_timestamp = true` keeps // the original event time rather than the time Alloy received it, // which matters when replaying historical logs after an Alloy restart. loki.source.windowsevent "security" { eventlog_name = "Security" use_incoming_timestamp = true forward_to = [loki.process.security.receiver] } loki.process "security" { // Step 1: parse the windowsevent JSON wrapper. stage.json { expressions = { message = "", eventRecordID = "", channel = "", computer = "", } } // Step 2: parse the event message (XML/EventData) into top-level // fields. The exact keys depend on event type — `eventlogmessage` // pulls every named field from the XML/EventData payload. stage.eventlogmessage { source = "message" overwrite_existing = true } // Step 3: drop high-noise event IDs that are rarely useful in a // SOC dashboard but consume most of the Security log volume: // 4658 — handle to an object closed // 4690 — attempt to duplicate a handle to an object // 4674 — operation attempted on a privileged object // 5379 — Credential Manager credentials read stage.match { selector = `{event_id=~"4658|4690|4674|5379"}` action = "drop" } // Step 4: promote useful fields to labels. Indexed labels make // "show me all failed logons by username" queries cheap. stage.labels { values = { event_id = "", subject_user_name = "", target_user_name = "", logon_type = "", } } // Step 5: keep heavyweight fields out of labels but searchable // via structured metadata. stage.structured_metadata { values = { eventRecordID = "", channel = "", computer = "", } } forward_to = [loki.write.endpoint.receiver] } loki.write "endpoint" { endpoint { url = "http://localhost:3100/loki/api/v1/push" } } ================================================ FILE: windows-events/docker-compose.yml ================================================ services: loki: image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10} ports: - "3100:3100/tcp" volumes: - ./loki-config.yaml:/etc/loki/local-config.yaml command: -config.file=/etc/loki/local-config.yaml grafana: image: grafana/grafana:${GRAFANA_VERSION:-13.0.1} environment: - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_BASIC_ENABLED=false ports: - "3000:3000/tcp" entrypoint: - sh - -euc - | mkdir -p /etc/grafana/provisioning/datasources cat < /etc/grafana/provisioning/datasources/ds.yaml apiVersion: 1 datasources: - name: Loki type: loki access: proxy orgId: 1 url: http://loki:3100 basicAuth: false isDefault: true version: 1 editable: false EOF /run.sh ================================================ FILE: windows-events/loki-config.yaml ================================================ auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true server: http_listen_port: 3100 common: ring: instance_addr: 0.0.0.0 kvstore: store: inmemory replication_factor: 1 path_prefix: /tmp/loki schema_config: configs: - from: 2020-05-15 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h storage_config: tsdb_shipper: active_index_directory: /tmp/loki/index cache_location: /tmp/loki/index_cache filesystem: directory: /tmp/loki/chunks pattern_ingester: enabled: true ingester: max_chunk_age: 5m