Repository: grafana/alloy-scenarios
Branch: main
Commit: fb8fd5ed16bb
Files: 430
Total size: 1.3 MB

Directory structure:
gitextract_o8zi_7a5/

├── .coda/
│   ├── coda-start.service
│   ├── coda-start.sh
│   └── packer-install.sh
├── .cursor/
│   ├── docker-example.mdc
│   └── k8s-example.mdc
├── .github/
│   ├── k8s-scenarios.json
│   ├── scenario-list.txt
│   └── workflows/
│       ├── check-image-versions.yml
│       ├── validate-k8s-scenarios.yml
│       └── validate-scenarios.yml
├── .gitignore
├── CLAUDE.md
├── LICENSE
├── README.md
├── app-instrumentation/
│   └── logging/
│       └── popular-logging-frameworks/
│           ├── README.md
│           ├── alloy/
│           │   ├── config.alloy
│           │   └── helper.alloy
│           ├── cpp/
│           │   ├── CMakeLists.txt
│           │   ├── Dockerfile
│           │   └── main.cpp
│           ├── csharp/
│           │   ├── Dockerfile
│           │   ├── LoggingExample.csproj
│           │   └── Program.cs
│           ├── docker-compose.coda.yml
│           ├── docker-compose.yml
│           ├── go/
│           │   ├── Dockerfile
│           │   ├── go.mod
│           │   ├── go.sum
│           │   └── main.go
│           ├── java/
│           │   ├── App.java
│           │   ├── Dockerfile
│           │   └── logback.xml
│           ├── javascript/
│           │   ├── Dockerfile
│           │   └── app.js
│           ├── loki-config.yaml
│           ├── php/
│           │   ├── Dockerfile
│           │   └── app.php
│           └── python/
│               ├── Dockerfile
│               └── app.py
├── aws-firehose-logs/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.yml
│   ├── firehose_sender.py
│   └── loki-config.yaml
├── blackbox-probing/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   └── prom-config.yaml
├── cloudwatch-metrics/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.yml
│   ├── prom-config.yaml
│   └── seed-metrics.py
├── coda
├── continuous-profiling/
│   ├── README.md
│   ├── app/
│   │   ├── go.mod
│   │   └── main.go
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   └── docker-compose.yml
├── docker-monitoring/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.yml
│   ├── grafana/
│   │   └── datasources/
│   │       └── default.yml
│   └── loki-config.yaml
├── elasticsearch-monitoring/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   └── prom-config.yaml
├── faro-frontend-observability/
│   ├── README.md
│   ├── app/
│   │   └── index.html
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   └── loki-config.yaml
├── game-of-tracing/
│   ├── AGENTS.md
│   ├── CLAUDE.md
│   ├── README.md
│   ├── SPAN_LINKS.md
│   ├── ai_opponent/
│   │   ├── CLAUDE.md
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   ├── ai_server.py
│   │   ├── requirements.txt
│   │   └── telemetry.py
│   ├── app/
│   │   ├── CLAUDE.md
│   │   ├── Dockerfile
│   │   ├── game_config.py
│   │   ├── location_server.py
│   │   ├── requirements.txt
│   │   ├── run_game.py
│   │   └── telemetry.py
│   ├── config-otel.yaml
│   ├── config.alloy
│   ├── docker-compose-otel.yml
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   ├── grafana/
│   │   ├── dashboards/
│   │   │   ├── War of Kingdoms-1747821967780.json
│   │   │   └── dashboards.yaml
│   │   └── datasources/
│   │       └── defaults.yml
│   ├── loki-config.yaml
│   ├── prom-config.yaml
│   ├── pyroscope-config.yaml
│   ├── tempo-config.yaml
│   └── war_map/
│       ├── CLAUDE.md
│       ├── Dockerfile
│       ├── app.py
│       ├── requirements.txt
│       ├── static/
│       │   └── css/
│       │       └── style.css
│       ├── telemetry.py
│       └── templates/
│           ├── index.html
│           ├── layout.html
│           ├── map.html
│           ├── map_picker.html
│           ├── replay.html
│           └── replay_session.html
├── gelf-log-ingestion/
│   ├── README.md
│   ├── app/
│   │   └── main.py
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   └── loki-config.yaml
├── image-versions.env
├── k8s/
│   ├── README.md
│   ├── events/
│   │   ├── README.md
│   │   ├── alloy-config.yaml
│   │   ├── alloy-deployment.yaml
│   │   ├── alloy-rbac.yaml
│   │   ├── grafana-values.yml
│   │   ├── kind.yml
│   │   └── loki-values.yml
│   ├── logs/
│   │   ├── README.md
│   │   ├── grafana-values.yml
│   │   ├── k8s-monitoring-values.yml
│   │   ├── killercoda/
│   │   │   └── loki-values.yml
│   │   ├── kind.yml
│   │   └── loki-values.yml
│   ├── metrics/
│   │   ├── README.md
│   │   ├── grafana-values.yml
│   │   ├── k8s-monitoring-values.yml
│   │   ├── kind.yml
│   │   └── prometheus-values.yml
│   ├── profiling/
│   │   ├── README.md
│   │   ├── grafana-values.yml
│   │   ├── k8s-monitoring-values.yml
│   │   ├── kind.yml
│   │   └── pyroscope-values.yml
│   └── tracing/
│       ├── README.md
│       ├── grafana-values.yml
│       ├── k8s-monitoring-values.yml
│       ├── kind.yml
│       └── tempo-values.yml
├── kafka/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   ├── gen_log.sh
│   └── loki-config.yaml
├── linux/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.yml
│   ├── loki-config.yaml
│   └── prom-config.yaml
├── log-api-gateway/
│   ├── README.md
│   ├── app/
│   │   └── producer.py
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   └── loki-config.yaml
├── log-secret-filtering/
│   ├── README.md
│   ├── app/
│   │   └── main.py
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   └── loki-config.yaml
├── logs-file/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   ├── loki-config.yaml
│   └── main.py
├── logs-tcp/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   ├── loki-config.yaml
│   └── simulator.py
├── mail-house/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   ├── loki-config.yaml
│   └── main.py
├── memcached-monitoring/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   └── prom-config.yaml
├── mysql-monitoring/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   └── prom-config.yaml
├── nginx-monitoring/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.yml
│   ├── loki-config.yaml
│   ├── nginx.conf
│   └── prom-config.yaml
├── otel-basic-tracing/
│   ├── README.md
│   ├── app/
│   │   ├── Dockerfile
│   │   ├── app.py
│   │   └── requirements.txt
│   ├── config-otel.yaml
│   ├── config.alloy
│   ├── docker-compose-otel.yml
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   ├── prom-config.yaml
│   └── tempo-config.yaml
├── otel-examples/
│   ├── README.md
│   ├── cost-control/
│   │   ├── README.md
│   │   ├── app/
│   │   │   ├── Dockerfile
│   │   │   ├── app.py
│   │   │   └── requirements.txt
│   │   ├── config-otel.yaml
│   │   ├── config.alloy
│   │   ├── docker-compose.coda.yml
│   │   ├── docker-compose.yml
│   │   ├── loki-config.yaml
│   │   └── tempo-config.yaml
│   ├── count-connector/
│   │   ├── README.md
│   │   ├── app/
│   │   │   ├── Dockerfile
│   │   │   ├── app.py
│   │   │   └── requirements.txt
│   │   ├── config-otel.yaml
│   │   ├── config.alloy
│   │   ├── docker-compose.coda.yml
│   │   ├── docker-compose.yml
│   │   ├── loki-config.yaml
│   │   ├── prom-config.yaml
│   │   └── tempo-config.yaml
│   ├── filelog-processing/
│   │   ├── README.md
│   │   ├── app/
│   │   │   └── generate_logs.py
│   │   ├── config-otel.yaml
│   │   ├── config.alloy
│   │   ├── docker-compose.coda.yml
│   │   ├── docker-compose.yml
│   │   └── loki-config.yaml
│   ├── host-metrics/
│   │   ├── README.md
│   │   ├── config-otel.yaml
│   │   ├── config.alloy
│   │   ├── docker-compose.coda.yml
│   │   ├── docker-compose.yml
│   │   └── prom-config.yaml
│   ├── kafka-buffer/
│   │   ├── README.md
│   │   ├── app/
│   │   │   ├── Dockerfile
│   │   │   ├── app.py
│   │   │   └── requirements.txt
│   │   ├── config-otel.yaml
│   │   ├── config.alloy
│   │   ├── docker-compose.coda.yml
│   │   ├── docker-compose.yml
│   │   └── tempo-config.yaml
│   ├── multi-pipeline-fanout/
│   │   ├── README.md
│   │   ├── app/
│   │   │   ├── Dockerfile
│   │   │   ├── app.py
│   │   │   └── requirements.txt
│   │   ├── config-otel.yaml
│   │   ├── config.alloy
│   │   ├── docker-compose.coda.yml
│   │   ├── docker-compose.yml
│   │   ├── prom-config.yaml
│   │   └── tempo-config.yaml
│   ├── ottl-transform/
│   │   ├── README.md
│   │   ├── app/
│   │   │   ├── Dockerfile
│   │   │   ├── app.py
│   │   │   └── requirements.txt
│   │   ├── config-otel.yaml
│   │   ├── config.alloy
│   │   ├── docker-compose.coda.yml
│   │   ├── docker-compose.yml
│   │   ├── loki-config.yaml
│   │   └── tempo-config.yaml
│   ├── pii-redaction/
│   │   ├── README.md
│   │   ├── app/
│   │   │   ├── Dockerfile
│   │   │   ├── app.py
│   │   │   └── requirements.txt
│   │   ├── config-otel.yaml
│   │   ├── config.alloy
│   │   ├── docker-compose.coda.yml
│   │   ├── docker-compose.yml
│   │   ├── loki-config.yaml
│   │   └── tempo-config.yaml
│   ├── resource-enrichment/
│   │   ├── README.md
│   │   ├── app/
│   │   │   ├── Dockerfile
│   │   │   ├── app.py
│   │   │   └── requirements.txt
│   │   ├── config-otel.yaml
│   │   ├── config.alloy
│   │   ├── docker-compose.coda.yml
│   │   ├── docker-compose.yml
│   │   ├── prom-config.yaml
│   │   └── tempo-config.yaml
│   └── routing-multi-tenant/
│       ├── README.md
│       ├── app/
│       │   ├── generate_logs.py
│       │   └── requirements.txt
│       ├── config-otel.yaml
│       ├── config.alloy
│       ├── docker-compose.coda.yml
│       ├── docker-compose.yml
│       └── loki-config.yaml
├── otel-metrics-pipeline/
│   ├── README.md
│   ├── app/
│   │   └── main.py
│   ├── config-otel.yaml
│   ├── config.alloy
│   ├── docker-compose-otel.yml
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   └── prom-config.yaml
├── otel-span-metrics/
│   ├── README.md
│   ├── app/
│   │   ├── load.py
│   │   ├── main.py
│   │   └── requirements.txt
│   ├── config-otel.yaml
│   ├── config.alloy
│   ├── docker-compose-otel.yml
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   ├── prom-config.yaml
│   └── tempo-config.yaml
├── otel-tail-sampling/
│   ├── README.md
│   ├── app/
│   │   ├── Dockerfile
│   │   ├── app.py
│   │   └── requirements.txt
│   ├── config-otel.yaml
│   ├── config.alloy
│   ├── docker-compose-otel.yml
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   ├── prom-config.yaml
│   └── tempo-config.yaml
├── otel-tracing-service-graphs/
│   ├── README.md
│   ├── app/
│   │   ├── Dockerfile
│   │   ├── app.py
│   │   └── requirements.txt
│   ├── config-otel.yaml
│   ├── config.alloy
│   ├── docker-compose-otel.yml
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   ├── prom-config.yaml
│   └── tempo-config.yaml
├── postgres-monitoring/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   └── prom-config.yaml
├── rabbitmq-monitoring/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   ├── enabled_plugins
│   ├── loki-config.yaml
│   ├── prom-config.yaml
│   └── rabbitmq.conf
├── redis-monitoring/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   └── prom-config.yaml
├── renovate.json
├── routing/
│   ├── README.MD
│   ├── config.alloy
│   ├── docker-compose.yaml
│   └── support/
│       ├── grafana/
│       │   └── datasources.yml
│       ├── loki/
│       │   └── server.yaml
│       └── promtail/
│           ├── myCustomLog.txt
│           └── promtail-config.yml
├── run-example.sh
├── self-monitoring/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.yaml
│   └── loki-config.yaml
├── snmp/
│   ├── Readme.md
│   ├── config.alloy
│   ├── docker-compose.yml
│   ├── loki-config.yaml
│   ├── prom-config.yaml
│   └── snmp.yml
├── syslog/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   ├── loki-config.yaml
│   ├── rsyslog.conf
│   └── syslog_simulator.py
├── systemd-journal/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.yml
│   └── loki-config.yaml
├── trace-delivery/
│   ├── README.md
│   ├── app/
│   │   ├── Dockerfile
│   │   ├── app.py
│   │   └── requirements.txt
│   ├── config-otel.yaml
│   ├── config.alloy
│   ├── docker-compose-otel.yml
│   ├── docker-compose.coda.yml
│   ├── docker-compose.yml
│   ├── prom-config.yaml
│   └── tempo-config.yaml
├── vault-secrets/
│   ├── README.md
│   ├── auth/
│   │   └── htpasswd
│   ├── config.alloy
│   ├── docker-compose.yml
│   ├── nginx.conf
│   ├── prom-config.yaml
│   └── rotate.sh
├── windows/
│   ├── README.md
│   ├── config.alloy
│   ├── docker-compose.yml
│   ├── loki-config.yaml
│   └── prom-config.yaml
└── windows-events/
    ├── README.md
    ├── config.alloy
    ├── docker-compose.yml
    └── loki-config.yaml

================================================
FILE CONTENTS
================================================

================================================
FILE: .coda/coda-start.service
================================================
[Unit]
Description=Coda Alloy Scenario Start
After=network-online.target docker.service
Wants=network-online.target
Requires=docker.service

[Service]
Type=oneshot
ExecStart=/usr/local/bin/coda-start.sh
WorkingDirectory=/opt/alloy-scenarios
StandardOutput=journal
StandardError=journal
RemainAfterExit=yes

[Install]
WantedBy=multi-user.target


================================================
FILE: .coda/coda-start.sh
================================================
#!/usr/bin/env bash
set -euo pipefail

SCENARIO_FILE="/etc/coda/scenario"
REPO_DIR="/opt/alloy-scenarios"

# Wait for the scenario file to be written by user_data
echo "Waiting for ${SCENARIO_FILE}..."
timeout=120
elapsed=0
while [[ ! -f "$SCENARIO_FILE" ]]; do
  sleep 2
  elapsed=$((elapsed + 2))
  if [[ $elapsed -ge $timeout ]]; then
    echo "Timed out waiting for ${SCENARIO_FILE} after ${timeout}s" >&2
    exit 1
  fi
done

SCENARIO="$(cat "$SCENARIO_FILE")"
echo "Scenario: ${SCENARIO}"

# Pull latest changes from main so new scenarios are always available.
# Explicitly fetch+reset main to handle AMIs built from non-main branches.
echo "Updating alloy-scenarios repo..."
git -C "$REPO_DIR" fetch origin main 2>/dev/null \
  && git -C "$REPO_DIR" checkout main 2>/dev/null \
  && git -C "$REPO_DIR" reset --hard origin/main 2>/dev/null \
  || echo "Warning: git update failed, using baked version"

# Start the scenario (builds images on demand)
exec "$REPO_DIR/coda" start "$SCENARIO"


================================================
FILE: .coda/packer-install.sh
================================================
#!/usr/bin/env bash
# Packer provisioner: set up coda CLI and systemd services on an AMI.
#
# Expects the alloy-scenarios repo to already be cloned to /opt/alloy-scenarios.
# This script is called by the consuming Packer template after cloning.
#
# It intentionally does NOT pre-build scenario images. Scenarios are built
# on demand by `coda start`, so new scenarios work without re-baking the AMI.
set -euo pipefail

INSTALL_DIR="${1:-/opt/alloy-scenarios}"

echo "==> Adding host aliases for alloy"
grep -qxF '127.0.0.1 alloy' /etc/hosts || echo '127.0.0.1 alloy' >> /etc/hosts

echo "==> Symlinking coda CLI"
chmod +x "${INSTALL_DIR}/coda"
ln -sf "${INSTALL_DIR}/coda" /usr/local/bin/coda

echo "==> Pre-pulling common base images"
# Only pull widely-shared base images to speed up first boot.
# Scenario-specific images are built on demand by `coda start`.
docker pull "python:3.11-slim" || true
docker pull "apache/kafka:3.9.0" || true

echo "==> Installing systemd services"
cp "${INSTALL_DIR}/.coda/coda-start.service" /etc/systemd/system/coda-start.service
install -m 0755 "${INSTALL_DIR}/.coda/coda-start.sh" /usr/local/bin/coda-start.sh
systemctl daemon-reload

echo "==> Done"


================================================
FILE: .cursor/docker-example.mdc
================================================
---
description: creating a new alloy docker example
globs: 
alwaysApply: false
---
# Grafana Alloy Docker Example Template

This template provides a comprehensive structure for creating a new Grafana Alloy example using Docker Compose. It includes all the necessary components to monitor your application or system with the LGMT stack (Loki, Grafana, Metrics/Prometheus, Tempo).

## Directory Structure

```
your-example-name/
├── config.alloy            # Alloy configuration file
├── docker-compose.yml      # Docker Compose configuration
├── loki-config.yaml        # Loki configuration
├── prom-config.yaml        # Prometheus configuration
├── tempo-config.yaml       # Tempo configuration (optional)
├── README.md               # Documentation for your example
└── [additional files...]   # Any additional files needed for your example
```

## Docker Compose Template

Below is a template for your `docker-compose.yml` file that includes all components of the LGMT stack. You can customize it based on your specific needs.

```yaml
version: '3.8'

services:
  # Loki for log aggregation
  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.7}
    ports:
      - 3100:3100/tcp
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  # Prometheus for metrics collection
  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.10.0}
    command:
      - --web.enable-remote-write-receiver
      - --web.enable-otlp-receiver
      - --enable-feature=native-histograms
      - --enable-feature=exemplar-storage
      - --config.file=/etc/prometheus/prometheus.yml
    ports:
      - 9090:9090/tcp
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml

  # Memcached for Tempo
  memcached:
    image: memcached:1.6.29
    container_name: memcached
    ports:
      - "11211:11211"
    environment:
      - MEMCACHED_MAX_MEMORY=64m  # Set the maximum memory usage
      - MEMCACHED_THREADS=4       # Number of threads to use

  # Tempo initialization (required for file permissions)
  tempo-init:
    image: &tempoImage grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.1}
    user: root
    entrypoint:
      - "chown"
      - "10001:10001"
      - "/var/tempo"
    volumes:
      - ./tempo-data:/var/tempo

  # Tempo for tracing
  tempo:
    image: *tempoImage
    command: ["-config.file=/etc/tempo.yaml"]
    ports:
      - 3200:3200/tcp    # tempo
      - 4317:4317/tcp    # otlp grpc
      - 4318:4318/tcp    # otlp http
      - 14268:14268/tcp  # jaeger thrift http
      - 14250:14250/tcp  # jaeger grpc
      - 6831:6831/udp    # jaeger thrift compact
      - 6832:6832/udp    # jaeger thrift binary
      - 9411:9411/tcp    # zipkin
    volumes:
      - ./tempo-config.yaml:/etc/tempo.yaml
      - ./tempo-data:/var/tempo
    depends_on:
      - tempo-init
      - memcached
      - prometheus

  # Grafana for visualization
  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-12.4.0}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
      - GF_INSTALL_PLUGINS=https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        - name: Tempo
          type: tempo
          access: proxy
          orgId: 1
          url: http://tempo:3200
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
          jsonData:
            serviceMap:
              datasourceUid: 'Prometheus'
            nodeGraph:
              enabled: true
        EOF
        /run.sh
    depends_on:
      - prometheus
      - tempo

  # Alloy for telemetry pipeline
  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.14.0}
    ports:
      - 12345:12345      # Alloy HTTP server
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
      - /var/run/docker.sock:/var/run/docker.sock  # For Docker monitoring (optional)
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
```

## Configuration Files

### Loki Configuration (loki-config.yaml)

```yaml
auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
  - from: 2020-05-15
    store: tsdb
    object_store: filesystem
    schema: v13
    index:
      prefix: index_
      period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

ingester:
  max_chunk_age: 2h
```

### Prometheus Configuration (prom-config.yaml)

```yaml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  - job_name: 'alloy'
    static_configs:
      - targets: ['alloy:12345']

otlp:
  # Recommended attributes to be promoted to labels.
  promote_resource_attributes:
    - service.instance.id
    - service.name
    - service.namespace
    - service.version
    - cloud.availability_zone
    - cloud.region
    - container.name
    - deployment.environment
    - deployment.environment.name
    - k8s.cluster.name
    - k8s.container.name
    - k8s.namespace.name
    - k8s.pod.name

storage:
  tsdb:
    out_of_order_time_window: 30m
```

### Tempo Configuration (tempo-config.yaml)

```yaml
server:
  http_listen_port: 3200
  log_level: info

cache:
  background:
    writeback_goroutines: 5
  caches:
  - roles:
    - frontend-search  
    memcached: 
      addresses: dns+memcached:11211

query_frontend:
  search:
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09
    metadata_slo:
        duration_slo: 5s
        throughput_bytes_slo: 1.073741824e+09
  trace_by_id:
    duration_slo: 100ms
  metrics:
    max_duration: 200h                # maximum duration of a metrics query, increase for local setups
    query_backend_after: 5m
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09

distributor:
  receivers:                           
    jaeger:                            
      protocols:                       
        thrift_http:                   
          endpoint: "tempo:14268"      
        grpc:
          endpoint: "tempo:14250"
        thrift_binary:
          endpoint: "tempo:6832"
        thrift_compact:
          endpoint: "tempo:6831"
    zipkin:
      endpoint: "tempo:9411"
    otlp:
      protocols:
        grpc:
          endpoint: "tempo:4317"
        http:
          endpoint: "tempo:4318"
    opencensus:
      endpoint: "tempo:55678"

ingester:
  max_block_duration: 5m               

compactor:
  compaction:
    block_retention: 720h              

# Note: The metrics_generator section below can be enabled for built-in service graphs.
# Alternatively, use Alloy's servicegraph connector as shown in alloy-service-graphs example.
# metrics_generator:
#   registry:
#     external_labels:
#       source: tempo
#       cluster: docker-compose
#   storage:
#     path: /var/tempo/generator/wal
#     remote_write:
#       - url: http://prometheus:9090/api/v1/write
#         send_exemplars: true
#   traces_storage:
#     path: /var/tempo/generator/traces
#   processor:
#     local_blocks:
#       filter_server_spans: false
#       flush_to_storage: true

storage:
  trace:
    backend: local                     
    wal:
      path: /var/tempo/wal             
    local:
      path: /var/tempo/blocks

# Note: Service graph generation is commented out to allow using Alloy for this purpose.
# overrides:
#   defaults:
#     metrics_generator:
#       processors: [service-graphs, span-metrics, local-blocks]
#       generate_native_histograms: both
```

### Alloy Configuration with Service Graph Generation (config.alloy)

```river
/*
 * Alloy Configuration for OpenTelemetry Trace Collection with Service Graph Generation
 */

// Receive OpenTelemetry traces
otelcol.receiver.otlp "default" {
  http {}
  grpc {}

  output {
    traces = [otelcol.processor.batch.default.input]
  }
}

// Batch processor to improve performance
otelcol.processor.batch "default" {
  output {
    traces = [
      otelcol.connector.servicegraph.default.input,
      otelcol.exporter.otlp.tempo.input,
    ]
  }
}

// Service Graph Generator 
otelcol.connector.servicegraph "default" {
  metrics_flush_interval = "10s"
  dimensions = ["http.method"]
  
  output {
    metrics = [otelcol.exporter.otlphttp.prometheus.input]
  }
}

// Send service graph metrics to Prometheus via OTLP
otelcol.exporter.otlphttp "prometheus" {
  client {
    endpoint = "http://prometheus:9090/api/v1/otlp"
    tls {
      insecure = true
    }
  }
}

// Send traces to Tempo for storage and visualization
otelcol.exporter.otlp "tempo" {
  client {
    endpoint = "tempo:4317"
    tls {
      insecure = true
    }
  }
}
```

## README Template

The README.md file for your example should include:

1. A brief description of what the example demonstrates
2. Instructions for running the example
3. What to expect after running the example
4. Any additional steps or configuration needed

Example:

```markdown
# Your Example Name

Brief description of what this example demonstrates and its purpose.

## Overview

The example includes:
- Component 1 (brief description)
- Component 2 (brief description)
- ...

## Running the Demo

1. Clone the repository:
   ```
   git clone https://github.com/grafana/alloy-scenarios.git
   cd alloy-scenarios
   ```

2. Navigate to this example directory:
   ```
   cd your-example-name
   ```

3. Run using Docker Compose:
   ```
   docker compose up -d
   ```
   
   Or use the centralized image management:
   ```
   cd ..
   ./run-example.sh your-example-name
   ```

4. Access Grafana at http://localhost:3000

## What to Expect

Describe what the user should see after running the example, including:
- What metrics/logs are being collected
- Any dashboards that are automatically set up
- How to interact with the example

## Service Graphs (if applicable)

If your example includes service graph visualization capabilities:

1. Open Grafana (http://localhost:3000)
2. Navigate to Explore
3. Select the Tempo data source
4. Click on the "Service Graph" tab
5. You should see a visual representation of the relationships between services

## Architecture

```
┌────────────┐     ┌──────────┐      ┌───────┐      ┌─────────┐
│ Component1 │────▶│ Component2│─────▶│Component3│──▶│ Grafana │
└────────────┘     └──────────┘      └───┬───┘      └─────────┘
                                         │                ▲
                                         ▼                │
                                    ┌─────────┐           │
                                    │Component4│───────────┘
                                    └─────────┘
```

Brief explanation of the architecture and data flow.

## Additional Configuration

Any additional steps or configuration that might be needed.
```

## Customizing Your Example

To create your own example:

1. Create a new directory with your example name at the root of the repository
2. Copy the template files from this template
3. Customize the files for your specific use case
4. Update the README.md with specific instructions for your example
5. Add your example to the main README.md table with a link and description


================================================
FILE: .cursor/k8s-example.mdc
================================================
---
description: 
globs: 
alwaysApply: false
---
# Grafana Alloy Kubernetes Example Template

This template provides a comprehensive structure for creating a new Grafana Alloy example using Kubernetes. It is based on the Kubernetes Monitoring Helm chart which abstracts the need to configure Loki and deploys with best practices for monitoring Kubernetes clusters.

## Directory Structure

```
your-k8s-example-name/
├── k8s-monitoring-values.yml   # K8s monitoring helm chart values
├── loki-values.yml             # Loki helm chart values
├── grafana-values.yml          # Grafana helm chart values
├── kind.yml                    # Kind cluster configuration (optional)
├── README.md                   # Documentation for your example
└── [additional files...]       # Any additional files needed for your example
```

## Kubernetes Configuration Files

### Kind Cluster Configuration (kind.yml)

If you're using Kind for local development, you can use this configuration:

```yaml
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
  extraPortMappings:
  - containerPort: 30000
    hostPort: 30000
```

### K8s Monitoring Helm Chart Values (k8s-monitoring-values.yml)

This file configures the Kubernetes Monitoring Helm chart with Alloy settings:

```yaml
---
cluster:
  name: example-monitoring

destinations:
  - name: loki
    type: loki
    url: http://loki-gateway.meta.svc.cluster.local/loki/api/v1/push

# Cluster Events Collection
clusterEvents:
  enabled: true
  collector: alloy-logs
  namespaces:
    - meta
    - default

# Node Logs Collection
nodeLogs:
  enabled: true
  collector: alloy-logs

# Pod Logs Collection
podLogs:
  enabled: true
  gatherMethod: kubernetesApi
  collector: alloy-logs
  labelsToKeep: ["app_kubernetes_io_name","container","instance","job","level","namespace","service_name","service_namespace","deployment_environment","deployment_environment_name"]
  structuredMetadata:
    pod: pod  # Set structured metadata "pod" from label "pod"
  namespaces:
    - meta
    - default

# Node Metrics Collection
nodeMetrics:
  enabled: true
  collector: alloy-metrics

# Pod Metrics Collection
podMetrics:
  enabled: true
  collector: alloy-metrics
  namespaces:
    - meta
    - default

# Kubernetes API Server Metrics
kubernetesMetrics:
  enabled: true
  collector: alloy-metrics

# Traces Collection (if applicable)
traces:
  enabled: true
  collector: alloy-receiver
  namespaces:
    - meta
    - default

# Profiles Collection (if applicable)
profiles:
  enabled: true
  collector: alloy-profiles
  namespaces:
    - meta
    - default

# Collectors Configuration
alloy-singleton:
  enabled: false

alloy-metrics:
  enabled: true
  alloy:
    clustering:
      enabled: true

alloy-logs:
  enabled: true
  alloy:
    mounts:
      varlog: true
    clustering:
      enabled: true

alloy-profiles:
  enabled: true
  alloy:
    clustering:
      enabled: true

alloy-receiver:
  enabled: true
  alloy:
    clustering:
      enabled: true
```

### Loki Helm Chart Values (loki-values.yml)

Configuration for the Loki Helm chart:

```yaml
---
loki:
  auth_enabled: false
  commonConfig:
    replication_factor: 1
  schemaConfig:
    configs:
      - from: 2024-01-01
        store: tsdb
        object_store: s3
        schema: v13
        index:
          prefix: loki_index_
          period: 24h
  ingester:
    chunk_encoding: snappy
  tracing:
    enabled: true
  pattern_ingester:
      enabled: true
  limits_config:
    allow_structured_metadata: true
    volume_enabled: true
  ruler:
    enable_api: true
  querier:
    max_concurrent: 4

minio:
  enabled: true
      
deploymentMode: SingleBinary
singleBinary:
  replicas: 1
  resources:
    limits:
      cpu: 4
      memory: 4Gi
    requests:
      cpu: 2
      memory: 2Gi
  extraEnv:
    - name: GOMEMLIMIT
      value: 3750MiB

chunksCache:
  writebackSizeLimit: 10MB

# Zero out replica counts of other deployment modes
backend:
  replicas: 0
read:
  replicas: 0
write:
  replicas: 0

ingester:
  replicas: 0
querier:
  replicas: 0
queryFrontend:
  replicas: 0
queryScheduler:
  replicas: 0
distributor:
  replicas: 0
compactor:
  replicas: 0
indexGateway:
  replicas: 0
bloomCompactor:
  replicas: 0
bloomGateway:
  replicas: 0
```

### Grafana Helm Chart Values (grafana-values.yml)

Configuration for the Grafana Helm chart:

```yaml
---
persistence:
  type: pvc
  enabled: true

# DO NOT DO THIS IN PRODUCTION USECASES
adminUser: admin
adminPassword: adminadminadmin
# CONSIDER USING AN EXISTING SECRET
# admin:
#  existingSecret: ""
#  userKey: admin-user
#  passwordKey: admin-password

service:
  enabled: true
  type: ClusterIP

datasources:
  datasources.yaml:
    apiVersion: 1
    datasources:
    - name: Loki
      type: loki
      access: proxy
      orgId: 1
      url: http://loki-gateway.meta.svc.cluster.local:80
      basicAuth: false
      isDefault: false
      version: 1
      editable: false
    - name: Prometheus
      type: prometheus
      access: proxy
      orgId: 1
      url: http://prometheus-server.meta.svc.cluster.local:80
      basicAuth: false
      isDefault: true
      version: 1
      editable: false
    - name: Tempo
      type: tempo
      access: proxy
      orgId: 1
      url: http://tempo.meta.svc.cluster.local:80
      basicAuth: false
      isDefault: false
      version: 1
      editable: false
```

## README Template

Here's a template for your example's README.md:

```markdown
# Your Kubernetes Example Name

Brief description of what this example demonstrates and its purpose.

## Prerequisites

- Kubernetes cluster (or Kind for local development)
- Helm (v3.x)
- kubectl

## Setup

### 1. Create a Kubernetes Cluster (Optional, if using Kind)

```bash
kind create cluster --config kind.yml
```

### 2. Create a Namespace for Monitoring

```bash
kubectl create namespace meta
```

### 3. Install Loki

Add the Grafana Helm repository if you haven't already:

```bash
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
```

Install Loki:

```bash
helm install --values loki-values.yml loki grafana/loki -n meta
```

### 4. Install Grafana

```bash
helm install --values grafana-values.yml grafana grafana/grafana --namespace meta
```

### 5. Install Kubernetes Monitoring (with Alloy)

```bash
helm install --values ./k8s-monitoring-values.yml k8s grafana/k8s-monitoring -n meta
```

## Accessing the Dashboard

### Port Forward Grafana

```bash
kubectl port-forward -n meta svc/grafana 3000:80
```

Navigate to http://localhost:3000 in your browser. The default credentials are:
- Username: admin
- Password: adminadminadmin

## What to Expect

Describe what the user should see after setting up the example, including:
- What metrics/logs are being collected
- Any dashboards that are automatically set up
- How to interact with the example

## Cleanup

To remove the deployed resources:

```bash
helm uninstall k8s -n meta
helm uninstall grafana -n meta
helm uninstall loki -n meta
kubectl delete namespace meta
```

If you created a Kind cluster:

```bash
kind delete cluster
```
```

## Customizing Your Example

To create your own example:

1. Create a new directory with your example name at the root of the repository
2. Copy the template files from this template
3. Customize the files for your specific use case
4. Update the README.md with specific instructions for your example
5. Add your example to the main README.md table with a link and description

## Typical Use Cases for Kubernetes Examples

1. **Logs Collection**: Collecting and analyzing logs from applications running in Kubernetes
2. **Metrics Monitoring**: Monitoring application and infrastructure metrics
3. **Tracing**: Distributed tracing for microservices
4. **Profiling**: Performance profiling of applications
5. **Combined Observability**: Demonstrating how to use all telemetry types together

## Special Considerations for Kubernetes

- **Resource Limits**: Adjust resource requests and limits based on your cluster capacity
- **Persistent Storage**: Configure appropriate storage classes for your environment
- **Security**: In production environments, use proper authentication methods
- **Network Policies**: Consider adding network policies if required for your environment


================================================
FILE: .github/k8s-scenarios.json
================================================
{
  "metrics": [
    { "release": "prometheus", "chart": "prometheus-community/prometheus", "values": "prometheus-values.yml" },
    { "release": "grafana",    "chart": "grafana/grafana",                 "values": "grafana-values.yml" },
    { "release": "k8s",        "chart": "grafana/k8s-monitoring",          "values": "k8s-monitoring-values.yml", "version": "^4.0.0" }
  ],
  "logs": [
    { "release": "loki",       "chart": "grafana/loki",                    "values": "loki-values.yml" },
    { "release": "grafana",    "chart": "grafana/grafana",                 "values": "grafana-values.yml" },
    { "release": "k8s",        "chart": "grafana/k8s-monitoring",          "values": "k8s-monitoring-values.yml", "version": "^4.0.0" }
  ],
  "tracing": [
    { "release": "tempo",      "chart": "grafana/tempo",                   "values": "tempo-values.yml" },
    { "release": "grafana",    "chart": "grafana/grafana",                 "values": "grafana-values.yml" },
    { "release": "k8s",        "chart": "grafana/k8s-monitoring",          "values": "k8s-monitoring-values.yml", "version": "^4.0.0" }
  ],
  "profiling": [
    { "release": "pyroscope",  "chart": "grafana/pyroscope",               "values": "pyroscope-values.yml" },
    { "release": "grafana",    "chart": "grafana/grafana",                 "values": "grafana-values.yml" },
    { "release": "k8s",        "chart": "grafana/k8s-monitoring",          "values": "k8s-monitoring-values.yml", "version": "^4.0.0" }
  ],
  "events": [
    { "release": "loki",       "chart": "grafana/loki",                    "values": "loki-values.yml" },
    { "release": "grafana",    "chart": "grafana/grafana",                 "values": "grafana-values.yml" }
  ]
}


================================================
FILE: .github/scenario-list.txt
================================================
aws-firehose-logs
blackbox-probing
continuous-profiling
docker-monitoring
elasticsearch-monitoring
faro-frontend-observability
game-of-tracing
gelf-log-ingestion
kafka
linux
log-api-gateway
log-secret-filtering
logs-file
logs-tcp
mail-house
memcached-monitoring
mysql-monitoring
nginx-monitoring
otel-basic-tracing
otel-metrics-pipeline
otel-span-metrics
otel-tail-sampling
otel-tracing-service-graphs
postgres-monitoring
redis-monitoring
routing
self-monitoring
snmp
syslog
systemd-journal
trace-delivery
vault-secrets
windows
windows-events


================================================
FILE: .github/workflows/check-image-versions.yml
================================================
name: check-image-versions

# Drift guard: every ${VAR:-default} fallback in a docker-compose file
# must match the value of VAR in image-versions.env.
#
# Without this check, renovate's docker manager (which updates fallbacks
# in compose files) and the customManager in renovate.json (which
# updates image-versions.env) can fall out of lockstep — leaving anyone
# who runs `docker compose up` without `--env-file image-versions.env`
# on stale versions.

on:
  pull_request:
    paths:
      - '**/docker-compose.yml'
      - '**/docker-compose.yaml'
      - '**/docker-compose.coda.yml'
      - '**/docker-compose.coda.yaml'
      - 'image-versions.env'
      - '.github/workflows/check-image-versions.yml'
  push:
    branches: [main]

permissions:
  contents: read

jobs:
  check:
    name: Compose fallbacks vs image-versions.env
    runs-on: ubuntu-latest
    timeout-minutes: 3
    steps:
      - name: Harden runner
        uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1
        with:
          egress-policy: block
          allowed-endpoints: >
            api.github.com:443
            github.com:443

      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Compare fallbacks against image-versions.env
        run: |
          set -euo pipefail

          # Build a map of VAR=value from image-versions.env
          declare -A want
          while IFS='=' read -r k v; do
            [[ "$k" =~ ^[A-Z_]+_VERSION$ ]] || continue
            want[$k]="$v"
          done < <(grep -E '^[A-Z_]+_VERSION=' image-versions.env)

          echo "Tracking ${#want[@]} version variables:"
          for k in "${!want[@]}"; do
            echo "  $k=${want[$k]}"
          done
          echo

          # Scan every fallback. Pattern: ${VAR:-default}
          mismatches=0
          while IFS= read -r -d '' f; do
            while IFS= read -r line; do
              if [[ "$line" =~ \$\{([A-Z_]+_VERSION):-([^}]+)\} ]]; then
                var="${BASH_REMATCH[1]}"
                fallback="${BASH_REMATCH[2]}"
                expected="${want[$var]:-}"
                if [ -z "$expected" ]; then
                  echo "::warning file=$f::unknown variable $var (not in image-versions.env)"
                  continue
                fi
                if [ "$fallback" != "$expected" ]; then
                  echo "::error file=$f::\${$var:-$fallback} should be \${$var:-$expected}"
                  mismatches=$((mismatches+1))
                fi
              fi
            done < "$f"
          done < <(find . -type f \
                    \( -name 'docker-compose.yml' -o -name 'docker-compose.yaml' \
                       -o -name 'docker-compose.coda.yml' -o -name 'docker-compose.coda.yaml' \) \
                    -not -path '*/k8s/*' -not -path '*/.git/*' -print0)

          if [ "$mismatches" -gt 0 ]; then
            echo
            echo "::error::Found $mismatches drift(s). Update either the fallback in the compose file or image-versions.env."
            exit 1
          fi
          echo "OK — all fallbacks match image-versions.env"


================================================
FILE: .github/workflows/validate-k8s-scenarios.yml
================================================
name: validate-k8s-scenarios

# Lightweight validation for k8s scenarios under k8s/. Mirrors the
# defense-in-depth posture of validate-scenarios.yml (docker), but
# without paying the cost of a real cluster on every PR:
#
#   validate (every PR):  helm template + kubeconform per chart per scenario.
#                         Renders offline, validates against k8s API schemas.
#   kind-integration:     opt-in via workflow_dispatch only. Boots kind,
#                         helm-installs all charts, waits for pods Ready.
#
# Defense-in-depth (same as the docker workflow):
#   - permissions: contents: read       (no token write, no secrets)
#   - harden-runner egress allowlist    (compromised tool can't phone home)
#   - third-party actions SHA-pinned    (tag pushes can't sneak in)
#   - direct binary downloads, version-pinned (helm, kubeconform)
#   - github-hosted ephemeral runners
#   - pull_request, not pull_request_target

on:
  pull_request:
    paths:
      - 'k8s/**'
      - '.github/k8s-scenarios.json'
      - '.github/workflows/validate-k8s-scenarios.yml'
  workflow_dispatch:
    inputs:
      kind_integration:
        description: 'Run the kind-cluster integration job after validation'
        type: boolean
        default: false
      scenario:
        description: 'Which scenario(s) to run kind-integration for ("all" or comma-separated subset, e.g. "metrics,logs")'
        type: string
        default: 'all'

permissions:
  contents: read

concurrency:
  group: validate-k8s-${{ github.event.pull_request.number || github.run_id }}
  cancel-in-progress: true

env:
  HELM_VERSION: 'v4.1.4'
  KUBECONFORM_VERSION: 'v0.6.7'
  KUBERNETES_VERSION: '1.31.0'

jobs:
  # ──────────────────────────────────────────────────────────────────
  # validate: helm template + kubeconform per chart for each of the
  # 4 scenarios. Pure offline — no API server, no real cluster.
  # ──────────────────────────────────────────────────────────────────
  validate:
    name: Validate ${{ matrix.scenario }}
    runs-on: ubuntu-latest
    timeout-minutes: 8
    strategy:
      fail-fast: false
      matrix:
        scenario: [metrics, logs, tracing, profiling, events]
    steps:
      - name: Harden runner
        uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1
        with:
          egress-policy: block
          allowed-endpoints: >
            api.github.com:443
            github.com:443
            objects.githubusercontent.com:443
            release-assets.githubusercontent.com:443
            raw.githubusercontent.com:443
            get.helm.sh:443
            grafana.github.io:443
            prometheus-community.github.io:443
            charts.bitnami.com:443
            pypi.org:443
            files.pythonhosted.org:443

      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Install helm + kubeconform + yamllint
        run: |
          set -euo pipefail
          # Helm — pinned by version. Upstream tarball, verify by sha would
          # be ideal but Helm doesn't publish stable per-tag checksums in a
          # consumable way; pinning the version + restricting egress is the
          # workable compromise.
          curl -fsSL "https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz" \
            | tar -xz -C /tmp
          sudo install -m 0755 /tmp/linux-amd64/helm /usr/local/bin/helm

          # kubeconform — pinned. The release archive contains just the
          # binary; we extract it directly.
          curl -fsSL "https://github.com/yannh/kubeconform/releases/download/${KUBECONFORM_VERSION}/kubeconform-linux-amd64.tar.gz" \
            | sudo tar -xz -C /usr/local/bin/ kubeconform

          # yamllint — preinstalled python3 + pip on ubuntu-latest.
          sudo pip install --quiet yamllint

          helm version --short
          kubeconform -v
          yamllint --version

      - name: Helm repo bootstrap
        run: |
          set -euo pipefail
          helm repo add grafana https://grafana.github.io/helm-charts
          helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
          helm repo update

      - name: yamllint values files
        # Loose ruleset — values files commonly use long datasource URLs and
        # don't need a leading `---`. Run as advisory: don't fail the job on
        # style; we want it for hygiene signal, not blocking.
        continue-on-error: true
        run: |
          yamllint -d "{extends: relaxed, rules: {line-length: disable, document-start: disable}}" \
            k8s/${{ matrix.scenario }}/

      - name: Helm template + kubeconform per chart
        run: |
          set -euo pipefail
          mkdir -p /tmp/rendered
          fail=0

          # Iterate the scenario's chart list. helm template against the
          # remote chart triggers values.schema.json validation upstream
          # (most grafana charts ship a schema), then kubeconform validates
          # the rendered Kubernetes API objects against the target version.
          while IFS= read -r entry; do
            release=$(jq -r '.release' <<<"$entry")
            chart=$(jq -r '.chart'    <<<"$entry")
            values=$(jq -r '.values'  <<<"$entry")
            version=$(jq -r '.version // ""' <<<"$entry")
            values_path="k8s/${{ matrix.scenario }}/$values"

            ver_arg=()
            [ -n "$version" ] && ver_arg=(--version "$version")

            echo "::group::helm template $release ($chart${version:+ @$version})"
            out="/tmp/rendered/${{ matrix.scenario }}-$release.yaml"
            if ! helm template "$release" "$chart" "${ver_arg[@]}" \
                  -f "$values_path" > "$out" 2> "/tmp/rendered/${{ matrix.scenario }}-$release.err"; then
              echo "::error::helm template failed for $release"
              cat "/tmp/rendered/${{ matrix.scenario }}-$release.err"
              fail=1
              echo "::endgroup::"
              continue
            fi
            lines=$(wc -l < "$out")
            echo "Rendered $lines lines to $out"
            echo "::endgroup::"

            echo "::group::kubeconform $release"
            # -ignore-missing-schemas: skip CRDs whose schemas aren't in the
            # datree catalog (catching built-in K8s API drift is the real
            # signal; CRD validation is the chart maintainer's responsibility).
            if ! kubeconform -strict -summary \
                  -kubernetes-version "$KUBERNETES_VERSION" \
                  -schema-location default \
                  -schema-location 'https://raw.githubusercontent.com/datreeio/CRDs-catalog/main/{{.Group}}/{{.ResourceKind}}_{{.ResourceAPIVersion}}.json' \
                  -ignore-missing-schemas \
                  "$out"; then
              echo "::error::kubeconform failed for $release"
              fail=1
            fi
            echo "::endgroup::"
          done < <(jq -c --arg s "${{ matrix.scenario }}" '.[$s][]' .github/k8s-scenarios.json)

          exit $fail

      - name: Upload rendered manifests
        if: always()
        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
          name: rendered-${{ matrix.scenario }}
          path: /tmp/rendered/
          retention-days: 7

  # ──────────────────────────────────────────────────────────────────
  # kind-integration: Boots a real kind cluster and helm-installs all
  # charts for the scenario. Heavy — only on workflow_dispatch.
  # ──────────────────────────────────────────────────────────────────
  kind-integration:
    name: Kind integration ${{ matrix.scenario }}
    if: github.event_name == 'workflow_dispatch' && inputs.kind_integration == true
    runs-on: ubuntu-latest
    timeout-minutes: 25
    strategy:
      fail-fast: false
      matrix:
        scenario: [metrics, logs, tracing, profiling, events]
    steps:
      - name: Harden runner
        uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1
        with:
          egress-policy: block
          # Adds image registries on top of the validate allowlist —
          # helm install actually pulls images for kind to schedule.
          allowed-endpoints: >
            api.github.com:443
            github.com:443
            objects.githubusercontent.com:443
            release-assets.githubusercontent.com:443
            raw.githubusercontent.com:443
            get.helm.sh:443
            grafana.github.io:443
            prometheus-community.github.io:443
            charts.bitnami.com:443
            registry-1.docker.io:443
            auth.docker.io:443
            production.cloudflare.docker.com:443
            ghcr.io:443
            quay.io:443
            cdn.quay.io:443
            grafana.com:443
            mcr.microsoft.com:443
            public.ecr.aws:443

      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Filter by scenario input
        id: filter
        # User-supplied workflow_dispatch input is passed via env, NOT
        # interpolated directly into the run block, to prevent
        # template-injection (zizmor: template-injection rule).
        # matrix.scenario IS safe to interpolate directly because it's
        # constrained to the static list above.
        env:
          USER_SCENARIO: ${{ inputs.scenario }}
          MATRIX_SCENARIO: ${{ matrix.scenario }}
        run: |
          set -euo pipefail
          if [ "$USER_SCENARIO" = "all" ]; then
            echo "run=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          if grep -qx "$MATRIX_SCENARIO" <(tr ',' '\n' <<<"$USER_SCENARIO"); then
            echo "run=true" >> "$GITHUB_OUTPUT"
          else
            echo "run=false" >> "$GITHUB_OUTPUT"
            echo "::notice::Skipping $MATRIX_SCENARIO (not in user-selected subset '$USER_SCENARIO')"
          fi

      - name: Install helm
        if: steps.filter.outputs.run == 'true'
        run: |
          curl -fsSL "https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz" \
            | tar -xz -C /tmp
          sudo install -m 0755 /tmp/linux-amd64/helm /usr/local/bin/helm

      - name: Create kind cluster
        if: steps.filter.outputs.run == 'true'
        uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0
        with:
          config: k8s/${{ matrix.scenario }}/kind.yml
          cluster_name: ${{ matrix.scenario }}

      - name: Helm bootstrap + install all charts
        if: steps.filter.outputs.run == 'true'
        run: |
          set -euo pipefail
          helm repo add grafana https://grafana.github.io/helm-charts
          helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
          helm repo update
          kubectl create namespace meta || true

          while IFS= read -r entry; do
            release=$(jq -r '.release' <<<"$entry")
            chart=$(jq -r '.chart'    <<<"$entry")
            values=$(jq -r '.values'  <<<"$entry")
            version=$(jq -r '.version // ""' <<<"$entry")
            values_path="k8s/${{ matrix.scenario }}/$values"

            ver_arg=()
            [ -n "$version" ] && ver_arg=(--version "$version")

            echo "::group::helm install $release ($chart)"
            helm install "$release" "$chart" "${ver_arg[@]}" \
              -f "$values_path" -n meta --create-namespace \
              --wait --timeout 5m
            echo "::endgroup::"
          done < <(jq -c --arg s "${{ matrix.scenario }}" '.[$s][]' .github/k8s-scenarios.json)

      - name: Wait for pods Ready in meta namespace
        if: steps.filter.outputs.run == 'true'
        run: |
          if ! kubectl wait --for=condition=Ready pods --all -n meta --timeout=10m; then
            echo "::error::Pods did not become Ready"
            kubectl get pods -n meta -o wide
            kubectl describe pods -n meta
            exit 1
          fi
          kubectl get pods -n meta -o wide


================================================
FILE: .github/workflows/validate-scenarios.yml
================================================
name: validate-scenarios

# Boots every scenario whose files were touched by the PR, after a CVE
# scan of every image the scenario will run. Designed to make renovate
# dependency PRs reviewable on signal rather than diff-eyeballing alone.
#
# Defense-in-depth (intentional, not paranoia):
#   - permissions: contents: read       — no token write, no secrets
#   - third-party actions SHA-pinned    — tag pushes can't sneak in
#   - trivy advisory scan before boot   — known-bad images flagged in PR
#   - github-hosted ephemeral runners   — runner state is not persisted
#
# Triggered on pull_request (NOT pull_request_target): fork PRs run
# without secrets, which is the safe default. Updating this file
# requires the same scrutiny as updating any third-party action SHA.

on:
  pull_request:
    paths:
      - '*/docker-compose.yml'
      - '*/docker-compose.yaml'
      - '*/docker-compose.coda.yml'
      - '*/Dockerfile'
      - '*/config.alloy'
      - '*/app/**'
      - '*/*/Dockerfile'
      - '*/*/requirements.txt'
      - '*/*/package.json'
      - '*/*/*.csproj'
      - 'image-versions.env'
      - '.github/scenario-list.txt'
      - '.github/workflows/validate-scenarios.yml'
  # Manual trigger — runs the full matrix without the sampling cap, so a
  # maintainer can validate a cross-cutting change (e.g. an LGMT bump
  # that touches every scenario) before merging. PRs auto-sample when
  # affected count exceeds MATRIX_CAP; workflow_dispatch always runs all.
  workflow_dispatch: {}

env:
  # Maximum scenarios to validate on a PR before sampling kicks in.
  # Picked so a typical big update finishes within ~30 min wall-clock
  # at the configured max-parallel; bypassed by workflow_dispatch.
  MATRIX_CAP: '8'

permissions:
  contents: read

concurrency:
  # `pull_request.number || run_id` keeps PR runs grouped (and superseded
  # by force-pushes) while still giving every workflow_dispatch run its
  # own slot — manual full-matrix runs shouldn't cancel each other.
  group: validate-scenarios-${{ github.event.pull_request.number || github.run_id }}
  cancel-in-progress: true

jobs:
  # ──────────────────────────────────────────────────────────────────
  # detect: Map changed files to top-level scenario directories.
  # Pure shell — no third-party action — to keep the supply-chain
  # surface minimal.
  # ──────────────────────────────────────────────────────────────────
  detect:
    name: Detect affected scenarios
    runs-on: ubuntu-latest
    timeout-minutes: 5
    outputs:
      scenarios: ${{ steps.filter.outputs.scenarios }}
      count: ${{ steps.filter.outputs.count }}
      count_full: ${{ steps.filter.outputs.count_full }}
      sampled: ${{ steps.filter.outputs.sampled }}
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0

      - name: Compute affected scenarios
        id: filter
        env:
          EVENT_NAME: ${{ github.event_name }}
          BASE_SHA: ${{ github.event.pull_request.base.sha }}
          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
        run: |
          set -euo pipefail

          if [ "$EVENT_NAME" = "workflow_dispatch" ]; then
            # Manual run: validate every scenario in the canonical list.
            # No diff to compute; sampling cap is bypassed.
            cp .github/scenario-list.txt /tmp/affected.txt
          else
            # The base sha may not be in the local clone with a shallow
            # checkout; fetch-depth: 0 avoids that, but be belt-and-braces.
            git fetch origin "$BASE_SHA" "$HEAD_SHA" --depth=200 2>/dev/null || true

            # Map every changed file to its first path segment. Empty lines
            # come from root-level files (no segment); awk drops those.
            git diff --name-only "$BASE_SHA" "$HEAD_SHA" \
              | awk -F/ 'NF>1 {print $1}' \
              | sort -u > /tmp/segments.txt

            # Intersect with the canonical scenario list. `|| true` keeps
            # the pipeline alive when there's no overlap (e.g. a PR that
            # only touches docs/CI).
            grep -Fxf /tmp/segments.txt .github/scenario-list.txt \
              | sort -u > /tmp/affected.txt || true
          fi

          count_full=$(wc -l < /tmp/affected.txt | tr -d ' ')
          sampled=false

          # Sampling cap: when a single PR touches more than MATRIX_CAP
          # scenarios (typical for image-versions.env / shared-base
          # changes), validate a deterministic representative subset
          # rather than the full matrix. Maintainers can run the full
          # matrix via workflow_dispatch before merging if signal on
          # every scenario is wanted.
          #
          # Determinism: sort by the SHA-256 of "<scenario><commit>".
          # Same commit → same subset, so re-runs are stable. Different
          # commits get different subsets, so coverage rotates over
          # time across many big-update PRs.
          if [ "$EVENT_NAME" != "workflow_dispatch" ] \
             && [ "$count_full" -gt "$MATRIX_CAP" ]; then
            sampled=true
            commit_hash="${HEAD_SHA:-$GITHUB_SHA}"
            while read -r line; do
              [ -z "$line" ] && continue
              key=$(printf "%s%s" "$line" "$commit_hash" \
                    | sha256sum | head -c 16)
              printf "%s\t%s\n" "$key" "$line"
            done < /tmp/affected.txt \
              | sort | head -n "$MATRIX_CAP" | cut -f2 > /tmp/active.txt
          else
            cp /tmp/affected.txt /tmp/active.txt
          fi

          count=$(wc -l < /tmp/active.txt | tr -d ' ')
          scenarios=$(jq -Rsc 'split("\n") | map(select(length>0))' /tmp/active.txt)

          echo "scenarios=$scenarios"     >> "$GITHUB_OUTPUT"
          echo "count=$count"             >> "$GITHUB_OUTPUT"
          echo "count_full=$count_full"   >> "$GITHUB_OUTPUT"
          echo "sampled=$sampled"         >> "$GITHUB_OUTPUT"

          {
            echo "## Affected scenarios"
            echo
            if [ "$count_full" = "0" ]; then
              echo "_None — PR does not touch any scenario directory._"
            elif [ "$sampled" = "true" ]; then
              echo "**$count_full** scenarios affected; sampled **$count** for validation (cap is \`$MATRIX_CAP\`)."
              echo
              echo "Trigger \`workflow_dispatch\` on this branch to validate the full matrix."
              echo
              echo "Sampled subset:"
              echo '```'
              cat /tmp/active.txt
              echo '```'
              echo
              echo "<details><summary>Full affected list ($count_full)</summary>"
              echo
              echo '```'
              cat /tmp/affected.txt
              echo '```'
              echo
              echo "</details>"
            else
              echo "Count: \`$count\`"
              echo
              echo '```'
              cat /tmp/active.txt
              echo '```'
            fi
          } >> "$GITHUB_STEP_SUMMARY"

          if [ "$sampled" = "true" ]; then
            echo "::warning::Sampled $count of $count_full affected scenarios. Run workflow_dispatch on this branch to validate them all."
          fi

  # ──────────────────────────────────────────────────────────────────
  # scan: For each affected scenario, resolve every image reference
  # via `docker compose config --images`, then trivy-scan each one.
  # Hard-fails on HIGH/CRITICAL CVEs that have a fix available.
  # ──────────────────────────────────────────────────────────────────
  scan:
    name: Scan images
    needs: detect
    if: needs.detect.outputs.count != '0'
    runs-on: ubuntu-latest
    timeout-minutes: 10
    strategy:
      fail-fast: false
      max-parallel: 6
      matrix:
        scenario: ${{ fromJSON(needs.detect.outputs.scenarios) }}
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Compute today's UTC date for cache key
        id: date
        run: echo "today=$(date -u +%Y-%m-%d)" >> "$GITHUB_OUTPUT"

      - name: Restore trivy DB cache
        # Trivy fetches a fresh vulnerability DB on every cold scan
        # (~30 MB, ~5-10 s per scenario from mirror.gcr.io). Caching
        # the DB shaves the cold-pull off every matrix entry after the
        # first one of the day. Key rotates daily so the DB stays
        # fresh; the restore-keys fallback is intentional — even a
        # stale-by-hours DB is far better than a cold fetch.
        uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
        with:
          path: /tmp/trivy-cache
          key: trivy-db-${{ steps.date.outputs.today }}
          restore-keys: |
            trivy-db-

      - name: Resolve images for ${{ matrix.scenario }}
        id: images
        run: |
          set -euo pipefail
          # Try .yml first, fall back to .yaml (some scenarios use either).
          compose_file=""
          for ext in yml yaml; do
            f="${{ matrix.scenario }}/docker-compose.$ext"
            if [ -f "$f" ]; then compose_file="$f"; break; fi
          done
          if [ -z "$compose_file" ]; then
            echo "No docker-compose found for ${{ matrix.scenario }}" >&2
            exit 1
          fi

          # `docker compose config --images` returns service-name defaults
          # like `game-of-tracing-ai-opponent` for `build:`-only services
          # — those don't exist in any registry, so trivy fails with
          # UNAUTHORIZED. Filter to services with an explicit `image:`
          # field (third-party registry artifacts only). Locally-built
          # images aren't directly scanned; their FROM base image (e.g.
          # python:3.11-slim) lives in the Dockerfile and is tracked
          # separately by renovate's docker manager.
          docker compose -f "$compose_file" \
            --env-file image-versions.env \
            config --format json \
            | jq -r '.services | to_entries[]
                     | select(.value.image != null)
                     | .value.image' \
            | sort -u > /tmp/images.txt
          echo "Images to scan:"
          cat /tmp/images.txt
          if [ ! -s /tmp/images.txt ]; then
            echo "::notice::No third-party images to scan in this scenario (all services build locally)."
          fi

      - name: Trivy scan each image (advisory)
        # Run trivy via its own docker image (digest-pinned). No
        # docker.sock mount: trivy pulls the target image itself rather
        # than reaching into the host's docker — keeps the trivy
        # container from gaining root-equivalent access on the runner.
        #
        # Advisory mode: HIGH/CRITICAL findings are reported via the job
        # log + step summary table + ::warning:: annotations, but the
        # step always exits 0. These are demo scenarios; upstream LGMT
        # images regularly carry HIGH-with-fix findings between releases
        # and blocking every PR until they catch up isn't useful. Treat
        # the report as a signal to bump base images, not a merge gate.
        env:
          # Suppress ANSI escapes so the log + summary parse cleanly
          NO_COLOR: '1'
        run: |
          set -euo pipefail
          TRIVY_IMAGE='aquasec/trivy:0.66.0@sha256:086971aaf400beebd94e8300fd8ea623774419597169156cec56eec5b00dfb1e'

          # Pre-pull once so loop iterations don't re-resolve.
          docker pull "$TRIVY_IMAGE"

          mkdir -p /tmp/trivy-cache
          report_log=/tmp/trivy-output.log
          : > "$report_log"

          while IFS= read -r img; do
            [ -z "$img" ] && continue
            echo "::group::Scanning $img"
            echo "=== $img ===" >> "$report_log"
            # `|| true` so a non-zero trivy exit (had findings) doesn't
            # abort the loop — we want to scan every image.
            docker run --rm \
                -e NO_COLOR=1 \
                -v /tmp/trivy-cache:/root/.cache/trivy \
                "$TRIVY_IMAGE" image \
                --severity HIGH,CRITICAL \
                --ignore-unfixed \
                --no-progress \
                --timeout 5m \
                "$img" 2>&1 | tee -a "$report_log" || true
            echo "::endgroup::"
          done < /tmp/images.txt

          # Per-image summary table for the PR's step summary.
          {
            echo "## CVE scan: ${{ matrix.scenario }}"
            echo
            if [ ! -s /tmp/images.txt ]; then
              echo "_No third-party images to scan (all services build locally)._"
            else
              echo "| Image | HIGH | CRITICAL |"
              echo "|---|---:|---:|"
              current=""
              h=0; c=0
              while IFS= read -r line; do
                if [[ "$line" =~ ^===\ (.+)\ ===$ ]]; then
                  if [ -n "$current" ]; then
                    echo "| \`$current\` | $h | $c |"
                  fi
                  current="${BASH_REMATCH[1]}"
                  h=0; c=0
                elif [[ "$line" =~ Total:\ [0-9]+\ \(HIGH:\ ([0-9]+),\ CRITICAL:\ ([0-9]+)\) ]]; then
                  h=$((h + ${BASH_REMATCH[1]}))
                  c=$((c + ${BASH_REMATCH[2]}))
                fi
              done < "$report_log"
              if [ -n "$current" ]; then
                echo "| \`$current\` | $h | $c |"
              fi
              echo
              echo "_HIGH+CRITICAL counts are unfixed CVEs with patches available upstream. Findings here don't block merge — see the job log for the full per-CVE table. Upgrade base images via the relevant renovate PR when fixes appear in a published release._"
            fi
          } >> "$GITHUB_STEP_SUMMARY"

          # Emit a single ::warning:: if anything was found, so the PR
          # gets an inline annotation pointing at the job summary.
          if grep -qE 'Total:\ [^0]' "$report_log"; then
            echo "::warning::trivy found HIGH/CRITICAL unfixed CVEs in scanned images for ${{ matrix.scenario }}. See job summary for per-image counts and the log for details."
          fi

  # ──────────────────────────────────────────────────────────────────
  # smoke: For each affected scenario, boot it via run-example.sh,
  # wait until something healthy answers (Grafana, then Alloy, then
  # Prometheus), then tear down.
  # ──────────────────────────────────────────────────────────────────
  smoke:
    name: Smoke test
    needs: [detect, scan]
    runs-on: ubuntu-latest
    timeout-minutes: 15
    strategy:
      fail-fast: false
      max-parallel: 4
      matrix:
        scenario: ${{ fromJSON(needs.detect.outputs.scenarios) }}
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Boot ${{ matrix.scenario }}
        run: |
          set -euo pipefail
          chmod +x ./run-example.sh
          ./run-example.sh "${{ matrix.scenario }}"

      - name: Wait for a healthy endpoint (Grafana, Alloy, or Prometheus)
        run: |
          set -euo pipefail
          # Probe in priority order. Most scenarios expose Grafana on
          # :3000; self-monitoring exposes Alloy on :12345 instead;
          # routing remaps Alloy. Grafana wins when present, else any
          # ready endpoint counts as bring-up success.
          probes=(
            "http://localhost:3000/api/health"
            "http://localhost:12345/-/ready"
            "http://localhost:9090/-/ready"
          )

          deadline=$(( $(date +%s) + 180 ))   # 3 min total
          while [ "$(date +%s)" -lt "$deadline" ]; do
            for url in "${probes[@]}"; do
              code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 3 "$url" 2>/dev/null || true)
              if [ "$code" = "200" ]; then
                echo "Healthy: $url"
                exit 0
              fi
            done
            sleep 5
          done

          echo "::error::No probe endpoint became healthy within 3 min"
          exit 1

      - name: Verify no exited containers
        run: |
          set -euo pipefail
          cd "${{ matrix.scenario }}"
          # `docker compose ps --status exited` lists any container that
          # crashed during bring-up. An empty list is the pass case.
          exited=$(docker compose ps --status exited --format '{{.Name}}' || true)
          if [ -n "$exited" ]; then
            echo "::error::Exited containers detected:"
            echo "$exited"
            exit 1
          fi

      - name: Dump container logs on failure
        if: failure()
        run: |
          cd "${{ matrix.scenario }}"
          docker compose logs --no-color || true

      - name: Tear down
        if: always()
        run: |
          cd "${{ matrix.scenario }}"
          docker compose down --volumes --remove-orphans || true


================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/


================================================
FILE: CLAUDE.md
================================================
# CLAUDE.md

This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.

## Project Overview

This is a collection of self-contained demonstration scenarios for **Grafana Alloy**, the telemetry collection and processing pipeline. Each scenario lives in its own top-level directory and showcases a specific monitoring use case using the **LGMT stack** (Loki, Grafana, Metrics/Prometheus, Tempo).

## Running Scenarios

```bash
# Option 1: Direct (uses default image versions in docker-compose.yml)
cd <scenario-dir> && docker compose up -d

# Option 2: Centralized image versions (from repo root)
./run-example.sh <scenario-dir>

# Stop a scenario
cd <scenario-dir> && docker compose down
```

Image versions are centralized in `image-versions.env` at the repo root. Docker-compose files reference these via `${VAR:-default}` syntax.

Kubernetes scenarios (under `k8s/`) use Helm charts instead of Docker Compose — see their individual READMEs.

## Scenario Structure

Every Docker-based scenario follows this layout:

```
scenario-name/
├── docker-compose.yml      # LGMT stack + Alloy (infrastructure only)
├── docker-compose.coda.yml # Demo app services (run via coda CLI or -f flag)
├── config.alloy             # Alloy pipeline configuration (River/HCL syntax)
├── loki-config.yaml         # Loki backend config
├── prom-config.yaml         # Prometheus backend config
├── tempo-config.yaml        # Tempo config (if tracing is involved)
├── README.md                # What the scenario demonstrates and how to use it
└── app/                     # Optional demo application (typically Python/Flask)
```

## Alloy Configuration Language

`config.alloy` files use Alloy's River syntax (HCL-like). Pipelines follow a consistent pattern:

1. **Receivers/Sources** — ingest data (`loki.source.*`, `otelcol.receiver.*`, `prometheus.exporter.*`)
2. **Processors/Transformers** — parse, relabel, batch (`loki.process.*`, `discovery.relabel`, `otelcol.processor.*`)
3. **Writers/Exporters** — send to backends (`loki.write.*`, `prometheus.remote_write.*`, `otelcol.exporter.*`)

Components are wired together by passing outputs to inputs (e.g., `forward_to = [loki.write.default.receiver]`).

## Creating a New Scenario

Templates exist in `.cursor/docker-example.mdc` (Docker) and `.cursor/k8s-example.mdc` (Kubernetes) with full boilerplate for all config files.

Checklist for a new scenario:
1. Create a new top-level directory named after the scenario
2. Include `docker-compose.yml`, `config.alloy`, backend configs, and `README.md`
3. Use `${VAR:-default}` for image versions matching `image-versions.env` keys
4. Grafana service should auto-provision datasources via entrypoint script (see template)
5. Add the scenario to the main `README.md` table
6. Alloy UI is available at `http://localhost:12345` for debugging pipelines

## Key Conventions

- Grafana runs on port 3000 with anonymous admin auth enabled (no login required)
- Alloy HTTP server runs on port 12345
- Python demo apps use OpenTelemetry SDK for instrumentation (`telemetry.py` pattern)
- Backend configs (loki, prometheus, tempo) are minimal single-instance dev configs — not production-ready


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
<p align="center">
  <img src="./img/banner.png" alt="Grafana Alloy Scenarios Banner" width="300"/>
</p>

# Grafana Alloy Scenarios

A collection of self-contained, runnable scenarios demonstrating how to use [Grafana Alloy](https://grafana.com/docs/alloy/) for telemetry collection and processing. Each scenario includes a full LGMT stack (Loki, Grafana, Mimir, Tempo) with pre-configured dashboards so you can explore immediately.

## Getting Started

### Prerequisites

- [Docker](https://docs.docker.com/get-docker/) and [Docker Compose](https://docs.docker.com/compose/install/)

### Run a scenario

```bash
# Option 1: Navigate to the scenario directory
cd <scenario-dir> && docker compose up -d

# Option 2: Use centralized image management (from repo root)
./run-example.sh <scenario-directory>
```

The centralized approach manages all Docker image versions in a single `image-versions.env` file, making it easy to update images across all scenarios.

### Access the stack

Once a scenario is running:

- **Grafana**: [http://localhost:3000](http://localhost:3000) (no login required)
- **Alloy UI**: [http://localhost:12345](http://localhost:12345) (pipeline debugging)

### Run with the Coda app overlay

Each scenario includes a `docker-compose.coda.yml` file that defines the demo application services separately from the infrastructure stack. This lets you run just the observability backend on its own, or layer in the app when you're ready:

```bash
# Infrastructure only
cd <scenario-dir> && docker compose up -d

# Infrastructure + demo app
cd <scenario-dir> && docker compose -f docker-compose.yml -f docker-compose.coda.yml up -d
```

If you have the `coda` CLI installed, it manages the app overlay automatically:

```bash
coda start <scenario-dir>   # Start app containers
coda stop <scenario-dir>    # Stop app containers
coda status <scenario-dir>  # Show container status
coda list                   # List all available scenarios
```

### Stop a scenario

```bash
cd <scenario-dir> && docker compose down
```

## Scenarios

### Logs

| Scenario | Description |
| -------- | ----------- |
| [GELF log ingestion](gelf-log-ingestion/) | Ingest structured logs from applications using the GELF (Graylog Extended Log Format) protocol over UDP. |
| [Kafka logs](kafka/) | Consume and process logs from Apache Kafka topics. |
| [Log API gateway](log-api-gateway/) | Use Alloy as a centralized log gateway that accepts logs via a Loki-compatible push API endpoint. |
| [Log routing](routing/) | Route logs from multiple sources to different Loki tenants based on log content and origin. |
| [Log secret filtering](log-secret-filtering/) | Automatically redact sensitive credentials and secrets from logs using pattern matching before storage. |
| [Logs from file](logs-file/) | Monitor and tail log files using Alloy. |
| [Logs over TCP](logs-tcp/) | Receive and process TCP logs in JSON format. |
| [Popular logging frameworks](app-instrumentation/logging/popular-logging-frameworks/) | Parse logs from popular logging frameworks across 7 programming languages. |
| [Structured log parsing](mail-house/) | Parse structured logs into labels and structured metadata. |
| [Syslog monitoring](syslog/) | Monitor non-RFC5424 compliant syslog messages using `rsyslog` and Alloy. |

### Tracing

| Scenario | Description |
| -------- | ----------- |
| [Distributed tracing](trace-delivery/) | Learn distributed tracing through a sofa delivery workflow from order to doorstep. |
| [Game of tracing](game-of-tracing/) | An interactive strategy game teaching distributed tracing, sampling, and service graphs. |
| [OpenTelemetry basic tracing](otel-basic-tracing/) | Collect and visualize OpenTelemetry traces using Alloy and Tempo. |
| [OpenTelemetry service graphs](otel-tracing-service-graphs/) | Generate service graphs using the Alloy `servicegraph` connector. |
| [OpenTelemetry span metrics](otel-span-metrics/) | Generate RED metrics (Request rate, Error rate, Duration) from OpenTelemetry traces using the span metrics connector. |
| [OpenTelemetry tail sampling](otel-tail-sampling/) | Apply tail sampling policies to OpenTelemetry traces with Alloy and Tempo. |

### Metrics

| Scenario | Description |
| -------- | ----------- |
| [Blackbox probing](blackbox-probing/) | Monitor endpoint availability and response times using synthetic HTTP probes. |
| [OTel metrics pipeline](otel-metrics-pipeline/) | Forward OpenTelemetry metrics from applications through Alloy with batching and transformation into Prometheus. |

### Profiling

| Scenario | Description |
| -------- | ----------- |
| [Continuous profiling](continuous-profiling/) | Collect and visualize CPU, memory, and goroutine profiles from Go applications using Grafana Pyroscope. |

### Secrets and configuration

| Scenario | Description |
| -------- | ----------- |
| [Vault secrets](vault-secrets/) | Pull `prometheus.remote_write` basic_auth credentials from HashiCorp Vault at runtime using `remote.vault`, with hot-reload on rotation. |

### Frontend

| Scenario | Description |
| -------- | ----------- |
| [Faro frontend observability](faro-frontend-observability/) | Collect frontend web telemetry (logs, errors, web vitals) from browser applications using the Faro Web SDK. |

### Cloud Monitoring

| Scenario | Description |
| -------- | ----------- |
| [CloudWatch metrics](cloudwatch-metrics/) | Pull AWS CloudWatch metrics into Prometheus via `prometheus.exporter.cloudwatch`. Uses LocalStack for offline reproducibility — no AWS account required. |

### Infrastructure Monitoring

| Scenario | Description |
| -------- | ----------- |
| [Docker monitoring](docker-monitoring/) | Monitor Docker container metrics and logs. |
| [Monitor Linux](linux/) | Monitor a Linux server's system metrics using Alloy. |
| [Monitor Windows](windows/) | Monitor Windows system metrics and Event Logs. |
| [Self-monitoring](self-monitoring/) | Configure Alloy to monitor itself, collecting its own metrics and logs. |
| [SNMP monitoring](snmp/) | Monitor SNMP devices using the Alloy SNMP exporter. |

### Database and Cache Monitoring

| Scenario | Description |
| -------- | ----------- |
| [Elasticsearch monitoring](elasticsearch-monitoring/) | Monitor Elasticsearch cluster health, node status, and performance metrics. |
| [Memcached monitoring](memcached-monitoring/) | Monitor Memcached instance metrics including connections, memory usage, and command performance. |
| [MySQL monitoring](mysql-monitoring/) | Monitor MySQL database server metrics and performance indicators. |
| [PostgreSQL monitoring](postgres-monitoring/) | Monitor PostgreSQL transaction statistics, connections, and server configuration. |
| [RabbitMQ monitoring](rabbitmq-monitoring/) | Monitor RabbitMQ queue, connection, and channel metrics plus broker container logs. |
| [Redis monitoring](redis-monitoring/) | Monitor Redis instance metrics including connections, memory usage, and command throughput. |

### Kubernetes

| Scenario | Description |
| -------- | ----------- |
| [Kubernetes](k8s/) | A series of scenarios demonstrating Alloy setup using the Kubernetes monitoring Helm chart. See subdirectories for telemetry-specific examples. |

### OTel Engine Examples (Experimental)

Alloy v1.14+ includes an experimental **OTel Engine** that runs standard OpenTelemetry Collector YAML configs directly. These scenarios use `alloy otel` instead of River/HCL syntax. See the [OTel examples README](otel-examples/) for details.

| Scenario | Description |
| -------- | ----------- |
| [File log processing](otel-examples/filelog-processing/) | Collect and parse mixed-format log files using the OTel `filelog` receiver with operator chains. |
| [PII redaction](otel-examples/pii-redaction/) | Scrub credit cards, emails, and IPs from traces and logs using OTTL `replace_pattern`. |
| [Multi-tenant routing](otel-examples/routing-multi-tenant/) | Route logs to different Loki tenants based on resource attributes using fan-out and filter. |
| [Cost control](otel-examples/cost-control/) | Drop health checks, filter debug logs, and apply probabilistic sampling to cut telemetry volume. |
| [Resource enrichment](otel-examples/resource-enrichment/) | Auto-attach host, OS, and Docker metadata to all signals via `resourcedetection`. |
| [Count connector](otel-examples/count-connector/) | Derive request rate and error rate metrics from traces and logs using the `count` connector. |
| [OTTL transform cookbook](otel-examples/ottl-transform/) | A cookbook of OTTL patterns: JSON parsing, severity mapping, attribute promotion, truncation. |
| [Host metrics](otel-examples/host-metrics/) | Collect CPU, memory, disk, and network metrics using the `hostmetrics` receiver. |
| [Multi-pipeline fan-out](otel-examples/multi-pipeline-fanout/) | Send traces to two backends with different processing per destination. |
| [Kafka buffer](otel-examples/kafka-buffer/) | Buffer traces through Kafka for durability and backpressure handling. |

## Contributing

Contributions of scenarios or improvements to scenarios are welcome. You can contribute in several ways:

### Suggest a scenario

If you have an idea for a scenario but don't have time to implement it:

1. Open an [issue](https://github.com/grafana/alloy-scenarios/issues/new) with the label `scenario-suggestion`
2. Describe the scenario and what it would demonstrate
3. Explain why this would be valuable to the community
4. Outline any special requirements or considerations

### Contribute a scenario

If you'd like to contribute a complete scenario:

1. Fork this repository and create a branch
2. Create a directory in the root of this repository with a descriptive name for your scenario
3. Follow the [scenario template](#scenario-template) below
4. Submit a pull request with your scenario

### Improve a scenario

To improve a scenario:

1. Fork this repository and create a branch
2. Make your improvements to the scenario
3. Submit a pull request with a clear description of your changes

### Scenario template

When creating a scenario, include the following files:

- `docker-compose.yml` - Docker Compose file with the LGMT stack
- `docker-compose.coda.yml` - Docker Compose override with the demo app services (for use with the `coda` CLI or `-f` flag)
- `config.alloy` - Alloy configuration file for the scenario
- `README.md` - Documentation explaining the scenario
- Any additional files needed for your scenario, such as scripts or data files

### Scenario checklist

Before submitting your scenario, ensure that you have:

- [ ] Created a directory in the root of this repository with a descriptive name
- [ ] Included a docker-compose.yml file with the necessary components, such as LGMT stack or subset
- [ ] Created a complete config.alloy file that demonstrates the monitoring approach
- [ ] Written a README.md with:
  - A clear description of what the scenario demonstrates
  - Prerequisites for running the demo
  - Step-by-step instructions for running the demo
  - Expected output and what to look for
  - Screenshots if applicable
  - Explanation of key configuration elements
- [ ] Added the scenario to the table in this README.md
- [ ] Ensured the scenario works with the centralized image management system
- [ ] Verified all components start correctly with `docker compose up -d`

### Best practices for scenarios

- Keep the scenario focused on demonstrating one concept
- Use clear, descriptive component and variable names
- Add comments to explain complex parts of your Alloy configuration
- Consider including a "Customizing" section in your README.md
- Provide sample queries for Grafana/Prometheus/Loki/Tempo that work with your scenario
- Use environment variables for versions and configurable parameters

## Get help

If you have questions about creating a scenario or need help with Alloy:

- Join the [Grafana Labs Community Forums](https://community.grafana.com/)
- Check the [Grafana Alloy documentation](https://grafana.com/docs/alloy/)

## License

This repository is licensed under the Apache License, Version 2.0. Refer to [LICENSE](LICENSE) for the full license text.


================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/README.md
================================================
# App Instrumentation - Structured Logging with Alloy Parsing

This directory contains a comprehensive **Alloy tutorial** demonstrating how to parse structured logs from 7 popular programming languages using modern logging frameworks. Each language uses industry-standard structured logging libraries, and all logs are processed through a unified Alloy pipeline for collection, parsing, and storage in Loki.

## 🎯 Tutorial Objectives

- **Learn Alloy log parsing**: Understand how to parse different log formats using `loki.process` stages
- **Multi-language support**: Handle logs from 7 different programming languages in a single pipeline
- **Structured logging**: Demonstrate modern logging practices with structured data
- **Real-world scenarios**: Show practical log parsing for containerized applications

## Languages and Modern Logging Frameworks

| Language | Logging Framework | Type | Key Features | Docker Base Image |
|----------|------------------|------|--------------|-------------------|
| **JavaScript** | `Pino` | JSON structured | High performance, child loggers, ndjson output | `node:22-alpine` |
| **Python** | `logging` module | Structured text | Built-in standard library with custom formatting | `python:3.12-slim` |
| **Java** | `SLF4J + Logback` | Structured text | Parameterized messages, MDC context, thread info | `openjdk:26-slim` |
| **C#** | `Microsoft.Extensions.Logging` | Structured text | .NET standard framework, event IDs, structured data | `mcr.microsoft.com/dotnet/*:9.0` |
| **C++** | `spdlog` | Structured text | High performance, source location, thread-safe | `ubuntu:24.04` |
| **Go** | `Zap` | JSON structured | High performance, named loggers, structured fields | `golang:1.23-alpine` |
| **PHP** | `Monolog` | Structured text | Context arrays, processors, multiple handlers | `php:8.3-cli-alpine` |

## Directory Structure

```
app-instrumentation/logging/popular-logging-frameworks/
├── alloy/
│   ├── config.alloy          # Main Alloy configuration
│   └── helper.alloy           # Language-specific log parsers
├── javascript/
│   ├── app.js                 # Pino structured logging
│   └── Dockerfile
├── python/
│   ├── app.py                 # Python logging with custom format
│   └── Dockerfile
├── java/
│   ├── App.java               # SLF4J + Logback
│   ├── logback.xml
│   └── Dockerfile
├── csharp/
│   ├── Program.cs            # Microsoft.Extensions.Logging
│   ├── LoggingExample.csproj
│   └── Dockerfile
├── cpp/
│   ├── main.cpp              # spdlog structured logging
│   ├── CMakeLists.txt
│   └── Dockerfile
├── go/
│   ├── main.go               # Zap JSON logging
│   ├── go.mod
│   ├── go.sum
│   └── Dockerfile
├── php/
│   ├── app.php               # Monolog with context
│   └── Dockerfile
├── docker-compose.yml         # Complete stack with Loki + Grafana
├── loki-config.yaml
└── README.md
```

## 🔍 Alloy Parsing Features Demonstrated

### Core Alloy Components Used
- **`loki.source.docker`**: Automatic Docker container log discovery
- **`loki.process`**: Multi-stage log parsing pipeline
- **`discovery.docker`**: Container metadata extraction
- **`discovery.relabel`**: Label transformation and routing

### Advanced Parsing Techniques
Each language parser demonstrates different Alloy parsing capabilities:

- **Regex parsing** (`stage.regex`): Extract structured fields from text logs
- **JSON parsing** (`stage.json`): Handle native JSON log formats  
- **Multiline handling** (`stage.multiline`): Process stack traces and exception logs
- **Label management** (`stage.labels`): Efficient indexing for filtering
- **Structured metadata** (`stage.structured_metadata`): Searchable non-indexed data
- **Timestamp parsing** (`stage.timestamp`): Multiple timestamp format support
- **Template formatting** (`stage.template`): Custom output formatting
- **Conditional logic**: Level conversion, error prioritization

### Language-Specific Parsing Examples

| Language | Primary Challenge | Alloy Solution |
|----------|------------------|----------------|
| **JavaScript (Pino)** | JSON numeric levels | Template stage for level conversion |
| **Python** | Custom text format | Regex extraction with line numbers |
| **Java (Logback)** | Multi-line stack traces | Multiline stage + regex parsing |
| **C#** | Event IDs and namespaces | Regex parsing with structured metadata |
| **C++** | Source location details | Complex regex for file:line extraction |
| **Go (Zap)** | Unix timestamps | Timestamp parsing with fractional seconds |
| **PHP (Monolog)** | Nested JSON context | Multiple JSON parsing stages |

## 🚀 Quick Start Tutorial

### Step 1: Clone the Repository

```bash
git clone https://github.com/grafana/alloy-scenarios.git
cd app-instrumentation/logging/popular-logging-frameworks
```

### Step 2: Launch the Complete Stack

```bash
# Build and run all applications with Alloy + Loki + Grafana
docker compose up --build

# Run in detached mode to see clean output
docker compose up --build -d
```

This starts:
- **7 language applications** generating structured logs
- **Alloy** parsing and forwarding logs to Loki
- **Loki** storing parsed logs with labels and metadata
- **Grafana** for log visualization and querying

### Step 3: Explore the Logs

- Head to http://localhost:3000/a/grafana-lokiexplore-app to see the logs in Grafana
- Each language has its own service name / app so you can identify which languge you would like to see the parsed logs for

## 📚 Learning Outcomes

After completing this tutorial, you'll understand:

### Alloy Concepts
- **Multi-stage processing**: How to chain `loki.process` stages for complex parsing
- **Component composition**: Using `import.file` to modularize configurations
- **Discovery patterns**: Automatic service discovery with Docker integration
- **Label vs. metadata strategy**: When to use indexed labels vs. structured metadata

### Log Parsing Techniques
- **Regex mastery**: Complex pattern matching for text log formats
- **JSON handling**: Extracting nested fields from structured logs
- **Timestamp parsing**: Supporting multiple timestamp formats across languages
- **Multiline processing**: Handling stack traces and exception logs
- **Conditional formatting**: Template logic for log transformation

### Real-World Patterns
- **Language-specific challenges**: Understanding unique parsing requirements per language
- **Performance considerations**: Efficient labeling and metadata strategies
- **Observability best practices**: Structured logging principles across tech stacks
- **Container log collection**: Production-ready log aggregation patterns

## 🔧 Configuration Details

### Language-Specific Parsing Challenges

Each language presents unique parsing requirements:

#### JavaScript (Pino)
```alloy
// Challenge: Numeric log levels (10, 20, 30, 40, 50, 60)
stage.template {
  source = "level"
  template = "{{- if eq .level_num \"30\" -}}info{{- else if eq .level_num \"50\" -}}error{{- end -}}"
}
```

#### Java (Logback)  
```alloy
// Challenge: Multi-line stack traces
stage.multiline {
  firstline = "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}"
}
```

#### Go (Zap)
```alloy
// Challenge: Unix timestamp with fractional seconds
stage.timestamp {
  source = "ts"
  format = "1750342991.0445938"
}
```

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/alloy/config.alloy
================================================
// ###############################
// #### Main Logging Configuration ####
// ###############################

// Import the custom log parsing helper module.
// This gives us access to the "app_logs_parser" component that handles
// language-specific log parsing for Python, JavaScript, Go, Java, C#, PHP, and C++.
import.file "helper" {
  filename = "/etc/alloy/helper.alloy"
}

// Discover Docker containers running on the local Docker daemon.
// This component continuously monitors the Docker socket for container changes,
// providing real-time discovery of log sources without manual configuration.
discovery.docker "linux" {
  host = "unix:///var/run/docker.sock"  // Connect to local Docker daemon via Unix socket
}

// Transform Docker container metadata into useful labels for log routing.
// This creates a "service_name" label from the container name, which is used
// by our parsing stages to determine which language parser to apply.
discovery.relabel "logs_integrations_docker" {
  targets = []  // Start with empty targets (will be populated by discovery.docker)
  
  // Extract container name and use it as service identifier
  // Example: container "/python-app" becomes service_name="python-app"
  rule {
    source_labels = ["__meta_docker_container_name"]  // Docker provides this metadata
    regex = "/(.*)"                                   // Remove leading slash from container name
    target_label = "service_name"                     // Create clean service identifier
  }
}

// Instantiate our custom log parser with output destination.
// This creates the processing pipeline that will parse logs from all supported languages
// and forward them to Loki for storage and querying.
helper.app_logs_parser "default" {
  write_to = [loki.write.local.receiver]  // Send parsed logs to our Loki instance
}

// Collect logs from all discovered Docker containers.
// This is the main log collection engine that streams container logs in real-time
// and feeds them into our language-specific parsing pipeline.
loki.source.docker "default" {
  host       = "unix:///var/run/docker.sock"                           // Connect to Docker daemon
  targets    = discovery.docker.linux.targets                          // Use discovered containers
  labels     = {"platform" = "docker"}                                 // Add platform label to all logs
  relabel_rules = discovery.relabel.logs_integrations_docker.rules     // Apply container name transformation
  forward_to = [helper.app_logs_parser.default.parser_input]           // Send raw logs to our parser
}

// Configure Loki write endpoint for log storage.
// This is where all parsed and enriched logs are finally stored for querying,
// alerting, and analysis in Grafana or other tools.
loki.write "local" {
  endpoint {
    url = "http://loki:3100/loki/api/v1/push"  // Loki's standard push API endpoint
  }
} 

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/alloy/helper.alloy
================================================
declare "app_logs_parser" {
  // argument.write_to is a required argument that specifies where parsed
  // log lines are sent.
  //
  // The value of the argument is retrieved in this file with
  // argument.write_to.value.
  argument "write_to" {
    optional = false
  }

  // loki.process.app_logs_parser is our component which executes the parsing,
  // passing parsed logs to argument.write_to.value.
  loki.process "app_logs_parser" {

    // ## Python Processing ##
    // Let only python logs pass through this stage. This is done via the label match on the service_name label.
    stage.match {
      pipeline_name = "python"
      selector = "{service_name=\"python\"}"
      
      // Extract the timestamp, file, line number, level, and message from the log line.
      // Python logs format: "2025-06-17 09:54:15,283 - main.py:25 - INFO - Starting application"
      stage.regex {
        expression = "^(?P<timestamp>\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}) - (?P<file>[^:]+):(?P<line_num>\\d+) - (?P<level>[^ ]+) - (?P<msg>.*)"
      }
      
      // Set the file and level as labels for efficient filtering and querying in Loki.
      // Labels are indexed and should be used for high-cardinality filtering.
      stage.labels {
        values = {
          file = "",
          level = "",
        }
      }

      // Set the timestamp to the timestamp extracted from the log line.
      // This ensures proper chronological ordering in Loki.
      stage.timestamp {
        source = "timestamp"
        format = "2006-01-02 15:04:05,000"
      }

      // Set the line number as structured metadata in Loki (non-indexed).
      // Structured metadata is searchable but not indexed, reducing storage costs.
      stage.structured_metadata {
        values = {
          line_num = "",
        }
      }

      // We want to maintain a similar format to the original log line so we use template to create a new
      // temporary variable called output. This creates a clean, consistent format across all Python logs.
      stage.template {
        source = "output"
        template = "{{.file}} - {{.line_num}} - {{.level}} - {{.msg}}"
      }

      // We use the new output variable to create a new log body. This is the log line that will be sent to loki.
      // The output stage replaces the original log message with our formatted version.
      stage.output {
        source = "output"
      }
    }

    // ## Node.js Processing ##
    // Let only node.js logs pass through this stage. This is done via the label match on the service_name label.
    stage.match {
      pipeline_name = "javascript"
      selector = "{service_name=\"javascript\"}"
      
      // Extract fields from JSON-formatted Pino logs.
      // Pino outputs structured JSON logs with fields like level (numeric), time (timestamp), msg, etc.
      stage.json {
        expressions = {
          level_num        = "level",
          time             = "time",
          pid              = "pid",
          hostname         = "hostname",
          msg              = "msg",
          obj              = "obj",
          counter          = "counter",
          component        = "component",
          query            = "query",
          duration         = "duration",
          version          = "version",
          method           = "method",
          path             = "path",
          status           = "status",
          nested_obj       = "nested.obj",
          nested_timestamp = "nested.timestamp",
          err_type         = "err.type",
          err_message      = "err.message",
          err_stack        = "err.stack",
        }
      }

      // Convert Pino's numeric log levels to human-readable strings.
      // Pino uses numbers: 10=trace, 20=debug, 30=info, 40=warn, 50=error, 60=fatal
      stage.template {
        source   = "level"
        template = "{{- if eq .level_num \"10\" -}}trace{{- else if eq .level_num \"20\" -}}debug{{- else if eq .level_num \"30\" -}}info{{- else if eq .level_num \"40\" -}}warn{{- else if eq .level_num \"50\" -}}error{{- else if eq .level_num \"60\" -}}fatal{{- else -}}unknown{{- end -}}"
      }

      // Set important fields as labels for efficient querying.
      // hostname and component help identify log sources, level enables filtering by severity.
      stage.labels {
        values = {
          file      = "",
          hostname  = "",
          component = "",
          level     = "",
        }
      }

      // Set the timestamp from Pino's Unix millisecond timestamp.
      // Pino logs include precise timestamps for accurate log ordering.
      stage.timestamp {
        source = "time"
        format = "UnixMs"
      }

      // Store all extracted fields as structured metadata for searchability without indexing costs.
      // This includes process info, request details, and error information.
      stage.structured_metadata {
        values = {
          level_num        = "",
          pid              = "",
          query            = "",
          duration         = "",
          version          = "",
          method           = "",
          path             = "",
          status           = "",
          nested_obj       = "",
          nested_timestamp = "",
          err_type         = "",
          err_message      = "",
          err_stack        = "",
        }
      }

      // Create a consistent output format prioritizing error messages over regular messages.
      // This provides better visibility of errors while maintaining standard log structure.
      stage.template {
        source   = "output"
        template = "{{.hostname}} - {{.level}} - {{ if .err_message }}{{ .err_message }}{{ else }}{{ .msg }}{{ end }}"
      }

      // Apply the formatted output as the final log message sent to Loki.
      stage.output {
        source = "output"
      }
    }

    // ## Go Processing ##
    // Let only go logs pass through this stage. This is done via the label match on the service_name label.
    stage.match {
      pipeline_name = "go"
      selector = "{service_name=\"go\"}"
      
      // Extract fields from Zap's JSON-structured logs.
      // Zap outputs detailed JSON logs with structured fields for better observability.
      stage.json {
        expressions = {
          level            = "level",
          ts               = "ts",
          logger           = "logger",
          caller           = "caller",
          msg              = "msg",
          answer           = "answer",
          obj              = "obj",
          counter          = "counter",
          feature          = "feature",
          query            = "query",
          duration         = "duration",
          method           = "method",
          path             = "path",
          status           = "status",
          requestId        = "requestId",
          context1         = "context1",
          context2         = "context2",
          error            = "error",
          stacktrace       = "stacktrace",
          nested_obj       = "nested.obj",
          nested_timestamp = "nested.timestamp",
        }
      }

      // Set logger name and level as indexed labels for efficient filtering.
      // This enables quick filtering by specific loggers (e.g., database, api) and log levels.
      stage.labels {
        values = {
          logger = "",
          level  = "",
        }
      }

      // Parse Zap's Unix timestamp with fractional seconds.
      // Zap provides high-precision timestamps for accurate log correlation.
      stage.timestamp {
        source = "ts"
        format = "1750342991.0445938"
      }

      // Store all contextual information as structured metadata.
      // This includes caller info, request details, errors, and application-specific data.
      stage.structured_metadata {
        values = {
          caller           = "caller",
          answer           = "answer",
          obj              = "obj",
          counter          = "counter",
          feature          = "feature",
          query            = "query",
          duration         = "duration",
          method           = "method",
          path             = "path",
          status           = "status",
          requestId        = "requestId",
          context1         = "context1",
          context2         = "context2",
          error            = "error",
          stacktrace       = "stacktrace",
          nested_obj       = "nested.obj",
          nested_timestamp = "nested.timestamp",
        }
      }

      // Create a clean, consistent output format showing logger, level, and message.
      // This maintains readability while preserving structured data in metadata.
      stage.template {
        source   = "output"
        template = "{{.logger}} - {{.level}} - {{.msg}}"
      }

      // Apply the formatted output as the final log message.
      stage.output {
        source = "output"
      }
    }

    // ## Java Processing ##
    // Let only java logs pass through this stage. This is done via the label match on the service_name label.
    stage.match {
      pipeline_name = "java"
      selector = "{service_name=\"java\"}"
      
      // Handle multi-line Java stack traces by identifying the start of new log entries.
      // Java exceptions often span multiple lines, so we need to group them properly.
      stage.multiline {
        firstline = "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{3}\\+\\d{4}\\[[^\\]]+\\]\\s+[A-Z]+\\s+\\w+\\s+-\\s+"
      }

      // Parse Logback's structured log format including timestamps, threads, levels, and stack traces.
      // Format: "2024-01-15T14:41:02.423+0000[main] INFO App - Starting application"
      stage.regex {
        expression = "^(?P<timestamp>\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{3}\\+\\d{4})\\[(?P<thread>[^\\]]+)\\] (?P<level>[A-Z]+)\\s+(?P<logger>[^ ]+) - (?P<msg>[^\n]*)(?:\\n(?P<stacktrace>.*))?"
      }

      // Set logger and level as indexed labels for efficient log filtering.
      // This enables filtering by specific Java classes/packages and log severity.
      stage.labels {
        values = {
          logger = "",
          level  = "",
        }
      }

      // Parse ISO 8601 timestamp with timezone for accurate time correlation.
      // Java's Logback uses precise timestamps with timezone information.
      stage.timestamp {
        source = "timestamp"
        format = "2006-01-02T15:04:05.000-0700"
      }

      // Store thread information and stack traces as structured metadata.
      // Thread info helps with concurrent debugging, stack traces provide error context.
      stage.structured_metadata {
        values = {
          thread     = "",
          stacktrace = "",
        }
      }

      // Format output to show essential information: logger, level, and message.
      // Stack traces are preserved in metadata for when they're needed.
      stage.template {
        source   = "output"
        template = "{{.logger}} - {{.level}} - {{.msg}}"
      }

      // Apply the clean formatted output while preserving detailed metadata.
      stage.output {
        source = "output"
      }
    }

    // ## C# Processing ##
    // Let only c# logs pass through this stage. This is done via the label match on the service_name label.
    stage.match {
      pipeline_name = "csharp"
      selector = "{service_name=\"csharp\"}"

      // Handle multi-line .NET logs and exception stack traces.
      // .NET logging can span multiple lines, especially with structured logging and exceptions.
      stage.multiline {
        firstline = "^\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3} [a-z]+: [^\\[]+\\[\\d+\\]"
      }

      // Parse .NET's structured logging format with event IDs.
      // Format: "2024-01-15 14:41:02.423 info: Microsoft.Extensions.Hosting[1] Starting application"
      stage.regex {
        expression = "^(?P<timestamp>\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}) (?P<level>[a-z]+): (?P<logger>[^\\[]+)\\[(?P<event_id>\\d+)\\]\\n\\s+(?P<msg>.*)"
      }

      // Set logger namespace and level as indexed labels for filtering.
      // .NET uses hierarchical logger names (e.g., Microsoft.Extensions.Hosting) for categorization.
      stage.labels {
        values = {
          logger = "",
          level  = "",
        }
      }

      // Parse .NET's standard timestamp format (no timezone).
      // .NET logging typically uses local time format.
      stage.timestamp {
        source = "timestamp"
        format = "2006-01-02 15:04:05.000"
      }

      // Store .NET-specific event IDs as structured metadata.
      // Event IDs help categorize and filter specific types of .NET framework events.
      stage.structured_metadata {
        values = {
          event_id = "",
        }
      }

      // Create consistent output format showing logger namespace, level, and message.
      stage.template {
        source = "output"
        template = "{{.logger}} - {{.level}} - {{.msg}}"
      }

      // Apply the formatted output to maintain consistency with other language logs.
      stage.output {
        source = "output"
      }
    }

    // ## PHP Processing ##
    // Let only php logs pass through this stage. This is done via the label match on the service_name label.
    stage.match {
      pipeline_name = "php"
      selector = "{service_name=\"php\"}"

      // Parse Monolog's default line format with timestamp, logger, level, message, context, and extra data.
      // Format: "[2024-01-15T14:41:02.123456+00:00] app.INFO: hello world {"counter":42} {"environment":"production"}"
      stage.regex {
        expression = "^\\[(?P<timestamp>[^\\]]+)\\] (?P<logger>[^.]+)\\.(?P<level>[A-Z]+): (?P<msg>.*?) (?P<context_json>\\[\\]|\\{.*?\\}) (?P<extra_json>\\{.*?\\})$"
      }

      // Set logger name and level as indexed labels for efficient querying.
      // PHP applications often use multiple named loggers (app, database, api, etc.).
      stage.labels {
        values = {
          logger = "",
          level  = "",
        }
      }

      // Parse Monolog's ISO 8601 timestamp with microseconds and timezone.
      // Monolog provides high-precision timestamps for accurate log correlation.
      stage.timestamp {
        source = "timestamp"
        format = "2006-01-02T15:04:05.000000-07:00"
      }

      // Extract application-specific data from the context JSON.
      // Context contains request-specific data like counters, query info, API details, etc.
      stage.json {
        source = "context_json"
        expressions = {
          counter          = "counter",
          obj              = "obj",
          query            = "query",
          duration         = "duration",
          method           = "method",
          path             = "path",
          status           = "status",
          exception        = "exception",
          error_code       = "error_code",
          affected_service = "affected_service",
        }
      }
      
      // Extract environment and system-level data from the extra JSON.
      // Extra data typically contains environment info, process details, etc.
      stage.json {
        source = "extra_json"
        expressions = {
          environment = "environment",
        }
      }

      // Store all extracted PHP context and environment data as structured metadata.
      // This provides rich searchability for PHP application debugging and monitoring.
      stage.structured_metadata {
        values = {
          counter          = "",
          obj              = "",
          query            = "",
          duration         = "",
          method           = "",
          path             = "",
          status           = "",
          exception        = "",
          error_code       = "",
          affected_service = "",
          environment      = "",
        }
      }

      // Create clean output format showing logger, level, and message.
      // Detailed context remains accessible in structured metadata.
      stage.template {
        source = "output"
        template = "{{.logger}} - {{.level}} - {{.msg}}"
      }

      // Apply the standardized output format while preserving rich PHP context data.
      stage.output {
        source = "output"
      }
    }

    // ## C++ Processing ##
    // Let only cpp logs pass through this stage. This is done via the label match on the service_name label.
    stage.match {
      pipeline_name = "cpp"
      selector = "{service_name=\"cpp\"}"

      // Parse C++ structured logging format with detailed source location information.
      // Format: "2024-01-15 14:41:02.423 [info] [logger] [thread 1] [main.cpp:25 main] - Starting application"
      stage.regex {
        expression = "^(?P<timestamp>\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}) \\[(?P<level>[^\\]]+)\\] \\[(?P<logger>[^\\]]+)\\] \\[(?P<thread>[^\\]]+)\\] \\[(?P<file>[^:]+):(?P<line_num>\\d+) (?P<function>[^\\]]+)\\] - (?P<msg>.*)"
      }

      // Set logger, level, and source file as indexed labels for debugging.
      // C++ logs benefit from file-based filtering for debugging specific modules.
      stage.labels {
        values = {
          logger = "",
          level  = "",
          file   = "",
        }
      }

      // Parse standard timestamp format used by C++ logging libraries.
      stage.timestamp {
        source = "timestamp"
        format = "2006-01-02 15:04:05.000"
      }

      // Store detailed C++ debugging information as structured metadata.
      // Thread info, line numbers, and function names are crucial for C++ debugging.
      stage.structured_metadata {
        values = {
          thread   = "",
          line_num = "",
          function = "",
        }
      }

      // Create detailed output showing file location, function, level, and message.
      // C++ debugging often requires precise source location information.
      stage.template {
        source = "output"
        template = "{{.file}}:{{.line_num}} {{.function}} - {{.level}} - {{.msg}}"
      }

      // Apply the detailed C++ format optimized for debugging and troubleshooting.
      stage.output {
        source = "output"
      }
    }

    // Send processed logs to our argument.
    forward_to = argument.write_to.value
  }

  // export.parser_input exports a value to the module consumer.
  export "parser_input" {
    // Expose the receiver of loki.process so the module importer can send
    // logs to our loki.process component.
    value = loki.process.app_logs_parser.receiver
  }
}

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/cpp/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.16)
project(LoggingExample)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# Find required packages
find_package(PkgConfig REQUIRED)
find_package(Threads REQUIRED)

# Add spdlog
include(FetchContent)
FetchContent_Declare(
    spdlog
    GIT_REPOSITORY https://github.com/gabime/spdlog.git
    GIT_TAG v1.12.0
)
FetchContent_MakeAvailable(spdlog)

# Create executable
add_executable(logging_example main.cpp)

# Link libraries
target_link_libraries(logging_example 
    PRIVATE 
    spdlog::spdlog
    Threads::Threads
)

# Compiler-specific options
if(MSVC)
    target_compile_options(logging_example PRIVATE /W4)
else()
    target_compile_options(logging_example PRIVATE -Wall -Wextra -Wpedantic)
endif() 

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/cpp/Dockerfile
================================================
FROM ubuntu:26.04@sha256:f3d28607ddd78734bb7f71f117f3c6706c666b8b76cbff7c9ff6e5718d46ff64

# Install build dependencies
RUN apt-get update && apt-get install -y \
    cmake \
    g++ \
    make \
    git \
    pkg-config \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /app

COPY . .

# Build the application
RUN cmake -B build -S . && \
    cmake --build build --config Release

# Run the application
CMD ["./build/logging_example"] 

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/cpp/main.cpp
================================================
#include <spdlog/spdlog.h>
#include <spdlog/sinks/stdout_color_sinks.h>
#include <chrono>
#include <thread>

int main() {
    auto console = spdlog::stdout_color_mt("logger");
    spdlog::set_default_logger(console);
    spdlog::set_level(spdlog::level::debug);
    spdlog::set_pattern(
        "%Y-%m-%d %H:%M:%S.%e [%^%l%$] [%n] [thread %t] [%s:%# %!] - %v"
    );

    int counter = 0;

    SPDLOG_LOGGER_INFO(console, "Starting C++ basic logging example");
    SPDLOG_LOGGER_INFO(console, "Demonstrating spdlog formatting");

    while (true) {
        counter++;
        int logType = counter % 5;

        switch (logType) {
            case 0:
                SPDLOG_LOGGER_DEBUG(console, "Basic debug message, counter: {}", counter);
                break;
            case 1:
                SPDLOG_LOGGER_INFO(console, "Information message, counter: {}", counter);
                break;
            case 2:
                SPDLOG_LOGGER_WARN(console, "Warning message, counter: {}", counter);
                break;
            case 3:
                SPDLOG_LOGGER_ERROR(console, "Error message, counter: {}", counter);
                break;
            case 4:
                SPDLOG_LOGGER_CRITICAL(console, "Critical message, counter: {}", counter);
                break;
        }

        std::this_thread::sleep_for(std::chrono::seconds(1));
    }

    return 0;
}


================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/csharp/Dockerfile
================================================
FROM mcr.microsoft.com/dotnet/sdk:9.0@sha256:0300d42309afd86168fa57d62db79020a34ee396d39c9634844b9c0ab285ea55 AS build
WORKDIR /app

COPY *.csproj .
RUN dotnet restore

COPY . .
RUN dotnet publish -c Release -o out

FROM mcr.microsoft.com/dotnet/runtime:9.0@sha256:7590f1b7e124fe7a4b7cffa5f6f9958f2c02a22bf5bd7a0387a84b88cddf4057
WORKDIR /app
COPY --from=build /app/out .

ENTRYPOINT ["dotnet", "LoggingExample.dll"] 

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/csharp/LoggingExample.csproj
================================================
<Project Sdk="Microsoft.NET.Sdk">

  <PropertyGroup>
    <OutputType>Exe</OutputType>
    <TargetFramework>net9.0</TargetFramework>
    <Nullable>enable</Nullable>
  </PropertyGroup>

  <ItemGroup>
    <PackageReference Include="Microsoft.Extensions.Hosting" Version="10.0.7" />
    <PackageReference Include="Microsoft.Extensions.Logging" Version="10.0.7" />
    <PackageReference Include="Microsoft.Extensions.Logging.Console" Version="10.0.7" />
  </ItemGroup>

</Project> 

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/csharp/Program.cs
================================================
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using System;
using System.Threading.Tasks;

namespace LoggingExample
{
    class Program
    {
        private static ILogger<Program>? _logger;

        static async Task Main(string[] args)
        {
            // Configure logging with proper formatting
            using var host = Host.CreateDefaultBuilder(args)
                .ConfigureLogging(logging =>
                {
                    logging.ClearProviders();
                    logging.AddConsole(options =>
                    {
                        options.TimestampFormat = "yyyy-MM-dd HH:mm:ss.fff ";
                        options.IncludeScopes = false;
                    });
                    logging.SetMinimumLevel(LogLevel.Debug);
                })
                .Build();

            _logger = host.Services.GetRequiredService<ILogger<Program>>();

            int counter = 0;

            _logger.LogInformation("Starting C# basic logging example");
            _logger.LogInformation("Demonstrating Microsoft.Extensions.Logging");

            // Infinite loop with different log levels
            while (true)
            {
                counter++;

                // Cycle through different log levels
                int logType = counter % 5;

                switch (logType)
                {
                    case 0:
                        _logger.LogDebug("Basic debug message, counter: {Counter}", counter);
                        break;
                    case 1:
                        _logger.LogInformation("Information message, counter: {Counter}", counter);
                        break;
                    case 2:
                        _logger.LogWarning("Warning message, counter: {Counter}", counter);
                        break;
                    case 3:
                        _logger.LogError("Error message, counter: {Counter}", counter);
                        break;
                    case 4:
                        _logger.LogCritical("Critical message, counter: {Counter}", counter);
                        break;
                }

                // Wait 1 second before next log
                await Task.Delay(1000);
            }
        }
    }
} 

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/docker-compose.coda.yml
================================================
services:
  javascript-logging:
    build:
      context: ./javascript
      dockerfile: Dockerfile
    container_name: javascript
    environment:
      - NODE_ENV=production
    restart: unless-stopped

  python-logging:
    build:
      context: ./python
      dockerfile: Dockerfile
    container_name: python
    environment:
      - PYTHON_ENV=production
    restart: unless-stopped

  java-logging:
    build:
      context: ./java
      dockerfile: Dockerfile
    container_name: java
    environment:
      - JAVA_ENV=production
    restart: unless-stopped

  csharp-logging:
    build:
      context: ./csharp
      dockerfile: Dockerfile
    container_name: csharp
    environment:
      - DOTNET_ENVIRONMENT=Production
    restart: unless-stopped

  cpp-logging:
    build:
      context: ./cpp
      dockerfile: Dockerfile
    container_name: cpp
    environment:
      - CPP_ENV=production
    restart: unless-stopped

  go-logging:
    build:
      context: ./go
      dockerfile: Dockerfile
    container_name: go
    environment:
      - GO_ENV=production
    restart: unless-stopped

  php-logging:
    build:
      context: ./php
      dockerfile: Dockerfile
    container_name: php
    environment:
      - PHP_ENV=production
    restart: unless-stopped


================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/docker-compose.yml
================================================
version: '3.8'

services:
  javascript-logging:
    build:
      context: ./javascript
      dockerfile: Dockerfile
    container_name: javascript
    environment:
      - NODE_ENV=production
    restart: unless-stopped

  python-logging:
    build:
      context: ./python
      dockerfile: Dockerfile
    container_name: python
    environment:
      - PYTHON_ENV=production
    restart: unless-stopped

  java-logging:
    build:
      context: ./java
      dockerfile: Dockerfile
    container_name: java
    environment:
      - JAVA_ENV=production
    restart: unless-stopped

  csharp-logging:
    build:
      context: ./csharp
      dockerfile: Dockerfile
    container_name: csharp
    environment:
      - DOTNET_ENVIRONMENT=Production
    restart: unless-stopped

  cpp-logging:
    build:
      context: ./cpp
      dockerfile: Dockerfile
    container_name: cpp
    environment:
      - CPP_ENV=production
    restart: unless-stopped

  go-logging:
    build:
      context: ./go
      dockerfile: Dockerfile
    container_name: go
    environment:
      - GO_ENV=production
    restart: unless-stopped

  php-logging:
    build:
      context: ./php
      dockerfile: Dockerfile
    container_name: php
    environment:
      - PHP_ENV=production
    restart: unless-stopped


  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    container_name: loki
    ports:
      - "3100:3100"
    volumes:
     - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml
  grafana:
   image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
   container_name: grafana
   environment:
     - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
     - GF_AUTH_ANONYMOUS_ENABLED=true
     - GF_AUTH_BASIC_ENABLED=false
   ports:
     - 3000:3000/tcp
   entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        EOF
        /run.sh

  alloy:
   image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
   container_name: alloy
   ports:
     - 12345:12345
     - 4317:4317
     - 4318:4318
   volumes:
     - ./alloy/:/etc/alloy/
     - /var/run/docker.sock:/var/run/docker.sock
   command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy


networks:
  default:
    name: logging-examples-network 

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/go/Dockerfile
================================================
FROM golang:1.26-alpine@sha256:91eda9776261207ea25fd06b5b7fed8d397dd2c0a283e77f2ab6e91bfa71079d

WORKDIR /app

# Copy go.mod and go.sum for better caching
COPY go.mod go.sum ./
RUN go mod download

# Copy source code
COPY main.go .

RUN go build -o logging_example main.go

CMD ["./logging_example"] 

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/go/go.mod
================================================
module logging-example

go 1.23

require go.uber.org/zap v1.28.0

require go.uber.org/multierr v1.10.0 // indirect


================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/go/go.sum
================================================
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/multierr v1.10.0 h1:S0h4aNzvfcFsC3dRF1jLoaov7oRaKqRGC/pUEJ2yvPQ=
go.uber.org/multierr v1.10.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
go.uber.org/zap v1.28.0 h1:IZzaP1Fv73/T/pBMLk4VutPl36uNC+OSUh3JLG3FIjo=
go.uber.org/zap v1.28.0/go.mod h1:rDLpOi171uODNm/mxFcuYWxDsqWSAVkFdX4XojSKg/Q=
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=


================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/go/main.go
================================================
package main

import (
	"errors"
	"time"

	"go.uber.org/zap"
	"go.uber.org/zap/zapcore"
)

func main() {
	// Configure Zap logger for JSON output to stdout
	config := zap.NewProductionConfig()
	config.Level = zap.NewAtomicLevelAt(zap.DebugLevel)
	config.OutputPaths = []string{"stdout"}

	logger, err := config.Build()
	if err != nil {
		panic(err)
	}
	defer logger.Sync()

	// Create child loggers for different components
	appLogger := logger.Named("app")
	dbLogger := logger.Named("database")
	apiLogger := logger.Named("api")

	counter := 0

	appLogger.Info("Starting Go basic logging example with Zap")
	appLogger.Info("Demonstrating Zap structured logging features")

	// Infinite loop with different logging examples
	for {
		counter++

		// Cycle through different logging examples
		logType := counter % 12

		switch logType {
		case 0:
			appLogger.Info("hello world")
		case 1:
			appLogger.Error("this is at error level")
		case 2:
			appLogger.Info("the answer is 42", zap.Int("answer", 42))
		case 3:
			appLogger.Info("hello world", zap.Int("obj", 42))
		case 4:
			appLogger.Info("hello world with counter",
				zap.Int("obj", 42),
				zap.Int("counter", counter))
		case 5:
			appLogger.Info("nested object",
				zap.Object("nested", zapcore.ObjectMarshalerFunc(func(enc zapcore.ObjectEncoder) error {
					enc.AddInt("obj", 42)
					enc.AddTime("timestamp", time.Now())
					return nil
				})))
		case 6:
			appLogger.Error("simulated error", zap.Error(errors.New("kaboom")))
		case 7:
			appLogger.Info("hello from app component!")
		case 8:
			dbLogger.Warn("slow query detected",
				zap.String("query", "SELECT * FROM users"),
				zap.Duration("duration", 250*time.Millisecond))
		case 9:
			apiLogger.Info("API request completed",
				zap.String("method", "GET"),
				zap.String("path", "/api/users"),
				zap.Int("status", 200))
		case 10:
			tempChild := appLogger.With(zap.String("requestId", "req-"+string(rune(counter))))
			tempChild.Debug("this is a debug statement via child")
		case 11:
			appLogger.Error("error with additional context",
				zap.Error(errors.New("kaboom")),
				zap.String("context1", "additional"),
				zap.String("context2", "information"))
		}

		// Occasionally demonstrate sugar logger
		if counter%15 == 0 {
			sugar := logger.Sugar()
			sugar.Infow("using sugar logger",
				"counter", counter,
				"feature", "sugar")
		}

		// Occasionally demonstrate different log levels
		if counter%20 == 0 {
			appLogger.Debug("this is a debug message", zap.Int("counter", counter))
			appLogger.Warn("this is a warning message", zap.Int("counter", counter))
		}

		// Wait 1 second before next log
		time.Sleep(1 * time.Second)
	}
}


================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/java/App.java
================================================
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.MDC;

public class App {
    private static final Logger logger = LoggerFactory.getLogger(App.class);
    private static final Logger appLogger = LoggerFactory.getLogger("app");
    private static final Logger dbLogger = LoggerFactory.getLogger("database");
    private static final Logger apiLogger = LoggerFactory.getLogger("api");
    
    public static void main(String[] args) {
        int counter = 0;
        
        logger.info("Starting Java basic logging example with SLF4J + Logback");
        logger.info("Demonstrating SLF4J structured logging features");
        
        // Infinite loop with different logging examples
        while (true) {
            counter++;
            
            // Cycle through different logging examples
            int logType = counter % 12;
            
            switch (logType) {
                case 0:
                    logger.info("hello world");
                    break;
                case 1:
                    logger.error("this is at error level");
                    break;
                case 2:
                    logger.info("the answer is {}", 42);
                    break;
                case 3:
                    logger.info("hello world with obj {}", 42);
                    break;
                case 4:
                    logger.info("hello world with counter {} and obj {}", counter, 42);
                    break;
                case 5:
                    logger.info("nested object with timestamp {} and value {}", 
                               java.time.LocalDateTime.now(), 42);
                    break;
                case 6:
                    Exception simulatedError = new RuntimeException("kaboom");
                    logger.error("simulated error", simulatedError);
                    break;
                case 7:
                    appLogger.info("hello from app component!");
                    break;
                case 8:
                    dbLogger.warn("slow query detected: {} took {}ms", 
                                 "SELECT * FROM users", 250);
                    break;
                case 9:
                    apiLogger.info("API request completed: {} {} status={}", 
                                  "GET", "/api/users", 200);
                    break;
                case 10:
                    // Using MDC (Mapped Diagnostic Context) for contextual logging
                    MDC.put("requestId", "req-" + counter);
                    logger.debug("this is a debug statement with MDC context");
                    MDC.clear();
                    break;
                case 11:
                    Exception error = new RuntimeException("kaboom");
                    logger.error("error with additional context: {} {}", 
                               "additional", "information", error);
                    break;
            }
            
            // Occasionally demonstrate different log levels
            if (counter % 15 == 0) {
                logger.debug("this is a debug message with counter {}", counter);
                logger.warn("this is a warning message with counter {}", counter);
            }
            
            // Occasionally demonstrate MDC usage
            if (counter % 20 == 0) {
                MDC.put("userId", "user123");
                MDC.put("sessionId", "session456");
                logger.info("using MDC for contextual logging");
                MDC.clear();
            }
            
            // Wait 1 second before next log
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                logger.warn("Thread interrupted: {}", e.getMessage());
                break;
            }
        }
    }
} 

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/java/Dockerfile
================================================
FROM openjdk:26-slim@sha256:63814a9d8bbea6d39d5ce9c91843bec5e9d9d1d1bc2bade4bb57ba70c0839553

WORKDIR /app

# Download SLF4J API, Logback dependencies, and Jackson for JSON encoding
RUN apt-get update && apt-get install -y wget && \
    wget https://repo1.maven.org/maven2/org/slf4j/slf4j-api/2.0.9/slf4j-api-2.0.9.jar && \
    wget https://repo1.maven.org/maven2/ch/qos/logback/logback-classic/1.4.14/logback-classic-1.4.14.jar && \
    wget https://repo1.maven.org/maven2/ch/qos/logback/logback-core/1.4.14/logback-core-1.4.14.jar && \
    wget https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-core/2.16.1/jackson-core-2.16.1.jar && \
    wget https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-databind/2.16.1/jackson-databind-2.16.1.jar && \
    wget https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-annotations/2.16.1/jackson-annotations-2.16.1.jar && \
    apt-get clean && rm -rf /var/lib/apt/lists/*

COPY App.java .
COPY logback.xml .

RUN javac -cp "slf4j-api-2.0.9.jar:logback-classic-1.4.14.jar:logback-core-1.4.14.jar:jackson-core-2.16.1.jar:jackson-databind-2.16.1.jar:jackson-annotations-2.16.1.jar" App.java

CMD ["java", "-cp", ".:slf4j-api-2.0.9.jar:logback-classic-1.4.14.jar:logback-core-1.4.14.jar:jackson-core-2.16.1.jar:jackson-databind-2.16.1.jar:jackson-annotations-2.16.1.jar", "App"] 

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/java/logback.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
    <!-- Console appender with standard format -->
    <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
        <encoder>
            <pattern>%d{yyyy-MM-dd'T'HH:mm:ss.SSSZ}[%thread] %-5level %logger{36} - %msg%n</pattern>
        </encoder>
    </appender>

    <!-- Root logger configuration -->
    <root level="DEBUG">
        <appender-ref ref="STDOUT" />
    </root>
    
    <!-- Specific logger configurations -->
    <logger name="app" level="DEBUG" />
    <logger name="database" level="DEBUG" />
    <logger name="api" level="DEBUG" />
</configuration> 

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/javascript/Dockerfile
================================================
FROM node:24-alpine@sha256:d1b3b4da11eefd5941e7f0b9cf17783fc99d9c6fc34884a665f40a06dbdfc94f

WORKDIR /app

# Create package.json and install pino with pino-pretty for better output formatting
RUN echo '{"name": "logging-example", "version": "1.0.0", "dependencies": {"pino": "^8.17.2", "pino-pretty": "^10.3.1"}}' > package.json
RUN npm install

COPY app.js .

RUN chmod +x app.js

CMD ["node", "app.js"] 

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/javascript/app.js
================================================
#!/usr/bin/env node

// Pino's primary usage writes ndjson to `stdout`:
const pino = require('pino')()

// However, if "human readable" output is desired,
// `pino-pretty` can be provided as the destination
// stream by uncommenting the following line in place
// of the previous declaration:
// const pino = require('pino')(require('pino-pretty')())

let counter = 0;

pino.info('Starting JavaScript basic logging example with Pino');
pino.info('Demonstrating various Pino logging features');

// Create child loggers with different contexts
const appLogger = pino.child({ component: 'app' });
const dbLogger = pino.child({ component: 'database' });
const apiLogger = pino.child({ component: 'api', version: '1.0' });

// Function to demonstrate various logging features
function logMessage() {
    counter++;
    
    // Cycle through different logging examples
    const logType = counter % 12;
    
    switch (logType) {
        case 0:
            pino.info('hello world');
            break;
        case 1:
            pino.error('this is at error level');
            break;
        case 2:
            pino.info('the answer is %d', 42);
            break;
        case 3:
            pino.info({ obj: 42 }, 'hello world');
            break;
        case 4:
            pino.info({ obj: 42, counter: counter }, 'hello world with counter');
            break;
        case 5:
            pino.info({ nested: { obj: 42, timestamp: new Date() } }, 'nested object');
            break;
        case 6:
            pino.error(new Error('simulated error'));
            break;
        case 7:
            appLogger.info('hello from app component!');
            break;
        case 8:
            dbLogger.warn({ query: 'SELECT * FROM users', duration: 250 }, 'slow query detected');
            break;
        case 9:
            apiLogger.info({ method: 'GET', path: '/api/users', status: 200 }, 'API request completed');
            break;
        case 10:
            const tempChild = pino.child({ requestId: `req-${counter}` });
            tempChild.debug('this is a debug statement via child');
            break;
        case 11:
            pino.info(new Error('kaboom'), 'with', 'additional', 'context');
            break;
    }
    
    // Occasionally demonstrate level changes
    if (counter % 20 === 0) {
        pino.level = 'debug';
        pino.debug('switched to debug level - this should now be visible');
        setTimeout(() => {
            pino.level = 'info';
            pino.info('switched back to info level');
        }, 500);
    }
    
    // Occasionally demonstrate trace level
    if (counter % 25 === 0) {
        const originalLevel = pino.level;
        pino.level = 'trace';
        pino.trace('this is a trace statement');
        pino.level = originalLevel;
    }
}

// Log every 1 second infinitely
setInterval(logMessage, 1000); 

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/loki-config.yaml
================================================

# This is a complete configuration to deploy Loki backed by the filesystem.
# The index will be shipped to the storage via tsdb-shipper.

auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true


server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
  - from: 2020-05-15
    store: tsdb
    object_store: filesystem
    schema: v13
    index:
      prefix: index_
      period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/php/Dockerfile
================================================
FROM php:8.5-cli-alpine@sha256:6ca76906d789edfac74e5f109c800b71e571bd313277133eaddc079733ee0b65

WORKDIR /app

# Install Composer
RUN curl -sS https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer

# Create composer.json for Monolog
RUN echo '{"require": {"monolog/monolog": "^3.5"}}' > composer.json

# Install dependencies
RUN composer install --no-dev --optimize-autoloader

COPY app.php .

CMD ["php", "app.php"] 

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/php/app.php
================================================
<?php

require_once 'vendor/autoload.php';

use Monolog\Logger;
use Monolog\Handler\StreamHandler;
use Exception;

// Create the main logger
$logger = new Logger('app');

// Create a console handler that writes to stdout
$consoleHandler = new StreamHandler('php://stdout', Logger::DEBUG);

// Push the handler onto the logger
$logger->pushHandler($consoleHandler);

// Add a processor to inject an 'environment' extra field into every log entry
$logger->pushProcessor(function ($record) {
    $record['extra']['environment'] = 'production'; // You can set any value or use getenv() etc.
    return $record;
});

// Create component-specific loggers if you want
$appLogger = $logger->withName('app');
$dbLogger = $logger->withName('database');
$apiLogger = $logger->withName('api');

$counter = 0;

$logger->info("Starting PHP basic logging example with Monolog");
$logger->info("Demonstrating Monolog structured logging features");

while (true) {
    $counter++;
    $logType = $counter % 6;

    switch ($logType) {
        case 0:
            $logger->info("hello world");
            break;
        case 1:
            $logger->error("this is at error level");
            break;
        case 2:
            $logger->info("hello world with counter", [
                'counter' => $counter,
                'obj' => 42
            ]);
            break;
        case 3:
            $dbLogger->warning("slow query detected", [
                'query' => 'SELECT * FROM users',
                'duration' => 250
            ]);
            break;
        case 4:
            $apiLogger->info("API request completed", [
                'method' => 'GET',
                'path' => '/api/users',
                'status' => 200
            ]);
            break;
        case 5:
            // Fatal error with stack trace
            $fatalException = new Exception("Critical system failure - database connection lost");
            $logger->emergency("System encountered a fatal error", [
                'exception' => $fatalException,
                'error_code' => 'DB_CONNECTION_LOST',
                'affected_service' => 'user_authentication'
            ]);
            break;
    }

    // Wait 1 second before next log
    sleep(1);
}


================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/python/Dockerfile
================================================
FROM python:3.12-slim@sha256:46cb7cc2877e60fbd5e21a9ae6115c30ace7a077b9f8772da879e4590c18c2e3

WORKDIR /app

COPY app.py .

RUN chmod +x app.py

CMD ["python", "app.py"] 

================================================
FILE: app-instrumentation/logging/popular-logging-frameworks/python/app.py
================================================
#!/usr/bin/env python3

import logging
import time

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format= '%(asctime)s - %(filename)s:%(lineno)d - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
    ]
)

logger = logging.getLogger(__name__)

def main():
    counter = 0
    
    logger.info("Starting Python basic logging example")
    logger.info("Demonstrating Python logging module")
    
    # Infinite loop with different log levels
    while True:
        counter += 1
        
        # Cycle through different log levels
        log_type = counter % 5
        
        if log_type == 0:
            logger.debug(f"Basic debug message, counter: {counter}")
        elif log_type == 1:
            logger.info(f"Information message, counter: {counter}")
        elif log_type == 2:
            logger.warning(f"Warning message, counter: {counter}")
        elif log_type == 3:
            logger.error(f"Error message, counter: {counter}")
        elif log_type == 4:
            logger.critical(f"Critical message, counter: {counter}")
        
        # Wait 1 second before next log
        time.sleep(1)

if __name__ == "__main__":
    main() 

================================================
FILE: aws-firehose-logs/README.md
================================================
# AWS Kinesis Data Firehose to Loki — no AWS account required

Demonstrates `loki.source.awsfirehose`, the HTTP receiver that accepts AWS Kinesis Data Firehose's documented delivery format. **You don't need an AWS account or any AWS SDKs** — Firehose is just an HTTPS POST in a known JSON shape, and this scenario emulates the producer with a small Python container.

This is the same producer-emulator pattern used by [`syslog/`](../syslog/) and [`gelf-log-ingestion/`](../gelf-log-ingestion/).

## Architecture

- **`alloy`** runs `loki.source.awsfirehose` on port `:9999`, listening at `/awsfirehose/api/v1/push`
- **`firehose-sender`** (Python) generates synthetic CloudWatch-style log batches every 5 seconds and POSTs them to Alloy in the documented Firehose delivery format (records array with gzip-compressed, base64-encoded data fields)
- **`loki`** + **`grafana`** for storage and visualization, with the Loki datasource auto-provisioned

The sender alternates between three log streams:
1. VPC flow logs on `eni-0abc1234-all` (channel `/aws/vpc/flowlogs`)
2. VPC flow logs on `eni-0def5678-all` (same channel, different stream)
3. Lambda invocation logs on `[$LATEST]abc` (channel `/aws/lambda/checkout-service`)

## Running

```bash
# From this directory
docker compose up -d

# Or from the repo root
./run-example.sh aws-firehose-logs
```

## Accessing

- **Grafana**: http://localhost:3000 (no login)
- **Alloy UI**: http://localhost:12345 — confirm components healthy, use livedebugging to watch records flow through
- **Firehose endpoint**: http://localhost:9999/awsfirehose/api/v1/push (POSTable from your laptop)
- **Loki API**: http://localhost:3100

## Trying it out

Within ~10 seconds of bring-up, the sender starts producing batches. In Grafana Explore on Loki:

```logql
# All Firehose-delivered logs
{log_group=~".+"}

# Just VPC flow logs
{log_group="/aws/vpc/flowlogs"}

# A specific ENI
{log_group="/aws/vpc/flowlogs", log_stream="eni-0abc1234-all"}

# Lambda invocations
{log_group="/aws/lambda/checkout-service"}

# Just the data records (vs control messages)
{msg_type="DATA_MESSAGE"}
```

The promoted labels `log_group`, `log_stream`, and `msg_type` come from the CloudWatch envelope — `loki.source.awsfirehose` automatically attaches `__aws_cw_log_group`, `__aws_cw_log_stream`, and `__aws_cw_msg_type` discovery labels when the records contain a CloudWatch subscription filter envelope; this scenario's `loki.relabel` block promotes them.

## Send your own records

The receiver is just an HTTP endpoint. From your laptop:

```bash
curl -X POST http://localhost:9999/awsfirehose/api/v1/push \
  -H 'Content-Type: application/json' \
  -d '{
    "requestId": "test-1",
    "timestamp": 1234567890,
    "records": [
      {"data": "'$(printf '{"messageType":"DATA_MESSAGE","logGroup":"/manual","logStream":"laptop","logEvents":[{"id":"x","timestamp":1234567890000,"message":"hi from curl"}]}' | gzip | base64)'"}
    ]
  }'
```

This adds a one-off entry visible at `{log_group="/manual"}`.

## Differences from real Firehose

This scenario emulates the wire format. A real Firehose delivery stream has a few additional concerns the demo doesn't cover:

- **Authentication**: real Firehose includes an `X-Amz-Firehose-Access-Key` header that the receiver validates. `loki.source.awsfirehose` supports this via the `access_key` argument; we leave it disabled in the demo for ease of trying it from curl. In production, **always** set an access key.
- **TLS**: real Firehose requires HTTPS. Add `tls { cert_file = ..., key_file = ... }` to the Alloy `http` block in production.
- **Retry semantics**: real Firehose retries on 5xx and partial successes. The Python sender here just logs failures and moves on.
- **Custom labels via header**: real Firehose can set `X-Amz-Firehose-Common-Attributes` (label names prefixed `lbl_`). Try adding this to your own producer to see additional discovery labels appear.

## Stopping

```bash
docker compose down -v
```


================================================
FILE: aws-firehose-logs/config.alloy
================================================
// AWS Kinesis Data Firehose → Loki, no AWS account required.
//
// `loki.source.awsfirehose` is just an HTTP endpoint that accepts
// Firehose's documented delivery format (a `records` array of base64
// blobs). A small Python sender container in this scenario fakes the
// producer side, posting CloudWatch-style log batches every few
// seconds. The component auto-detects the CloudWatch envelope and
// attaches the `__aws_cw_*` discovery labels we relabel below.

livedebugging { enabled = true }

// CloudWatch envelope discovery labels are exposed by
// `loki.source.awsfirehose` only via its `relabel_rules` argument
// (same pattern as `loki.source.journal`). They are NOT attached to
// outgoing entries by default — running them through a standalone
// `loki.relabel` after the source would see no `__aws_cw_*` labels.
loki.relabel "firehose" {
	forward_to = []

	rule {
		source_labels = ["__aws_cw_log_group"]
		target_label  = "log_group"
	}
	rule {
		source_labels = ["__aws_cw_log_stream"]
		target_label  = "log_stream"
	}
	rule {
		source_labels = ["__aws_cw_msg_type"]
		target_label  = "msg_type"
	}
}

loki.source.awsfirehose "fake" {
	http {
		listen_address = "0.0.0.0"
		listen_port    = 9999
	}
	relabel_rules = loki.relabel.firehose.rules
	forward_to    = [loki.write.local.receiver]
}

loki.write "local" {
	endpoint {
		url = "http://loki:3100/loki/api/v1/push"
	}
}


================================================
FILE: aws-firehose-logs/docker-compose.yml
================================================
services:

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100/tcp"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - "3000:3000/tcp"
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - "12345:12345"
      - "9999:9999"
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - loki

  firehose-sender:
    image: python:${PYTHON_VERSION:-3.11-slim}
    volumes:
      - ./firehose_sender.py:/firehose_sender.py:ro
    environment:
      - ALLOY_FIREHOSE_URL=http://alloy:9999/awsfirehose/api/v1/push
      - INTERVAL_SECONDS=5
      - EVENTS_PER_BATCH=8
    depends_on:
      - alloy
    command: ["python3", "-u", "/firehose_sender.py"]
    restart: unless-stopped


================================================
FILE: aws-firehose-logs/firehose_sender.py
================================================
"""Fake AWS Kinesis Firehose producer for the aws-firehose-logs scenario.

Generates synthetic VPC-flow-style log batches, wraps them in the
CloudWatch logs subscription envelope (so Alloy attaches the
`__aws_cw_*` discovery labels), then posts them to Alloy's
`loki.source.awsfirehose` HTTP endpoint in the documented Firehose
delivery format.

No AWS account or SDK required — this is just an HTTP client.
"""

import base64
import gzip
import json
import os
import random
import sys
import time
import uuid
from datetime import datetime
from urllib import request as urlrequest

ENDPOINT = os.environ.get(
    "ALLOY_FIREHOSE_URL",
    "http://alloy:9999/awsfirehose/api/v1/push",
)
INTERVAL = float(os.environ.get("INTERVAL_SECONDS", "5"))
EVENTS_PER_BATCH = int(os.environ.get("EVENTS_PER_BATCH", "8"))

LOG_GROUPS = [
    ("/aws/vpc/flowlogs", "eni-0abc1234-all"),
    ("/aws/vpc/flowlogs", "eni-0def5678-all"),
    ("/aws/lambda/checkout-service", "2026/04/28/[$LATEST]abc"),
]

ACTIONS = ["ACCEPT", "REJECT"]


def vpc_flow_line() -> str:
    src = f"10.0.{random.randint(0,255)}.{random.randint(1,254)}"
    dst = f"10.0.{random.randint(0,255)}.{random.randint(1,254)}"
    bytes_ = random.randint(40, 65000)
    pkts = random.randint(1, 50)
    action = random.choices(ACTIONS, weights=[9, 1])[0]
    now = int(time.time())
    return f"2 123456789012 eni-0abc1234 {src} {dst} 12345 443 6 {pkts} {bytes_} {now-30} {now} {action} OK"


def lambda_log_line() -> str:
    levels = ["INFO", "INFO", "INFO", "WARN", "ERROR"]
    level = random.choice(levels)
    request_id = str(uuid.uuid4())
    return f"{datetime.utcnow().isoformat()}Z {level} RequestId: {request_id} processing checkout"


def cloudwatch_envelope(log_group: str, log_stream: str, line_fn) -> dict:
    """Build a CloudWatch logs subscription delivery envelope.

    See: https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/SubscriptionFilters.html
    """
    return {
        "messageType": "DATA_MESSAGE",
        "owner": "123456789012",
        "logGroup": log_group,
        "logStream": log_stream,
        "subscriptionFilters": ["AlloyDemo"],
        "logEvents": [
            {
                "id": str(uuid.uuid4()),
                "timestamp": int(time.time() * 1000),
                "message": line_fn(),
            }
            for _ in range(EVENTS_PER_BATCH)
        ],
    }


def encode_record(envelope: dict) -> dict:
    """CloudWatch subscription delivery is gzip-compressed JSON, then
    base64-encoded inside the Firehose record `data` field. See:
    https://docs.aws.amazon.com/firehose/latest/dev/httpdeliveryrequestresponse.html
    """
    raw = json.dumps(envelope).encode()
    compressed = gzip.compress(raw)
    return {"data": base64.b64encode(compressed).decode()}


def send_batch() -> None:
    log_group, log_stream = random.choice(LOG_GROUPS)
    line_fn = lambda_log_line if "lambda" in log_group else vpc_flow_line
    envelope = cloudwatch_envelope(log_group, log_stream, line_fn)

    body = {
        "requestId": str(uuid.uuid4()),
        "timestamp": int(time.time() * 1000),
        "records": [encode_record(envelope)],
    }
    req = urlrequest.Request(
        ENDPOINT,
        data=json.dumps(body).encode(),
        headers={
            "Content-Type": "application/json",
            "X-Amz-Firehose-Request-Id": body["requestId"],
        },
    )
    try:
        with urlrequest.urlopen(req, timeout=5) as resp:
            print(f"POST {log_group}/{log_stream}: {resp.status}", flush=True)
    except Exception as e:
        print(f"POST {log_group}/{log_stream}: FAILED {e}", flush=True)


def main() -> int:
    # Wait briefly so Alloy's HTTP listener is up before the first POST.
    time.sleep(3)
    while True:
        send_batch()
        time.sleep(INTERVAL)


if __name__ == "__main__":
    sys.exit(main() or 0)


================================================
FILE: aws-firehose-logs/loki-config.yaml
================================================
auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
    - from: 2020-05-15
      store: tsdb
      object_store: filesystem
      schema: v13
      index:
        prefix: index_
        period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

ingester:
  max_chunk_age: 5m


================================================
FILE: blackbox-probing/README.md
================================================
# Blackbox Probing

This scenario demonstrates **synthetic monitoring** and **HTTP endpoint probing** using Grafana Alloy's `prometheus.exporter.blackbox` component.

## Overview

Blackbox probing (also known as synthetic monitoring) tests the availability and responsiveness of services from an external perspective. Instead of instrumenting applications to export metrics, the blackbox exporter actively probes endpoints and reports whether they are reachable, how long they take to respond, and other HTTP-level details.

This scenario probes two targets:
- **nginx** — a simple web server running on port 80
- **prometheus** — the Prometheus server running on port 9090

## Architecture

```
Alloy (blackbox exporter) --probes--> nginx:80
                          --probes--> prometheus:9090
                          --writes--> Prometheus (remote write)
Grafana --queries--> Prometheus
```

## Running

```bash
# From this directory
docker compose up -d

# Or from the repo root
./run-example.sh blackbox-probing
```

## Accessing the Stack

| Service    | URL                        |
|------------|----------------------------|
| Grafana    | http://localhost:3000       |
| Alloy UI   | http://localhost:12345      |
| Prometheus | http://localhost:9090       |
| nginx      | http://localhost:8080       |

## Key Metrics

Once running, you can query these metrics in Grafana or Prometheus:

- `probe_success` — 1 if the probe succeeded, 0 if it failed
- `probe_duration_seconds` — total time the probe took
- `probe_http_status_code` — HTTP status code returned by the target
- `probe_http_duration_seconds` — duration of each phase of the HTTP request (resolve, connect, tls, processing, transfer)

## Stopping

```bash
docker compose down
```


================================================
FILE: blackbox-probing/config.alloy
================================================
// --- Remote Write to Prometheus ---
prometheus.remote_write "remote" {
	endpoint {
		url = "http://prometheus:9090/api/v1/write"
	}
}

// --- Blackbox Exporter Configuration ---
prometheus.exporter.blackbox "default" {
	config = "{ modules: { http_2xx: { prober: http, timeout: 5s } } }"

	target {
		name    = "nginx"
		address = "http://nginx:80"
		module  = "http_2xx"
	}

	target {
		name    = "prometheus"
		address = "http://prometheus:9090"
		module  = "http_2xx"
	}
}

// --- Blackbox Scrape Configuration ---
prometheus.scrape "blackbox_targets" {
	scrape_interval = "15s"
	targets         = prometheus.exporter.blackbox.default.targets
	forward_to      = [prometheus.remote_write.remote.receiver]
}

// --- Enable Live Debugging ---
livedebugging {
	enabled = true
}


================================================
FILE: blackbox-probing/docker-compose.coda.yml
================================================
services:
  nginx:
    image: nginx:latest@sha256:1881968aff6f7cdcc4b888c00a11f4ce241ad7ec957e0cb4a9e19e93a3ff87ea
    ports:
      - 8080:80/tcp


================================================
FILE: blackbox-probing/docker-compose.yml
================================================

services:

  nginx:
    image: nginx:latest@sha256:1881968aff6f7cdcc4b888c00a11f4ce241ad7ec957e0cb4a9e19e93a3ff87ea
    ports:
      - 8080:80/tcp

  prometheus:
     image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
     command:
       - --web.enable-remote-write-receiver
       - --config.file=/etc/prometheus/prometheus.yml
     ports:
      - 9090:9090/tcp
     volumes:
        - ./prom-config.yaml:/etc/prometheus/prometheus.yml

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
       - sh
       - -euc
       - |
         mkdir -p /etc/grafana/provisioning/datasources
         cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
         apiVersion: 1
         datasources:
         - name: Prometheus
           type: prometheus
           orgId: 1
           url: http://prometheus:9090
           basicAuth: false
           isDefault: true
           version: 1
           editable: false
         EOF
         /run.sh

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy


================================================
FILE: blackbox-probing/prom-config.yaml
================================================
# Minimal Prometheus configuration
global:
  scrape_interval: 15s
  evaluation_interval: 15s


================================================
FILE: cloudwatch-metrics/README.md
================================================
# AWS CloudWatch metrics — no AWS account required

Demonstrates `prometheus.exporter.cloudwatch`, Alloy's built-in wrapper around [YACE](https://github.com/nerdswords/yet-another-cloudwatch-exporter). **No real AWS account or live infrastructure needed** — [LocalStack](https://localstack.cloud/) emulates the CloudWatch and STS APIs locally, and a small Python seeder container plants synthetic `EC2/CPUUtilization` data points every 30 s.

This is the same offline-reproducibility pattern used by [`aws-firehose-logs/`](../aws-firehose-logs/).

## Architecture

```
metric-seeder (Python)
  └── put_metric_data → LocalStack CloudWatch (:4566)
                              ↑
                        Alloy prometheus.exporter.cloudwatch
                              ↓
                        prometheus.scrape → prometheus.remote_write
                              ↓
                        Prometheus (:9090)
                              ↑
                        Grafana (:3000)
```

- **`localstack`** — emulates `cloudwatch` + `sts` APIs; no AWS credentials required
- **`metric-seeder`** — pushes `CPUUtilization` (random 5–85 %) for `i-1234567890abcdef0` every 30 s
- **`alloy`** — runs `prometheus.exporter.cloudwatch` pointed at LocalStack via `AWS_ENDPOINT_URL`; scrapes every 60 s and remote-writes to Prometheus
- **`prometheus`** — stores and serves metrics
- **`grafana`** — visualises with Prometheus datasource auto-provisioned

## Running

```bash
# From this directory
docker compose up -d

# Or from the repo root
./run-example.sh cloudwatch-metrics
```

LocalStack and the metric-seeder start first; Alloy waits for LocalStack to be healthy before scraping.

## Accessing

| Service | URL |
|---|---|
| **Grafana** | http://localhost:3000 (no login) |
| **Prometheus** | http://localhost:9090 |
| **Alloy UI** | http://localhost:12345 |
| **LocalStack** | http://localhost:4566/_localstack/health |

## Trying it out

Within ~90 s of bring-up (LocalStack ready → seeder plants first points → Alloy scrapes → Prometheus ingests), metrics appear in Prometheus.

Open **Grafana → Explore → Prometheus** and run:

```promql
# CPU utilisation for the seeded EC2 instance
aws_ec2_cpuutilization_average

# Maximum CPU in the last 5 m
aws_ec2_cpuutilization_maximum

# All CloudWatch-sourced metrics
{job="cloudwatch/localstack/ec2_cpu"}
```

Or query Prometheus directly:

```bash
curl -sG 'http://localhost:9090/api/v1/query' \
  --data-urlencode 'query=aws_ec2_cpuutilization_average' | jq .
```

In the **Alloy UI** (http://localhost:12345), navigate to **Graph** to see the pipeline:
`prometheus.exporter.cloudwatch.localstack` → `prometheus.scrape.cloudwatch` → `prometheus.remote_write.local`

Use **livedebugging** on `prometheus.scrape.cloudwatch` to watch metrics flow through in real time.

## Adapting for real AWS

To point this scenario at real CloudWatch instead of LocalStack:

1. Remove the `localstack` and `metric-seeder` services from `docker-compose.yml`
2. Remove the `AWS_ENDPOINT_URL` environment variable from the `alloy` service
3. Set real credentials:
   ```yaml
   environment:
     - AWS_ACCESS_KEY_ID=<your-key>
     - AWS_SECRET_ACCESS_KEY=<your-secret>
     - AWS_DEFAULT_REGION=us-east-1
   ```
4. Update the `dimensions` in `config.alloy` to match a real `InstanceId` in your account

The `config.alloy` static job configuration and Alloy pipeline are identical for both LocalStack and real AWS.


================================================
FILE: cloudwatch-metrics/config.alloy
================================================
// AWS CloudWatch metrics → Prometheus — no AWS account required.
//
// Uses LocalStack to emulate CloudWatch locally. A companion `metric-seeder`
// container pushes synthetic EC2/CPUUtilization data points every 30 s so
// Alloy has real data to scrape immediately on start-up.
//
// `prometheus.exporter.cloudwatch` wraps YACE and honours AWS SDK v2 endpoint
// overrides; we point it at LocalStack via AWS_ENDPOINT_URL in docker-compose.

livedebugging { enabled = true }

// Static job: no live EC2 discovery needed — we target the exact InstanceId
// that the metric-seeder plants in LocalStack CloudWatch.
prometheus.exporter.cloudwatch "localstack" {
	sts_region = "us-east-1"

	static "ec2_cpu" {
		regions   = ["us-east-1"]
		namespace = "AWS/EC2"

		dimensions = {
			"InstanceId" = "i-1234567890abcdef0",
		}

		metric {
			name       = "CPUUtilization"
			statistics = ["Average", "Maximum"]
			period     = "1m"
		}
	}
}

// Scrape the exporter every 60 s — CloudWatch data points are coarse-grained
// so there is no benefit in scraping more frequently.
prometheus.scrape "cloudwatch" {
	targets         = prometheus.exporter.cloudwatch.localstack.targets
	forward_to      = [prometheus.remote_write.local.receiver]
	scrape_interval = "60s"
}

// Remote-write to the local Prometheus instance.
prometheus.remote_write "local" {
	endpoint {
		url = "http://prometheus:9090/api/v1/write"
	}
}


================================================
FILE: cloudwatch-metrics/docker-compose.yml
================================================
services:

  # LocalStack emulates the CloudWatch + STS APIs locally.
  # No real AWS account or credentials needed.
  localstack:
    image: localstack/localstack:${LOCALSTACK_VERSION:-4.4.0}
    ports:
      - "4566:4566"
    environment:
      - SERVICES=cloudwatch,sts
      - DEFAULT_REGION=us-east-1
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://localhost:4566/_localstack/health"]
      interval: 5s
      timeout: 5s
      retries: 15

  # Pushes synthetic EC2/CPUUtilization data into LocalStack every 30 s.
  metric-seeder:
    image: python:${PYTHON_VERSION:-3.11-slim}
    volumes:
      - ./seed-metrics.py:/seed-metrics.py:ro
    environment:
      - AWS_ACCESS_KEY_ID=test
      - AWS_SECRET_ACCESS_KEY=test
      - AWS_DEFAULT_REGION=us-east-1
      - AWS_ENDPOINT_URL=http://localstack:4566
      - INTERVAL_SECONDS=30
    command: >
      sh -c "pip install boto3 --quiet && python -u /seed-metrics.py"
    depends_on:
      localstack:
        condition: service_healthy
    restart: unless-stopped

  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --config.file=/etc/prometheus/prometheus.yml
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - "3000:3000/tcp"
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh
    depends_on:
      - prometheus

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - "12345:12345"
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    environment:
      # Point AWS SDK v2 at LocalStack instead of real AWS endpoints.
      - AWS_ACCESS_KEY_ID=test
      - AWS_SECRET_ACCESS_KEY=test
      - AWS_DEFAULT_REGION=us-east-1
      - AWS_ENDPOINT_URL=http://localstack:4566
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      localstack:
        condition: service_healthy
      prometheus:
        condition: service_started


================================================
FILE: cloudwatch-metrics/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s


================================================
FILE: cloudwatch-metrics/seed-metrics.py
================================================
"""
CloudWatch metric seeder for LocalStack.

Pushes synthetic EC2 CPUUtilization data points into LocalStack every
INTERVAL_SECONDS so that prometheus.exporter.cloudwatch has something
to scrape immediately without a real AWS account.
"""
import os
import random
import time

import boto3
from botocore.config import Config

ENDPOINT    = os.getenv("AWS_ENDPOINT_URL", "http://localstack:4566")
REGION      = os.getenv("AWS_DEFAULT_REGION", "us-east-1")
INTERVAL    = int(os.getenv("INTERVAL_SECONDS", "30"))
INSTANCE_ID = "i-1234567890abcdef0"

cw = boto3.client(
    "cloudwatch",
    endpoint_url=ENDPOINT,
    region_name=REGION,
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID", "test"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY", "test"),
    config=Config(retries={"max_attempts": 5}),
)

print(f"Seeder started — pushing to {ENDPOINT} every {INTERVAL}s", flush=True)

while True:
    cpu = round(random.uniform(5.0, 85.0), 2)
    cw.put_metric_data(
        Namespace="AWS/EC2",
        MetricData=[
            {
                "MetricName": "CPUUtilization",
                "Dimensions": [{"Name": "InstanceId", "Value": INSTANCE_ID}],
                "Value": cpu,
                "Unit": "Percent",
            }
        ],
    )
    print(f"  → CPUUtilization={cpu}%  instance={INSTANCE_ID}", flush=True)
    time.sleep(INTERVAL)


================================================
FILE: coda
================================================
#!/usr/bin/env bash
set -euo pipefail

# On Coda VMs the repo lives at /opt/alloy-scenarios and this script is
# symlinked from /usr/local/bin/coda. For local dev use the script's own
# directory (works when invoked directly, not via symlink).
if [[ -d /opt/alloy-scenarios ]]; then
  REPO_DIR="/opt/alloy-scenarios"
else
  REPO_DIR="$(cd "$(dirname "$0")" && pwd)"
fi
ENV_FILE="${REPO_DIR}/image-versions.env"
SCENARIO_FILE="/etc/coda/scenario"

usage() {
  cat <<EOF
Usage: coda <command> [scenario]

Commands:
  start  [scenario]   Start app containers for a scenario
  stop   [scenario]   Stop app containers for a scenario
  status [scenario]   Show container status for a scenario
  list                List all available scenarios

If no scenario is given, reads from ${SCENARIO_FILE}.
EOF
  exit 1
}

resolve_scenario() {
  local scenario="${1:-}"
  if [[ -z "$scenario" ]]; then
    if [[ -f "$SCENARIO_FILE" ]]; then
      scenario="$(cat "$SCENARIO_FILE")"
    else
      echo "Error: no scenario specified and ${SCENARIO_FILE} not found" >&2
      exit 1
    fi
  fi
  echo "$scenario"
}

compose_args() {
  local scenario="$1"
  local dir="${REPO_DIR}/${scenario}"
  local compose_file="${dir}/docker-compose.coda.yml"

  if [[ ! -f "$compose_file" ]]; then
    echo "Error: ${compose_file} not found" >&2
    exit 1
  fi

  # Sanitize project name: replace / with -
  local project_name="coda-${scenario//\//-}"

  echo "-f ${compose_file} --env-file ${ENV_FILE} -p ${project_name}"
}

cmd_start() {
  local scenario
  scenario="$(resolve_scenario "${1:-}")"
  local args
  args="$(compose_args "$scenario")"
  echo "Starting scenario: ${scenario}"
  eval docker compose $args up -d --build
}

cmd_stop() {
  local scenario
  scenario="$(resolve_scenario "${1:-}")"
  local args
  args="$(compose_args "$scenario")"
  echo "Stopping scenario: ${scenario}"
  eval docker compose $args down
}

cmd_status() {
  local scenario
  scenario="$(resolve_scenario "${1:-}")"
  local args
  args="$(compose_args "$scenario")"
  eval docker compose $args ps
}

cmd_list() {
  echo "Available scenarios:"
  find "$REPO_DIR" -name docker-compose.coda.yml 2>/dev/null \
    | sed "s|^${REPO_DIR}/||; s|/docker-compose.coda.yml||" \
    | sort \
    | while read -r s; do echo "  $s"; done
}

[[ $# -lt 1 ]] && usage

command="$1"
shift

case "$command" in
  start)  cmd_start "$@" ;;
  stop)   cmd_stop "$@" ;;
  status) cmd_status "$@" ;;
  list)   cmd_list ;;
  *)      usage ;;
esac


================================================
FILE: continuous-profiling/README.md
================================================
# Continuous Profiling

This scenario demonstrates continuous profiling of a Go application using Grafana Alloy's `pyroscope.scrape` and `pyroscope.write` components, with Grafana Pyroscope as the profiling backend.

## Overview

The example includes:
- **demo-app** -- A Go application that performs CPU-intensive and memory-intensive work, exposing standard pprof endpoints on port 6060
- **alloy** -- Grafana Alloy configured to scrape pprof profiles from the demo app and forward them to Pyroscope
- **pyroscope** -- Grafana Pyroscope for storing and querying profiling data
- **grafana** -- Grafana with the Pyroscope datasource pre-configured for visualizing profiles

## Running the Demo

1. Clone the repository:
   ```
   git clone https://github.com/grafana/alloy-scenarios.git
   cd alloy-scenarios
   ```

2. Navigate to this example directory:
   ```
   cd continuous-profiling
   ```

3. Run using Docker Compose:
   ```
   docker compose up -d
   ```

   Or use the centralized image management:
   ```
   cd ..
   ./run-example.sh continuous-profiling
   ```

4. Access Grafana at http://localhost:3000

## What to Expect

After starting the scenario, Alloy will scrape the following profile types from the demo app every 15 seconds:

- **CPU** -- Identifies functions consuming the most CPU time (the `cpuIntensive` goroutine)
- **Memory (heap)** -- Shows memory allocation patterns (the `memoryIntensive` goroutine allocating 1MB chunks)
- **Goroutine** -- Displays active goroutines and their stack traces
- **Mutex** -- Captures mutex contention profiles
- **Block** -- Captures blocking operation profiles

To view profiles:

1. Open Grafana at http://localhost:3000
2. Navigate to **Explore**
3. Select the **Pyroscope** datasource
4. Choose a profile type (e.g., `process_cpu`) and the `demo-app` service
5. You should see flame graphs showing where the application spends its time and allocates memory

## Architecture

```
┌───────────┐     scrape pprof     ┌───────────┐     push profiles     ┌────────────┐
│  demo-app │◀─────────────────────│   Alloy   │─────────────────────▶│ Pyroscope  │
│  :6060    │     /debug/pprof/*   │  :12345   │                      │   :4040    │
└───────────┘                      └───────────┘                      └─────┬──────┘
                                                                            │
                                                                            ▼
                                                                      ┌──────────┐
                                                                      │ Grafana  │
                                                                      │  :3000   │
                                                                      └──────────┘
```

## Useful Links

- Alloy UI: http://localhost:12345 -- Inspect the Alloy pipeline and component status
- Grafana: http://localhost:3000 -- Explore profiles via the Pyroscope datasource
- Pyroscope: http://localhost:4040 -- Direct access to the Pyroscope UI
- Demo app pprof index: http://localhost:6060/debug/pprof/ -- Raw pprof endpoints


================================================
FILE: continuous-profiling/app/go.mod
================================================
module demo

go 1.23


================================================
FILE: continuous-profiling/app/main.go
================================================
package main

import (
	"fmt"
	"math/rand"
	"net/http"
	_ "net/http/pprof"
	"time"
)

func cpuIntensive() {
	for {
		sum := 0
		for i := 0; i < 1000000; i++ {
			sum += rand.Intn(100)
		}
		time.Sleep(100 * time.Millisecond)
	}
}

func memoryIntensive() {
	var data [][]byte
	for {
		chunk := make([]byte, 1024*1024) // 1MB
		for i := range chunk {
			chunk[i] = byte(rand.Intn(256))
		}
		data = append(data, chunk)
		if len(data) > 50 {
			data = data[1:]
		}
		time.Sleep(500 * time.Millisecond)
	}
}

func main() {
	go cpuIntensive()
	go memoryIntensive()

	fmt.Println("Demo app running on :6060 with pprof endpoints")
	http.ListenAndServe(":6060", nil)
}


================================================
FILE: continuous-profiling/config.alloy
================================================
livedebugging {
	enabled = true
}

// Scrape pprof profiles from the demo Go application
pyroscope.scrape "default" {
	targets = [
		{"__address__" = "demo-app:6060", "service_name" = "demo-app"},
	]

	scrape_interval = "15s"

	profiling_config {
		profile.process_cpu {
			enabled = true
		}

		profile.memory {
			enabled = true
		}

		profile.goroutine {
			enabled = true
		}

		profile.mutex {
			enabled = true
		}

		profile.block {
			enabled = true
		}
	}

	forward_to = [pyroscope.write.default.receiver]
}

pyroscope.write "default" {
	endpoint {
		url = "http://pyroscope:4040"
	}
}


================================================
FILE: continuous-profiling/docker-compose.coda.yml
================================================
services:
  demo-app:
    image: golang:1.26@sha256:2981696eed011d747340d7252620932677929cce7d2d539602f56a8d7e9b660b
    ports:
      - 6060:6060
    volumes:
      - ./app:/app
    working_dir: /app
    command: go run main.go


================================================
FILE: continuous-profiling/docker-compose.yml
================================================

services:
  # Demo Go application with pprof endpoints
  demo-app:
    image: golang:1.26@sha256:2981696eed011d747340d7252620932677929cce7d2d539602f56a8d7e9b660b
    ports:
      - 6060:6060
    volumes:
      - ./app:/app
    working_dir: /app
    command: go run main.go

  # Pyroscope for continuous profiling storage and visualization
  pyroscope:
    image: grafana/pyroscope:2.0.1@sha256:704889ae04768d982a0a71935bb054948993ddc3fe80234611d20877ba8be4c9
    ports:
      - 4040:4040

  # Grafana for visualization
  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Pyroscope
          type: grafana-pyroscope-datasource
          access: proxy
          orgId: 1
          url: http://pyroscope:4040
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh
    depends_on:
      - pyroscope

  # Alloy for telemetry pipeline
  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345      # Alloy HTTP server
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - demo-app
      - pyroscope


================================================
FILE: docker-monitoring/README.md
================================================
# Docker Monitoring with Grafana Alloy

This example demonstrates how to monitor Docker containers using Grafana Alloy.
## Prerequisites
- Docker
- Docker Compose
- Git

## Running the Demo

### Step 1: Clone the repository
```bash
git clone https://github.com/grafana/alloy-scenarios.git
```

### Step 2: Deploy the monitoring stack
```bash
cd alloy-scenarios/docker-monitoring
docker-compose up -d
```

> **Note (macOS Docker Desktop):** If Alloy cannot connect to the Docker socket, you may need to change the volume mount in `docker-compose.yml` from `/var/run/docker.sock` to `/var/run/docker.sock.raw`. This is a workaround specific to some versions of Docker Desktop on macOS.

### Step 3: Access Grafana Alloy UI
Open your browser and go to `http://localhost:12345`. 

### Step 4: Access Grafana UI
Open your browser and go to `http://localhost:3000`.


================================================
FILE: docker-monitoring/config.alloy
================================================
// ###############################
// #### Metrics Configuration ####
// ###############################

// Host Cadvisor on the Docker socket to expose container metrics.
prometheus.exporter.cadvisor "example" {
  docker_only = true
}

discovery.relabel "example" {
    targets = prometheus.exporter.cadvisor.example.targets

    rule {
        target_label = "job"
        replacement  = "integrations/docker"
    }

    rule {
        target_label = "instance"
        replacement  = constants.hostname
    }
}

// Configure a prometheus.scrape component to collect cadvisor metrics.
prometheus.scrape "scraper" {
  targets    = discovery.relabel.example.output
  forward_to = [ prometheus.remote_write.demo.receiver ]


  scrape_interval = "10s"
}

// Configure a prometheus.remote_write component to send metrics to a Prometheus server.
prometheus.remote_write "demo" {
  endpoint {
    url = "http://prometheus:9090/api/v1/write"
  }
}

// ###############################
// #### Logging Configuration ####
// ###############################

// Discover Docker containers and extract metadata.
discovery.docker "linux" {
  host = "unix:///var/run/docker.sock"
}

// Define a relabeling rule to create a service name from the container name.
discovery.relabel "logs_integrations_docker" {
      targets = []
  
      rule {
          source_labels = ["__meta_docker_container_name"]
          regex = "/(.*)"
          target_label = "container_name"
      }

     rule {
        target_label = "instance"
        replacement  = constants.hostname
    }

  }


// Configure a loki.source.docker component to collect logs from Docker containers.
loki.source.docker "default" {
  host       = "unix:///var/run/docker.sock"
  targets    = discovery.docker.linux.targets
  relabel_rules = discovery.relabel.logs_integrations_docker.rules
  forward_to = [loki.process.docker_logs.receiver]
}

// Process and filter Docker logs before sending to Loki.
// Example: Drop logs from infrastructure containers.
// Modify the regex pattern to match container names you want to exclude.
loki.process "docker_logs" {
  forward_to = [loki.write.local.receiver]

  stage.drop {
    source     = "container_name"
    expression = "(alloy|grafana|loki)"
  }
}

loki.write "local" {
  endpoint {
    url = "http://loki:3100/loki/api/v1/push"
  }
}

================================================
FILE: docker-monitoring/docker-compose.yml
================================================
version: '3'
services:
  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100"
    volumes:
     - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml
  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --config.file=/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"
  grafana:
   image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
   environment:
     - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
     - GF_AUTH_ANONYMOUS_ENABLED=true
     - GF_AUTH_BASIC_ENABLED=false
   ports:
     - 3000:3000/tcp
   entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh

  alloy:
   image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
   privileged: true
   ports:
     - 12345:12345
     - 4317:4317
     - 4318:4318
   environment:
      ALLOY_DEPLOY_MODE: docker
   volumes:
      - ./config.alloy:/etc/alloy/config.alloy
      - /proc:/rootproc:ro
      - /var/run/docker.sock:/var/run/docker.sock
      - /sys:/sys:ro
      - /:/rootfs:ro
      - /dev/disk/:/dev/disk:ro
      - /var/lib/docker/:/var/lib/docker:ro
   command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
   extra_hosts:
        - "host.docker.internal:host-gateway"
   devices:
        - /dev/kmsg

================================================
FILE: docker-monitoring/grafana/datasources/default.yml
================================================
apiVersion: 1
datasources:
- name: Loki
  type: loki
  access: proxy
  url: http://loki:3100


================================================
FILE: docker-monitoring/loki-config.yaml
================================================

# This is a complete configuration to deploy Loki backed by the filesystem.
# The index will be shipped to the storage via tsdb-shipper.

auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

distributor:
  otlp_config:
    # List of default otlp resource attributes to be picked as index labels
    # CLI flag: -distributor.otlp.default_resource_attributes_as_index_labels
      default_resource_attributes_as_index_labels: [service.name service.namespace service.instance.id deployment.environment deployment.environment.name cloud.region cloud.availability_zone k8s.cluster.name k8s.namespace.name k8s.container.name container.name k8s.replicaset.name k8s.deployment.name k8s.statefulset.name k8s.daemonset.name k8s.cronjob.name k8s.job.name]


server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
  - from: 2020-05-15
    store: tsdb
    object_store: filesystem
    schema: v13
    index:
      prefix: index_
      period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

================================================
FILE: elasticsearch-monitoring/README.md
================================================
# Elasticsearch Monitoring with Grafana Alloy

This scenario demonstrates how to monitor an Elasticsearch instance using Grafana Alloy's built-in `prometheus.exporter.elasticsearch` component.

## Architecture

- **Elasticsearch** - The monitored Elasticsearch instance (single-node, security disabled)
- **Grafana Alloy** - Collects Elasticsearch metrics via `prometheus.exporter.elasticsearch` and remote writes them to Prometheus
- **Prometheus** - Stores the scraped metrics
- **Grafana** - Visualizes Elasticsearch metrics (auto-provisioned with Prometheus datasource)

## Running

```bash
# From this directory
docker compose up -d

# Or from the repo root using centralized image versions
./run-example.sh elasticsearch-monitoring
```

## Accessing

- **Grafana**: http://localhost:3000 (no login required)
- **Alloy UI**: http://localhost:12345
- **Prometheus**: http://localhost:9090
- **Elasticsearch**: http://localhost:9200

## Key Metrics

Once running, you can query Elasticsearch metrics in Grafana or Prometheus. Some useful metrics include:

- `elasticsearch_cluster_health_status` - Cluster health (green/yellow/red)
- `elasticsearch_cluster_health_number_of_nodes` - Number of nodes in the cluster
- `elasticsearch_indices_docs_total` - Total number of documents
- `elasticsearch_indices_store_size_bytes` - Total store size
- `elasticsearch_jvm_memory_used_bytes` - JVM memory usage
- `elasticsearch_process_cpu_percent` - CPU usage
- `elasticsearch_breakers_tripped` - Circuit breaker trip count

Metrics are scraped every 30s by default — adjust `scrape_interval` in `config.alloy` if you need finer or coarser resolution.

## Stopping

```bash
docker compose down
```


================================================
FILE: elasticsearch-monitoring/config.alloy
================================================
// Elasticsearch Monitoring with Grafana Alloy
// This configuration scrapes Elasticsearch metrics using the built-in prometheus.exporter.elasticsearch component
// and remote writes them to Prometheus.

livedebugging {
	enabled = true
}

prometheus.exporter.elasticsearch "default" {
	address = "http://elasticsearch:9200"
}

prometheus.scrape "elasticsearch" {
	targets         = prometheus.exporter.elasticsearch.default.targets
	forward_to      = [prometheus.remote_write.default.receiver]
	scrape_interval = "30s"
}

prometheus.remote_write "default" {
	endpoint {
		url = "http://prometheus:9090/api/v1/write"
	}
}


================================================
FILE: elasticsearch-monitoring/docker-compose.coda.yml
================================================
services:
  elasticsearch:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.17.0@sha256:2f602552550869fb29b6fd5848c5118d3ef3a2e1d5d45802e3ab9088cb2de8e2
    environment:
      - discovery.type=single-node
      - xpack.security.enabled=false
      - ES_JAVA_OPTS=-Xms512m -Xmx512m
    ports:
      - "9200:9200"


================================================
FILE: elasticsearch-monitoring/docker-compose.yml
================================================
services:
  elasticsearch:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.17.0@sha256:2f602552550869fb29b6fd5848c5118d3ef3a2e1d5d45802e3ab9088cb2de8e2
    environment:
      - discovery.type=single-node
      - xpack.security.enabled=false
      - ES_JAVA_OPTS=-Xms512m -Xmx512m
    ports:
      - "9200:9200"

  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --config.file=/etc/prometheus/prometheus.yml
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - elasticsearch
      - prometheus


================================================
FILE: elasticsearch-monitoring/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s


================================================
FILE: faro-frontend-observability/README.md
================================================
# Faro Frontend Observability

This scenario demonstrates collecting frontend web telemetry using Grafana Alloy's `faro.receiver` component and the [Grafana Faro Web SDK](https://github.com/grafana/faro-web-sdk).

The Faro Web SDK runs in the browser and captures logs, errors, events, and web vitals, then sends them to Alloy's Faro receiver endpoint. Alloy forwards the collected telemetry to Loki for storage and querying.

## Architecture

```
Browser (Faro Web SDK) --> Alloy (faro.receiver :12347) --> Loki (:3100)
                                                                |
                                                           Grafana (:3000)
```

## Getting Started

1. Start all services:

```bash
docker compose up -d
```

2. Open the demo web page at [http://localhost:8080](http://localhost:8080).

3. Click the buttons to generate telemetry:
   - **Send Log** -- pushes an info-level log message
   - **Throw Error** -- catches and reports a JavaScript error
   - **Send Event** -- sends a custom event with metadata
   - **Unhandled Error** -- triggers an uncaught exception (automatically captured by Faro)

4. View the collected telemetry in Grafana:
   - Open [http://localhost:3000](http://localhost:3000)
   - Go to **Explore** and select the **Loki** datasource
   - Query with `{service_name="faro-demo"}` to see all frontend telemetry

## Services

| Service | URL | Description |
|---------|-----|-------------|
| Web (nginx) | [http://localhost:8080](http://localhost:8080) | Demo frontend page with Faro Web SDK |
| Alloy | [http://localhost:12345](http://localhost:12345) | Alloy UI for pipeline debugging |
| Alloy Faro Receiver | `http://localhost:12347/collect` | Faro SDK collection endpoint |
| Loki | [http://localhost:3100](http://localhost:3100) | Log aggregation backend |
| Grafana | [http://localhost:3000](http://localhost:3000) | Visualization and querying |

## Alloy Pipeline

The `config.alloy` pipeline is straightforward:

1. **`faro.receiver`** -- listens on port 12347 for Faro Web SDK payloads with CORS enabled for all origins
2. **`loki.write`** -- forwards the received logs to Loki

## Cleanup

```bash
docker compose down
```


================================================
FILE: faro-frontend-observability/app/index.html
================================================
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Faro Frontend Observability Demo</title>
    <script src="https://unpkg.com/@grafana/faro-web-sdk@latest/dist/bundle/faro-web-sdk.iife.js"></script>
    <style>
        body { font-family: sans-serif; max-width: 600px; margin: 40px auto; padding: 0 20px; }
        button { padding: 10px 20px; margin: 5px; cursor: pointer; font-size: 14px; }
        .error { background: #ff4444; color: white; border: none; border-radius: 4px; }
        .log { background: #4488ff; color: white; border: none; border-radius: 4px; }
        .event { background: #44bb44; color: white; border: none; border-radius: 4px; }
        #output { margin-top: 20px; padding: 10px; background: #f0f0f0; border-radius: 4px; min-height: 100px; font-family: monospace; font-size: 12px; }
    </style>
</head>
<body>
    <h1>Faro Frontend Observability Demo</h1>
    <p>Click the buttons below to generate frontend telemetry. Check Grafana Loki for the collected data.</p>

    <button class="log" onclick="sendLog()">Send Log</button>
    <button class="error" onclick="throwError()">Throw Error</button>
    <button class="event" onclick="sendEvent()">Send Event</button>
    <button class="error" onclick="unhandledError()">Unhandled Error</button>

    <div id="output">Telemetry output will appear here...</div>

    <script>
        var faro = window.GrafanaFaroWebSdk.initializeFaro({
            url: 'http://localhost:12347/collect',
            app: {
                name: 'faro-demo',
                version: '1.0.0',
                environment: 'development',
            },
        });

        var output = document.getElementById('output');
        function log(msg) {
            output.innerHTML = new Date().toISOString() + ' - ' + msg + '<br>' + output.innerHTML;
        }

        function sendLog() {
            faro.api.pushLog(['User clicked the log button'], { level: 'info' });
            log('Sent log to Faro');
        }

        function throwError() {
            try {
                throw new Error('Demo error from button click');
            } catch (e) {
                faro.api.pushError(e);
                log('Sent error to Faro: ' + e.message);
            }
        }

        function sendEvent() {
            faro.api.pushEvent('button_click', { button: 'event', timestamp: Date.now().toString() });
            log('Sent event to Faro');
        }

        function unhandledError() {
            log('Throwing unhandled error...');
            setTimeout(function() { undefinedFunction(); }, 100);
        }
    </script>
</body>
</html>


================================================
FILE: faro-frontend-observability/config.alloy
================================================
livedebugging {
	enabled = true
}

// Receive frontend telemetry from the Faro Web SDK
faro.receiver "default" {
	server {
		listen_address = "0.0.0.0"
		listen_port    = 12347

		cors_allowed_origins = ["*"]
	}

	output {
		logs = [loki.write.local.receiver]
	}
}

loki.write "local" {
	endpoint {
		url = "http://loki:3100/loki/api/v1/push"
	}
}


================================================
FILE: faro-frontend-observability/docker-compose.coda.yml
================================================
services:
  web:
    image: nginx:latest@sha256:1881968aff6f7cdcc4b888c00a11f4ce241ad7ec957e0cb4a9e19e93a3ff87ea
    ports:
      - 8080:80
    volumes:
      - ./app:/usr/share/nginx/html:ro


================================================
FILE: faro-frontend-observability/docker-compose.yml
================================================
services:
  # Nginx web server serving the demo frontend page
  web:
    image: nginx:latest@sha256:1881968aff6f7cdcc4b888c00a11f4ce241ad7ec957e0cb4a9e19e93a3ff87ea
    ports:
      - 8080:80
    volumes:
      - ./app:/usr/share/nginx/html:ro

  # Alloy telemetry pipeline — receives Faro Web SDK telemetry and forwards logs to Loki
  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
      - 12347:12347
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data --stability.level=experimental /etc/alloy/config.alloy
    depends_on:
      - loki

  # Loki for log aggregation
  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - 3100:3100/tcp
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  # Grafana for visualization
  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh
    depends_on:
      - loki


================================================
FILE: faro-frontend-observability/loki-config.yaml
================================================
auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
  - from: 2020-05-15
    store: tsdb
    object_store: filesystem
    schema: v13
    index:
      prefix: index_
      period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

ingester:
  max_chunk_age: 2h


================================================
FILE: game-of-tracing/AGENTS.md
================================================
# Game of Tracing — Agent Guide

> Canonical guide for any AI coding agent working inside this scenario. Tool-agnostic (Cursor, Codex, Cline, Aider, Claude Code). Claude-specific dispatch lives in `CLAUDE.md`.

## What this scenario is

**Game of Tracing** (titled *War of Kingdoms* in the UI) is a distributed-tracing tutorial game in the `alloy-scenarios` repository. It is substantially more elaborate than other scenarios in the repo: 10 Python/Flask services, two kingdoms competing over 8 territories, an algorithmic AI opponent, and the full LGMT stack (Loki, Grafana, Metrics/Prometheus, Tempo) sitting behind Grafana Alloy.

The **headline feature** is **span-link-driven game replay**: every player and AI action stores its `trace_id`/`span_id` in SQLite; the next action creates an OpenTelemetry `trace.Link` to the previous one, producing a causal chain of traces that can be replayed from Tempo. See `SPAN_LINKS.md` for the full spec and `README.md` for the player-facing tutorial narrative.

## Architecture at a glance

```
 Players ──► war-map (8080) ──┐
                              │
 AI Opponent (8081) ──────────┤──► 8 Location Services (5001-5008)
                              │       southern-capital, northern-capital,
                              │       village-1 … village-6
                              │
 All services ──OTLP──► Alloy (4317 gRPC / 4318 HTTP) ──► Tempo (3200)
                                                      ├─► Loki  (3100)
                                                      └─► Prom  (9090)
                                                          │
 Grafana (3000) ──datasources──► Tempo (default), Loki, Prometheus
```

All services push OTLP to Alloy; Alloy fans out by signal (traces→Tempo, logs→Loki, metrics→Prometheus). Grafana is auto-provisioned with all three datasources plus traces↔logs↔metrics correlation.

## Services and ports

| Service | Port(s) | Build context | Image version env | Purpose |
|---|---|---|---|---|
| `loki` | 3100 | — | `GRAFANA_LOKI_VERSION` (default 3.6.7) | Log storage |
| `prometheus` | 9090 | — | `PROMETHEUS_VERSION` (default v3.10.0) | Metrics storage + OTLP receiver |
| `tempo` | 3200 | — | `GRAFANA_TEMPO_VERSION` (default 2.10.1) | Trace storage + metrics generator |
| `grafana` | 3000 | — | `GRAFANA_VERSION` (default 12.4.0) | Visualization (anonymous admin) |
| `alloy` | 12345, 4317, 4318 | — | `GRAFANA_ALLOY_VERSION` (default v1.14.0) | Telemetry pipeline |
| `southern-capital` | 5001 | `./app` | — | Capital location service |
| `northern-capital` | 5002 | `./app` | — | Capital location service |
| `village-1` … `village-6` | 5003-5008 | `./app` | — | Village location services |
| `war-map` | 8080 | `./war_map` | — | Game UI + span-link broker |
| `ai-opponent` | 8081 | `./ai_opponent` | — | Algorithmic AI opponent |

Image versions are centralized at `/Users/jayclifford/Repos/alloy-scenarios/image-versions.env` — edit that file, not the compose files (they use `${VAR:-default}` syntax).

## Submodules (each has its own CLAUDE.md)

- **`app/`** — the 8 location Flask services. See [`app/CLAUDE.md`](app/CLAUDE.md).
- **`ai_opponent/`** — the algorithmic strategic AI (not LLM). See [`ai_opponent/CLAUDE.md`](ai_opponent/CLAUDE.md).
- **`war_map/`** — the Flask UI and the owner of span-link reconstruction logic. See [`war_map/CLAUDE.md`](war_map/CLAUDE.md).

## Shared state

One Docker volume, `game-data`, mounted at `/data`. **Two SQLite databases live under it, with different owners — do not confuse them:**

| File | Owner | Mode | Purpose |
|---|---|---|---|
| `game_state.db` | All 8 location services (shared) | WAL | Canonical game state: resources, armies, faction per location |
| `game_sessions.db` | `war_map/` only | default | `game_actions` table: per-action `trace_id`, `span_id`, `action_sequence`, `game_session_id` — drives span linking |

Overriding `DATABASE_FILE` (game_state) or `GAME_SESSIONS_DB` (game_sessions) env vars on `war_map` is supported.

### Extra tables added for multi-map support

`game_state.db` also holds:

- **`game_config`** — key/value store; the `active_map_id` row is authoritative at runtime. `war_map`'s `/select_map` route writes this; every location service reads it on boot and `/reload`.
- **`faction_economy`** — `(faction, corpses)`. Holds the White Walkers' corpse pool on the WWA map. Populated by the post-battle hook in `LocationServer.receive_army` and by the passive corpse tick at the WW fortress. Consumed by `LocationServer.create_army` when the faction's currency is `corpses`.
- **`wall_hold`** — `(map_id, faction, ticks, last_update)`. Written by `war_map`'s `_wall_tick_thread`. Non-zero rows mean that faction currently holds every wall keep on that map.

`game_sessions.db` has a `map_id` column added to the `game_actions` table so replay queries can filter by map. Fresh installs seed `map_id=NULL` for any legacy rows; an additive `ALTER TABLE` migration runs on first boot after the upgrade.

## Maps

`app/game_config.py` defines a `MAPS` dict with two entries:

| Map id | Players | Factions | Win | Notable rules |
|---|---|---|---|---|
| `war_of_kingdoms` (default) | 2 | `southern`, `northern`, `neutral` | Capture enemy capital | Classic — 30 resources per army, 20 resource/collect at capitals, village passive +10/15 s |
| `white_walkers_attack` | 1 (player is `nights_watch`) | `nights_watch`, `white_walkers`, `barbarian`, `neutral` | Hold every `wall` keep for 5 × 30 s ticks | `wall` settlement type doubles defenders; WW spends 5 corpses per army (no resources); barbarian villages grow +1 army every 30 s; WW fortress passively +1 corpse every 15 s |

Each map also defines a **slot assignments** dict (`slot_1` → logical location id) so the 8 physical containers can serve either map. See "Slot identity" below.

### Slot identity

Each location container has a fixed `SLOT_ID` env var (`slot_1` … `slot_8`). On boot, the container:

1. Reads the shared `active_map_id` from `game_state.db`'s `game_config` table.
2. Looks up `MAPS[active_map_id]["slot_assignments"][SLOT_ID]` → its logical `location_id`.
3. Loads config from `MAPS[active_map_id]["locations"][location_id]`.

The container's **SERVICE_NAME** (used by Grafana dashboards) stays stable (`southern-capital`, `village-1`, etc.) regardless of the map — the *logical* location id is published as the `location.id` span attribute, not the service name.

Runtime map switching: `war_map/select_map` writes a new `active_map_id`, POSTs `/reset` to any one container to wipe the `locations` table, then POSTs `/reload` to every container so they rebind in place without a restart.

## Two Alloy configurations

### Default — River (HCL)
```bash
cd game-of-tracing && docker compose up -d
```
Uses `config.alloy`. Alloy runs with `run /etc/alloy/config.alloy`.

### Alternate — OTel Collector YAML
```bash
cd game-of-tracing && docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
```
Uses `config-otel.yaml`. Alloy runs with its OTel Engine mode: `otel --config=/etc/alloy/config-otel.yaml`. The pipeline is functionally identical; this variant demonstrates Alloy's ability to accept OTel Collector syntax.

## OpenTelemetry patterns you must respect

Every service has its own `telemetry.py` exposing a `GameTelemetry` class that wires up all three signals.

- **Traces** — OTLP gRPC → `alloy:4317`, `BatchSpanProcessor(max_export_batch_size=1)`. The batch size of 1 is **intentional** for demo timing; do not tune it.
- **Logs** — OTLP HTTP → `alloy:4318/v1/logs`, `BatchLogRecordProcessor(max_queue_size=30, max_export_batch_size=5)`.
- **Metrics** — OTLP HTTP → `alloy:4318/v1/metrics`, `PeriodicExportingMetricReader(export_interval_millis=10000)`, `TraceBasedExemplarFilter` (so metric exemplars link to trace IDs).

### Context propagation is manual

Incoming requests extract W3C trace context from headers; outgoing requests inject it:

```python
# Incoming (every route handler):
ctx = extract(request.headers)
with tracer.start_as_current_span("name", context=ctx, ...) as span:

# Outgoing (canonical helper at app/location_server.py:327-352):
inject(headers)
requests.post(url, headers=headers, ...)
```

### Background threads MUST capture context explicitly

Python threads do not inherit OpenTelemetry context. The scenario's canonical pattern is to capture before spawning and attach inside the thread:

```python
# app/location_server.py:209-271 (_continue_army_movement) — canonical example:
ctx = get_current()

def move():
    token = attach(ctx)
    try:
        with self.tracer.start_as_current_span("army_movement", ...):
            ...
    finally:
        detach(token)

Thread(target=move).start()
```

The same pattern appears in `_transfer_resources_along_path` at `app/location_server.py:273-325`. If a background span shows up with a missing or different `trace_id`, the `get_current()` / `attach` / `detach` pair is the first thing to check.

## Span links — the headline feature

Span links are the mechanism that turns a sequence of discrete player actions into a replayable narrative. See `SPAN_LINKS.md` for the full design.

**Flow:**
1. Player selects a faction → `war_map/app.py` creates a `game_session_id` (UUID).
2. Every action handler (`/api/collect_resources`, `/api/create_army`, `/api/move_army`) does:
   - Looks up the previous action for this session via `get_previous_action_context()` at `war_map/app.py:130-170`. That function reads `trace_id` and `span_id` from the `game_actions` SQLite table and rebuilds a `trace.SpanContext(..., is_remote=True, trace_flags=TraceFlags.SAMPLED)`.
   - Wraps the context in a link via `create_span_link_from_context()` at `war_map/app.py:172-189`, attaching `link.type="game_sequence"`, `link.relation="follows"`, `game.sequence="true"`.
   - Starts its own action span with that link, then calls `store_game_action()` to record its own `trace_id`/`span_id` for the next action to link back to.
3. The AI opponent uses the same primitive with a different link type — `link.type="ai_decision_trigger"` — to link its decision span to the action execution span it spawns (see `ai_opponent/ai_server.py`).
4. The replay UI queries Tempo:
   - `GET /api/v2/search/tag/game.session.id/values` to enumerate sessions.
   - `GET /api/search?q={game.session.id="<id>"}` to pull every trace in a session.
   - SQLite `game_actions` is the fallback if Tempo is unavailable.

## Custom metrics reference

### From `app/telemetry.py`
| Metric | Type | Attributes | Notes |
|---|---|---|---|
| `game.resources` | observable gauge | `location`, `location_type` | Current resource pool per location |
| `game.army_size` | observable gauge | `location`, `location_type`, `faction` | Current army strength |
| `game.battles` | counter | `attacker_faction`, `defender_faction`, `result`, `location` | `result ∈ {attacker_victory, defender_victory, stalemate, reinforcement}` |
| `game.resource_transfer_cooldown` | observable gauge | `location` | Seconds remaining |
| `game.location_control` | observable gauge | `location`, `location_type`, `faction` | `northern=1, southern=2, neutral=0, unknown=-1` |

### From `ai_opponent/telemetry.py`
| Metric | Type | Attributes |
|---|---|---|
| `ai.decisions` | counter | `action_type`, `phase`, `reason` |
| `ai.plans_created` | counter | `goal` |
| `ai.plans_abandoned` | counter | `reason` |
| `ai.decision_cycle_duration_seconds` | histogram | `phase` |
| `ai.territory_count` | observable gauge | `faction` |
| `ai.total_army` | observable gauge | `faction` |

### Span attributes used by the provisioned Grafana dashboard
Preserve these when adding new spans — the dashboard's TraceQL filters depend on them:
- `span.resource.movement = true`
- `span.battle.occurred = true`
- `span.player.action = true`

## Common tasks

```bash
# Start everything
cd game-of-tracing && docker compose up -d

# Stop (preserves volume)
docker compose down

# Stop and wipe game state
docker compose down -v

# Rebuild only one service after code change
docker compose up -d --build war-map

# Switch to the OTel Engine variant
docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d

# Tail a trace end-to-end
# 1. Game UI:      http://localhost:8080
# 2. Grafana:      http://localhost:3000 (anonymous admin)
# 3. Alloy debug:  http://localhost:12345/debug/livedebugging
# 4. Tempo API:    http://localhost:3200
```

## Gotchas

- **Hyphens vs underscores.** Service names are hyphenated (`southern-capital`, set via `SERVICE_NAME` resource attribute); location IDs in game_config.py and DB rows are underscored (`southern_capital`). Code that bridges them uses `location_id.replace('_', '-')`. Do not cross them.
- **Two compose files — `docker-compose.yml` and `docker-compose.coda.yml`.** The coda variant redefines the same 10 app-layer services already defined in the main compose file, for use with the `coda` CLI. When editing app services, update both.
- **Image versions.** Live in `/Users/jayclifford/Repos/alloy-scenarios/image-versions.env`. Compose files use `${VAR:-default}` — edit the env file, not the compose.
- **Grafana is auto-provisioned** via `grafana/datasources/defaults.yml`. Tempo is the default datasource; service map, traces-to-logs (Loki `trace_id` label), traces-to-metrics, and exemplars are pre-wired. Do not add datasources via UI — edit the YAML.
- **Tempo metrics generator is enabled** in `tempo-config.yaml` with processors `service-graphs`, `span-metrics`, `local-blocks`, writing to `prometheus:9090/api/v1/write`. Ingester `max_block_duration: 5m` and 720h compactor retention are demo-tuned, not production values.
- **`grafana-traces-app` plugin** is installed via `GF_INSTALL_PLUGINS` at container start. If Grafana is slow on first boot, that is why.
- **`war-map` strips `X-Frame-Options`** in an `@app.after_request` hook (`war_map/app.py:191-194`) so the UI can be embedded in Grafana iframes. Intentional — do not remove.

## Keep docs current

**Any change to this scenario must land in the same work unit as a doc update.** Stale line-number anchors, removed symbols, or new services that nobody documents are treated as regressions, not cleanup tasks.

Files that must be checked whenever the scenario changes:
- `game-of-tracing/AGENTS.md` (this file)
- `game-of-tracing/CLAUDE.md`
- `game-of-tracing/app/CLAUDE.md`
- `game-of-tracing/ai_opponent/CLAUDE.md`
- `game-of-tracing/war_map/CLAUDE.md`
- `.claude/agents/game-of-tracing-expert.md` (cheat-sheet references)

Triggers that require a doc update: new service, renamed function, new/changed span attribute, new env var, added/removed metric, port change, dependency bump, new action type in the span-link chain, change to any cited line-number anchor.

The Claude sub-agent at `.claude/agents/game-of-tracing-expert.md` owns this responsibility end-to-end for Claude Code sessions. For non-Claude agents: before returning a response that involved a code edit, grep the six files above for any outdated references and update them.

## Verification

After any meaningful change, run through this sequence:

1. **Smoke the scenario.** `cd game-of-tracing && docker compose up -d`; wait ~20s for all 10 services to be healthy (`docker compose ps` — all should be `(healthy)` or `Up`).
2. **Confirm Alloy ingest.** Open `http://localhost:12345/debug/livedebugging`. Select the `otelcol.receiver.otlp.default` component and confirm non-zero signal counts for traces/logs/metrics.
3. **Trigger a player action.** Open `http://localhost:8080`, pick a faction, collect resources, create an army, move it to a neutral village.
4. **Inspect the resulting trace.** Grafana at `http://localhost:3000` → Explore → Tempo → Search by `game.session.id` tag. Verify:
   - Parent player-action span in `war-map`.
   - Child CLIENT span with propagated trace context.
   - SERVER span in the target location (`village-X` etc.).
   - Background `army_movement` span sharing the same `trace_id` (confirms `get_current()`/`attach` worked).
   - A span link back to the previous action span (the headline feature).
5. **Dashboard check.** Open the provisioned *War of Kingdoms* dashboard; TraceQL filters like `{span.resource.movement = true}` should return traces.
6. **Shutdown.** `docker compose down` (add `-v` to wipe volumes).

## Cross-references

- Full span-link design: [`SPAN_LINKS.md`](SPAN_LINKS.md)
- Player-facing tutorial: [`README.md`](README.md)
- Generic scenario conventions: [`../CLAUDE.md`](../CLAUDE.md)
- Submodule guides: [`app/CLAUDE.md`](app/CLAUDE.md), [`ai_opponent/CLAUDE.md`](ai_opponent/CLAUDE.md), [`war_map/CLAUDE.md`](war_map/CLAUDE.md)


================================================
FILE: game-of-tracing/CLAUDE.md
================================================
# CLAUDE.md — Game of Tracing (Claude Code)

> Claude-specific workflow for this scenario. For architecture, services, OpenTelemetry patterns, span-link mechanics, and gotchas, **read [`./AGENTS.md`](AGENTS.md) first**. This file only covers what's different when the agent is Claude Code.

## Start here

1. Read `./AGENTS.md` for the scenario overview — including the **Maps** and **Slot identity** sections.
2. Read the submodule `CLAUDE.md` matching the area you are touching: [`app/CLAUDE.md`](app/CLAUDE.md), [`ai_opponent/CLAUDE.md`](ai_opponent/CLAUDE.md), [`war_map/CLAUDE.md`](war_map/CLAUDE.md).
3. If the task involves span links, trace replay, cross-service context propagation, or AI decision logic — delegate to the sub-agent below.

### Two maps, one stack

The scenario ships **two maps** selected via an in-UI picker at game start: `war_of_kingdoms` (default 2-player) and `white_walkers_attack` (single-player Night's Watch vs AI White Walkers with `wall` keeps, corpse economy, and a 5-tick hold-to-win condition). Both reuse the same 8 location containers — each container has a constant `SLOT_ID` env and picks up its logical identity from `MAPS[active_map_id]["slot_assignments"][SLOT_ID]` in `app/game_config.py`. Changing maps writes a new `active_map_id` to the shared `game_config` table and POSTs `/reload` to every slot.

## Sub-agent dispatch

A specialized sub-agent lives at [`../.claude/agents/game-of-tracing-expert.md`](../.claude/agents/game-of-tracing-expert.md). Use it (via `Task` tool, `subagent_type: game-of-tracing-expert`) for any non-trivial question about:

- Reconstructing or debugging span contexts / span links
- Cross-service or cross-thread OpenTelemetry context propagation
- The `StrategicAI` priority cascade, game phases, or AI metric instrumentation
- Tempo TraceQL queries used by the replay UI
- Why a trace is orphaned, missing, or appears duplicated in Grafana

The sub-agent is read-only (no Write/Edit tools) — it reports; the parent agent does the writes. It **also owns keeping the docs in sync with the code** — see "Keep docs current" below.

## Tool preferences

- **Use `Read`, not `cat`**, for the large files in this scenario. Use `offset` / `limit` to target line ranges rather than reading the whole file:
  - `app/location_server.py` (~52 KB, ~1200 lines)
  - `ai_opponent/ai_server.py` (~46 KB)
  - `war_map/app.py` (~64 KB)
  - `war_map/templates/map.html` (~50 KB)
  - `war_map/templates/replay_session.html` (~28 KB)
  - `SPAN_LINKS.md` (~17 KB)
- **Use `Grep`, not `grep | head`** for pattern search across the scenario.
- For the Alloy pipeline debug UI (`http://localhost:12345`), the stack has to be running — either ask the user to `docker compose up -d` or check `docker compose ps` first.

## Read-before-edit checklist

Before editing any service, open these files to ground yourself:

| Change area | Open first |
|---|---|
| Location server behavior | `app/telemetry.py`, relevant route handler in `app/location_server.py`, `app/game_config.py`, the service block in `docker-compose.yml` |
| AI decision logic | `ai_opponent/telemetry.py`, `ai_opponent/ai_server.py`, `ai_opponent/README.md` |
| UI, sessions, or replay | `war_map/telemetry.py`, `war_map/app.py` (especially `:130-189` for span-link plumbing), relevant template under `war_map/templates/` |
| Telemetry pipeline | `config.alloy` (default) or `config-otel.yaml` (OTel variant), `tempo-config.yaml`, `loki-config.yaml`, `prom-config.yaml` |
| Datasources / dashboards | `grafana/datasources/defaults.yml`, `grafana/dashboards/*.json` |
| Image versions | `../image-versions.env` |

## Keep docs current

**Whenever a change to this scenario ships, the matching docs must ship in the same change.** The sub-agent (`game-of-tracing-expert`) enforces this during its work; Claude Code in the main loop is responsible whenever the sub-agent is not invoked.

Triggers that require a doc update in the same commit:

- New service, renamed function, relocated symbol (line-number anchors shift)
- New, removed, or renamed span attribute — especially the ones that feed the Grafana dashboard TraceQL (`span.resource.movement`, `span.battle.occurred`, `span.player.action`)
- New or removed env var
- New or removed metric
- Port change
- Dependency version bump (update `image-versions.env` *and* any docs that quote a version)
- New action type in the span-link chain (both `war_map/app.py` handler and `replay_session.html` renderer)

Files to sweep on every scenario change:

1. `game-of-tracing/AGENTS.md`
2. `game-of-tracing/CLAUDE.md` (this file)
3. `game-of-tracing/app/CLAUDE.md`
4. `game-of-tracing/ai_opponent/CLAUDE.md`
5. `game-of-tracing/war_map/CLAUDE.md`
6. `.claude/agents/game-of-tracing-expert.md`

Stale line-number anchors are treated as regressions, not cleanup tasks. If a cited `file:line` range no longer resolves to the referenced symbol, fix it.

## Relationship to the repo root

- `/Users/jayclifford/Repos/alloy-scenarios/CLAUDE.md` covers the generic multi-scenario conventions (run commands, scenario directory layout, Alloy pipeline shape).
- This file overrides nothing; it extends the root with the patterns that are unique to this scenario (manual context propagation, background-thread context capture, span-link-driven replay, AI instrumentation).


================================================
FILE: game-of-tracing/README.md
================================================
---
title: A Game of Traces
menuTitle: A Game of Traces
description: A grand strategy game with distributed tracing
weight: 600
killercoda:
  title: A Game of Traces
  description: A grand strategy game with distributed tracing
  details:
      intro:
         foreground: docker-compose-update.sh
  backend:
    backend:
    imageid: ubuntu
---


<!-- INTERACTIVE page intro.md START -->
# War of Kingdoms: A Distributed Tracing Tutorial Game

<!-- INTERACTIVE ignore START -->

<div align="center">
<img src="https://grafana.com/media/docs/alloy/game-of-tracing.jpeg" alt="Game of Tracing" width="200"/>
</div>

<!-- INTERACTIVE ignore END -->

This educational game demonstrates distributed tracing concepts through an interactive strategy game built with OpenTelemetry and Grafana Alloy. Players learn about trace sampling, service graphs, and observability while competing for territory control.

## Educational Goals

This game teaches several key concepts in distributed tracing:

1. **Distributed System Architecture**
   - Multiple microservices (locations) communicating via HTTP
   - Shared state management
   - Event-driven updates
   - Real-time data propagation

2. **OpenTelemetry Concepts**
   - Trace context propagation
   - Span creation and attributes
   - Service naming and resource attributes
   - Manual instrumentation techniques

3. **Observability Patterns**
   - Trace sampling strategies
   - Error tracking and monitoring
   - Performance measurement
   - Service dependencies visualization

## Game Overview

Open the scenario at `http://localhost:8080` and you land on a **map picker**. Two maps ship today:

### War of Kingdoms (default, 2-player)

Two rival kingdoms — Southern and Northern — race to capture the enemy capital. Players:

- Collect resources from their territories
- Build armies (30 resources per unit) to expand their influence
- Capture neutral villages (6 of them)
- Send resources back to their capital
- Launch strategic attacks on enemy territories

**Win condition:** capture the enemy capital.

### White Walkers Attack (single-player)

The Long Night has come. The human plays the **Night's Watch** (player faction); the AI opponent plays the **White Walkers**. A new **Barbarian** faction controls two villages on the flanks — passive, slowly accruing army units, good raid targets.

New mechanics:

- **Wall settlements** run across the middle of the map. Defenders count **2×** when a wall is attacked, making them hard to dislodge.
- **Corpse economy.** White Walkers spend **corpses** (not resources) to raise new armies at their fortress. Corpses come from winning battles (every unit killed on either side becomes a corpse) plus a slow passive tick at the fortress itself. Cost: 5 corpses per unit.
- **Barbarians** never attack. They accrue +1 army every 30 s — easy farm for White Walkers, but they also harass unguarded Night's Watch supply lines.

**Win condition:** hold *every* wall settlement continuously for **5 ticks** (150 s, since the tick is 30 s). Any wall changing hands resets the counter.

Both maps share the same 8 location containers — the active map lives in `game_state.db`, and the `/reload` endpoint on each service rebinds the slot's identity when the player switches maps via the picker.

Each action in the game generates traces that can be analyzed in Grafana Tempo, demonstrating how distributed tracing works in a real application.

## Technical Components

The application consists of:

- **Location Servers**: Python Flask microservices representing different map locations
- **War Map UI**: Web interface for game interaction
- **AI Opponent**: Intelligent computer player for single-player mode
- **Telemetry Pipeline**:
  - OpenTelemetry SDK for instrumentation
  - `pyroscope-otel` bridge for linking traces to CPU profiles
  - Grafana Alloy for trace/log/metric/profile processing
  - Tempo for trace storage
  - Prometheus for metrics
  - Loki for logs
  - Pyroscope for continuous profiling
  - Grafana for visualization

<!-- INTERACTIVE page intro.md END -->

<!-- INTERACTIVE page step1.md START -->

## Running the Demo

1. Clone the repository:
   ```bash
   git clone https://github.com/grafana/alloy-scenarios.git
   cd alloy-scenarios
   ```

2. Navigate to this example:
   ```bash
   cd game-of-tracing
   ```

3. Run using Docker Compose:
   ```bash
   docker compose up -d
   ```

4. Access the components:
   - Game UI: [http://localhost:8080](http://localhost:8080)
   - Grafana: [http://localhost:3000](http://localhost:3000)
   - Prometheus: [http://localhost:9090](http://localhost:9090)
   - Pyroscope: [http://localhost:4040](http://localhost:4040)
   - Alloy Debug: [http://localhost:12345/debug/livedebugging](http://localhost:12345/debug/livedebugging)

5. Multiplayer Access:
   - The game supports multiple players simultaneously
   - Players can join using:
     - `http://localhost:8080` from the same machine
     - `http://<host-ip>:8080` from other machines on the network
   - Each player can choose either the Southern or Northern faction
   - The game prevents multiple players from selecting the same faction

6. Single-Player Mode:
   - Toggle "Enable AI Opponent" in the game interface
   - The AI will automatically control the faction not chosen by the player
   - The AI provides a balanced challenge with adaptive strategies
   - For two-player games, keep the AI toggle disabled

<!-- INTERACTIVE page step1.md END -->

<!-- INTERACTIVE page step2.md START -->

## Setting Up the Dashboard

1. Open Grafana at http://localhost:3000 (anonymous admin auth is enabled, no login required).

2. The **War of Kingdoms** dashboard is auto-provisioned at startup — no manual import needed. Find it under Dashboards → Browse.

3. Data sources (Prometheus, Loki, Tempo, **Pyroscope**) are auto-provisioned too. The Tempo datasource is pre-wired to Loki (traces-to-logs), Prometheus (traces-to-metrics), and Pyroscope (traces-to-profiles), so every span in Explore gets a "View profile" link.

4. The dashboard provides:
   - Real-time army and resource metrics
   - Battle analytics
   - Territory control visualization
   - Service dependency mapping
   - Trace analytics for game events

### Viewing Profiles

With every player action the app emits CPU pprof samples via the `pyroscope-otel` bridge. Each span carries a `pyroscope.profile.id` attribute that Grafana uses to jump directly from a span to its flamegraph.

- Explore → **Pyroscope** datasource → pick a service (e.g. `war-map`) → flamegraph renders.
- Explore → **Tempo** → open a recent trace → right-click a span → **View Profile**.

> **OTel-engine variant note**: when running the alternate pipeline via `docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d`, Alloy's OTel-engine mode has no native Pyroscope receiver. The Python services still profile themselves, but the default profile endpoint (`http://alloy:9999`) won't exist. Override with `PYROSCOPE_SERVER_ADDRESS=http://pyroscope:4040` in the environment to push profiles straight to Pyroscope.

<!-- INTERACTIVE page step2.md END -->

<!-- INTERACTIVE page step3.md START -->

## Learning Through Play

### 1. Trace Context Propagation
Watch how actions propagate through the system:
- Resource collection triggers spans across services
- Army movements create trace chains
- Battle events generate nested spans

### 2. Service Graph Analysis
Learn how services interact:
- Village-to-capital resource flows
- Army movement paths
- Battle resolution chains

## Observability Features

### 1. Resource Movement Tracing
```console
{span.resource.movement = true}
```
Track resource transfers between locations with detailed timing and amounts.

### 2. Battle Analysis
```console
{span.battle.occurred = true}
```
Analyze combat events, outcomes, and participating forces.

### 3. Player Actions
```console
{span.player.action = true}
```
Monitor player interactions and their impact on the game state.

<!-- INTERACTIVE page step3.md END -->

<!-- INTERACTIVE page step4.md START -->

## Architecture Deep Dive

### Trace Flow Example: Army Movement

1. Player initiates move (UI span)
2. Source location processes request (source span)
3. Movement calculation (path span)
4. Target location receives army (target span)
5. Battle resolution if needed (battle span)
6. State updates propagate (update spans)

Each step generates spans with relevant attributes, demonstrating trace context propagation in a distributed system.

## Educational Use

This project is designed for educational purposes to teach:
- Distributed systems concepts
- Observability practices
- Microservice architecture
- Real-time data flow
- System instrumentation

<!-- INTERACTIVE page step4.md END -->

<!-- INTERACTIVE page finish.md START -->

## Contributing

We welcome contributions! Please see our [contribution guidelines](CONTRIBUTING.md) for details.

## License

This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.

## Disclaimer

This is an educational project focused on teaching distributed tracing concepts. Any resemblance to existing games or properties is coincidental and falls under fair use for educational purposes.

## Further Resources

- [OpenTelemetry Documentation](https://opentelemetry.io/docs/)
- [Grafana Alloy Documentation](https://grafana.com/docs/alloy/latest/)
- [Distributed Tracing Guide](https://opentelemetry.io/docs/concepts/observability-primer/#distributed-traces) 

<!-- INTERACTIVE page finish.md END -->

================================================
FILE: game-of-tracing/SPAN_LINKS.md
================================================
# Span Links Implementation in Game of Tracing

This document explains how span links are implemented in the Game of Tracing game to enable game replay functionality.

## What Are Span Links?

Span links allow you to create relationships between spans that aren't in a direct parent-child hierarchy. Unlike parent-child relationships (which are synchronous and hierarchical), links are more flexible and can connect spans across different traces or time periods.

## Implementation Overview

### Game Session Tracking

Each player gets a unique `game_session_id` when they select a faction. This ID is used to track all their actions throughout the game:

```python
# Generated when player selects faction
session['game_session_id'] = str(uuid.uuid4())
session['action_sequence'] = 0
```

### Action Storage

Every significant game action is stored in a SQLite database with its trace information:

```sql
CREATE TABLE game_actions (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    game_session_id TEXT NOT NULL,
    action_sequence INTEGER NOT NULL,
    action_type TEXT NOT NULL,
    player_name TEXT,
    faction TEXT,
    trace_id TEXT NOT NULL,
    span_id TEXT NOT NULL,
    location_id TEXT,
    target_location_id TEXT,
    timestamp INTEGER NOT NULL,
    game_state_after TEXT
)
```

### Span Link Creation

Each new action creates a span link to the previous action in the sequence:

```python
# Get previous action's span context
previous_span_context = get_previous_action_context(game_session_id, current_sequence)

# Create span link using official OpenTelemetry API
if previous_span_context:
    link = trace.Link(
        previous_span_context,
        attributes={
            "link.type": "game_sequence",
            "link.relation": "follows",
            "game.sequence": "true"
        }
    )
    links.append(link)

# Create new span with links
with tracer.start_as_current_span(
    "move_army",
    kind=SpanKind.SERVER,
    links=links,  # Links to previous actions
    attributes={
        "game.session.id": game_session_id,
        "game.action.type": "move_army",
        "game.action.sequence": current_sequence + 1
    }
) as span:
    # ... action logic ...
```

## Supported Actions

The following game actions create span links:

1. **collect_resources** - Collecting resources at a location
2. **create_army** - Creating armies at capitals
3. **move_army** - Moving armies between locations
4. **all_out_attack** - Launching all-out attacks

## Battle Mechanics

The game uses simple but effective battle calculations:

### Combat Rules

1. **Same Faction**: Reinforcement
   - Armies combine: `final_army = attacking_army + defending_army`
   - Used for friendly army movements and reinforcements

2. **Different Factions**: Combat
   - **Attacker Victory**: `remaining_army = attacking_army - defending_army`
   - **Defender Victory**: `remaining_army = defending_army - attacking_army`
   - **Stalemate**: `remaining_army = 0` (equal armies destroy each other)

### All-Out Attack Special Rules

- All-out attacks automatically collect armies from friendly villages along the path
- This simulates gathering reinforcements during the march to enemy territory
- Example: 5 armies + 2 village armies = 7 armies continuing to target

### Battle Calculation Code

```python
def _handle_battle(self, attacking_army: int, attacking_faction: str, 
                  defending_army: int, defending_faction: str) -> tuple[str, int, str]:
    # Same faction = reinforcement
    if attacking_faction == defending_faction:
        return "reinforcement", attacking_army + defending_army, attacking_faction
    
    # Actual combat
    if attacking_army > defending_army:
        remaining = attacking_army - defending_army
        return "attacker_victory", remaining, attacking_faction
    elif defending_army > attacking_army:
        remaining = defending_army - attacking_army
        return "defender_victory", remaining, defending_faction
    else:
        return "stalemate", 0, defending_faction
```

## Game Restart Functionality

The restart system ensures complete game state reset:

### What Gets Reset

1. **Game State Variables**
   - `GAME_OVER`, `WINNER`, `VICTORY_MESSAGE` flags
   - Global game state in war map

2. **Span Links Database**
   - All game action records cleared
   - Fresh start for span link chains

3. **Faction Assignments**
   - Player faction selections cleared
   - All factions become available

4. **AI Opponent**
   - AI automatically deactivated
   - Prevents ghost AI actions

5. **Location Database**
   - All locations reset to initial state
   - Resources, armies, and factions restored

### Restart Process

```python
def reset_game_data():
    # Reset local game state
    reset_game_state()
    
    # Deactivate AI
    requests.post(f"{AI_SERVICE_URL}/deactivate")
    
    # Clear faction assignments
    release_all_factions()
    
    # Clear span links database
    cursor.execute("DELETE FROM game_actions")
    
    # Reset location database
    make_api_request('southern_capital', 'reset', method='POST')
```

### Verification

Use the debug endpoint to verify complete reset:

```bash
curl http://localhost:8080/api/debug/restart_verification
```

Expected response:
```json
{
  "success": true,
  "all_systems_reset": true,
  "details": {
    "game_state_reset": true,
    "span_links_cleared": true,
    "faction_assignments_cleared": true,
    "ai_deactivated": true,
    "database_reset": true
  }
}
```

## Game Replay Chain

With span links, you can trace the complete game narrative:

```
Game Start → Collect Resources → Create Army → Move Army → Battle → Victory
     ↑              ↑               ↑           ↑         ↑        ↑
  [trace_1]     [trace_2]       [trace_3]   [trace_4] [trace_5] [trace_6]
                    ↑               ↑           ↑         ↑        ↑
               [links to]      [links to]  [links to] [links to] [links to]
               trace_1         trace_2     trace_3    trace_4   trace_5
```

## Game Replay Through Tempo

### TraceQL Queries for Replay

#### 1. Find All Game Sessions
```traceql
{game.session.id!=""}
```

#### 2. Get Specific Game Session
```traceql
{game.session.id="abc-123-def"}
```

#### 3. Find Actions with Span Links
```traceql
{link.type="game_sequence"}
```

#### 4. Find Game Actions by Type
```traceql
{game.action.type="move_army"}
```

#### 5. Find Actions by Player
```traceql
{player.name="Alice" && game.session.id!=""}
```

#### 6. Find Battle Outcomes
```traceql
{span.battle.occurred=true}
```

### Tempo API Integration

The replay system uses Tempo's HTTP API:

```python
# 1. Search for game sessions
GET /api/search?q={game.session.id!=""}

# 2. Get specific session traces  
GET /api/search?q={game.session.id="session-id"}

# 3. Get full trace details
GET /api/traces/{trace-id}

# 4. Extract span links from trace data
for span in trace['batches'][0]['spans']:
    for ref in span.get('references', []):
        if ref.get('refType') == 'FOLLOWS_FROM':
            # This is a span link
            linked_span_id = ref.get('spanID')
```

### Replay Engine Architecture

```python
class GameReplayEngine:
    def find_game_sessions(self) -> List[str]:
        """Query Tempo for all game sessions"""
        
    def get_session_traces(self, session_id: str) -> List[Dict]:
        """Get all traces for a specific session"""
        
    def extract_game_actions(self, traces: List[Dict]) -> List[GameAction]:
        """Parse traces into game actions"""
        
    def verify_span_links(self, actions: List[GameAction]) -> None:
        """Verify span link chain integrity"""
        
    def replay_session_step_by_step(self, session: GameSession) -> None:
        """Replay game session action by action"""
```

### Web UI Replay

The game includes web endpoints for replay:

- `GET /api/replay/sessions` - List available game sessions
- `GET /api/replay/session/{id}` - Get detailed replay data
- `GET /replay` - Replay dashboard page
- `GET /replay/{session-id}` - Specific session replay

### Replay Data Structure

```json
{
  "session_id": "abc-123-def",
  "player_name": "Alice",
  "faction": "southern",
  "actions": [
    {
      "sequence": 1,
      "action_type": "collect_resources",
      "trace_id": "trace-1",
      "span_id": "span-1",
      "span_links": [],  // First action has no links
      "timestamp": "2024-01-01T10:00:00Z",
      "location_id": "southern_capital"
    },
    {
      "sequence": 2,
      "action_type": "create_army", 
      "trace_id": "trace-2",
      "span_id": "span-2",
      "span_links": ["span-1"],  // Links to previous action
      "timestamp": "2024-01-01T10:01:00Z",
      "location_id": "southern_capital"
    }
  ],
  "span_link_chain": [
    {"sequence": 1, "valid_chain": true, "note": "First action"},
    {"sequence": 2, "valid_chain": true, "note": "Correctly links to action 1"}
  ]
}
```

## Querying Span Links

### In Grafana Tempo

Search for traces with game session information:
```
{game.session.id!=""}
```

Find spans with links:
```
{link.type="game_sequence"}
```

### Trace Attributes

Each span includes these attributes for game replay:
- `game.session.id` - Unique session identifier
- `game.action.type` - Type of action (move_army, create_army, etc.)
- `game.action.sequence` - Sequence number in the game
- `link.type` - Type of link (game_sequence)
- `link.relation` - Relationship (follows)

## Testing

Run the test script to verify span links are working:

```bash
cd game-of-tracing
python debug_span_links.py
```

This will:
1. Select a faction
2. Perform a sequence of actions
3. Each action will link to the previous one
4. Provide instructions for viewing the links in Grafana
5. Test battle calculation mechanics
6. Verify restart functionality

Test the replay functionality:

```bash
cd game-of-tracing
python war_map/replay.py
```

## Educational Value

Span links demonstrate:
- **Cross-trace relationships** - Actions in different traces can be related
- **Historical context** - Each action knows what came before it
- **Game narrative** - Complete story of how the game unfolded
- **Advanced OpenTelemetry** - Real-world use of span links feature
- **Tempo integration** - How to query and reconstruct trace relationships

## Game Replay Benefits

1. **Debugging** - Understand what led to game outcomes
2. **Analytics** - Analyze player behavior patterns
3. **Education** - Show distributed tracing concepts in action
4. **Auditing** - Verify game logic and fairness
5. **Entertainment** - Watch epic games unfold step by step

## Future Enhancements

Potential additions:
- AI action links to player actions that triggered them
- Battle outcome links to the actions that led to the battle
- Resource transfer chains across multiple locations
- Victory condition traces showing the sequence that led to game end
- Interactive replay UI with game map visualization
- Export replay data for external analysis 

## Troubleshooting Replay Functionality

### Tempo API Query Strategy

The replay system uses a **two-step approach** to work reliably with Tempo:

#### **Step 1: Discover Game Sessions**
Uses Tempo's tag values API to find all available game session IDs:
```bash
GET /api/v2/search/tag/game.session.id/values?start=<timestamp>&end=<timestamp>&limit=50
```

This returns all unique values for the `game.session.id` tag, giving us a list of available sessions.

#### **Step 2: Query Each Session**
For each discovered session ID, queries for its traces:
```bash
GET /api/search?q={game.session.id="specific-session-id"}&limit=100
```

This approach avoids complex TraceQL queries that might fail with 400 errors.

### Common Issues and Solutions

#### 1. Tempo Query Errors (400 Bad Request)

**Problem**: Getting 400 errors when querying Tempo with complex TraceQL

**Solutions**:
- **New approach**: Use tag values API first, then simple session-specific queries
- **Fallback**: System automatically falls back to local SQLite database
- **Logging**: Enhanced logging shows exactly which queries are being attempted

#### 2. Missing Span Attributes

**Problem**: Custom span attributes like `game.session.id` may not be indexed in Tempo

**Solutions**:
- **Attribute verification**: Check that spans are being created with correct attributes
- **Hybrid approach**: Local database stores action sequence as backup
- **Index configuration**: Ensure Tempo is configured to index custom attributes

#### 3. Time Range Issues

**Solutions**:
- **4-hour window**: System now uses 4-hour time windows for discovery
- **Unix timestamps**: Uses seconds-based timestamps for better compatibility
- **Configurable ranges**: Time ranges can be adjusted based on game session length

### Data Source Fallbacks

The replay system has multiple data sources in order of preference:

1. **`tempo_tag_values`** - Primary approach using tag values API
2. **`tempo_search_only`** - Basic span data from search results only  
3. **`local_db_fallback`** - SQLite database as final fallback

### Debug Tools

#### 1. Replay Debug Script
```bash
cd game-of-tracing
python debug_replay.py
```

This comprehensive script tests:
- Tempo connection and version
- Basic TraceQL query functionality  
- Game-specific attribute queries
- Replay API endpoints
- Local database fallback

#### 2. Manual Tempo Queries

Test Tempo directly using curl:

```bash
# Basic connectivity
curl http://localhost:3200/ready

# Simple trace search
curl "http://localhost:3200/api/search?q={span.name!=\"\"}&limit=5"

# Game-specific search
curl "http://localhost:3200/api/search?q={span.name=\"collect_resources\"}&limit=10"
```

#### 3. Replay API Testing

```bash
# Get available sessions
curl http://localhost:8080/api/replay/sessions

# Get specific session
curl http://localhost:8080/api/replay/session/your-session-id

# Check local database health
curl http://localhost:8080/api/debug/health
```

### Replay System Architecture

The improved replay system uses a **hybrid approach**:

#### 1. Primary Data Source: Tempo
- Queries Tempo using multiple TraceQL approaches
- Extracts complete span information including links
- Provides full distributed tracing context

#### 2. Fallback Data Source: Local SQLite
- Stores essential game action metadata
- Always available even if Tempo queries fail
- Enables replay functionality regardless of Tempo state

#### 3. Query Strategy
```python
# Multiple query attempts with increasing specificity
queries = [
    '{span.name="collect_resources" || span.name="create_army" || span.name="move_army" || span.name="all_out_attack"}',
    '{resource.service.name="war_map"}', 
    '{game.action.type!=""}',
    '{span.name!=""}'  # Fallback to any spans
]
```

### Performance Optimizations

#### 1. Time Window Optimization
- **Before**: 24-hour windows with nanosecond precision
- **After**: 1-hour windows with Unix second precision
- **Result**: Faster queries, reduced timeout errors

#### 2. Query Prioritization
- Try specific game queries first
- Fall back to broader queries if needed
- Use local database if all Tempo queries fail

#### 3. Response Caching
- Session metadata cached in local database
- Reduces repeated Tempo queries
- Improves UI responsiveness

### Access After Game Reset

The replay page is now accessible from the faction selection screen:

**Location**: [http://localhost:8080](http://localhost:8080) → "View Game Replays" button

**Benefits**:
- No need to be in an active game session
- Available immediately after game reset
- Persistent access to historical game data

### Expected Response Format

#### Successful Tempo Response
```json
{
  "success": true,
  "sessions": [
    {
      "session_id": "abc-123-def",
      "player_name": "Alice", 
      "faction": "southern",
      "start_time": 1234567890000000000,
      "action_count": 5,
      "last_action": "move_army"
    }
  ],
  "query_method": "tempo",
  "total_sessions": 1
}
```

#### Fallback Local Database Response
```json
{
  "success": true,
  "sessions": [...],
  "query_method": "local_db_fallback",
  "warning": "Tempo query failed: connection timeout"
}
```

### Tempo Configuration Requirements

For optimal replay functionality, ensure Tempo is configured with:

```yaml
# tempo-config.yaml
query_frontend:
  search:
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09

stream_over_http_enabled: true
```

And in docker-compose.yml:
```yaml
environment:
  - TEMPO_URL=http://tempo:3200
```

### TraceQL Query Examples

Based on the [Tempo API documentation](https://grafana.com/docs/tempo/latest/api_docs/), these queries should work:

#### Basic Queries
```traceql
# Find any spans with duration
{duration>1ms}

# Find spans by name
{span.name="collect_resources"}

# Find spans by service
{resource.service.name="war_map"}
```

#### Game-Specific Queries
```traceql
# Find game actions (if attributes are indexed)
{game.action.type!=""}

# Find player actions (if attributes are indexed)  
{player.name!=""}

# Combine conditions
{span.name="move_army" && player.faction="southern"}
```

### Integration with Grafana

Once the replay data is accessible, you can:

1. **View in Grafana Tempo**: Search for game session traces directly
2. **Create dashboards**: Visualize game progression over time
3. **Set up alerts**: Monitor for specific game events
4. **Analyze patterns**: Study player behavior across multiple games 

================================================
FILE: game-of-tracing/ai_opponent/CLAUDE.md
================================================
# ai_opponent/ — Strategic AI Decision Engine

> Algorithmic opponent (not LLM-based) that plays the faction not chosen by a human player. This doc is read by any AI coding agent. For scenario-wide context read [`../AGENTS.md`](../AGENTS.md) first.

## Purpose

`ai-opponent` is a Flask service on port **8081** that takes control of a faction and makes strategic decisions on a recurring loop. It is activated by `war_map` via `POST /activate` with JSON body `{"faction": ..., "map_id": ...}` — on the WoK map the player toggles it on manually; on WWA it auto-activates as `white_walkers` the moment the player picks the map.

Two AI variants dispatch off the `faction` field at activation time:

- **`StrategicAI`** — classic WoK opponent (southern / northern). 6-step priority cascade: capital defense → zero-risk captures → resource transfers → plan execution → plan creation → fallback.
- **`WhiteWalkerAI(StrategicAI)`** — single-player WWA opponent. Different cascade: defend fortress → capture unowned wall → reinforce weakest wall (non-capital neighbours preferred; capital is a fallback when no other source has spare army, since `move_army` empties the source) → raid barbarian village (for corpses) → raise army from corpses at the fortress (only requires the capital to still belong to the AI; no minimum garrison) → idle. Reads its corpse pool via `GET /faction_economy?faction=white_walkers` on any location service; spends 5 corpses per army unit instead of 30 resources.

Common to both: the AI:

- Fetches the state of all 8 locations.
- Runs a priority cascade of checks to decide the next action (defend, capture, transfer, plan, fallback).
- Executes the action via the same HTTP API the player uses (against the location services on 5001-5008).
- Emits fully-linked traces so the replay UI can narrate the AI's reasoning alongside the human player's.
- Adapts its loop cadence (2-15 s) to the current game phase.

**This is deterministic code, not an LLM.** No `anthropic`, `openai`, or other model SDKs are imported.

## File map

| File | Size | Purpose |
|---|---|---|
| `ai_server.py` | ~46 KB | Main decision engine: `StrategicAI`, `PhaseDetector`, `Planner`, `MapAnalyzer`, Flask routes, decision loop. |
| `telemetry.py` | ~7.7 KB | `AITelemetry` class for `ai-opponent` — traces, logs, AI-specific metrics, plus Pyroscope profiling with OTel span-profile linkage. |
| `README.md` | ~2.6 KB | Feature doc. |
| `Dockerfile` | small | `python:3.11-slim`, `pip install -r requirements.txt`, runs `python ai_server.py`. |
| `requirements.txt` | small | Flask 3.1.3, requests 2.33.1, OpenTelemetry SDK/API + exporters, `pyroscope-io` + `pyroscope-otel` for profiling. |

## Decision model

### Priority cascade — `StrategicAI.decide()`

Executed every cycle; returns the first non-null action:

1. **Capital defense.** If the capital is under threat (enemy army adjacent with path-army-estimate exceeding capital garrison), react: build army, pull army back, or preempt.
2. **Zero-risk captures.** Grab any neutral village reachable with overwhelming numerical advantage.
3. **Resource transfers.** Move resources from villages to the capital when the capital is running low.
4. **Plan execution.** If a multi-step plan is active and valid, advance to the next step.
5. **Plan creation.** Propose a new plan targeting the most valuable enemy territory.
6. **Fallback.** Collect resources at the capital.

### Phase detection — `PhaseDetector.detect()` at `ai_server.py:195-212`

Five phases drive cadence and aggressiveness:

| Phase | Condition | Cadence (seconds) |
|---|---|---|
| `READY_TO_ATTACK` | `total_army >= 8` | 3-8 |
| `DESPERATE` | `my_count <= 1` | 2-5 |
| `DEFENSIVE` | `my_count < enemy_count` | medium |
| `DOMINATING` | `my_count > enemy_count + 1` | 5-15 |
| `BALANCED` | everything else | 5-15 |

Cadence is set by `StrategicAI.get_pause_time()`; faster in crisis, slower in stability.

### Supporting classes

- **`MapAnalyzer`** (`ai_server.py:64-135`) — precomputes BFS distances between all location pairs at startup. Used by `path_army_estimate()` to sum enemy armies along shortest path to a target — enabling threat assessment.
- **`Planner`** (`ai_server.py:216+`) — multi-step goal sequences like `[create_army, create_army, create_army, move_army(target)]`. Validated every cycle via `Planner.validate()`; abandoned if preconditions break (e.g., capital lost, source location flipped).
- **`GameMemory`** — tracks territory-loss history, failed attacks, enemy push directions; used by `territory_lost_recently()` etc. at `ai_server.py:180-191` to adjust reactive behavior.

## Custom metrics

| Metric | Type | Attributes | Emitter |
|---|---|---|---|
| `ai.decisions` | counter | `action_type`, `phase`, `reason` | `decide()` / `execute_strategic_action()` |
| `ai.plans_created` | counter | `goal` | `Planner.set_plan` |
| `ai.plans_abandoned` | counter | `reason` | `Planner.abandon` |
| `ai.decision_cycle_duration_seconds` | histogram | `phase` | Each decision cycle |
| `ai.territory_count` | observable gauge | `faction` | Callback into live state |
| `ai.total_army` | observable gauge | `faction` | Callback into live state |

## Span events

Significant state transitions are emitted as events on the active decision span (rather than as standalone spans):

- `phase_transition` — with `from_phase`, `to_phase` attributes
- `territory_change` — with `gained` / `lost` territory lists
- `plan_abandoned` — with `reason` and `original_goal`
- `threat_detected` — with `threat_source`, `threat_army`, `target`

Locations: `ai_server.py:299-327`.

## Span links unique to `ai_opponent/`

The AI opponent instruments its own causal chain **inside a single decision cycle**:

- `ai_decision_cycle` span (SpanKind.INTERNAL) wraps the whole cycle.
- `ai_decision` span (child, INTERNAL) captures the cascade evaluation and chosen action.
- `execute_ai_action` span (INTERNAL) is the action execution — it starts with a `Link` back to the `ai_decision` span's context, with `link.type="ai_decision_trigger"`. This allows the replay UI to jump from the executed action back to the reasoning that produced it.

The linking logic lives around `ai_server.py:888-901`. The AI does **not** participate in the cross-session `game_sequence` chain that `war_map` builds — that is player-only.

## Environment

| Var | Default | Purpose |
|---|---|---|
| `PORT` | `8081` | Flask listen port |
| `IN_DOCKER` | unset | When set, location URLs resolve via container DNS (`southern-capital:5001`) instead of `localhost:5001` |

Telemetry endpoints are hard-coded in `telemetry.py` to `alloy:4317` (gRPC traces) and `alloy:4318` (HTTP logs + metrics). The service resource is registered with `SERVICE_NAME="ai-opponent"`.

## Activation flow

1. `war_map` calls `POST http://ai-opponent:8081/activate` with JSON body `{"faction": "northern"}`.
2. The handler constructs a `StrategicAI(faction)` instance and starts `ai_decision_loop()` in a daemon thread.
3. The loop runs until `/deactivate` is called or the game is marked over.
4. Each cycle captures a span, logs, and increments the appropriate metrics.

## Common edits

**Tune aggressiveness.**
Adjust thresholds in `PhaseDetector.detect()` at `ai_server.py:195-212`, or the cadence ranges in `get_pause_time()`.

**Change the priority cascade.**
Edit `StrategicAI.decide()`. Each priority is its own helper (`_check_capital_defense`, `_find_zero_risk_captures`, `_do_resource_transfers`, plan steps). Reorder by reshuffling the cascade.

**Add a new AI metric.**
Mirror the observable-gauge pattern in `telemetry.py` and wire a callback that reads from `StrategicAI` live state (via a registered state accessor, same pattern as `app/telemetry.py`).

**Add a new span event.**
Call `span.add_event("event_name", attributes={...})` inside the decision span. Keep the existing four event names stable — they feed replay UI rendering.

## Keep this doc current

Per the sub-agent rule, any change to the priority cascade, phase thresholds, metric set, env vars, or the line-number anchors above must land in the same work unit. Before returning a response that touched `ai_opponent/`, grep this file for references to anything you changed.

## Cross-references

- [`../AGENTS.md`](../AGENTS.md) — scenario-wide architecture and patterns
- [`../app/CLAUDE.md`](../app/CLAUDE.md) — the location-server HTTP API this AI calls
- [`../war_map/CLAUDE.md`](../war_map/CLAUDE.md) — the orchestrator that activates/deactivates this service
- [`../SPAN_LINKS.md`](../SPAN_LINKS.md) — span-link design, including the `ai_decision_trigger` link type


================================================
FILE: game-of-tracing/ai_opponent/Dockerfile
================================================
FROM python:3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2

WORKDIR /app

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .

ENV FLASK_APP=ai_server.py
ENV FLASK_DEBUG=0
ENV IN_DOCKER=1

EXPOSE 8081

CMD ["flask", "run", "--host=0.0.0.0", "--port=8081"] 

================================================
FILE: game-of-tracing/ai_opponent/README.md
================================================
# AI Opponent for War of Kingdoms

This Flask-based AI service provides an intelligent opponent for single-player games in the War of Kingdoms distributed tracing tutorial.

## Features

### Adaptive Strategy
The AI adapts its strategy based on the game phase:
- **Early Game (0-5 minutes)**: Focuses on resource collection and capturing neutral villages
- **Mid Game (5-15 minutes)**: Balances expansion with army building and defense
- **Late Game (15+ minutes)**: Shifts to aggressive tactics and all-out attacks

### Natural Behavior
- Takes 15-45 second pauses between actions to simulate human thinking time
- Uses weighted random decisions to avoid predictable patterns
- Reacts to player threats by reinforcing endangered locations
- Manages resources by transferring them from villages to capitals

### Decision Making
The AI analyzes the game state to make intelligent decisions:
1. **Threat Analysis**: Identifies enemy armies near its territories
2. **Expansion Targets**: Finds neutral villages and weak enemy locations
3. **Resource Management**: Collects resources and creates armies when needed
4. **Strategic Movement**: Reinforces threatened locations and attacks vulnerable targets

### OpenTelemetry Integration
All AI actions are fully instrumented with OpenTelemetry:
- Traces show decision-making process
- Spans include game phase, threats, and chosen actions
- Integrates with the game's distributed tracing pipeline

## API Endpoints

- `POST /activate` - Activate the AI for a specific faction
- `POST /deactivate` - Deactivate the AI
- `GET /status` - Get current AI status
- `GET /health` - Health check endpoint

## How It Works

1. When activated, the AI starts a background thread that runs the decision loop
2. Every 15-45 seconds, it:
   - Fetches the current game state from all locations
   - Analyzes threats and opportunities
   - Makes a weighted random decision based on the game phase
   - Executes the chosen action via location server APIs
3. The AI automatically stops when it detects game over

## Configuration

The AI difficulty is set to "normal" and provides a balanced challenge. Decision weights can be adjusted in the `DECISION_WEIGHTS` dictionary to make the AI more aggressive or defensive.

## Usage

The AI is integrated with the War Map UI:
1. Players can toggle "Enable AI Opponent" in the game interface
2. The AI automatically takes control of the faction not chosen by the player
3. For two-player games, keep the AI toggle off

## Observability

Monitor AI behavior through:
- **Traces**: View AI decision-making and action execution
- **Logs**: Track AI state changes and decisions
- **Service Map**: See AI interactions with location servers 

================================================
FILE: game-of-tracing/ai_opponent/ai_server.py
================================================
import os
import time
import random
import requests
import threading
import atexit
from collections import deque
from flask import Flask, jsonify, request
from telemetry import AITelemetry
from opentelemetry import trace, baggage
from opentelemetry.trace import SpanKind, Link
from opentelemetry.propagate import inject
from datetime import datetime, timedelta
from enum import Enum

app = Flask(__name__)

# Initialize telemetry
telemetry = AITelemetry()
logger = telemetry.get_logger()
tracer = telemetry.get_tracer()
atexit.register(telemetry.shutdown)

# ─── Constants ─────────────────────────────────────────────────────────────────

# Per-map adjacency lists. Keep keys in sync with
# game-of-tracing/app/game_config.py's MAPS[*]["locations"][*]["connections"].
MAP_GRAPHS_BY_MAP = {
    "war_of_kingdoms": {
        "southern_capital": ["village_1", "village_3"],
        "northern_capital": ["village_2", "village_6"],
        "village_1": ["southern_capital", "village_2", "village_4"],
        "village_2": ["northern_capital", "village_1", "village_5"],
        "village_3": ["southern_capital", "village_5", "village_6"],
        "village_4": ["village_1", "village_5"],
        "village_5": ["village_2", "village_3", "village_4", "village_6"],
        "village_6": ["northern_capital", "village_3", "village_5"],
    },
    "white_walkers_attack": {
        "nights_watch_fortress": [
            "wall_west", "wall_center_west", "wall_center_east", "wall_east",
        ],
        "white_walker_fortress": [
            "wall_west", "wall_center_west", "wall_center_east", "wall_east",
        ],
        "wall_west": [
            "nights_watch_fortress", "white_walker_fortress",
            "wall_center_west", "barbarian_village_west",
        ],
        "wall_center_west": [
            "nights_watch_fortress", "white_walker_fortress",
            "wall_west", "wall_center_east",
        ],
        "wall_center_east": [
            "nights_watch_fortress", "white_walker_fortress",
            "wall_center_west", "wall_east",
        ],
        "wall_east": [
            "nights_watch_fortress", "white_walker_fortress",
            "wall_center_east", "barbarian_village_east",
        ],
        "barbarian_village_west": ["wall_west"],
        "barbarian_village_east": ["wall_east"],
    },
}

# Per-map capital mapping (faction -> location_id of that faction's capital).
CAPITALS_BY_MAP = {
    "war_of_kingdoms": {
        "southern": "southern_capital",
        "northern": "northern_capital",
    },
    "white_walkers_attack": {
        "nights_watch": "nights_watch_fortress",
        "white_walkers": "white_walker_fortress",
    },
}

# Per-map location type lookup (capital / village / wall).
LOCATION_TYPES_BY_MAP = {
    "war_of_kingdoms": {
        "southern_capital": "capital", "northern_capital": "capital",
        "village_1": "village", "village_2": "village", "village_3": "village",
        "village_4": "village", "village_5": "village", "village_6": "village",
    },
    "white_walkers_attack": {
        "nights_watch_fortress": "capital",
        "white_walker_fortress": "capital",
        "wall_west": "wall", "wall_center_west": "wall",
        "wall_center_east": "wall", "wall_east": "wall",
        "barbarian_village_west": "village",
        "barbarian_village_east": "village",
    },
}

# Per-map location faction (static initial ownership — what the AI reasons
# about for walls-are-neutral / barbarian-villages-are-barbarian etc.).
INITIAL_FACTIONS_BY_MAP = {
    "war_of_kingdoms": {
        "southern_capital": "southern", "northern_capital": "northern",
        "village_1": "neutral", "village_2": "neutral", "village_3": "neutral",
        "village_4": "neutral", "village_5": "neutral", "village_6": "neutral",
    },
    "white_walkers_attack": {
        "nights_watch_fortress": "nights_watch",
        "white_walker_fortress": "white_walkers",
        "wall_west": "neutral", "wall_center_west": "neutral",
        "wall_center_east": "neutral", "wall_east": "neutral",
        "barbarian_village_west": "barbarian",
        "barbarian_village_east": "barbarian",
    },
}

# Per-map army cost per faction. Matches app/game_config.py's rules.army_cost.
ARMY_COST_BY_MAP = {
    "war_of_kingdoms": {"default": 30},
    "white_walkers_attack": {"default": 30, "white_walkers": 5},
}

# Backward-compat alias: legacy code that references MAP_GRAPH still sees WoK.
MAP_GRAPH = MAP_GRAPHS_BY_MAP["war_of_kingdoms"]

ARMY_COST = 30
VILLAGE_INCOME_PER_MIN = 40  # ~10 resources every 15s
RESOURCE_TRANSFER_THRESHOLD = 30

# Single port table keyed by location id (same ports are shared across maps
# because a slot's port is fixed and each map just renames the slot).
LOCATION_PORTS = {
    "southern_capital": 5001,
    "northern_capital": 5002,
    "village_1": 5003,
    "village_2": 5004,
    "village_3": 5005,
    "village_4": 5006,
    "village_5": 5007,
    "village_6": 5008,
    # White Walkers Attack aliases (same physical slot → same port).
    "nights_watch_fortress": 5001,
    "white_walker_fortress": 5002,
    "wall_west": 5003,
    "wall_center_west": 5004,
    "wall_center_east": 5005,
    "wall_east": 5006,
    "barbarian_village_west": 5007,
    "barbarian_village_east": 5008,
}

# Container hostname per logical location id (resolves HTTP URLs in docker).
CONTAINER_FOR_LOCATION_ID = {
    # WoK ids are their own container names.
    "southern_capital": "southern-capital",
    "northern_capital": "northern-capital",
    "village_1": "village-1",
    "village_2": "village-2",
    "village_3": "village-3",
    "village_4": "village-4",
    "village_5": "village-5",
    "village_6": "village-6",
    # WWA ids share containers with their slot peer.
    "nights_watch_fortress": "southern-capital",
    "white_walker_fortress": "northern-capital",
    "wall_west": "village-1",
    "wall_center_west": "village-2",
    "wall_center_east": "village-3",
    "wall_east": "village-4",
    "barbarian_village_west": "village-5",
    "barbarian_village_east": "village-6",
}


def get_map_graph(map_id):
    return MAP_GRAPHS_BY_MAP.get(map_id, MAP_GRAPH)


def get_capitals(map_id):
    return CAPITALS_BY_MAP.get(map_id, CAPITALS_BY_MAP["war_of_kingdoms"])


def get_location_types(map_id):
    return LOCATION_TYPES_BY_MAP.get(map_id, LOCATION_TYPES_BY_MAP["war_of_kingdoms"])


def get_initial_factions(map_id):
    return INITIAL_FACTIONS_BY_MAP.get(map_id, INITIAL_FACTIONS_BY_MAP["war_of_kingdoms"])


def get_army_cost_for(map_id, faction):
    costs = ARMY_COST_BY_MAP.get(map_id, ARMY_COST_BY_MAP["war_of_kingdoms"])
    return costs.get(faction, costs["default"])

# ─── Game Phase ────────────────────────────────────────────────────────────────

class GamePhase(Enum):
    DESPERATE = "desperate"
    DEFENSIVE = "defensive"
    BALANCED = "balanced"
    DOMINATING = "dominating"
    READY_TO_ATTACK = "ready_to_attack"

# ─── Map Analyzer ──────────────────────────────────────────────────────────────

class MapAnalyzer:
    """Precomputed map analysis: BFS distances, strategic values, path army estimation."""

    def __init__(self, graph=None, capitals=None):
        # ``graph`` defaults to WoK to preserve legacy behaviour; new callers
        # pass the active map's adjacency list. ``capitals`` is the map's
        # faction→capital dict (needed for the strategic-value heuristic).
        self.graph = graph if graph is not None else MAP_GRAPH
        self.capitals = capitals if capitals is not None else CAPITALS_BY_MAP["war_of_kingdoms"]
        self.distances = self._compute_all_distances()
        self.strategic_values = self._compute_strategic_values()

    def _bfs_distances(self, start):
        """BFS from start node, returns dict {node: distance}."""
        visited = {start: 0}
        queue = deque([start])
        while queue:
            node = queue.popleft()
            for neighbor in self.graph[node]:
                if neighbor not in visited:
                    visited[neighbor] = visited[node] + 1
                    queue.append(neighbor)
        return visited

    def _compute_all_distances(self):
        """Precompute all-pairs BFS distances."""
        return {loc: self._bfs_distances(loc) for loc in self.graph}

    def _compute_strategic_values(self):
        """Score each location by connectivity + centrality.

        High connectivity or short distance to either capital = valuable.
        Works identically across maps because it reads capitals from the
        per-map mapping rather than hardcoding WoK's capital names.
        """
        values = {}
        capital_ids = list(self.capitals.values())
        for loc in self.graph:
            connections = len(self.graph[loc])
            if capital_ids:
                avg_capital_dist = sum(
                    self.distances[loc].get(cap, 99) for cap in capital_ids
                ) / float(len(capital_ids))
            else:
                avg_capital_dist = 99
            values[loc] = connections + (4.0 / max(avg_capital_dist, 1))
        return values

    def distance(self, a, b):
        return self.distances[a].get(b, 99)

    def neighbors(self, loc):
        return self.graph.get(loc, [])

    def path_army_estimate(self, game_state, from_loc, to_loc, my_faction):
        """Estimate total enemy army along BFS shortest path from from_loc to to_loc."""
        parent = {from_loc: None}
        queue = deque([from_loc])
        while queue:
            node = queue.popleft()
            if node == to_loc:
                break
            for neighbor in self.graph[node]:
                if neighbor not in parent:
                    parent[neighbor] = node
                    queue.append(neighbor)

        if to_loc not in parent:
            return 999  # unreachable

        # Walk path and sum enemy armies (excluding from_loc)
        path = []
        node = to_loc
        while node is not None:
            path.append(node)
            node = parent[node]
        path.reverse()

        enemy_army = 0
        for loc in path[1:]:  # skip from_loc
            loc_data = game_state.get(loc, {})
            if loc_data.get('faction') != my_faction:
                enemy_army += loc_data.get('army', 0)
        return enemy_army

# ─── Game Memory ───────────────────────────────────────────────────────────────

class GameMemory:
    """Tracks territory changes, failed attacks, and enemy push direction."""

    def __init__(self):
        self.territory_history = []  # list of (timestamp, my_territories set)
        self.failed_attacks = {}     # {target_loc: last_failure_time}
        self.enemy_push_direction = None
        self.last_enemy_territories = set()

    def update(self, game_state, my_faction):
        now = time.time()
        my_territories = set()
        enemy_territories = set()

        for loc_id, data in game_state.items():
            if data.get('faction') == my_faction:
                my_territories.add(loc_id)
            elif data.get('faction') not in (my_faction, 'neutral'):
                enemy_territories.add(loc_id)

        self.territory_history.append((now, my_territories.copy()))
        if len(self.territory_history) > 20:
            self.territory_history = self.territory_history[-20:]

        # Detect enemy push direction: new enemy territory closest to our capital
        new_enemy = enemy_territories - self.last_enemy_territories
        if new_enemy:
            self.enemy_push_direction = list(new_enemy)[0]
        self.last_enemy_territories = enemy_territories

        return my_territories, enemy_territories

    def record_failed_attack(self, target):
        self.failed_attacks[target] = time.time()

    def recently_failed(self, target, cooldown=60):
        last = self.failed_attacks.get(target)
        if last is None:
            return False
        return (time.time() - last) < cooldown

    def territory_lost_recently(self, seconds=30):
        """Check if we lost territory in the last N seconds."""
        if len(self.territory_history) < 2:
            return False
        now = time.time()
        current = self.territory_history[-1][1]
        for ts, territories in reversed(self.territory_history[:-1]):
            if now - ts > seconds:
                break
            if len(territories) > len(current):
                return True
        return False

# ─── Phase Detector ────────────────────────────────────────────────────────────

class PhaseDetector:
    """State-based phase detection using territory count and total army."""

    @staticmethod
    def detect(my_territories, enemy_territories, total_army):
        my_count = len(my_territories)
        enemy_count = len(enemy_territories)

        if total_army >= 8:
            return GamePhase.READY_TO_ATTACK
        if my_count <= 1:
            return GamePhase.DESPERATE
        elif my_count < enemy_count:
            return GamePhase.DEFENSIVE
        elif my_count > enemy_count + 1:
            return GamePhase.DOMINATING
        else:
            return GamePhase.BALANCED

# ─── Planner ───────────────────────────────────────────────────────────────────

class Planner:
    """Multi-step goal planning: sequences like [create_army x3, move_army(target)]."""

    def __init__(self):
        self.steps = []
        self.goal = None

    @property
    def active(self):
        return len(self.steps) > 0

    def set_plan(self, goal, steps):
        self.goal = goal
        self.steps = list(steps)

    def next_step(self):
        if self.steps:
            return self.steps[0]
        return None

    def advance(self):
        if self.steps:
            self.steps.pop(0)

    def abandon(self, reason=""):
        self.steps = []
        self.goal = None

    def validate(self, game_state, my_faction, my_capital):
        """Check if the current plan is still valid. Abandon if not."""
        if not self.active:
            return

        step = self.steps[0]
        action = step.get("action")

        if action == "create_army":
            cap_data = game_state.get(my_capital, {})
            if cap_data.get('faction') != my_faction:
                self.abandon("lost capital")
        elif action == "move_army":
            from_loc = step.get("from")
            loc_data = game_state.get(from_loc, {})
            if loc_data.get('faction') != my_faction or loc_data.get('army', 0) == 0:
                self.abandon("lost staging location or no army")
        elif action == "all_out_attack":
            cap_data = game_state.get(my_capital, {})
            if cap_data.get('faction') != my_faction or cap_data.get('army', 0) < 3:
                self.abandon("insufficient army for all-out attack")

# ─── Strategic AI ──────────────────────────────────────────────────────────────

class StrategicAI:
    """Main decision engine with priority cascade."""

    def __init__(self, faction, map_id="war_of_kingdoms"):
        self.faction = faction
        self.map_id = map_id
        capitals = get_capitals(map_id)
        self.my_capital = capitals.get(faction)
        enemies = [cap for fac, cap in capitals.items() if fac != faction]
        self.enemy_capital = enemies[0] if enemies else None
        self.map = MapAnalyzer(graph=get_map_graph(map_id), capitals=capitals)
        self.memory = GameMemory()
        self.planner = Planner()
        self.phase = GamePhase.BALANCED
        self.my_territories = set()
        self.enemy_territories = set()
        self.total_army = 0
        self._previous_phase = None
        self._previous_territories = set()
        self._last_evaluated = []
        # Army cost for this faction on this map.
        self.army_cost = get_army_cost_for(map_id, faction)

    def decide(self, game_state):
        """Run the priority cascade and return an action dict or None."""
        # Update memory and phase
        self.my_territories, self.enemy_territories = self.memory.update(game_state, self.faction)
        self.total_army = sum(
            data.get('army', 0) for loc, data in game_state.items()
            if data.get('faction') == self.faction
        )
        self.phase = PhaseDetector.detect(self.my_territories, self.enemy_territories, self.total_army)

        # Span events: phase transition
        span = trace.get_current_span()
        if self._previous_phase is not None and self.phase != self._previous_phase:
            span.add_event("phase_transition", attributes={
                "previous_phase": self._previous_phase.value,
                "new_phase": self.phase.value,
                "territory_count": len(self.my_territories),
                "total_army": self.total_army,
            })
        self._previous_phase = self.phase

        # Span events: territory change
        current_territory_set = set(self.my_territories)
        gained = current_territory_set - self._previous_territories
        lost = self._previous_territories - current_territory_set
        if gained or lost:
            span.add_event("territory_change", attributes={
                "territories_gained": str(list(gained)),
                "territories_lost": str(list(lost)),
                "current_count": len(current_territory_set),
            })
        self._previous_territories = current_territory_set

        # Validate active plan (track if it gets abandoned)
        had_plan = self.planner.active
        previous_goal = self.planner.goal
        self.planner.validate(game_state, self.faction, self.my_capital)
        if had_plan and not self.planner.active:
            span.add_event("plan_abandoned", attributes={
                "previous_goal": previous_goal or "unknown",
                "reason": "validation_failed",
            })
            telemetry.record_plan_abandoned("validation_failed")

        # Priority cascade with alternatives tracking
        evaluated = []

        action = self._check_capital_defense(game_state)
        if action:
            evaluated.append(f"capital_defense: TRIGGERED ({action.get('reason', '')})")
            self._last_evaluated = evaluated
            return action
        evaluated.append("capital_defense: skipped")

        action = self._find_zero_risk_captures(game_state)
        if action:
            evaluated.append(f"zero_risk_capture: TRIGGERED ({action.get('reason', '')})")
            self._last_evaluated = evaluated
            return action
        evaluated.append("zero_risk_capture: skipped")

        action = self._do_resource_transfers(game_state)
        if action:
            evaluated.append(f"resource_transfer: TRIGGERED ({action.get('reason', '')})")
            self._last_evaluated = evaluated
            return action
        evaluated.append("resource_transfer: skipped")

        action = self._execute_plan_step(game_state)
        if action:
            evaluated.append(f"execute_plan: TRIGGERED ({action.get('reason', '')})")
            self._last_evaluated = evaluated
            return action
        evaluated.append("execute_plan: skipped")

        action = self._create_new_plan(game_state)
        if action:
            evaluated.append(f"create_plan: TRIGGERED ({action.get('reason', '')})")
            self._last_evaluated = evaluated
            return action
        evaluated.append("create_plan: skipped")

        evaluated.append("fallback: TRIGGERED")
        self._last_evaluated = evaluated
        return self._fallback(game_state)

    # ── Priority 1: Capital Defense ────────────────────────────────────────────

    def _check_capital_defense(self, game_state):
        """If enemies adjacent to capital, create armies or reinforce."""
        cap_data = game_state.get(self.my_capital, {})
        if not cap_data or cap_data.get('faction') != self.faction:
            return None

        my_army = cap_data.get('army', 0)
        neighbors = self.map.neighbors(self.my_capital)
        max_threat = 0
        threat_loc = None

        for n in neighbors:
            n_data = game_state.get(n, {})
            if n_data.get('faction') not in (self.faction, 'neutral') and n_data.get('army', 0) > 0:
                if n_data['army'] > max_threat:
                    max_threat = n_data['army']
                    threat_loc = n

        if max_threat == 0:
            return None

        needed = max_threat + 2
        trace.get_current_span().add_event("threat_detected", attributes={
            "threat_location": threat_loc,
            "threat_army": max_threat,
            "capital_army": my_army,
            "armies_needed": needed,
        })
        if my_army < needed:
            if cap_data.get('resources', 0) >= ARMY_COST:
                armies_to_create = min(
                    needed - my_army,
                    cap_data['resources'] // ARMY_COST
                )
                return {
                    "action": "create_army",
                    "location": self.my_capital,
                    "count": max(1, armies_to_create),
                    "reason": f"capital_defense against {max_threat} at {threat_loc}"
                }
            return self._reinforce_capital(game_state)

        return None

    def _reinforce_capital(self, game_state):
        """Move friendly armies within 2 hops toward capital."""
        best_source = None
        best_army = 0
        for loc in MAP_GRAPH:
            if loc == self.my_capital:
                continue
            loc_data = game_state.get(loc, {})
            if loc_data.get('faction') == self.faction and loc_data.get('army', 0) > 0:
                dist = self.map.distance(loc, self.my_capital)
                if dist <= 2 and loc_data['army'] > best_army:
                    best_army = loc_data['army']
                    best_source = loc

        if best_source:
            target = self._step_toward(best_source, self.my_capital)
            if target:
                return {
                    "action": "move_army",
                    "from": best_source,
                    "to": target,
                    "reason": f"reinforce capital from {best_source}"
                }
        return None

    def _step_toward(self, from_loc, toward_loc):
        """Return the neighbor of from_loc that is closest to toward_loc."""
        # Must consult the *active map's* adjacency, not the global
        # ``MAP_GRAPH`` (which is hard-coded to WoK). On WWA the from_loc is
        # e.g. ``white_walker_fortress`` — absent from the WoK graph and
        # raises ``KeyError`` mid-cascade, leaving the AI stuck.
        best = None
        best_dist = 99
        for n in self.map.graph[from_loc]:
            d = self.map.distance(n, toward_loc)
            if d < best_dist:
                best_dist = d
                best = n
        return best

    # ── Priority 2: Zero-Risk Captures ─────────────────────────────────────────

    def _find_zero_risk_captures(self, game_state):
        """Capture locations where our army > target army + 1, sorted by strategic value."""
        candidates = []
        for loc in MAP_GRAPH:
            loc_data = game_state.get(loc, {})
            if loc_data.get('faction') == self.faction:
                continue
            target_army = loc_data.get('army', 0)

            for neighbor in MAP_GRAPH[loc]:
                n_data = game_state.get(neighbor, {})
                if n_data.get('faction') == self.faction and n_data.get('army', 0) > target_army + 1:
                    # Don't attack from capital if it would leave it defenseless
                    if neighbor == self.my_capital:
                        cap_threatened = False
                        for cap_n in MAP_GRAPH[self.my_capital]:
                            cn_data = game_state.get(cap_n, {})
                            if cn_data.get('faction') not in (self.faction, 'neutral') and cn_data.get('army', 0) > 0:
                                cap_threatened = True
                                break
                        if cap_threatened:
                            continue

                    if self.memory.recently_failed(loc):
                        continue

                    candidates.append({
                        "target": loc,
                        "from": neighbor,
                        "our_army": n_data['army'],
                        "their_army": target_army,
                        "strategic_value": self.map.strategic_values.get(loc, 0),
                        "is_neutral": loc_data.get('faction') == 'neutral',
                    })

        if not candidates:
            return None

        candidates.sort(key=lambda c: (-c['is_neutral'], -c['strategic_value']))
        best = candidates[0]
        return {
            "action": "move_army",
            "from": best["from"],
            "to": best["target"],
            "reason": f"zero_risk_capture {best['target']} (our {best['our_army']} vs {best['their_army']})"
        }

    # ── Priority 3: Resource Transfers ─────────────────────────────────────────

    def _do_resource_transfers(self, game_state):
        """Transfer resources from ALL villages above threshold to capital, every cycle."""
        transfer_targets = []
        for loc in MAP_GRAPH:
            if loc == self.my_capital:
                continue
            loc_data = game_state.get(loc, {})
            if (loc_data.get('faction') == self.faction and
                'village' in loc and
                loc_data.get('resources', 0) > RESOURCE_TRANSFER_THRESHOLD):
                transfer_targets.append(loc)

        if not transfer_targets:
            return None

        return {
            "action": "resource_transfer",
            "locations": transfer_targets,
            "reason": f"transfer resources from {len(transfer_targets)} villages"
        }

    # ── Priority 4: Execute Active Plan Step ───────────────────────────────────

    def _execute_plan_step(self, game_state):
        """Execute next step of active plan."""
        if not self.planner.active:
            return None

        step = self.planner.next_step()
        if not step:
            return None

        action = step.get("action")

        if action == "create_army":
            cap_data = game_state.get(self.my_capital, {})
            if cap_data.get('resources', 0) >= ARMY_COST:
                self.planner.advance()
                return {
                    "action": "create_army",
                    "location": self.my_capital,
                    "count": 1,
                    "reason": f"plan step: {self.planner.goal}"
                }
            else:
                return {
                    "action": "collect_resources",
                    "location": self.my_capital,
                    "reason": "waiting for resources for plan"
                }

        elif action == "move_army":
            from_loc = step.get("from")
            to_loc = step.get("to")
            loc_data = game_state.get(from_loc, {})
            if loc_data.get('faction') == self.faction and loc_data.get('army', 0) > 0:
                self.planner.advance()
                return {
                    "action": "move_army",
                    "from": from_loc,
                    "to": to_loc,
                    "reason": f"plan step: {self.planner.goal}"
                }
            else:
                reason = "can't execute move step"
                self.planner.abandon(reason)
                trace.get_current_span().add_event("plan_abandoned", attributes={
                    "reason": reason,
                })
                telemetry.record_plan_abandoned(reason)
                return None

        elif action == "all_out_attack":
            self.planner.advance()
            return {
                "action": "all_out_attack",
                "location": self.my_capital,
                "reason": f"plan step: {self.planner.goal}"
            }

        self.planner.advance()
        return None

    # ── Priority 5: Create New Plan ────────────────────────────────────────────

    def _create_new_plan(self, game_state):
        """Create a new plan based on current phase."""
        # Sub-priority: if total army < 3, always build armies first
        if self.total_army < 3:
            armies_needed = 3 - self.total_army
            steps = [{"action": "create_army"} for _ in range(armies_needed)]
            goal = f"build {armies_needed} armies"
            self.planner.set_plan(goal, steps)
            trace.get_current_span().add_event("plan_created", attributes={
                "goal": goal, "step_count": len(steps),
            })
            telemetry.record_plan_created(goal)
            return self._execute_plan_step(game_state)

        # Sub-priority: capturable targets exist -> plan capture
        capture_plan = self._plan_capture(game_state)
        if capture_plan:
            return capture_plan

        # Sub-priority: READY_TO_ATTACK + feasible all-out
        if self.phase == GamePhase.READY_TO_ATTACK:
            attack_plan = self._plan_all_out_attack(game_state)
            if attack_plan:
                return attack_plan

        # Sub-priority: DESPERATE -> emergency build
        if self.phase == GamePhase.DESPERATE:
            cap_data = game_state.get(self.my_capital, {})
            if cap_data.get('resources', 0) >= ARMY_COST:
                goal = "emergency army build"
                steps = [{"action": "create_army"}]
                self.planner.set_plan(goal, steps)
                trace.get_current_span().add_event("plan_created", attributes={
                    "goal": goal, "step_count": len(steps),
                })
                telemetry.record_plan_created(goal)
                return self._execute_plan_step(game_state)

        # Sub-priority: concentrate isolated armies
        concentrate = self._concentrate_forces(game_state)
        if concentrate:
            return concentrate

        return None

    def _plan_capture(self, game_state):
        """Plan a capture: build N armies then move toward target."""
        targets = self._find_capturable_targets(game_state)
        if not targets:
            return None

        target = targets[0]
        target_loc = target["target"]
        target_army = game_state.get(target_loc, {}).get('army', 0)
        needed_army = target_army + 3

        steps = []

        # Build armies if needed
        armies_to_build = max(0, needed_army - self.total_army)
        for _ in range(min(armies_to_build, 5)):  # cap at 5 to avoid over-planning
            steps.append({"action": "create_army"})

        # Move one hop from capital toward target
        next_hop = self._step_toward(self.my_capital, target_loc)
        if next_hop:
            steps.append({"action": "move_army", "from": self.my_capital, "to": next_hop})

        if steps:
            goal = f"capture {target_loc}"
            self.planner.set_plan(goal, steps)
            trace.get_current_span().add_event("plan_created", attributes={
                "goal": goal, "step_count": len(steps),
            })
            telemetry.record_plan_created(goal)
            return self._execute_plan_step(game_state)

        return None

    def _find_capturable_targets(self, game_state):
        """Find targets we could capture, prioritizing low-defense neutrals for income."""
        targets = []
        for loc in MAP_GRAPH:
            loc_data = game_state.get(loc, {})
            if loc_data.get('faction') == self.faction:
                continue
            if self.memory.recently_failed(loc):
                continue

            target_army = loc_data.get('army', 0)
            is_neutral = loc_data.get('faction') == 'neutral'
            strat_value = self.map.strategic_values.get(loc, 0)

            # Find best staging location (closest of our territories)
            best_staging = None
            best_staging_dist = 99
            for our_loc in self.my_territories:
                dist = self.map.distance(our_loc, loc)
                if dist < best_staging_dist:
                    best_staging_dist = dist
                    best_staging = our_loc

            path_enemy = self.map.path_army_estimate(
                game_state, best_staging, loc, self.faction
            ) if best_staging else 999

            targets.append({
                "target": loc,
                "staging": best_staging,
                "target_army": target_army,
                "path_enemy": path_enemy,
                "is_neutral": is_neutral,
                "strategic_value": strat_value,
                "distance": best_staging_dist,
            })

        # Sort: neutrals first, then by lowest defense, then by strategic value
        targets.sort(key=lambda t: (
            not t['is_neutral'],
            t['target_army'],
            -t['strategic_value'],
        ))

        return targets

    def _plan_all_out_attack(self, game_state):
        """Plan an all-out attack if feasible (expected remaining army > 2)."""
        path_enemy = self.map.path_army_estimate(
            game_state, self.my_capital, self.enemy_capital, self.faction
        )
        expected_remaining = self.total_army - path_enemy
        if expected_remaining > 2:
            goal = "all-out attack on enemy capital"
            steps = [{"action": "all_out_attack"}]
            self.planner.set_plan(goal, steps)
            trace.get_current_span().add_event("plan_created", attributes={
                "goal": goal, "step_count": len(steps),
            })
            telemetry.record_plan_created(goal)
            return self._execute_plan_step(game_state)
        return None

    def _concentrate_forces(self, game_state):
        """Move isolated friendly armies toward threats or strategic hub (V5)."""
        target_loc = self.memory.enemy_push_direction or "village_5"

        for loc in MAP_GRAPH:
            if loc == self.my_capital:
                continue
            loc_data = game_state.get(loc, {})
            if loc_data.get('faction') == self.faction and loc_data.get('army', 0) > 0:
                # Check if this army is isolated (no enemy neighbors)
                has_enemy_neighbor = False
                for n in MAP_GRAPH[loc]:
                    n_data = game_state.get(n, {})
                    if n_data.get('faction') not in (self.faction, 'neutral'):
                        has_enemy_neighbor = True
                        break

                if not has_enemy_neighbor:
                    next_hop = self._step_toward(loc, target_loc)
                    if next_hop and next_hop != loc:
                        n_data = game_state.get(next_hop, {})
                        if n_data.get('faction') == self.faction or n_data.get('army', 0) < loc_data['army']:
                            return {
                                "action": "move_army",
                                "from": loc,
                                "to": next_hop,
                                "reason": f"concentrate forces from {loc} toward {target_loc}"
                            }
        return None

    # ── Priority 6: Fallback ───────────────────────────────────────────────────

    def _fallback(self, game_state):
        """Collect resources at capital."""
        return {
            "action": "collect_resources",
            "location": self.my_capital,
            "reason": "fallback: collect resources"
        }

    # ── Adaptive Timing ────────────────────────────────────────────────────────

    def get_pause_time(self):
        """Adaptive loop timing based on phase."""
        if self.phase == GamePhase.DESPERATE or self.memory.territory_lost_recently():
            return random.randint(2, 5)
        elif self.phase == GamePhase.READY_TO_ATTACK:
            return random.randint(3, 8)
        else:
            return random.randint(5, 15)


# ─── White Walkers AI ─────────────────────────────────────────────────────────

class WhiteWalkerAI(StrategicAI):
    """Single-player opponent on the White Walkers Attack map.

    Economy: corpses, not resources. Corpses come from winning battles and
    passive generation at the fortress. Army units cost
    ``ARMY_COST_BY_MAP["white_walkers_attack"]["white_walkers"]`` corpses.

    Priority cascade (replaces ``StrategicAI.decide``):

      1. Defend the fortress when enemies are adjacent and the garrison is
         outnumbered.
      2. Capture any wall that the White Walkers do not already control,
         preferring the wall that needs the fewest attacking troops to beat
         its 2× defender multiplier.
      3. Reinforce the weakest White Walker-held wall.
      4. Raid the nearest barbarian village whose army is less than or equal
         to the closest White Walker garrison — a clean harvest for corpses.
      5. If corpses are at or above the army cost and the fortress holds any
         troops, raise a new undead unit.
      6. No-op fallback (corpse stream keeps flowing via the passive tick).
    """

    def decide(self, game_state):
        self.my_territories, self.enemy_territories = self.memory.update(
            game_state, self.faction
        )
        self.total_army = sum(
            data.get('army', 0) for loc, data in game_state.items()
            if data.get('faction') == self.faction
        )
        self.phase = PhaseDetector.detect(
            self.my_territories, self.enemy_territories, self.total_army
        )

        span = trace.get_current_span()
        span.set_attribute("ai.variant", "white_walkers")
        span.set_attribute("game.map.id", self.map_id)

        corpses = fetch_faction_corpses(self.faction)
        span.set_attribute("ai.corpse_pool", corpses)

        evaluated = []

        action = self._defend_fortress(game_state)
        if action:
            evaluated.append(f"defend_fortress: TRIGGERED ({action.get('reason', '')})")
            self._last_evaluated = evaluated
            return action
        evaluated.append("defend_fortress: skipped")

        action = self._capture_unowned_wall(game_state)
        if action:
            evaluated.append(f"capture_wall: TRIGGERED ({action.get('reason', '')})")
            self._last_evaluated = evaluated
            return action
        evaluated.append("capture_wall: skipped")

        action = self._reinforce_weakest_wall(game_state)
        if action:
            evaluated.append(f"reinforce_wall: TRIGGERED ({action.get('reason', '')})")
            self._last_evaluated = evaluated
            return action
        evaluated.append("reinforce_wall: skipped")

        action = self._raid_barbarian(game_state)
        if action:
            evaluated.append(f"raid_barbarian: TRIGGERED ({action.get('reason', '')})")
            self._last_evaluated = evaluated
            return action
        evaluated.append("raid_barbarian: skipped")

        action = self._raise_army_from_corpses(game_state, corpses)
        if action:
            evaluated.append(f"raise_army: TRIGGERED ({action.get('reason', '')})")
            self._last_evaluated = evaluated
            return action
        evaluated.append("raise_army: skipped")

        self._last_evaluated = evaluated
        return self._passive_fallback()

    # ── Cascade helpers ───────────────────────────────────────────────────────

    def _defend_fortress(self, game_state):
        cap_data = game_state.get(self.my_capital, {})
        if not cap_data or cap_data.get('faction') != self.faction:
            return None

        garrison = cap_data.get('army', 0)
        max_threat = 0
        threat_loc = None
        for n in self.map.neighbors(self.my_capital):
            n_data = game_state.get(n, {})
            n_faction = n_data.get('faction')
            if n_faction and n_faction != self.faction and n_faction != 'barbarian':
                if n_data.get('army', 0) > max_threat:
                    max_threat = n_data['army']
                    threat_loc = n
        if max_threat == 0 or max_threat <= garrison:
            return None

        # Pull back from the strongest adjacent wall we own (if any).
        best_source = None
        best_army = 0
        for wall in self._walls():
            w_data = game_state.get(wall, {})
            if w_data.get('faction') == self.faction and w_data.get('army', 0) > best_army:
                best_source = wall
                best_army = w_data['army']
        if best_source:
            return {
                "action": "move_army",
                "from": best_source,
                "to": self.my_capital,
                "reason": f"defend fortress vs {threat_loc} ({max_threat} army)",
            }
        return None

    def _capture_unowned_wall(self, game_state):
        best = None
        best_cost = float("inf")
        for wall in self._walls():
            w_data = game_state.get(wall, {})
            if w_data.get('faction') == self.faction:
                continue
            defender = w_data.get('army', 0)
            # Wall multiplier = 2 — must exceed 2 * defender to take it.
            needed = 2 * defender + 1
            source, source_army = self._nearest_source_with_army(game_state, wall, needed)
            if source is None:
                continue
            total_cost = needed
            if total_cost < best_cost:
                best_cost = total_cost
                best = (source, wall, defender)
        if best is None:
            return None
        source, wall, defender = best
        return {
            "action": "move_army",
            "from": source,
            "to": self._step_toward(source, wall),
            "reason": f"capture {wall} (defender {defender}, needed {best_cost})",
        }

    def _reinforce_weakest_wall(self, game_state):
        mine = [
            (w, game_state.get(w, {}).get('army', 0))
            for w in self._walls()
            if game_state.get(w, {}).get('faction') == self.faction
        ]
        if not mine:
            return None
        weakest, weakest_army = min(mine, key=lambda item: item[1])

        # Prefer non-capital neighbours so corpse-driven army production at
        # the capital isn't drained on every tick. Capital is a fallback
        # below — without it the AI gets stuck post-capture, since
        # ``move_army`` moves *all* army, leaving walls at 0 and capital as
        # the only source.
        capital_neighbour = None
        for n in self.map.neighbors(weakest):
            n_data = game_state.get(n, {})
            if n_data.get('faction') != self.faction:
                continue
            n_army = n_data.get('army', 0)
            if n_army <= 1:
                continue
            if n == self.my_capital:
                capital_neighbour = (n, n_army)
                continue
            return {
                "action": "move_army",
                "from": n,
                "to": weakest,
                "reason": f"reinforce {weakest} from {n}",
            }

        # Capital fallback. Only fire if (a) the capital has more than the
        # weakest wall (otherwise it's not really reinforcing) and (b) the
        # capital has enough to spare — leaving 0 garrison is fine because
        # ``_raise_army_from_corpses`` no longer requires a non-zero
        # garrison to wrap a fresh unit around.
        if capital_neighbour is not None:
            cap_loc, cap_army = capital_neighbour
            if cap_army > weakest_army + 1:
                return {
                    "action": "move_army",
                    "from": cap_loc,
                    "to": weakest,
                    "reason": f"reinforce {weakest} from capital ({cap_army} → wall {weakest_army})",
                }
        return None

    def _raid_barbarian(self, game_state):
        targets = [
            loc for loc, t in get_location_types(self.map_id).items()
            if t == "village"
            and get_initial_factions(self.map_id).get(loc) == "barbarian"
            and game_state.get(loc, {}).get('faction') == "barbarian"
        ]
        if not targets:
            return None

        best = None
        best_margin = -1
        for target in targets:
            defender = game_state.get(target, {}).get('army', 0)
            source, source_army = self._nearest_source_with_army(
                game_state, target, defender + 1
            )
            if source is None:
                continue
            margin = source_army - defender
            if margin > best_margin:
                best_margin = margin
                best = (source, target, defender)
        if best is None:
            return None
        source, target, defender = best
        return {
            "action": "move_army",
            "from": source,
            "to": self._step_toward(source, target),
            "reason": f"raid {target} (defender {defender}) for corpses",
        }

    def _raise_army_from_corpses(self, game_state, corpses):
        # Capital must still belong to us — if NW captured it the AI has
        # soft-lost. The earlier `army >= 1` gate has been dropped: it
        # blocked the AI's primary economic loop after every capital→wall
        # reinforcement (move_army drains the source to 0), leaving the AI
        # idle until corpses overflowed.
        cap_data = game_state.get(self.my_capital, {})
        if cap_data.get('faction') != self.faction:
            return None
        if corpses < self.army_cost:
            return None
        return {
            "action": "create_army",
            "location": self.my_capital,
            "count": 1,
            "reason": f"raise undead ({corpses} corpses, cost {self.army_cost})",
        }

    def _passive_fallback(self):
        # No-op for White Walkers: the passive corpse tick handles "idle".
        return {
            "action": "noop",
            "reason": "passive: corpses accumulate at fortress",
        }

    # ── Utility ───────────────────────────────────────────────────────────────

    def _walls(self):
        types = get_location_types(self.map_id)
        return [loc for loc, t in types.items() if t == "wall"]

    def _nearest_source_with_army(self, game_state, target, needed):
        """Return the (location_id, army) of the closest friendly node with
        at least ``needed`` troops, or ``(None, 0)`` if nothing qualifies.
        """
        best = (None, 0)
        best_dist = float("inf")
        for loc, data in game_state.items():
            if data.get('faction') != self.faction:
                continue
            if data.get('army', 0) < needed:
                continue
            dist = self.map.distance(loc, target)
            if dist < best_dist:
                best = (loc, data.get('army', 0))
                best_dist = dist
        return best


# ─── AI State ──────────────────────────────────────────────────────────────────

class AIState:
    def __init__(self):
        self.faction = None
        self.map_id = "war_of_kingdoms"
        self.active = False
        self.last_action_time = None
        self.game_start_time = None
        self.strategic_ai = None
        self.decision_thread = None
        self.stop_flag = threading.Event()

ai_state = AIState()

# ─── Preserved Helpers ─────────────────────────────────────────────────────────

def get_location_url(location_id):
    """Get the URL for a location's API.

    Container hostnames in docker-compose are the stable WoK names
    (``southern-capital``, ``village-1`` …). On WWA the *logical* location id
    differs (``wall_west`` → still lives on container ``village-1``), so we
    look up the container via ``CONTAINER_FOR_LOCATION_ID`` rather than
    naively hyphenating the location id.
    """
    if os.environ.get('IN_DOCKER'):
        host = CONTAINER_FOR_LOCATION_ID.get(location_id, location_id.replace('_', '-'))
    else:
        host = 'localhost'

    port = LOCATION_PORTS[location_id]
    return f"http://{host}:{port}"


def fetch_faction_corpses(faction):
    """Query any location service for the faction's corpse pool. Returns 0 on error."""
    # Use slot_1 (southern-capital container); any container is fine since
    # the DB is shared.
    try:
        if os.environ.get('IN_DOCKER'):
            base = "http://southern-capital:5001"
        else:
            base = "http://localhost:5001"
        resp = requests.get(f"{base}/faction_economy", params={"faction": faction}, timeout=2)
        resp.raise_for_status()
        return int(resp.json().get("corpses", 0))
    except Exception:
        return 0

def make_api_request(location_id, endpoint, method='GET', data=None):
    """Make an API request to a location server with trace context"""
    url = f"{get_location_url(location_id)}/{endpoint}"
    headers = {"Content-Type": "application/json"}

    with tracer.start_as_current_span(
        "ai_api_request",
        kind=SpanKind.CLIENT,
        attributes={
            "location.id": location_id,
            "location.endpoint": endpoint,
            "http.method": method
        }
    ) as span:
        inject(headers)  # Inject trace context

        try:
            if method == 'GET':
                response = requests.get(url, headers=headers)
            else:  # POST
                response = requests.post(url, json=data, headers=headers)

            span.set_attribute("http.status_code", response.status_code)
            response.raise_for_status()
            result = response.json()

            if not result.get("success", True):
                span.set_status(trace.StatusCode.ERROR, result.get("message", "Unknown error"))

            return result
        except requests.RequestException as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            logger.error("API request failed", extra={"error": str(e)})
            return {"error": str(e)}

def get_game_state(parent_ctx):
    """Get the current state of every location on the currently active map."""
    # Which set of location ids belongs to this AI's map? Fall back to
    # WoK's 8 ids if AI isn't initialised yet.
    if ai_state.strategic_ai is not None:
        location_ids = list(get_map_graph(ai_state.strategic_ai.map_id).keys())
    else:
        location_ids = list(MAP_GRAPH.keys())

    with tracer.start_as_current_span(
        "get_game_state",
        kind=SpanKind.INTERNAL,
        context=parent_ctx,
        attributes={"location_count": len(location_ids)}
    ) as span:
        game_state = {}
        error_count = 0

        for location_id in location_ids:
            data = make_api_request(location_id, '')
            if 'error' not in data:
                game_state[location_id] = data
            else:
                error_count += 1
                span.add_event(
                    "location_fetch_error",
                    attributes={
                        "location": location_id,
                        "error": str(data.get('error', 'Unknown error'))
                    }
                )

        span.set_attribute("locations_retrieved", len(game_state))
        span.set_attribute("errors", error_count)

        if error_count > 0:
            span.set_status(trace.StatusCode.ERROR, f"Failed to fetch {error_count} locations")

        return game_state

# ─── Action Executor ───────────────────────────────────────────────────────────

def execute_strategic_action(action, game_state, parent_ctx, decision_link=None):
    """Execute an action returned by StrategicAI.decide()."""
    if not action:
        return

    action_type = action.get("action")
    reason = action.get("reason", "")

    links = []
    if decision_link:
        links = [Link(decision_link, attributes={"link.type": "ai_decision_trigger"})]

    with tracer.start_as_current_span(
        "execute_ai_action",
        kind=SpanKind.INTERNAL,
        context=parent_ctx,
        links=links,
        attributes={
            "action_type": action_type,
            "reason": reason,
        }
    ) as span:
        try:
            if action_type == "create_army":
                location = action.get("location", ai_state.strategic_ai.my_capital)
                count = action.get("count", 1)
                armies_created = 0
                for i in range(count):
                    result = make_api_request(location, 'create_army', method='POST')
                    if result.get('success'):
                        armies_created += 1
                        logger.info("AI created army", extra={"army_number": armies_created, "total_requested": count, "reason": reason})
                    else:
                        logger.warning("Failed to create army", extra={"message": result.get('message', 'unknown')})
                        break
                    if i < count - 1:
                        time.sleep(0.5)
                span.set_attribute("armies_created", armies_created)
                span.set_attribute("armies_requested", count)

            elif action_type == "move_army":
                from_loc = action["from"]
                to_loc = action["to"]
                result = make_api_request(
                    from_loc,
                    'move_army',
                    method='POST',
                    data={"target_location": to_loc}
                )
                success = result.get('success', False)
                span.set_attribute("from_location", from_loc)
                span.set_attribute("target_location", to_loc)
                span.set_attribute("move_success", success)
                logger.info("AI move army", extra={"from_location": from_loc, "to_location": to_loc, "reason": reason, "success": success})
                if not success:
                    ai_state.strategic_ai.memory.record_failed_attack(to_loc)

            elif action_type == "all_out_attack":
                location = action.get("location", ai_state.strategic_ai.my_capital)
                result = make_api_request(location, 'all_out_attack', method='POST')
                span.set_attribute("all_out_attack", True)
                logger.info("AI all-out attack", extra={"location": location, "reason": reason})

            elif action_type == "collect_resources":
                location = action.get("location", ai_state.strategic_ai.my_capital)
                result = make_api_request(location, 'collect_resources', method='POST')
                logger.info("AI collected resources", extra={"location": location, "reason": reason})

            elif action_type == "resource_transfer":
                locations = action.get("locations", [])
                for loc in locations:
                    result = make_api_request(loc, 'send_resources_to_capital', method='POST')
                    logger.info("AI transferred resources", extra={"from_location": loc})
                span.set_attribute("transfers_count", len(locations))

            elif action_type == "noop":
                # WhiteWalkerAI uses ``noop`` as a quiet-tick fallback when
                # corpses are accruing but no actionable move exists. Still
                # emit a span so replay shows the AI was awake but chose not
                # to act.
                span.set_attribute("ai.cycle.idle", True)
                logger.debug("AI idle cycle", extra={"reason": reason})

        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            logger.error("Error executing AI action", extra={"error": str(e), "action_type": action_type})

# ─── Decision Loop ─────────────────────────────────────────────────────────────

def ai_decision_loop():
    """Main AI decision loop that runs in a separate thread"""
    logger.info("AI decision loop started", extra={"faction": ai_state.faction})

    decision_count = 0

    while ai_state.active and not ai_state.stop_flag.is_set():
        decision_count += 1

        with tracer.start_as_current_span(
            "ai_decision_cycle",
            kind=SpanKind.INTERNAL,
            attributes={
                "faction": ai_state.faction,
                "game_phase": ai_state.strategic_ai.phase.value if ai_state.strategic_ai else "unknown",
                "cycle_number": decision_count,
                "cycle_start": datetime.now().isoformat(),
                "session_start": ai_state.game_start_time.isoformat() if ai_state.game_start_time else None
            }
        ) as cycle_span:
            parent_ctx = baggage.set_baggage("context", "parent")
            cycle_start_time = time.time()
            try:
                # Get current game state
                game_state = get_game_state(parent_ctx)
                my_capital = ai_state.strategic_ai.my_capital

                # Check if game is over
                if my_capital not in game_state or game_state[my_capital].get('faction') != ai_state.faction:
                    logger.info("AI detected game over", extra={"faction": ai_state.faction, "cycle_number": decision_count})
                    cycle_span.set_attribute("game_over_detected", True)
                    cycle_span.set_attribute("final_cycle", True)
                    ai_state.active = False
                    break

                # Make decision using StrategicAI
                decision_context = None
                with tracer.start_as_current_span(
                    "ai_decision",
                    kind=SpanKind.INTERNAL,
                    context=parent_ctx,
                    attributes={"game_phase": ai_state.strategic_ai.phase.value}
                ) as decision_span:
                    action = ai_state.strategic_ai.decide(game_state)
                    decision_context = decision_span.get_span_context()

                    if action:
                        decision_span.set_attribute("chosen_action", action.get("action", "none"))
                        decision_span.set_attribute("reason", action.get("reason", ""))

                    # Strategic context on spans
                    decision_span.set_attribute("my_territories", str(list(ai_state.strategic_ai.my_territories)))
                    decision_span.set_attribute("enemy_territories", str(list(ai_state.strategic_ai.enemy_territories)))
                    decision_span.set_attribute("total_army", ai_state.strategic_ai.total_army)
                    decision_span.set_attribute("game_phase", ai_state.strategic_ai.phase.value)
                    decision_span.set_attribute("priorities_evaluated", str(ai_state.strategic_ai._last_evaluated))

                if action:
                    action_type = action.get("action", "none")
                    telemetry.record_decision(action_type, ai_state.strategic_ai.phase.value)
                    execute_strategic_action(action, game_state, parent_ctx, decision_link=decision_context)
                    ai_state.last_action_time = datetime.now()
                    cycle_span.set_attribute("action_executed", True)
                    cycle_span.set_attribute("action_type", action_type)
                else:
                    cycle_span.set_attribute("no_action_taken", True)

                cycle_span.set_attribute("cycle_complete", True)

                # Session metrics
                if ai_state.game_start_time:
                    elapsed_time = (datetime.now() - ai_state.game_start_time).total_seconds()
                    cycle_span.set_attribute("session_elapsed_seconds", elapsed_time)

                # Record cycle duration
                telemetry.record_cycle_duration(time.time() - cycle_start_time)

                # Adaptive pause
                pause_time = ai_state.strategic_ai.get_pause_time()
                cycle_span.set_attribute("pause_duration_seconds", pause_time)
                logger.info("AI waiting", extra={"pause_seconds": pause_time, "phase": ai_state.strategic_ai.phase.value})

                if ai_state.stop_flag.wait(pause_time):
                    cycle_span.set_attribute("interrupted", True)
                    break

                if not ai_state.active:
                    cycle_span.set_attribute("ai_deactivated", True)
                    break

            except Exception as e:
                cycle_span.record_exception(e)
                cycle_span.set_status(trace.StatusCode.ERROR, str(e))
                logger.error("Error in AI decision cycle", extra={"error": str(e), "cycle_number": decision_count})
                time.sleep(5)

# ─── Flask Endpoints ───────────────────────────────────────────────────────────

@app.route('/activate', methods=['POST'])
def activate_ai():
    """Activate the AI for a specific faction on a specific map.

    Accepts ``{"faction": ..., "map_id": ...}``. Defaults to
    War of Kingdoms when ``map_id`` is omitted (backward compat).
    Dispatches to ``WhiteWalkerAI`` when the requested faction is
    ``white_walkers``; otherwise uses the classic ``StrategicAI``.
    """
    data = request.get_json() or {}
    faction = data.get('faction')
    map_id = data.get('map_id', 'war_of_kingdoms')

    valid_factions = set()
    for m in CAPITALS_BY_MAP.values():
        valid_factions.update(m.keys())
    if faction not in valid_factions:
        return jsonify({"success": False, "message": "Invalid faction"}), 400

    if map_id not in MAP_GRAPHS_BY_MAP:
        return jsonify({"success": False, "message": f"Unknown map_id: {map_id}"}), 400

    if faction not in get_capitals(map_id):
        return jsonify({
            "success": False,
            "message": f"Faction {faction} is not valid on map {map_id}"
        }), 400

    if ai_state.active:
        return jsonify({"success": False, "message": "AI already active"}), 400

    ai_state.faction = faction
    ai_state.map_id = map_id
    ai_state.active = True
    ai_state.game_start_time = datetime.now()
    ai_state.stop_flag.clear()

    if faction == "white_walkers":
        ai_state.strategic_ai = WhiteWalkerAI(faction, map_id=map_id)
    else:
        ai_state.strategic_ai = StrategicAI(faction, map_id=map_id)

    # Register state callback for observable gauges
    telemetry.set_state_callback(lambda: {
        "territory_count": len(ai_state.strategic_ai.my_territories),
        "total_army": ai_state.strategic_ai.total_army,
        "faction": ai_state.faction or "unknown",
    } if ai_state.strategic_ai else None)

    # Corpse-pool gauge: only meaningful for White Walkers. For other AIs
    # the callback returns None so the gauge stays unobserved.
    def _corpse_cb():
        if ai_state.faction == "white_walkers":
            return ("white_walkers", fetch_faction_corpses("white_walkers"))
        return None
    telemetry.set_corpse_callback(_corpse_cb)

    # Start AI decision thread
    ai_state.decision_thread = threading.Thread(target=ai_decision_loop, daemon=True)
    ai_state.decision_thread.start()

    logger.info(
        "AI activated",
        extra={"faction": faction, "map_id": map_id, "variant": type(ai_state.strategic_ai).__name__},
    )
    return jsonify({
        "success": True,
        "message": f"AI activated for {faction} faction on {map_id}",
        "map_id": map_id,
        "variant": type(ai_state.strategic_ai).__name__,
    })

@app.route('/deactivate', methods=['POST'])
def deactivate_ai():
    """Deactivate the AI"""
    if not ai_state.active:
        return jsonify({"success": False, "message": "AI not active"}), 400

    ai_state.active = False
    ai_state.stop_flag.set()

    # Wait for thread to stop (with timeout)
    if ai_state.decision_thread:
        ai_state.decision_thread.join(timeout=5)

    logger.info("AI deactivated", extra={"faction": ai_state.faction})
    return jsonify({"success": True, "message": "AI deactivated"})

@app.route('/status', methods=['GET'])
def ai_status():
    """Get current AI status"""
    return jsonify({
        "active": ai_state.active,
        "faction": ai_state.faction,
        "last_action": ai_state.last_action_time.isoformat() if ai_state.last_action_time else None,
        "game_phase": ai_state.strategic_ai.phase.value if ai_state.active and ai_state.strategic_ai else None
    })

@app.route('/health', methods=['GET'])
def health_check():
    """Health check endpoint"""
    return jsonify({"status": "healthy"})

if __name__ == '__main__':
    port = int(os.environ.get('PORT', 8081))
    app.run(host='0.0.0.0', port=port, debug=False)


================================================
FILE: game-of-tracing/ai_opponent/requirements.txt
================================================
flask==3.1.3
requests==2.33.1
opentelemetry-api==1.41.1
opentelemetry-sdk==1.41.1
opentelemetry-exporter-otlp==1.41.1
pyroscope-io==1.0.6
pyroscope-otel==1.0.0


================================================
FILE: game-of-tracing/ai_opponent/telemetry.py
================================================
import os

from opentelemetry.sdk.resources import SERVICE_NAME, Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry import trace

# Logging setup
import logging
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry._logs import set_logger_provider

# Metrics setup
from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.sdk.metrics import TraceBasedExemplarFilter
from opentelemetry.metrics import CallbackOptions, Observation
from typing import Iterable

# Profiling setup (Pyroscope v2 + OTel span-profile linking)
import pyroscope
from pyroscope.otel import PyroscopeSpanProcessor

class AITelemetry:
    def __init__(self, service_name="ai-opponent", logging_endpoint="http://alloy:4318", tracing_endpoint="http://alloy:4317", metrics_endpoint="http://alloy:4318"):
        self.service_name = service_name
        self.logging_endpoint = logging_endpoint
        self.tracing_endpoint = tracing_endpoint
        self.metrics_endpoint = metrics_endpoint
        self._state_callback = None
        self.resource = Resource.create(attributes={
            SERVICE_NAME: service_name,
            "ai.difficulty": "normal",
            "ai.version": "1.0"
        })

        self._setup_logging()
        self._setup_tracing()
        self._setup_metrics()
        self._setup_profiling()
        
    def _setup_logging(self):
        """Configure OpenTelemetry logging"""
        self.logger_provider = LoggerProvider(resource=self.resource)
        set_logger_provider(self.logger_provider)
        
        log_exporter = OTLPLogExporter(
            endpoint=f"{self.logging_endpoint}/v1/logs"
        )
        
        self.logger_provider.add_log_record_processor(
            BatchLogRecordProcessor(
                exporter=log_exporter,
                max_queue_size=30,
                max_export_batch_size=5
            )
        )
        
        # Setup root logger
        handler = LoggingHandler(
            level=logging.NOTSET,
            logger_provider=self.logger_provider
        )
        logging.getLogger().addHandler(handler)
        logging.getLogger().setLevel(logging.INFO)
        
        self.logger = logging.getLogger(self.service_name)
    
    def _setup_tracing(self):
        """Configure OpenTelemetry tracing"""
        trace.set_tracer_provider(TracerProvider(resource=self.resource))
        
        otlp_exporter = OTLPSpanExporter(
            endpoint=f"{self.tracing_endpoint}/v1/traces",
            insecure=True
        )
        
        span_processor = BatchSpanProcessor(
            span_exporter=otlp_exporter,
            max_export_batch_size=1
        )
        
        trace.get_tracer_provider().add_span_processor(span_processor)
        self.tracer = trace.get_tracer(__name__)

    def _setup_profiling(self):
        """Configure Pyroscope profiling + OTel span-profile linkage."""
        pyroscope.configure(
            application_name=self.service_name,
            server_address=os.getenv("PYROSCOPE_SERVER_ADDRESS", "http://alloy:9999"),
            tags={"service_name": self.service_name},
            oncpu=True,
            gil_only=True,
        )
        trace.get_tracer_provider().add_span_processor(PyroscopeSpanProcessor())

    def _setup_metrics(self):
        """Configure OpenTelemetry metrics"""
        self.metric_exporter = OTLPMetricExporter(
            endpoint=f"{self.metrics_endpoint}/v1/metrics"
        )

        self.metric_reader = PeriodicExportingMetricReader(
            self.metric_exporter,
            export_interval_millis=10000
        )

        self.meter_provider = MeterProvider(
            metric_readers=[self.metric_reader],
            resource=self.resource,
            exemplar_filter=TraceBasedExemplarFilter()
        )
        metrics.set_meter_provider(self.meter_provider)

        self.meter = metrics.get_meter(__name__)

        # Counters
        self._decisions_counter = self.meter.create_counter(
            name="ai.decisions",
            description="Number of AI decisions made",
            unit="1"
        )
        self._plans_created_counter = self.meter.create_counter(
            name="ai.plans_created",
            description="Number of plans created",
            unit="1"
        )
        self._plans_abandoned_counter = self.meter.create_counter(
            name="ai.plans_abandoned",
            description="Number of plans abandoned",
            unit="1"
        )

        # Histogram
        self._cycle_duration_histogram = self.meter.create_histogram(
            name="ai.decision_cycle_duration_seconds",
            description="Duration of AI decision cycles",
            unit="s"
        )

        # Observable gauges
        self.meter.create_observable_gauge(
            name="ai.territory_count",
            description="Number of territories controlled by faction",
            callbacks=[self._observe_territory_count],
            unit="1"
        )
        self.meter.create_observable_gauge(
            name="ai.total_army",
            description="Total army size for faction",
            callbacks=[self._observe_total_army],
            unit="1"
        )

        # White Walkers Attack metrics (additive; only populate when the
        # relevant callback is wired).
        self._walls_captured_counter = self.meter.create_counter(
            name="ai.walls_captured",
            description="Number of wall keeps captured by this AI variant",
            unit="1",
        )
        self._corpse_callback = None
        self.meter.create_observable_gauge(
            name="ai.corpse_pool",
            description="White Walker corpse pool (cost pool for raising armies)",
            callbacks=[self._observe_corpse_pool],
            unit="1",
        )

    def _observe_territory_count(self, options: CallbackOptions) -> Iterable[Observation]:
        """Callback for territory count observable gauge"""
        if self._state_callback:
            try:
                state = self._state_callback()
                if state:
                    yield Observation(
                        value=state["territory_count"],
                        attributes={"faction": state["faction"]}
                    )
            except Exception:
                pass

    def _observe_total_army(self, options: CallbackOptions) -> Iterable[Observation]:
        """Callback for total army observable gauge"""
        if self._state_callback:
            try:
                state = self._state_callback()
                if state:
                    yield Observation(
                        value=state["total_army"],
                        attributes={"faction": state["faction"]}
                    )
            except Exception:
                pass

    def set_state_callback(self, fn):
        """Register a callback that returns current AI state for observable gauges"""
        self._state_callback = fn

    def set_corpse_callback(self, fn):
        """Register a callback that returns ``(faction, corpses)`` for the
        ``ai.corpse_pool`` gauge. ``fn`` should return ``None`` when the
        current AI variant does not use the corpse economy.
        """
        self._corpse_callback = fn

    def _observe_corpse_pool(self, options: CallbackOptions) -> Iterable[Observation]:
        if not self._corpse_callback:
            return
        try:
            result = self._corpse_callback()
            if not result:
                return
            faction, corpses = result
            yield Observation(value=int(corpses), attributes={"faction": faction})
        except Exception:
            pass

    def record_wall_captured(self, wall_id, source):
        """Increment the walls-captured counter. ``source`` is the AI variant name."""
        self._walls_captured_counter.add(
            1, {"wall_id": wall_id, "variant": source}
        )

    def record_decision(self, action_type, phase):
        """Record an AI decision metric"""
        self._decisions_counter.add(1, {"action_type": action_type, "phase": phase})

    def record_plan_created(self, goal):
        """Record a plan creation metric"""
        self._plans_created_counter.add(1, {"goal": goal})

    def record_plan_abandoned(self, reason):
        """Record a plan abandonment metric"""
        self._plans_abandoned_counter.add(1, {"reason": reason})

    def record_cycle_duration(self, seconds):
        """Record decision cycle duration"""
        self._cycle_duration_histogram.record(seconds)

    def collect_metrics(self):
        """Force collection and export of all metrics"""
        try:
            self.metric_reader.collect()
            self.meter_provider.force_flush()
        except Exception:
            pass

    def get_tracer(self):
        """Get the configured tracer"""
        return self.tracer

    def get_logger(self):
        """Get the configured logger"""
        return self.logger

    def shutdown(self):
        """Flush and shutdown all telemetry providers."""
        try:
            trace.get_tracer_provider().shutdown()
        except Exception:
            pass
        try:
            self.meter_provider.shutdown()
        except Exception:
            pass
        try:
            self.logger_provider.shutdown()
        except Exception:
            pass

================================================
FILE: game-of-tracing/app/CLAUDE.md
================================================
# app/ — Location Servers

> 8 Flask microservices representing map territories in the *War of Kingdoms* game. This doc is read by any AI coding agent (Claude, Cursor, Codex, Cline). For scenario-wide context read [`../AGENTS.md`](../AGENTS.md) first.

## Purpose

All 8 locations run the same codebase. A container's **slot** (set via `SLOT_ID` env var, `slot_1` … `slot_8`) is fixed at build time; the **logical identity** it serves (`southern_capital`, `wall_west`, `barbarian_village_east`, …) is resolved at boot and on `/reload` from the active map in `game_state.db`. Each location:

- Owns a row in the shared `game_state.db` (resources, army, faction).
- Exposes an HTTP API for collecting resources, creating armies, moving armies, and launching attacks.
- Instruments every route with OpenTelemetry traces, logs, and five custom game metrics.
- Runs passive resource generation for villages (every 15 s) and handles cooldowns for capitals.
- On the White Walkers Attack map, also runs: passive barbarian army growth (every 30 s at barbarian villages), passive corpse generation (every 15 s at the White Walker fortress), passive resource generation at the Night's Watch capital (+5 every 10 s — WWA has no friendly villages, so this replaces the click-only economy), and the wall multiplier (defenders count 2× at `wall`-type locations).

Ports 5001-5008:

| Location ID | Service name | Port | Type |
|---|---|---|---|
| `southern_capital` | `southern-capital` | 5001 | capital |
| `northern_capital` | `northern-capital` | 5002 | capital |
| `village_1` | `village-1` | 5003 | village |
| `village_2` | `village-2` | 5004 | village |
| `village_3` | `village-3` | 5005 | village |
| `village_4` | `village-4` | 5006 | village |
| `village_5` | `village-5` | 5007 | village |
| `village_6` | `village-6` | 5008 | village |

Service names (hyphenated) match the `SERVICE_NAME` resource attribute used in traces. Location IDs (underscored) are what DB rows and `game_config.py` use. Bridge: `location_id.replace('_', '-')`.

## File map

| File | Size | Purpose |
|---|---|---|
| `game_config.py` | ~3 KB | `LOCATIONS` dict: coordinates, connections, initial resources/army/faction, passive-rate, costs. |
| `telemetry.py` | ~11 KB | `GameTelemetry` class — traces, logs, metrics (5 observable gauges + 1 counter for game state), plus Pyroscope profiling with OTel span-profile linkage. |
| `location_server.py` | ~52 KB (~1200 lines) | `LocationServer` class — Flask app, routes, DB access, pathfinding, battle resolution, background-thread movement. |
| `run_game.py` | — | CLI to run all 8 services as separate local processes (non-Docker). |
| `Dockerfile` | small | `python:3.11-slim`, `pip install -r requirements.txt`, runs `python location_server.py`. |
| `requirements.txt` | small | Flask 3.1.3, requests 2.33.1, OpenTelemetry SDK/API + OTLP gRPC/HTTP exporters, `pyroscope-io` + `pyroscope-otel` for profiling. |

## Routes

| Method | Path | Handler span name | Purpose |
|---|---|---|---|
| `GET` | `/` | `get_location_info` | Location state + optional cooldown |
| `POST` | `/collect_resources` | `collect_resources` | Capital-only; 5 s cooldown; +20 resources |
| `POST` | `/create_army` | `create_army` | Capital-only; costs 30 resources → +1 army unit |
| `POST` | `/move_army` | `move_army_request` | Move army to adjacent location; spawns background movement thread |
| `POST` | `/all_out_attack` | `all_out_attack` | Capital-to-capital attack via `_find_path(target, ATTACK)` |
| `POST` | `/receive_army` | `receive_army` | Target of `_continue_army_movement`; resolves battle via `_handle_battle` |
| `POST` | `/receive_resources` | `receive_resources` | Target of `_transfer_resources_along_path` |
| `GET` | `/health` | — | Docker health check; returns `{"status":"ok"}` |
| `POST` | `/send_resources_to_capital` | — | Village → friendly capital resource forwarding (used by AI) |
| `POST` | `/reload` | — | Re-read `active_map_id` + rebind slot identity in place (war_map calls this after `/select_map`) |
| `GET` | `/faction_economy?faction=...` | — | Read a faction's corpse pool (AI uses it) |

## Key algorithms

### Dijkstra pathfinding — `_find_path()` at `location_server.py:128-182`

Faction-aware edge weights:

| Mode | Friendly | Neutral | Enemy |
|---|---|---|---|
| `PathType.RESOURCE` | 1 | 2 | ∞ (unreachable) |
| `PathType.ATTACK` | 1 | 2 | 3 |

Resource routing only returns a path if the source is a capital of a known faction. Attack routing allows crossing enemy terrain at a cost.

### Battle resolution — `_handle_battle()` at `location_server.py:184-207`

| Case | Outcome | New army | New faction |
|---|---|---|---|
| Same faction | `reinforcement` | `attacking + defending` | defender's |
| `attacking > defending` | `attacker_victory` | `attacking - defending` | attacker's |
| `defending > attacking` | `defender_victory` | `defending - attacking` | defender's |
| equal | `stalemate` | `0` | defender's (territory held by default) |

Every outcome calls `telemetry.record_battle(attacker_faction, defender_faction, result)`, which increments the `game.battles` counter and force-flushes metrics.

### Atomic state updates — `_update_location_state()`

Forces metric collection at `location_server.py:124` on important changes (`faction`, `resources`, or `army` mutated), so the dashboard reflects state within ~1 s of the mutating request instead of waiting for the 10 s `PeriodicExportingMetricReader` cycle.

## OpenTelemetry patterns specific to `app/`

### HTTP clients go through one helper

`_make_request_with_trace()` at `location_server.py:327-352` is the only place outbound HTTP happens. It wraps every call in a CLIENT span, sets `http.url` and `http.status_code` attributes, and calls `inject(headers)` to propagate W3C trace context downstream. If you add a new outbound call, use this helper — do not call `requests.post` directly.

### Background threads capture context explicitly

Two methods spawn background threads for delayed operations:

- `_continue_army_movement()` at `location_server.py:209-271` — 5 s delay before the army arrives at the next location.
- `_transfer_resources_along_path()` at `location_server.py:273-325` — 5 s delay before the resources arrive.

Both follow the canonical pattern:

```python
ctx = get_current()              # capture before Thread().start()

def work():
    token = attach(ctx)          # re-attach inside the thread
    try:
        with tracer.start_as_current_span("..."):
            ...                  # span now belongs to the captured trace
    finally:
        detach(token)

Thread(target=work).start()
```

If you add a new background thread, replicate this pattern. Python threads will **not** inherit OTel context on their own — the span will be orphaned with a fresh trace_id.

### Span attributes that feed the Grafana dashboard

Preserve these when adding or modifying spans (the provisioned dashboard's TraceQL filters depend on them):

- `span.resource.movement = true` — any resource transfer span
- `span.battle.occurred = true` — any span that triggers `_handle_battle`
- `span.player.action = true` — any span caused by a human player action

## Custom metrics — `telemetry.py`

See `AGENTS.md` for the full cross-service metrics table. `app/`-specific:

| Metric | Type | Callback location in `telemetry.py` |
|---|---|---|
| `game.resources` | observable gauge | `_observe_resources` at `:176-193` |
| `game.army_size` | observable gauge | `_observe_army_size` at `:195-213` |
| `game.battles` | counter | `record_battle` at `:274-290` |
| `game.resource_transfer_cooldown` | observable gauge | `_observe_resource_cooldown` at `:215-233` |
| `game.location_control` | observable gauge | `_observe_location_control` at `:235-260` (values: `northern=1`, `southern=2`, `neutral=0`, unknown=`-1`) |

The gauge callbacks read from live server state via `_get_location_state()`, which the `LocationServer` registers on the telemetry instance at construction time.

## New mechanics (White Walkers Attack)

All defined in `app/game_config.py`'s `MAPS["white_walkers_attack"]["rules"]`. All behave as no-ops on `war_of_kingdoms`.

- **Wall defender multiplier** — `_handle_battle` accepts a `location_type` argument and scales `defending_army` by `rules["wall_multiplier"]` (2.0 on WWA, 1.0 on WoK) when the location type is `wall`. Remaining defender count is converted back to physical units after the fight.
- **Corpse economy** — when the battle winner is `white_walkers`, the post-battle hook in `receive_army` calls `self._add_corpses(attacking + defending - remaining, "white_walkers")`. `create_army` reads `get_army_currency(map_id, faction)` and, for `currency == "corpses"`, atomically decrements via `_spend_corpses` instead of touching `resources`. The corpse pool lives in `faction_economy` (persistent) so a `/reload` doesn't wipe it.
- **Barbarian passive growth** — `_start_barbarian_growth(interval_s)` runs when `faction == "barbarian"`; adds +1 army every `rules["barbarian_army_growth_interval_s"]` (30 s). Guards each iteration against identity changes via `/reload`.
- **Captured-camp resource generation** — `_start_passive_generation()` is launched for *every* `type == "village"` slot at boot (including barbarian Free Folk camps). The per-iteration `faction != "barbarian"` guard keeps it a no-op while the camp is still barbarian, then it starts producing the standard village amount the moment the player captures it. Without this fallthrough, captured camps stayed unproductive because the thread was never started on barbarian slots.
- **White Walker passive corpses** — `_start_white_walker_corpse_tick(interval_s)` runs at the WW fortress, +1 corpse every `rules["white_walker_passive_corpse_interval_s"]` (15 s).
- **Night's Watch passive resources** — `_start_nights_watch_capital_resource_tick(interval_s, amount)` runs at Castle Black on WWA (`faction == "nights_watch"`, `loc_type == "capital"`), adding `rules["nights_watch_capital_passive_amount"]` resources every `rules["nights_watch_capital_passive_interval_s"]` (5 per 10 s). Manual `/collect_resources` (+20, 5 s cooldown) still works alongside.

## DB additions (live in `game_state.db`)

- **`game_config`** — `(key, value)` key/value store. The `active_map_id` row is authoritative; containers re-read it on boot and on `/reload`.
- **`faction_economy`** — `(faction, corpses)`. Updated by `_add_corpses` / `_spend_corpses`. Read by the AI via `/faction_economy?faction=white_walkers`.

## Environment

| Var | Default | Purpose |
|---|---|---|
| `SLOT_ID` | — (required, `slot_1` … `slot_8`) | Fixed physical slot this container occupies |
| `LOCATION_ID` | — (legacy; no longer authoritative) | Kept for backward-compat with `run_game.py` local dev |
| `PORT` | derived from `LOCATION_ID` | HTTP listen port |
| `IN_DOCKER` | unset | When set, location URLs resolve via container DNS (`village-2:5004`) instead of `localhost:5004` |
| `DATABASE_FILE` | `/data/game_state.db` (Docker) / `./game_state.db` (local) | SQLite WAL-mode DB |

## Common edits

**Add a new location.**
1. Add an entry to `LOCATIONS` in `game_config.py` (connections list, initial resources/army/faction, port).
2. Add a `village-N` service in both `docker-compose.yml` and `docker-compose.coda.yml`.
3. Add to the `LOCATION_PORTS` dict in `war_map/app.py` and `ai_opponent/ai_server.py`.
4. Update the services-and-ports table in `../AGENTS.md` and the location table at the top of this file.

**Add a new metric.**
1. Add an observable gauge (or counter) in `telemetry.py` next to the existing ones.
2. If it reads from location state, register a callback that calls `self._get_location_state(...)`.
3. Add a row to the metrics table in this doc and in `../AGENTS.md`.

**Add a new route.**
1. Wrap the handler in `tracer.start_as_current_span(..., context=extract(request.headers), ...)`.
2. Add `"span.player.action": True` (if triggered by a player) so the dashboard picks it up.
3. If the route spawns a background thread, follow the `get_current()` / `attach` / `detach` pattern from `:209-271`.

## Keep this doc current

Per the sub-agent rule, any change to routes, metrics, span attributes, env vars, or the line-number anchors above must land in the same work unit. Before returning a response that touched `app/`, grep this file for references to anything you changed.

## Cross-references

- [`../AGENTS.md`](../AGENTS.md) — scenario-wide architecture and patterns
- [`../war_map/CLAUDE.md`](../war_map/CLAUDE.md) — the consumer of this service's HTTP API on behalf of the player
- [`../ai_opponent/CLAUDE.md`](../ai_opponent/CLAUDE.md) — the other consumer of this API (autonomous)
- [`../SPAN_LINKS.md`](../SPAN_LINKS.md) — how action spans chain across services


================================================
FILE: game-of-tracing/app/Dockerfile
================================================
FROM python:3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2

WORKDIR /app

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .

ENV FLASK_APP=location_server.py

# Default to running the main server script
CMD ["python", "run_game.py"] 

================================================
FILE: game-of-tracing/app/game_config.py
================================================
"""Game configuration for all maps in the game-of-tracing scenario.

Each entry in ``MAPS`` describes a playable map. A map has:

- ``display_name`` / ``description`` — surfaced by the map picker UI.
- ``single_player`` + ``player_faction`` / ``ai_faction`` — the map picker uses
  these to skip faction selection and auto-activate the AI when appropriate.
- ``factions`` — the valid faction strings for this map.
- ``slot_assignments`` — maps the fixed container slot ids (``slot_1`` …
  ``slot_8``) to the logical location id that slot serves on this map. The 8
  location containers carry only their ``SLOT_ID`` — their in-game identity
  is resolved at boot (and on ``/reload``) via this table.
- ``locations`` — per-location config (name, type, faction, connections,
  initial resources/army, port).
- ``rules`` — map-wide game rules (army costs and currency per faction, wall
  multiplier, tick interval, hold-to-win ticks, passive growth intervals).

The active map id is stored at runtime in the shared ``game_state.db`` in the
``game_config`` key-value table (written by ``war_map`` on ``/select_map``).
Both ``location_server`` and ``war_map`` read it to resolve per-service state.
"""

from __future__ import annotations

DATABASE_FILE = "game_state.db"
DEFAULT_MAP_ID = "war_of_kingdoms"

# Each of the 8 location containers has a fixed SLOT_ID env var
# (slot_1 .. slot_8). Its in-game identity is resolved through the active
# map's slot_assignments table, so the same container can serve "village_1" on
# War of Kingdoms and "wall_west" on White Walkers Attack.
SLOT_IDS = tuple(f"slot_{i}" for i in range(1, 9))


MAPS = {
    "war_of_kingdoms": {
        "display_name": "War of Kingdoms",
        "description": (
            "Northern and Southern kingdoms clash for dominance. "
            "Capture the enemy capital to win."
        ),
        "single_player": False,
        "factions": ["northern", "southern"],
        "slot_assignments": {
            "slot_1": "southern_capital",
            "slot_2": "northern_capital",
            "slot_3": "village_1",
            "slot_4": "village_2",
            "slot_5": "village_3",
            "slot_6": "village_4",
            "slot_7": "village_5",
            "slot_8": "village_6",
        },
        "locations": {
            "southern_capital": {
                "name": "Southern Capital",
                "type": "capital",
                "faction": "southern",
                "connections": ["village_1", "village_3"],
                "initial_resources": 100,
                "initial_army": 1,
                "port": 5001,
            },
            "northern_capital": {
                "name": "Northern Capital",
                "type": "capital",
                "faction": "northern",
                "connections": ["village_2", "village_6"],
                "initial_resources": 100,
                "initial_army": 1,
                "port": 5002,
            },
            "village_1": {
                "name": "Village 1",
                "type": "village",
                "faction": "neutral",
                "connections": ["southern_capital", "village_2", "village_4"],
                "initial_resources": 50,
                "initial_army": 2,
                "port": 5003,
            },
            "village_2": {
                "name": "Village 2",
                "type": "village",
                "faction": "neutral",
                "connections": ["northern_capital", "village_1", "village_5"],
                "initial_resources": 50,
                "initial_army": 3,
                "port": 5004,
            },
            "village_3": {
                "name": "Village 3",
                "type": "village",
                "faction": "neutral",
                "connections": ["southern_capital", "village_5", "village_6"],
                "initial_resources": 50,
                "initial_army": 2,
                "port": 5005,
            },
            "village_4": {
                "name": "Village 4",
                "type": "village",
                "faction": "neutral",
                "connections": ["village_1", "village_5"],
                "initial_resources": 50,
                "initial_army": 1,
                "port": 5006,
            },
            "village_5": {
                "name": "Village 5",
                "type": "village",
                "faction": "neutral",
                "connections": ["village_2", "village_3", "village_4", "village_6"],
                "initial_resources": 50,
                "initial_army": 4,
                "port": 5007,
            },
            "village_6": {
                "name": "Village 6",
                "type": "village",
                "faction": "neutral",
                "connections": ["northern_capital", "village_3", "village_5"],
                "initial_resources": 50,
                "initial_army": 2,
                "port": 5008,
            },
        },
        "rules": {
            "resource_generation": {"capital": 20, "village": 10},
            "army_cost": {"default": 30},
            "army_currency": {"default": "resources"},
            "wall_multiplier": 1.0,
            "barbarian_army_growth_interval_s": 0,
            "white_walker_passive_corpse_interval_s": 0,
            "tick_interval_s": 0,
            "win_hold_ticks": 0,
        },
    },
    "white_walkers_attack": {
        "display_name": "White Walkers Attack",
        "description": (
            "The Long Night has come. As the Night's Watch, hold every Wall "
            "keep for 5 ticks (150 s) before the White Walkers do. Single-player."
        ),
        "single_player": True,
        "player_faction": "nights_watch",
        "ai_faction": "white_walkers",
        "factions": ["nights_watch", "white_walkers", "barbarian"],
        "slot_assignments": {
            "slot_1": "nights_watch_fortress",
            "slot_2": "white_walker_fortress",
            "slot_3": "wall_west",
            "slot_4": "wall_center_west",
            "slot_5": "wall_center_east",
            "slot_6": "wall_east",
            "slot_7": "barbarian_village_west",
            "slot_8": "barbarian_village_east",
        },
        "locations": {
            "nights_watch_fortress": {
                "name": "Castle Black",
                "type": "capital",
                "faction": "nights_watch",
                "connections": [
                    "wall_west",
                    "wall_center_west",
                    "wall_center_east",
                    "wall_east",
                ],
                "initial_resources": 150,
                "initial_army": 3,
                "port": 5001,
            },
            "white_walker_fortress": {
                "name": "The Lands of Always Winter",
                "type": "capital",
                "faction": "white_walkers",
                "connections": [
                    "wall_west",
                    "wall_center_west",
                    "wall_center_east",
                    "wall_east",
                ],
                # White Walkers spend corpses, not resources. Keep the column
                # populated so the DB row shape stays uniform; the create_army
                # handler reads currency from the map rules.
                "initial_resources": 0,
                "initial_army": 2,
                "port": 5002,
            },
            "wall_west": {
                "name": "Westwatch",
                "type": "wall",
                "faction": "neutral",
                "connections": [
                    "nights_watch_fortress",
                    "white_walker_fortress",
                    "wall_center_west",
                    "barbarian_village_west",
                ],
                "initial_resources": 0,
                "initial_army": 1,
                "port": 5003,
            },
            "wall_center_west": {
                "name": "Queensgate",
                "type": "wall",
                "faction": "neutral",
                "connections": [
                    "nights_watch_fortress",
                    "white_walker_fortress",
                    "wall_west",
                    "wall_center_east",
                ],
                "initial_resources": 0,
                "initial_army": 1,
                "port": 5004,
            },
            "wall_center_east": {
                "name": "Deep Lake",
                "type": "wall",
                "faction": "neutral",
                "connections": [
                    "nights_watch_fortress",
                    "white_walker_fortress",
                    "wall_center_west",
                    "wall_east",
                ],
                "initial_resources": 0,
                "initial_army": 1,
                "port": 5005,
            },
            "wall_east": {
                "name": "Eastwatch-by-the-Sea",
                "type": "wall",
                "faction": "neutral",
                "connections": [
                    "nights_watch_fortress",
                    "white_walker_fortress",
                    "wall_center_east",
                    "barbarian_village_east",
                ],
                "initial_resources": 0,
                "initial_army": 1,
                "port": 5006,
            },
            "barbarian_village_west": {
                "name": "Free Folk Camp (West)",
                "type": "village",
                "faction": "barbarian",
                "connections": ["wall_west"],
                "initial_resources": 0,
                "initial_army": 2,
                "port": 5007,
            },
            "barbarian_village_east": {
                "name": "Free Folk Camp (East)",
                "type": "village",
                "faction": "barbarian",
                "connections": ["wall_east"],
                "initial_resources": 0,
                "initial_army": 2,
                "port": 5008,
            },
        },
        "rules": {
            # Night's Watch capital collects resources on the classic schedule.
            # White Walker fortress ignores resource_generation (uses corpses).
            "resource_generation": {"capital": 20, "village": 10},
            "army_cost": {
                "default": 30,
                "white_walkers": 5,
            },
            "army_currency": {
                "default": "resources",
                "white_walkers": "corpses",
            },
            "wall_multiplier": 2.0,
            "barbarian_army_growth_interval_s": 30,
            "white_walker_passive_corpse_interval_s": 15,
            # WWA gives the Night's Watch no friendly villages, so its only
            # income source is /collect_resources at Castle Black. Add a slow
            # passive trickle so the resource HUD ticks up without click-spam.
            # Keep it well below the click rate (+20 per 5 s) — passive should
            # supplement, not replace, active play.
            "nights_watch_capital_passive_amount": 5,
            "nights_watch_capital_passive_interval_s": 10,
            "tick_interval_s": 30,
            "win_hold_ticks": 5,
        },
    },
}

# Backward-compat exports: unchanged shape for callers that don't know about
# maps yet. These always reflect the War of Kingdoms defaults.
LOCATIONS = MAPS[DEFAULT_MAP_ID]["locations"]
RESOURCE_GENERATION = MAPS[DEFAULT_MAP_ID]["rules"]["resource_generation"]
COSTS = {"create_army": MAPS[DEFAULT_MAP_ID]["rules"]["army_cost"]["default"]}


def get_map(map_id):
    """Return the full map-config dict for ``map_id``."""
    if map_id not in MAPS:
        raise KeyError(f"Unknown map_id: {map_id}")
    return MAPS[map_id]


def resolve_slot(map_id, slot_id):
    """Return the location_id the given slot serves on the given map."""
    return MAPS[map_id]["slot_assignments"][slot_id]


def get_location_config(map_id, location_id):
    """Return the per-location config dict for (map_id, location_id)."""
    return MAPS[map_id]["locations"][location_id]


def get_rules(map_id):
    """Return the ``rules`` dict for ``map_id``."""
    return MAPS[map_id]["rules"]


def get_army_cost(map_id, faction):
    """Return the army-creation cost for ``faction`` on ``map_id``."""
    costs = MAPS[map_id]["rules"]["army_cost"]
    return costs.get(faction, costs["default"])


def get_army_currency(map_id, faction):
    """Return ``"resources"`` or ``"corpses"`` for ``faction`` on ``map_id``."""
    currencies = MAPS[map_id]["rules"]["army_currency"]
    return currencies.get(faction, currencies["default"])


def locations_by_type(map_id, type_name):
    """Return the list of location_ids on ``map_id`` of the given ``type_name``."""
    return [
        lid
        for lid, cfg in MAPS[map_id]["locations"].items()
        if cfg["type"] == type_name
    ]


================================================
FILE: game-of-tracing/app/location_server.py
================================================
"""Location server implementation.

Each of the 8 location containers has a constant ``SLOT_ID`` env var
(``slot_1`` … ``slot_8``). The in-game identity a slot serves (e.g.
``southern_capital`` in War of Kingdoms, ``wall_west`` in White Walkers
Attack) is resolved at boot and on ``/reload`` via the active map stored
in the shared ``game_config`` key-value table. See ``game_config.MAPS``.

The per-container SERVICE_NAME (used by Grafana dashboards) stays stable
regardless of map — it's derived from ``LOCATION_NAME`` env / slot id, not
from the logical location id.
"""
import os, sqlite3, requests, random, time, threading, atexit
from threading import Thread, Lock
from datetime import datetime, timedelta
from flask import Flask, jsonify, request
from game_config import (
    MAPS,
    COSTS,
    DATABASE_FILE,
    DEFAULT_MAP_ID,
    LOCATIONS,
    RESOURCE_GENERATION,
    SLOT_IDS,
    get_army_cost,
    get_army_currency,
    get_location_config,
    get_map,
    get_rules,
    resolve_slot,
)
from telemetry import GameTelemetry
from opentelemetry.propagate import extract, inject
from opentelemetry import trace
from opentelemetry.trace import SpanKind
from opentelemetry.context import get_current, attach, detach
from enum import Enum
from typing import Optional, List, Tuple, Dict

class PathType(Enum):
    RESOURCE = 'resource'
    ATTACK = 'attack'

class LocationServer:
    def __init__(self, slot_or_location=None):
        # Accept either a slot id (new, preferred) or a legacy location id
        # (for backward compat with local dev scripts). Falls back to env.
        raw = slot_or_location or os.environ.get('SLOT_ID')
        if raw in SLOT_IDS:
            self.slot_id = raw
        elif raw in MAPS[DEFAULT_MAP_ID]["locations"]:
            # Legacy: caller passed a War of Kingdoms location id; resolve to
            # its slot via the reverse map.
            inverse = {v: k for k, v in MAPS[DEFAULT_MAP_ID]["slot_assignments"].items()}
            self.slot_id = inverse[raw]
        else:
            raise ValueError(
                f"Cannot determine SLOT_ID from {raw!r}; expected one of {SLOT_IDS} "
                f"or a War of Kingdoms location id."
            )

        self.app = Flask(__name__)
        self.last_resource_collection = {}
        self.resource_cooldown = {}
        self.lock = Lock()

        # SERVICE_NAME must stay stable across map switches so Grafana
        # dashboards keep their series. Prefer the explicit LOCATION_NAME env
        # (matches container name in docker-compose); else synthesise from the
        # slot id.
        service_name = os.environ.get('LOCATION_NAME') or self.slot_id.replace('_', '-')
        self.telemetry = GameTelemetry(service_name=service_name)
        self.logger = self.telemetry.get_logger()
        self.tracer = self.telemetry.get_tracer()

        # Give telemetry access to location state
        self.telemetry._get_location_state = self._get_location_state
        # And access to faction-scoped economy (for the corpse gauge).
        self.telemetry._get_corpse_count = self._get_corpses

        self.db_path = os.environ.get('DATABASE_FILE', DATABASE_FILE)

        # Populated by _load_identity().
        self.map_id = DEFAULT_MAP_ID
        self.location_id = None
        self.location_info = None
        self._passive_thread_started = False
        self._barbarian_thread_started = False
        self._corpse_thread_started = False
        self._nw_capital_thread_started = False

        self._initialize_database()
        self._load_identity()
        self.setup_routes()

        atexit.register(self.telemetry.shutdown)

    # ----------------------------------------------------------------
    # Map / slot identity resolution
    # ----------------------------------------------------------------

    def _current_locations(self) -> Dict:
        """Return the active map's ``location_id → config`` dict."""
        return MAPS[self.map_id]["locations"]

    def _current_rules(self) -> Dict:
        return MAPS[self.map_id]["rules"]

    def _read_active_map_id(self) -> str:
        conn = self._get_db_connection()
        try:
            row = conn.execute(
                "SELECT value FROM game_config WHERE key = 'active_map_id'"
            ).fetchone()
        finally:
            conn.close()
        return row['value'] if row else DEFAULT_MAP_ID

    def _load_identity(self):
        """Resolve slot → (map, location_id, config); seed this slot's row."""
        self.map_id = self._read_active_map_id()
        self.location_id = resolve_slot(self.map_id, self.slot_id)
        self.location_info = get_location_config(self.map_id, self.location_id)

        # Publish live identity to the telemetry instance so the observable
        # gauges report the currently-served id, not whatever id was derived
        # from the container's SERVICE_NAME at boot.
        self.telemetry._location_id = self.location_id
        self.telemetry._location_type = self.location_info["type"]

        # Seed this slot's row in the locations table if missing. Idempotent:
        # INSERT OR IGNORE handles the case where war_map already re-seeded.
        conn = self._get_db_connection()
        try:
            conn.execute(
                "INSERT OR IGNORE INTO locations (id, resources, army, faction) VALUES (?, ?, ?, ?)",
                (
                    self.location_id,
                    self.location_info["initial_resources"],
                    self.location_info["initial_army"],
                    self.location_info["faction"],
                ),
            )
            conn.commit()
        finally:
            conn.close()

        self._start_passive_threads_if_needed()

        self.logger.info(
            f"Identity loaded: slot={self.slot_id} map={self.map_id} "
            f"location_id={self.location_id} type={self.location_info['type']} "
            f"faction={self.location_info['faction']}"
        )

    def _start_passive_threads_if_needed(self):
        """Kick off whichever passive loop matches this slot's identity.

        Threads are started at most once per process lifetime. If a slot's
        identity changes through ``/reload``, the *old* thread keeps running
        but becomes a no-op because it guards each iteration against the
        current location type/faction.
        """
        loc_type = self.location_info["type"]
        faction = self.location_info["faction"]
        rules = self._current_rules()

        # Launch the village resource thread for *every* village, including
        # barbarian-faction slots (Free Folk camps). The thread guards each
        # iteration on ``faction != "barbarian"``, so it stays a no-op while
        # the camp is still barbarian and starts producing for the player
        # the moment they capture it. Without this fallthrough, captured
        # camps stay unproductive because the thread was never started.
        if loc_type == "village" and not self._passive_thread_started:
            self._start_passive_generation()
            self._passive_thread_started = True

        if faction == "barbarian" and not self._barbarian_thread_started:
            interval = rules.get("barbarian_army_growth_interval_s", 0) or 0
            if interval > 0:
                self._start_barbarian_growth(interval)
                self._barbarian_thread_started = True

        if (
            loc_type == "capital"
            and faction == "white_walkers"
            and not self._corpse_thread_started
        ):
            interval = rules.get("white_walker_passive_corpse_interval_s", 0) or 0
            if interval > 0:
                self._start_white_walker_corpse_tick(interval)
                self._corpse_thread_started = True

        if (
            loc_type == "capital"
            and faction == "nights_watch"
            and not self._nw_capital_thread_started
        ):
            interval = rules.get("nights_watch_capital_passive_interval_s", 0) or 0
            amount = rules.get("nights_watch_capital_passive_amount", 0) or 0
            if interval > 0 and amount > 0:
                self._start_nights_watch_capital_resource_tick(interval, amount)
                self._nw_capital_thread_started = True

    # ----------------------------------------------------------------
    # Corpse economy (faction-scoped; lives in faction_economy table)
    # ----------------------------------------------------------------

    def _get_corpses(self, faction: str = "white_walkers") -> int:
        conn = self._get_db_connection()
        try:
            row = conn.execute(
                "SELECT corpses FROM faction_economy WHERE faction = ?", (faction,)
            ).fetchone()
        finally:
            conn.close()
        return int(row['corpses']) if row else 0

    def _add_corpses(self, delta: int, faction: str = "white_walkers"):
        if delta <= 0:
            return
        conn = self._get_db_connection()
        try:
            conn.execute(
                "INSERT INTO faction_economy (faction, corpses) VALUES (?, ?) "
                "ON CONFLICT(faction) DO UPDATE SET corpses = corpses + excluded.corpses",
                (faction, delta),
            )
            conn.commit()
        finally:
            conn.close()

    def _spend_corpses(self, amount: int, faction: str = "white_walkers") -> bool:
        """Atomically decrement ``faction``'s corpse pool. Returns True on success."""
        conn = self._get_db_connection()
        try:
            cursor = conn.execute(
                "UPDATE faction_economy SET corpses = corpses - ? "
                "WHERE faction = ? AND corpses >= ?",
                (amount, faction, amount),
            )
            conn.commit()
            return cursor.rowcount > 0
        finally:
            conn.close()

    def _find_capital(self, faction: str) -> Optional[str]:
        """Return the location_id of the capital with the given faction in the active map, by static config."""
        for loc_id, cfg in self._current_locations().items():
            if cfg["type"] == "capital" and cfg["faction"] == faction:
                return loc_id
        return None

    def _find_enemy_capital(self, faction: str) -> Optional[str]:
        """Return the location_id of a capital not belonging to ``faction`` (and not barbarian), by static config."""
        for loc_id, cfg in self._current_locations().items():
            if cfg["type"] == "capital" and cfg["faction"] not in (faction, "barbarian"):
                return loc_id
        return None

    def _get_db_connection(self):
        # ``timeout`` applies before the first PRAGMA runs, so concurrent
        # boot of all 8 containers doesn't race on ``PRAGMA journal_mode=WAL``
        # (which briefly acquires an exclusive lock to switch modes).
        conn = sqlite3.connect(self.db_path, timeout=15)
        conn.execute("PRAGMA busy_timeout=5000")
        conn.execute("PRAGMA journal_mode=WAL")
        conn.row_factory = sqlite3.Row
        return conn

    def _initialize_database(self):
        conn = self._get_db_connection()
        cursor = conn.cursor()

        # Canonical per-location state.
        cursor.execute('''
        CREATE TABLE IF NOT EXISTS locations (
            id TEXT PRIMARY KEY,
            resources INTEGER NOT NULL,
            army INTEGER NOT NULL,
            faction TEXT NOT NULL
        )
        ''')

        # Key/value game-wide config; holds active_map_id (authoritative at
        # runtime; overrides whatever the process started with).
        cursor.execute('''
        CREATE TABLE IF NOT EXISTS game_config (
            key TEXT PRIMARY KEY,
            value TEXT NOT NULL
        )
        ''')
        cursor.execute(
            "INSERT OR IGNORE INTO game_config (key, value) VALUES ('active_map_id', ?)",
            (DEFAULT_MAP_ID,),
        )

        # Faction-scoped economy (White Walkers' corpse pool today; room for
        # additional faction-level currencies later).
        cursor.execute('''
        CREATE TABLE IF NOT EXISTS faction_economy (
            faction TEXT PRIMARY KEY,
            corpses INTEGER NOT NULL DEFAULT 0
        )
        ''')

        conn.commit()
        conn.close()

    def _get_location_state(self, location_id):
        conn = self._get_db_connection()
        cursor = conn.cursor()
        
        cursor.execute("SELECT * FROM locations WHERE id = ?", (location_id,))
        row = cursor.fetchone()
        
        state = None
        if row:
            state = {
                "resources": row['resources'],
                "army": row['army'],
                "faction": row['faction']
            }
        conn.close()
        return state

    def _update_location_state(self, location_id, resources=None, army=None, faction=None):
        set_clauses = []
        params = []
        
        if resources is not None:
            set_clauses.append("resources = ?")
            params.append(resources)
        if army is not None:
            set_clauses.append("army = ?")
            params.append(army)
        if faction is not None:
            set_clauses.append("faction = ?")
            params.append(faction)
        
        if not set_clauses:
            return False
        
        params.append(location_id)
        
        conn = self._get_db_connection()
        cursor = conn.cursor()
        cursor.execute(
            f"UPDATE locations SET {', '.join(set_clauses)} WHERE id = ?",
            params
        )
        conn.commit()
        conn.close()

        # Force metric collection on important state changes
        if faction is not None or resources is not None or army is not None:
            self.telemetry.collect_metrics()
            
        return True

    def _find_path(self, target: str, path_type: PathType) -> Optional[List[str]]:
        """Unified pathfinding for both resources and armies on the active map."""
        locations = self._current_locations()
        location_state = self._get_location_state(self.location_id)
        faction = location_state["faction"]

        # Resource routing only makes sense for factions that have a resource
        # economy. ``barbarian`` and ``white_walkers`` don't send resources.
        resource_factions = {"southern", "northern", "nights_watch"}
        if path_type == PathType.RESOURCE and faction not in resource_factions:
            return None

        distances = {loc: float('infinity') for loc in locations.keys()}
        distances[self.location_id] = 0
        previous = {loc: None for loc in locations.keys()}
        unvisited = set(locations.keys())

        def get_weight(loc_id: str) -> float:
            state = self._get_location_state(loc_id)
            loc_faction = state["faction"] if state else "neutral"

            if path_type == PathType.RESOURCE:
                if loc_faction == faction:
                    return 1
                elif loc_faction == "neutral":
                    return 2
                return float('infinity')
            else:  # PathType.ATTACK
                if loc_faction == faction:
                    return 1
                elif loc_faction == "neutral":
                    return 2
                return 3

        while unvisited:
            current = min(unvisited, key=lambda loc: distances[loc])
            if current == target:
                break

            unvisited.remove(current)
            for neighbor in locations[current]["connections"]:
                if neighbor in unvisited:
                    weight = get_weight(neighbor)
                    distance = distances[current] + weight

                    if distance < distances[neighbor]:
                        distances[neighbor] = distance
                        previous[neighbor] = current

        if previous[target] is None:
            return None

        path = []
        current = target
        while current is not None:
            path.append(current)
            current = previous[current]

        return list(reversed(path))

    def _handle_battle(self, attacking_army: int, attacking_faction: str,
                      defending_army: int, defending_faction: str,
                      location_type: Optional[str] = None) -> tuple[str, int, str]:
        """Handle battle between armies and return ``(result, remaining_army, new_faction)``.

        ``location_type`` lets the active map's rules modify the fight. For
        ``wall`` settlements on a map with ``wall_multiplier`` > 1 the defender's
        effective strength is scaled up — the physical garrison plays harder to
        dislodge, but the ``remaining_army`` reported back is converted back to
        physical units so DB rows stay honest.
        """
        # Same faction = reinforcement. Multiplier never applies.
        if attacking_faction == defending_faction:
            self.logger.info(f"Reinforcement battle between {attacking_faction} armies")
            self.telemetry.record_battle(attacking_faction, defending_faction, "reinforcement")
            return "reinforcement", attacking_army + defending_army, attacking_faction

        multiplier = 1.0
        if location_type == "wall":
            multiplier = float(self._current_rules().get("wall_multiplier", 1.0) or 1.0)
        effective_defender = int(defending_army * multiplier)

        if attacking_army > effective_defender:
            remaining = attacking_army - effective_defender
            self.logger.info(
                f"Attacker victory: {attacking_army} vs {defending_army} "
                f"(effective {effective_defender}, mult {multiplier}) -> {remaining}"
            )
            self.telemetry.record_battle(attacking_faction, defending_faction, "attacker_victory")
            return "attacker_victory", remaining, attacking_faction
        elif effective_defender > attacking_army:
            # Convert defender's surviving *effective* strength back to physical.
            effective_remaining = effective_defender - attacking_army
            remaining = max(1, int(effective_remaining / multiplier)) if multiplier > 0 else effective_remaining
            self.logger.info(
                f"Defender victory: {defending_army} vs {attacking_army} "
                f"(effective {effective_defender}, mult {multiplier}) -> {remaining}"
            )
            self.telemetry.record_battle(attacking_faction, defending_faction, "defender_victory")
            return "defender_victory", remaining, defending_faction
        else:
            self.logger.info(
                f"Stalemate: {attacking_army} vs {defending_army} "
                f"(effective {effective_defender}, mult {multiplier})"
            )
            self.telemetry.record_battle(attacking_faction, defending_faction, "stalemate")
            return "stalemate", 0, defending_faction

    def _continue_army_movement(self, army_size: int, faction: str, current_loc: str,
                              next_loc: str, remaining_path: List[str], is_attack_move: bool = False) -> Dict:
        """Continue army movement to next location."""
        # Capture the full context before spawning the thread
        ctx = get_current()

        def move():
            token = attach(ctx)
            try:
                time.sleep(5)  # Wait 5 seconds before moving

                with self.tracer.start_as_current_span(
                        "army_movement",
                        kind=SpanKind.SERVER,
                        attributes={
                            "source_location": current_loc,
                            "target_location": next_loc,
                            "army_size": army_size,
                            "is_attack_move": is_attack_move
                        }
                    ) as movement_span:
                        target_url = f"{self.get_location_url(next_loc)}/receive_army"
                        self.logger.info(f"Moving army from {current_loc} to {next_loc}")
                        
                        result = self._make_request_with_trace(
                            'post',
                            target_url,
                            {
                                "army_size": army_size,
                                "faction": faction,
                                "source_location": current_loc,
                                "remaining_path": remaining_path,
                                "is_attack_move": is_attack_move
                            },
                            span_name="http_request.move_army"
                        )
                        
                        if not result.get("success", False):
                            movement_span.set_status(trace.StatusCode.ERROR, "Army movement failed")
                            movement_span.set_attribute("error", result.get("message", "Unknown error"))
                            self.logger.error(f"Army movement failed: {result.get('message', 'Unknown error')}")
                        else:
                            # Force metric collection after successful army movement
                            self.telemetry.collect_metrics()
                
            except Exception as e:
                self.logger.error(f"Failed to move army to {next_loc}: {str(e)}")
                raise
            finally:
                detach(token)

        # Start movement in background thread
        Thread(target=move).start()
        
        # Force metric collection at the start of movement
        self.telemetry.collect_metrics()
        
        # Return immediate response indicating movement has started
        return {
            "success": True,
            "message": f"Army movement started from {current_loc} to {next_loc}",
            "is_attack_move": is_attack_move
        }

    def _transfer_resources_along_path(self, resources: int, path: List[str]) -> bool:
        """Transfer resources along a path with delays."""
        if not path or len(path) < 2:
            return False
            
        # Capture the full context before spawning the thread
        ctx = get_current()

        def transfer():
            current_loc = path[0]
            next_loc = path[1]

            token = attach(ctx)
            try:
                time.sleep(5)  # Wait before starting transfer

                with self.tracer.start_as_current_span(
                    "resource_movement",
                    kind=SpanKind.SERVER,
                    attributes={
                        "source_location": current_loc,
                        "target_location": next_loc,
                        "resources_amount": resources
                    }
                ) as movement_span:
                    target_url = f"{self.get_location_url(next_loc)}/receive_resources"
                    result = self._make_request_with_trace(
                        'post',
                        target_url,
                        {
                            "resources": resources,
                            "source_location": current_loc,
                            "remaining_path": path[1:],
                            "faction": self._get_location_state(self.location_id)["faction"]
                        },
                        span_name="http_request.transfer_resources"
                    )

                    if result.get("success", False):
                        current_loc_resources = self._get_location_state(current_loc)['resources']
                        self._update_location_state(current_loc, resources=current_loc_resources - resources)
                        # Force metric collection after successful resource transfer
                        self.telemetry.collect_metrics()
                    else:
                        movement_span.set_status(trace.StatusCode.ERROR, "Resource transfer failed")

            except Exception as e:
                self.logger.error(f"Failed to send resources to {next_loc} from {current_loc}: {str(e)}")
            finally:
                detach(token)

        Thread(target=transfer).start()
        return True

    def _make_request_with_trace(self, method: str, url: str, json_data: Optional[Dict] = None, span_name: str = "http_request") -> Dict:
        """Make HTTP request with trace context propagated in headers."""
        headers = {"Content-Type": "application/json"}

        with self.tracer.start_as_current_span(
            span_name,
            kind=SpanKind.CLIENT,
            attributes={"http.url": url}
        ) as request_span:
            inject(headers)  # This will now inject the current request_span's context
            
            try:
                if method.lower() == 'get':
                    response = requests.get(url, headers=headers)
                elif method.lower() == 'post':
                    response = requests.post(url, json=json_data, headers=headers)
                else:
                    raise ValueError(f"Unsupported method: {method}")
                
                request_span.set_attribute("http.status_code", response.status_code)
                response.raise_for_status()
                return response.json()
            except requests.RequestException as e:
                request_span.set_status(trace.StatusCode.ERROR, str(e))
                self.logger.error(f"Request failed: {str(e)}")
                raise

    def _can_collect_resources(self) -> tuple[bool, Optional[str], Optional[int]]:
        """Check if location can collect resources.
        Returns:
            tuple: (can_collect, message, cooldown_seconds)
        """
        with self.lock:
            if self.location_info["type"] != "capital":
                return False, "Only capitals can manually collect resources", None
            
            now = datetime.now()
            
            # Check resource sending cooldown
            if self.location_id in self.resource_cooldown:
                cooldown_end = self.resource_cooldown[self.location_id]
                if now < cooldown_end:
                    remaining = (cooldown_end - now).seconds
                    return False, f"Resource generation on cooldown for {remaining} seconds", remaining
            
            # Check collection cooldown
            last_time = self.last_resource_collection.get(self.location_id, datetime.min)
            wait_time = timedelta(seconds=5)
            
            if now - last_time < wait_time:
                remaining = wait_time - (now - last_time)
                return False, f"Must wait {remaining.seconds} seconds to collect resources", remaining.seconds
            
            return True, None, None

    def _start_resource_cooldown(self):
        with self.lock:
            self.resource_cooldown[self.location_id] = datetime.now() + timedelta(seconds=5)

    def get_location_url(self, location_id):
        """Return the HTTP base URL for reaching another location service.

        Uses the active map's port assignment; falls back to WoK's port for a
        legacy id if the location isn't on the current map (shouldn't happen
        during a coherent game but guards against transition races).
        """
        locations = self._current_locations()
        if location_id in locations:
            port = locations[location_id]["port"]
        else:
            port = MAPS[DEFAULT_MAP_ID]["locations"][location_id]["port"]
        if os.environ.get('IN_DOCKER') or os.environ.get('LOCATION_ID'):
            docker_service_name = self._container_for(location_id)
            return f"http://{docker_service_name}:{port}"
        return f"http://localhost:{port}"

    def _container_for(self, location_id: str) -> str:
        """Return the stable container hostname for another location id.

        Containers are named after their *slot* (slot_1 → southern-capital in
        docker-compose, which is slot_1's stable identity). We reverse-look up
        the slot that currently serves ``location_id`` on the active map, then
        translate that slot back to its container hostname using the WoK
        default slot assignments (which match docker-compose service names).
        """
        active = MAPS[self.map_id]["slot_assignments"]
        wok = MAPS[DEFAULT_MAP_ID]["slot_assignments"]
        for slot, active_loc in active.items():
            if active_loc == location_id:
                return wok[slot].replace('_', '-')
        # Unknown id — best-effort: use the hyphenated form.
        return location_id.replace('_', '-')

    def _start_passive_generation(self):
        def generate_resources():
            while True:
                time.sleep(15)
                # Static identity guards against /reload moving this slot off
                # of a village type entirely.
                if self.location_info["type"] != "village":
                    continue
                # Live-DB guard: gate on the *current* faction, not the
                # boot-time identity, so a captured Free Folk camp starts
                # producing for the new owner the moment its row flips. The
                # static ``self.location_info["faction"]`` is set at boot
                # from MAPS config and never updates on battle.
                location_state = self._get_location_state(self.location_id)
                if location_state is None:
                    continue
                if location_state["faction"] == "barbarian":
                    continue
                amount = self._current_rules()["resource_generation"]["village"]
                with self.tracer.start_as_current_span(
                    "passive_resource_generation",
                    attributes={
                        "location.id": self.location_id,
                        "resources_gained": amount,
                        "game.map.id": self.map_id,
                        "owner.faction": location_state["faction"],
                    }
                ):
                    new_resources = location_state["resources"] + amount
                    self._update_location_state(self.location_id, resources=new_resources)
                    self.telemetry.collect_metrics()

        Thread(target=generate_resources, daemon=True).start()

    def _start_barbarian_growth(self, interval_s: int):
        """Barbarian villages grow +1 army every ``interval_s`` seconds.

        Barbarians never initiate combat; they exist to pressure the map and
        feed the White Walker corpse economy. The thread self-gates against
        identity changes so it becomes a no-op if /reload moves this slot off
        a barbarian role.
        """
        def grow():
            while True:
                time.sleep(interval_s)
                if self.location_info["faction"] != "barbarian":
                    continue
                with self.tracer.start_as_current_span(
                    "barbarian_passive_growth",
                    attributes={
                        "location.id": self.location_id,
                        "game.map.id": self.map_id,
                        "army_gained": 1,
                    }
                ):
                    state = self._get_location_state(self.location_id)
                    if state is None:
                        continue
                    # Only grow while still barbarian-controlled.
                    if state["faction"] != "barbarian":
                        continue
                    self._update_location_state(self.location_id, army=state["army"] + 1)
                    self.telemetry.collect_metrics()

        Thread(target=grow, daemon=True).start()

    def _start_nights_watch_capital_resource_tick(self, interval_s: int, amount: int):
        """Passive resource generation at the Night's Watch capital (WWA only).

        WWA gives the player no friendly villages, so /collect_resources at
        Castle Black is the only income source — leading to click-spam UX. A
        slow passive tick supplements that without removing the incentive to
        actively collect (manual is +20 per 5 s; passive is +amount per
        interval_s, configured well below that).
        """
        def tick():
            while True:
                time.sleep(interval_s)
                if (self.location_info["faction"] != "nights_watch"
                    or self.location_info["type"] != "capital"):
                    continue
                with self.tracer.start_as_current_span(
                    "nights_watch_passive_resource",
                    attributes={
                        "location.id": self.location_id,
                        "game.map.id": self.map_id,
                        "resources_gained": amount,
                    }
                ):
                    state = self._get_location_state(self.location_id)
                    if state is None:
                        continue
                    if state["faction"] != "nights_watch":
                        continue
                    self._update_location_state(
                        self.location_id, resources=state["resources"] + amount
                    )
                    self.telemetry.collect_metrics()

        Thread(target=tick, daemon=True).start()

    def _start_white_walker_corpse_tick(self, interval_s: int):
        """Passive corpse generation at the White Walker fortress.

        Simulates the undead slowly rising — keeps the WW economy nonzero even
        when no battles are happening. Corpses accrue to the faction pool.
        """
        def tick():
            while True:
                time.sleep(interval_s)
                if self.location_info["faction"] != "white_walkers" or self.location_info["type"] != "capital":
                    continue
                with self.tracer.start_as_current_span(
                    "white_walker_corpse_tick",
                    attributes={
                        "location.id": self.location_id,
                        "game.map.id": self.map_id,
                        "game.corpses.harvested": 1,
                        "corpse.source": "passive",
                    }
                ):
                    self._add_corpses(1, "white_walkers")
                    self.telemetry.collect_metrics()

        Thread(target=tick, daemon=True).start()

    def reset_database(self):
        """Reset every location row + the corpse pool to the active map's initial state."""
        conn = self._get_db_connection()
        cursor = conn.cursor()

        cursor.execute("DELETE FROM locations")

        for loc_id, loc_info in self._current_locations().items():
            cursor.execute(
                "INSERT INTO locations VALUES (?, ?, ?, ?)",
                (
                    loc_id,
                    loc_info["initial_resources"],
                    loc_info["initial_army"],
                    loc_info["faction"],
                ),
            )

        cursor.execute("DELETE FROM faction_economy")

        conn.commit()
        conn.close()
        self.logger.info(f"Database reset to initial state for map {self.map_id}")

    def setup_routes(self):
        @self.app.route('/', methods=['GET'])
        def info():
            context = extract(request.headers)
            with self.tracer.start_as_current_span(
                "get_location_info",
                context=context,
                kind=SpanKind.SERVER,
                attributes={
                    "location.id": self.location_id,
                    "location.name": self.location_info["name"],
                    "location.type": self.location_info["type"]
                }
            ):
                location_state = self._get_location_state(self.location_id)

                cooldown_info = None
                with self.lock:
                    now = datetime.now()
                    last_time = self.last_resource_collection.get(self.location_id, datetime.min)
                    wait_time = timedelta(seconds=15 if self.location_info["type"] == "village" else 5)

                    if now - last_time < wait_time:
                        remaining = wait_time - (now - last_time)
                        cooldown_info = remaining.seconds

                return jsonify({
                    "location_id": self.location_id,
                    "name": self.location_info["name"],
                    "faction": location_state["faction"],
                    "connections": self.location_info["connections"],
                    "resources": location_state["resources"],
                    "army": location_state["army"],
                    "resource_cooldown": cooldown_info
                })

        @self.app.route('/health', methods=['GET'])
        def health():
            return jsonify({"status": "ok"})

        @self.app.route('/collect_resources', methods=['POST'])
        def collect_resources():
            """Collect resources from a location"""
            # Extract trace context from request headers
            context = extract(request.headers)
            
            with self.tracer.start_as_current_span(
                "collect_resources",
                context=context,
                kind=SpanKind.SERVER,
                attributes={
                    "location_name": self.location_info["name"],
                    "location_type": self.location_info["type"]
                }
            ) as span:
                can_collect, message, cooldown_seconds = self._can_collect_resources()
                if not can_collect:
                    span.set_status(trace.StatusCode.ERROR, message)
                    span.set_attribute("cooldown_seconds", cooldown_seconds or 0)
                    return jsonify({
                        "success": False,
                        "message": message,
                        "cooldown": True,
                        "cooldown_seconds": cooldown_seconds
                    }), 200  # Return 200 for cooldown, as it's an expected state
                
                location_type = self.location_info["type"]
                resources_gained = self._current_rules()["resource_generation"].get(location_type, 0)

                location_state = self._get_location_state(self.location_id)
                new_resources = location_state["resources"] + resources_gained
                self._update_location_state(self.location_id, resources=new_resources)
                
                span.set_attribute("resources_gained", resources_gained)
                span.set_attribute("new_resources_total", new_resources)
                
                with self.lock:
                    self.last_resource_collection[self.location_id] = datetime.now()
                
                # Force metric collection after resource update
                self.telemetry.collect_metrics()
                
                return jsonify({
                    "success": True,
                    "message": f"Collected {resources_gained} resources",
                    "current_resources": new_resources,
                    "cooldown": False
                })
        
        @self.app.route('/create_army', methods=['POST'])
        def create_army():
            # Extract trace context from request headers
            context = extract(request.headers)

            with self.tracer.start_as_current_span(
                "create_army",
                context=context,
                kind=SpanKind.SERVER,
                attributes={
                    "location_name": self.location_info["name"],
                    "location_type": self.location_info["type"],
                    "game.map.id": self.map_id,
                }
            ) as span:
                if self.location_info["type"] != "capital":
                    span.set_status(trace.StatusCode.ERROR, "Only capitals can create armies")
                    return jsonify({
                        "success": False,
                        "message": "Only capitals can create armies"
                    }), 403

                location_state = self._get_location_state(self.location_id)
                current_resources = location_state["resources"]
                current_army = location_state["army"]
                faction = location_state["faction"]
                currency = get_army_currency(self.map_id, faction)
                cost = get_army_cost(self.map_id, faction)

                span.set_attribute("current_resources", current_resources)
                span.set_attribute("current_army", current_army)
                span.set_attribute("army_cost", cost)
                span.set_attribute("army_currency", currency)
                span.set_attribute("faction", faction)

                if currency == "corpses":
                    # White Walkers spend corpses from the faction pool, not
                    # resources from the location.
                    if not self._spend_corpses(cost, faction):
                        available = self._get_corpses(faction)
                        span.set_status(trace.StatusCode.ERROR, "Insufficient corpses")
                        return jsonify({
                            "success": False,
                            "message": f"Not enough corpses. Need {cost}, have {available}"
                        }), 400
                    new_resources = current_resources
                    new_army = current_army + 1
                    self._update_location_state(self.location_id, army=new_army)
                    span.set_attribute("game.corpses.spent", cost)
                    span.set_attribute("corpses_remaining", self._get_corpses(faction))
                else:
                    if current_resources < cost:
                        span.set_status(trace.StatusCode.ERROR, "Insufficient resources")
                        return jsonify({
                            "success": False,
                            "message": f"Not enough resources. Need {cost}, have {current_resources}"
                        }), 400

                    new_resources = current_resources - cost
                    new_army = current_army + 1

                    self._update_location_state(
                        self.location_id,
                        resources=new_resources,
                        army=new_army
                    )

                span.set_attribute("new_resources", new_resources)
                span.set_attribute("new_army", new_army)

                self.telemetry.collect_metrics()

                return jsonify({
                    "success": True,
                    "message": "Army created",
                    "current_army": new_army,
                    "current_resources": new_resources,
                    "currency": currency,
                })
        
        @self.app.route('/move_army', methods=['POST'])
        def move_army():
            # Extract trace context from request headers
            context = extract(request.headers)
            
            with self.tracer.start_as_current_span(
                "move_army_request",
                context=context,
                kind=SpanKind.SERVER,
                attributes={
                    "location_name": self.location_info["name"],
                    "location_type": self.location_info["type"]
                }
            ) as move_span:
                data = request.get_json()
                if not data or 'target_location' not in data:
                    move_span.set_status(trace.StatusCode.ERROR, "Target location not specified")
                    return jsonify({"success": False, "message": "Target location not specified"}), 400
                
                target_location = data['target_location']
                remaining_path = data.get('remaining_path', [])
                is_attack_move = data.get('is_attack_move', False)
                
                move_span.set_attribute("target_location", target_location)
                move_span.set_attribute("is_attack_move", is_attack_move)
                
                if target_location not in self.location_info["connections"]:
                    move_span.set_status(trace.StatusCode.ERROR, f"Cannot move to {target_location}")
                    return jsonify({
                        "success": False,
                        "message": f"Cannot move to {target_location}. Not connected to {self.location_id}"
                    }), 400
                
                location_state = self._get_location_state(self.location_id)
                if location_state["army"] <= 0:
                    move_span.set_status(trace.StatusCode.ERROR, "No army to move")
                    return jsonify({
                        "success": False,
                        "message": "No army to move"
                    }), 400
                
                try:
                    army_size = location_state["army"]
                    current_faction = location_state["faction"]
                    
                    move_span.set_attribute("army_size", army_size)
                    move_span.set_attribute("faction", current_faction)
                    
                    # Update the source location's army to 0
                    self._update_location_state(self.location_id, army=0)
                    
                    # Force metric collection after army leaves the location
                    self.telemetry.collect_metrics()
                    
                    result = self._continue_army_movement(
                        army_size,
                        current_faction,
                        self.location_id,
                        target_location,
                        remaining_path,
                        is_attack_move
                    )
                    
                    if not result.get("success", True):
                        move_span.set_status(trace.StatusCode.ERROR, result.get("message", "Unknown error"))
                    
                    return jsonify(result)
                except Exception as e:
                    move_span.record_exception(e)
                    move_span.set_status(trace.StatusCode.ERROR, str(e))
                    return jsonify({
                        "success": False,
                        "message": f"Failed to move army: {str(e)}"
                    }), 500
        
        @self.app.route('/all_out_attack', methods=['POST'])
        def all_out_attack():
            """Launch an all-out attack from a capital to the enemy capital"""
            context = extract(request.headers)
            
            with self.tracer.start_as_current_span(
                "all_out_attack",
                context=context,
                kind=SpanKind.SERVER,
                attributes={
                    "location_name": self.location_info["name"],
                    "location_type": self.location_info["type"]
                }
            ) as attack_span:
                try:
                    if self.location_info["type"] != "capital":
                        attack_span.set_status(trace.StatusCode.ERROR, "Only capitals can launch all-out attacks")
                        return jsonify({
                            "success": False,
                            "message": "Only capitals can launch all-out attacks"
                        }), 403
                    
                    location_state = self._get_location_state(self.location_id)
                    army_size = location_state["army"]
                    faction = location_state["faction"]
                    
                    if army_size <= 0:
                        attack_span.set_status(trace.StatusCode.ERROR, "No army available for attack")
                        return jsonify({
                            "success": False,
                            "message": "No army available for attack"
                        }), 400
                    
                    # Determine enemy capital based on the active map's config.
                    target_capital = self._find_enemy_capital(faction)
                    if not target_capital:
                        attack_span.set_status(trace.StatusCode.ERROR, "No enemy capital on this map")
                        return jsonify({
                            "success": False,
                            "message": "No enemy capital to attack on this map"
                        }), 400
                    attack_span.set_attribute("target_capital", target_capital)
                    
                    attack_path = self._find_path(target_capital, PathType.ATTACK)
                    
                    if not attack_path:
                        attack_span.set_status(trace.StatusCode.ERROR, "No valid path to enemy capital")
                        return jsonify({
                            "success": False,
                            "message": "No valid path to enemy capital"
                        }), 400
                    
                    attack_span.set_attribute("attack_path", str(attack_path))
                    attack_span.set_attribute("initial_army_size", army_size)
                    
                    # Set army to 0 before starting the attack
                    self._update_location_state(self.location_id, army=0)
                    
                    if len(attack_path) > 1:
                        next_loc = attack_path[1]
                        result = self._continue_army_movement(
                            army_size,
                            faction,
                            self.location_id,
                            next_loc,
                            attack_path[1:],
                            is_attack_move=True
                        )
                        
                        if not result.get("success", False):
                            # If movement fails, restore the army
                            self._update_location_state(self.location_id, army=army_size)
                            attack_span.set_status(trace.StatusCode.ERROR, "Failed to start attack")
                            return jsonify({
                                "success": False,
                                "message": f"Failed to start attack: {result.get('message', 'Unknown error')}"
                            }), 400
                        
                        return jsonify({
                            "success": True,
                            "message": f"All-out attack started with {army_size} troops",
                            "path": attack_path,
                            "army_size": army_size
                        })
                    
                    return jsonify({
                        "success": False,
                        "message": "Invalid attack path"
                    }), 400
                    
                except Exception as e:
                    attack_span.record_exception(e)
                    attack_span.set_status(trace.StatusCode.ERROR, str(e))
                    raise
        
        @self.app.route('/receive_army', methods=['POST'])
        def receive_army():
            try:
                data = request.get_json()
                self.logger.info(f"Received army at {self.location_id}: {data}")
                
                if not data or 'army_size' not in data or 'faction' not in data:
                    return jsonify({"success": False, "message": "Invalid army data"}), 400
                
                context = extract(request.headers)
                
                with self.tracer.start_as_current_span(
                    "receive_army",
                    context=context,
                    kind=SpanKind.SERVER,
                    attributes={
                        "location_name": self.location_info["name"],
                        "location_type": self.location_info["type"]
                    }
                ) as battle_span:
                    attacking_army = data['army_size']
                    attacking_faction = data['faction']
                    source_location = data.get('source_location', 'unknown')
                    remaining_path = data.get('remaining_path', [])
                    is_attack_move = data.get('is_attack_move', False)
                    
                    location_state = self._get_location_state(self.location_id)
                    defending_army = location_state["army"]
                    defending_faction = location_state["faction"]
                    
                    battle_span.set_attribute("source_location", source_location)
                    battle_span.set_attribute("attacking_army", attacking_army)
                    battle_span.set_attribute("defending_army", defending_army)
                    battle_span.set_attribute("remaining_path", str(remaining_path))
                    battle_span.set_attribute("is_attack_move", is_attack_move)

                    self.logger.info(f"Received army at {self.location_id}: {data}")
                    self.logger.info(f"Remaining path: {remaining_path}, is_attack_move: {is_attack_move}")
                    
                    if attacking_faction == defending_faction:
                        # For all-out attacks, combine armies with friendly villages
                        if is_attack_move and self.location_info["type"] == "village":
                            # Add village's army to the attacking force
                            attacking_army += defending_army
                            # Set village's army to 0
                            self._update_location_state(self.location_id, army=0)
                            battle_span.set_attribute("combined_army_size", attacking_army)
                            self.logger.info(f"Combined armies at {self.location_id}: {attacking_army} (village army was {defending_army})")
                        
                        # Continue movement if there's a path remaining
                        if is_attack_move and remaining_path:
                            next_location = remaining_path[0]
                            new_remaining_path = remaining_path[1:] if len(remaining_path) > 1 else []
                            self.logger.info(f"Continuing attack from {self.location_id} to {next_location}, new path: {new_remaining_path}")
                            
                            result = self._continue_army_movement(
                                attacking_army,  # Use the potentially increased army size
                                attacking_faction,
                                self.location_id,
                                next_location,
                                new_remaining_path,
                                is_attack_move
                            )
                            battle_span.set_attribute("result", "friendly_passage")
                            self.logger.info(f"Friendly passage result: {result}")
                            # Force metric collection after friendly passage
                            self.telemetry.collect_metrics()
                            return jsonify(result)
                        elif not is_attack_move:
                            # Normal army movement - combine armies
                            new_army = defending_army + attacking_army
                            self._update_location_state(self.location_id, army=new_army)
                            battle_span.set_attribute("result", "armies_combined")
                            self.logger.info(f"Armies combined at {self.location_info['name']}: {new_army}")
                            # Force metric collection after combining armies
                            self.telemetry.collect_metrics()
                            return jsonify({
                                "success": True,
                                "message": f"Armies combined at {self.location_info['name']}",
                                "current_army": new_army,
                                "faction": defending_faction
                            })
                        else:
                            # All-out attack reached friendly location with no remaining path
                            # This shouldn't normally happen, but handle it gracefully
                            if self.location_info["type"] == "capital":
                                # If it's our own capital, stop here
                                self._update_location_state(self.location_id, army=attacking_army)
                                battle_span.set_attribute("result", "returned_to_capital")
                                self.logger.warning(f"All-out attack returned to own capital with {attacking_army} troops")
                            else:
                                # For villages, the army should already be zeroed out above
                                battle_span.set_attribute("result", "attack_ended_at_village")
                                self.logger.warning(f"All-out attack ended at friendly village {self.location_id}")
                            
                            self.telemetry.collect_metrics()
                            return jsonify({
                                "success": True,
                                "message": f"Army movement ended at {self.location_info['name']}",
                                "current_army": self._get_location_state(self.location_id)["army"],
                                "faction": defending_faction
                            })
                    
                    battle_result, remaining_army, new_faction = self._handle_battle(
                        attacking_army,
                        attacking_faction,
                        defending_army,
                        defending_faction,
                        location_type=self.location_info["type"],
                    )

                    # Corpse harvesting: the White Walkers reap from any battle
                    # they win (either as attacker or defender). Corpses equal
                    # the total physical units that died on both sides.
                    if new_faction == "white_walkers":
                        dead = max(0, attacking_army + defending_army - remaining_army)
                        if dead > 0:
                            self._add_corpses(dead, "white_walkers")
                            battle_span.set_attribute("game.corpses.harvested", dead)
                            battle_span.set_attribute("corpse.source", "battle")

                    self._update_location_state(
                        self.location_id,
                        army=remaining_army,
                        faction=new_faction
                    )

                    battle_span.set_attribute("result", battle_result)
                    battle_span.set_attribute("remaining_army", remaining_army)
                    battle_span.set_attribute("game.map.id", self.map_id)
                    if self.location_info["type"] == "wall":
                        battle_span.set_attribute("game.wall.held", new_faction != "neutral")
                        battle_span.set_attribute("span.wall.battle", True)
                    
                    if battle_result == "attacker_victory" and is_attack_move and remaining_path:
                        self.logger.info(f"Continuing army movement at {self.location_id}: {remaining_army}")
                        self.logger.info(f"Battle victory - continuing to {remaining_path[0]}, path: {remaining_path[1:]}")
                        result = self._continue_army_movement(
                            remaining_army,
                            attacking_faction,
                            self.location_id,
                            remaining_path[0],
                            remaining_path[1:] if len(remaining_path) > 1 else [],
                            is_attack_move
                        )
                        return jsonify(result)
                    
                    if battle_result != "attacker_victory":
                        self.logger.warning(f"Battle result: {battle_result}")
                        battle_span.add_event("battle_result", attributes={
                            "outcome": battle_result,
                            "attacker_faction": attacking_faction,
                            "defender_faction": defending_faction,
                            "remaining_army": remaining_army,
                        })
                    
                    # Force metric collection after battle resolution
                    self.telemetry.collect_metrics()
                    
                    return jsonify({
                        "success": battle_result == "attacker_victory",
                        "message": f"Battle at {self.location_info['name']}: {battle_result}",
                        "current_army": remaining_army,
                        "faction": new_faction
                    })
                    
            except Exception as e:
                self.logger.error(f"Error in receive_army: {str(e)}")
                return jsonify({"success": False, "message": f"Error: {str(e)}"}), 500
        
        @self.app.route('/reset', methods=['POST'])
        def reset():
            self.reset_database()
            return jsonify({"success": True, "message": "Game state reset to initial values"})

        @self.app.route('/reload', methods=['POST'])
        def reload_identity():
            """Re-read the active map from the DB and rebind this slot's identity.

            Called by ``war_map`` after ``/select_map``. The slot's port + the
            telemetry service name do not change — only the logical
            ``location_id``, ``name``, ``type``, ``faction``, connections, and
            rules-scoped behaviour.
            """
            self._load_identity()
            return jsonify({
                "success": True,
                "slot_id": self.slot_id,
                "map_id": self.map_id,
                "location_id": self.location_id,
                "faction": self.location_info["faction"],
                "type": self.location_info["type"],
            })

        @self.app.route('/faction_economy', methods=['GET'])
        def faction_economy():
            """Expose the corpse pool for a faction (used by the AI)."""
            faction = request.args.get('faction', 'white_walkers')
            return jsonify({
                "faction": faction,
                "corpses": self._get_corpses(faction),
            })
        
        @self.app.route('/send_resources_to_capital', methods=['POST'])
        def send_resources_to_capital():
            # Extract trace context from request headers
            context = extract(request.headers)
            
            with self.tracer.start_as_current_span(
                "send_resources_to_capital",
                context=context,  # Use the extracted context
                kind=SpanKind.SERVER,
                attributes={
                    "location_name": self.location_info["name"],
                    "location_type": self.location_info["type"]
                }
            ) as span:
                try:
                    location_state = self._get_location_state(self.location_id)
                    current_resources = location_state["resources"]
                    faction = location_state["faction"]
                    
                    span.set_attribute("resources_amount", current_resources)
                    span.set_attribute("faction", faction)
                    
                    if self.location_info["type"] != "village":
                        span.set_status(trace.StatusCode.ERROR, "Only villages can send resources")
                        self.logger.error(f"Only villages can send resources to capital")
                        return jsonify({
                            "success": False,
                            "message": "Only villages can send resources to capital"
                        }), 403
                    
                    resource_factions = {"southern", "northern", "nights_watch"}
                    if faction not in resource_factions:
                        span.set_status(trace.StatusCode.ERROR, "Faction has no resource economy")
                        self.logger.error(
                            f"Faction {faction!r} has no resource economy; cannot send to capital"
                        )
                        return jsonify({
                            "success": False,
                            "message": "This faction does not send resources",
                        }), 403

                    # Target this faction's capital on the active map.
                    target_capital = self._find_capital(faction)
                    if not target_capital:
                        span.set_status(trace.StatusCode.ERROR, "No friendly capital on this map")
                        return jsonify({
                            "success": False,
                            "message": "No friendly capital to send resources to"
                        }), 400
                    path = self._find_path(target_capital, PathType.RESOURCE)
                    if not path:
                        span.set_status(trace.StatusCode.ERROR, "No valid path to capital")
                        self.logger.error(f"No valid path to capital found")
                        return jsonify({
                            "success": False,
                            "message": "No valid path to capital found"
                        }), 400
                    
                    span.set_attribute("path_to_capital", str(path))
                    
                    if self._transfer_resources_along_path(current_resources, path):
                        self._start_resource_cooldown()
                        self.logger.info(f"Resources sent to capital via {path}")
                        # Force metric collection after initiating resource transfer
                        self.telemetry.collect_metrics()
                        return jsonify({
                            "success": True,
                            "message": f"Sending {current_resources} resources to capital via {' -> '.join(path)}",
                            "path": path,
                            "amount": current_resources
                        })
                    else:
                        span.set_status(trace.StatusCode.ERROR, "Failed to start resource transfer")
                        self.logger.error(f"Failed to start resource transfer")
                        return jsonify({
                            "success": False,
                            "message": "Failed to start resource transfer"
                        }), 500
                except Exception as e:
                    span.record_exception(e)
                    span.set_status(trace.StatusCode.ERROR, str(e))
                    self.logger.error(f"Error in send_resources_to_capital: {str(e)}")
                    return jsonify({
                        "success": False,
                        "message": f"Error: {str(e)}"
                    }), 500
        
        @self.app.route('/receive_resources', methods=['POST'])
        def receive_resources():
            data = request.get_json()
            if not data or 'resources' not in data or 'faction' not in data:
                return jsonify({"success": False, "message": "Invalid resource data"}), 400
            
            context = extract(request.headers)
            
            with self.tracer.start_as_current_span(
                "receive_resources",
                context=context,
                attributes={
                    "location": self.location_id,
                    "location_type": self.location_info["type"],
                    "sending_faction": data['faction'],
                    "receiving_faction": self._get_location_state(self.location_id)["faction"],
                    "resources_amount": data['resources']
                }
            ) as transfer_span:
                incoming_resources = data['resources']
                source_location = data.get('source_location', 'unknown')
                remaining_path = data.get('remaining_path', [])
                faction = data['faction']
                
                transfer_span.set_attribute("source_location", source_location)
                
                location_state = self._get_location_state(self.location_id)
                current_resources = location_state["resources"]
                current_faction = location_state["faction"]
                
                if current_faction != faction:
                    transfer_span.set_status(trace.Status(trace.StatusCode.ERROR, f"Resources captured by {current_faction}"))
                    self._update_location_state(self.location_id, resources=current_resources + incoming_resources)
                    # Force metric collection after resource capture
                    self.telemetry.collect_metrics()
                    self.logger.error(f"Resources captured by {current_faction}")
                    return jsonify({
                        "success": False,
                        "message": f"Resources captured by {current_faction}!",
                        "current_resources": current_resources + incoming_resources
                    })
                
                new_resources = current_resources + incoming_resources
                self._update_location_state(self.location_id, resources=new_resources)
                # Force metric collection after receiving resources
                self.telemetry.collect_metrics()
                self.logger.info(f"Resources updated to {new_resources}")
                
                if len(remaining_path) > 1:
                    next_loc = remaining_path[1]
                    
                    def continue_transfer():
                        with self._start_movement_trace(
                            "resource_movement",
                            self.location_id,
                            next_loc,
                            resources=incoming_resources
                        ) as movement_span:
                            try:
                                time.sleep(5)
                                target_url = f"{self.get_location_url(next_loc)}/receive_resources"
                                self.logger.info(f"Sending resources to {next_loc} with target URL: {target_url}")
                                result = self._make_request_with_trace('post', target_url, {
                                    "resources": incoming_resources,
                                    "source_location": self.location_id,
                                    "remaining_path": remaining_path[1:],
                                    "faction": faction
                                }, span_name="http_request.forward_resources")
                                
                                if not result.get("success", False):
                                    movement_span.set_status(trace.Status(trace.StatusCode.ERROR, "Resource transfer failed"))
                                
                                current_state = self._get_location_state(self.location_id)
                                self._update_location_state(self.location_id, 
                                    resources=current_state["resources"] - incoming_resources)
                                # Force metric collection after forwarding resources
                                self.telemetry.collect_metrics()
                                self.logger.info(f"Resources updated to {current_state['resources'] - incoming_resources}")
                            except Exception as e:
                                movement_span.set_status(trace.Status(trace.StatusCode.ERROR, str(e)))
                                self.logger.error(f"Failed to forward resources to {next_loc}: {str(e)}")
                    
                    Thread(target=continue_transfer).start()
                
                transfer_span.set_attribute("final_resources", new_resources)
                if self.location_info["type"] == "capital":
                    transfer_span.set_attribute("resources_reached_capital", True)
                
                self.logger.info(f"Resources received at {self.location_info['name']}")
                return jsonify({
                    "success": True,
                    "message": f"Resources received at {self.location_info['name']}",
                    "current_resources": new_resources
                })
    
    def run(self):
        port = self.location_info["port"]
        self.app.run(host='0.0.0.0', port=port)
        self.logger.info(f"Location server running on port {port}")


if __name__ == '__main__':
    # Docker entrypoint: read SLOT_ID env var, resolve identity from the
    # shared active_map_id, and serve. SERVICE_NAME comes from LOCATION_NAME
    # (set per-container in docker-compose.yml) or is synthesised from slot.
    LocationServer().run()

================================================
FILE: game-of-tracing/app/requirements.txt
================================================
flask==3.1.3
requests==2.33.1
opentelemetry-api==1.41.1
opentelemetry-sdk==1.41.1
opentelemetry-exporter-otlp==1.41.1
pyroscope-io==1.0.6
pyroscope-otel==1.0.0


================================================
FILE: game-of-tracing/app/run_game.py
================================================
import os
import sys
import json
import sqlite3
import argparse
import multiprocessing
from game_config import LOCATIONS, DATABASE_FILE
from location_server import LocationServer

def reset_game():
    """Reset the database to initial state"""
    db_path = os.environ.get('DATABASE_FILE', DATABASE_FILE)
    
    if os.path.exists(db_path):
        # Connect to database and reset it
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        
        # Delete all data
        cursor.execute("DELETE FROM locations")
        
        # Reinitialize locations
        for loc_id, loc_info in LOCATIONS.items():
            cursor.execute(
                "INSERT INTO locations VALUES (?, ?, ?, ?)",
                (
                    loc_id,
                    loc_info["initial_resources"],
                    loc_info["initial_army"],
                    loc_info["faction"]
                )
            )
        
        conn.commit()
        conn.close()
        print(f"Game reset successfully. Database {db_path} reset to initial state.")
    else:
        print("Database not found. It will be created when the game starts.")

def run_location(location_id):
    """Run a location server in a separate process"""
    print(f"Starting {LOCATIONS[location_id]['name']} (Port: {LOCATIONS[location_id]['port']})")
    server = LocationServer(location_id)
    server.run()

def run_single_location():
    """Run a single location server based on environment variable"""
    location_id = os.environ.get('LOCATION_ID')
    if not location_id:
        print("Error: LOCATION_ID environment variable not set")
        sys.exit(1)
        
    if location_id not in LOCATIONS:
        print(f"Error: Invalid location_id '{location_id}'")
        sys.exit(1)
        
    print(f"Starting {LOCATIONS[location_id]['name']} server (Port: {LOCATIONS[location_id]['port']})")
    server = LocationServer(location_id)
    server.run()

def show_game_state():
    """Show the current game state from the database"""
    db_path = os.environ.get('DATABASE_FILE', DATABASE_FILE)
    
    if not os.path.exists(db_path):
        print("Database not found. Starting a new game...")
        return
    
    try:
        conn = sqlite3.connect(db_path)
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        
        cursor.execute("SELECT * FROM locations")
        rows = cursor.fetchall()
        
        if not rows:
            print("No game state found. Starting a new game...")
            return
        
        print("Current Game State:")
        for row in rows:
            loc_id = row['id']
            print(f"{LOCATIONS[loc_id]['name']} ({loc_id}): Faction={row['faction']}, Army={row['army']}, Resources={row['resources']}")
        
        conn.close()
    except sqlite3.Error as e:
        print(f"Error accessing database: {e}")
        print("Starting a new game...")

def run_game(reset=False):
    """Run all location servers"""
    if reset:
        reset_game()
    
    # Check if we're in Docker and should run just one location
    if os.environ.get('LOCATION_ID'):
        run_single_location()
        return

    # Show initial game state
    show_game_state()
    
    # Start each location server in a separate process
    processes = []
    for location_id in LOCATIONS:
        p = multiprocessing.Process(target=run_location, args=(location_id,))
        p.start()
        processes.append(p)
    
    print("\nAll locations are running!")
    print("Game Instructions:")
    print("1. Each location is running a Flask server at its designated port")
    print("2. Use HTTP requests to interact with locations")
    print("3. Example commands:")
    print("   - Get location info: curl http://localhost:[PORT]/")
    print("   - Collect resources: curl -X POST http://localhost:[PORT]/collect_resources")
    print("   - Create army: curl -X POST http://localhost:[PORT]/create_army")
    print("   - Move army: curl -X POST -H \"Content-Type: application/json\" -d '{\"target_location\":\"village_1\"}' http://localhost:[PORT]/move_army")
    print("   - Reset game: curl -X POST http://localhost:[PORT]/reset")
    print("4. Or use the game client: python game_client.py map")
    
    try:
        # Wait for processes to complete (they won't unless terminated)
        for p in processes:
            p.join()
    except KeyboardInterrupt:
        print("\nShutting down all servers...")
        for p in processes:
            p.terminate()
        print("Game ended.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="War of Westeros Game")
    parser.add_argument("--reset", action="store_true", help="Reset the game state")
    args = parser.parse_args()
    
    run_game(args.reset) 

================================================
FILE: game-of-tracing/app/telemetry.py
================================================
import os

from opentelemetry.sdk.resources import SERVICE_NAME, Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry import trace

# Logging setup
import logging
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry._logs import set_logger_provider

# Metrics setup
from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.sdk.metrics import TraceBasedExemplarFilter
from opentelemetry.metrics import CallbackOptions, Observation
from typing import Iterable

# Profiling setup (Pyroscope v2 + OTel span-profile linking)
import pyroscope
from pyroscope.otel import PyroscopeSpanProcessor

class GameTelemetry:
    def __init__(self, service_name, logging_endpoint="http://alloy:4318", tracing_endpoint="http://alloy:4317", metrics_endpoint="http://alloy:4318"):
        self.service_name = service_name
        self.logging_endpoint = logging_endpoint
        self.tracing_endpoint = tracing_endpoint
        self.metrics_endpoint = metrics_endpoint
        self.resource = Resource.create(attributes={
            SERVICE_NAME: service_name
        })

        self._setup_logging()
        self._setup_tracing()
        self._setup_metrics()
        self._setup_profiling()
        
    def _setup_logging(self):
        """Configure OpenTelemetry logging"""
        self.logger_provider = LoggerProvider(resource=self.resource)
        set_logger_provider(self.logger_provider)
        
        log_exporter = OTLPLogExporter(
            endpoint=f"{self.logging_endpoint}/v1/logs"
        )
        
        self.logger_provider.add_log_record_processor(
            BatchLogRecordProcessor(
                exporter=log_exporter,
                max_queue_size=30,
                max_export_batch_size=5
            )
        )
        
        # Setup root logger
        handler = LoggingHandler(
            level=logging.NOTSET,
            logger_provider=self.logger_provider
        )
        logging.getLogger().addHandler(handler)
        logging.getLogger().setLevel(logging.INFO)
        
        self.logger = logging.getLogger(self.service_name)
    
    def _setup_tracing(self):
        """Configure OpenTelemetry tracing"""
        trace.set_tracer_provider(TracerProvider(resource=self.resource))
        
        otlp_exporter = OTLPSpanExporter(
            endpoint=f"{self.tracing_endpoint}/v1/traces",
            insecure=True
        )
        
        span_processor = BatchSpanProcessor(
            span_exporter=otlp_exporter,
            max_export_batch_size=1
        )
        
        trace.get_tracer_provider().add_span_processor(span_processor)
        self.tracer = trace.get_tracer(__name__)

    def _setup_profiling(self):
        """Configure Pyroscope profiling + OTel span-profile linkage.

        Pyroscope collects CPU samples from this process and pushes pprof to
        the configured server. ``PyroscopeSpanProcessor`` attaches the current
        profile id to every span so the trace view in Grafana can link back
        to the flamegraph captured while each span was active.
        """
        pyroscope.configure(
            application_name=self.service_name,
            server_address=os.getenv("PYROSCOPE_SERVER_ADDRESS", "http://alloy:9999"),
            tags={"service_name": self.service_name},
            oncpu=True,
            gil_only=True,
        )
        trace.get_tracer_provider().add_span_processor(PyroscopeSpanProcessor())

    def _setup_metrics(self):
        """Configure OpenTelemetry metrics"""
        # Create the metrics exporter
        self.metric_exporter = OTLPMetricExporter(
            endpoint=f"{self.metrics_endpoint}/v1/metrics"
        )

        # Set up periodic metric reader with manual collection capability
        self.metric_reader = PeriodicExportingMetricReader(
            self.metric_exporter,
            export_interval_millis=10000  # Export every 10 seconds
        )

        # Create and set meter provider with exemplar support
        self.meter_provider = MeterProvider(
            metric_readers=[self.metric_reader],
            resource=self.resource,
            exemplar_filter=TraceBasedExemplarFilter()
        )
        metrics.set_meter_provider(self.meter_provider)

        # Get meter for creating metrics
        self.meter = metrics.get_meter(__name__)

        # Create observable gauges for game metrics
        self._setup_game_gauges()

    def _setup_game_gauges(self):
        """Set up observable gauges for game metrics"""
        # Resource gauge
        self.resource_gauge = self.meter.create_observable_gauge(
            name="game.resources",
            description="Current resources at location",
            callbacks=[self._observe_resources],
            unit="1"
        )

        # Army size gauge
        self.army_gauge = self.meter.create_observable_gauge(
            name="game.army_size",
            description="Current army size at location",
            callbacks=[self._observe_army_size],
            unit="1"
        )

        # Battle count counter
        self.battle_counter = self.meter.create_counter(
            name="game.battles",
            description="Number of battles fought",
            unit="1"
        )

        # Resource transfer gauge
        self.cooldown_gauge = self.meter.create_observable_gauge(
            name="game.resource_transfer_cooldown",
            description="Resource transfer cooldown status",
            callbacks=[self._observe_resource_cooldown],
            unit="s"
        )

        # Location control gauge
        self.control_gauge = self.meter.create_observable_gauge(
            name="game.location_control",
            description="Current faction controlling the location",
            callbacks=[self._observe_location_control],
            unit="1"
        )

        # Log that metrics have been set up
        self.logger.info("Game metrics initialized")

    # Faction → numeric value for the ``game.location_control`` gauge.
    # Existing WoK values (0/1/2) preserved for dashboard backward compat;
    # new factions appended with fresh values.
    _FACTION_VALUE = {
        "neutral": 0,
        "northern": 1,
        "southern": 2,
        "nights_watch": 3,
        "white_walkers": 4,
        "barbarian": 5,
    }

    def _active_location_id(self):
        """Return the currently served logical location id.

        ``LocationServer`` sets ``self._location_id`` on the telemetry instance
        at boot and refreshes it on ``/reload``. Fall back to the legacy
        ``service_name.replace('-', '_')`` pattern for non-slot deployments.
        """
        return getattr(self, "_location_id", None) or self.service_name.replace("-", "_")

    def _active_location_type(self):
        return getattr(self, "_location_type", None) or "village"

    def _observe_resources(self, options: CallbackOptions) -> Iterable[Observation]:
        """Callback to observe current resources"""
        try:
            location_id = self._active_location_id()
            if hasattr(self, '_get_location_state'):
                state = self._get_location_state(location_id)
                if state:
                    self.logger.debug(f"Observing resources for {location_id}: {state['resources']}")
                    yield Observation(
                        value=state["resources"],
                        attributes={
                            "location": self.service_name,
                            "location_type": self._active_location_type(),
                        }
                    )
        except Exception as e:
            self.logger.error(f"Error observing resources: {e}")

    def _observe_army_size(self, options: CallbackOptions) -> Iterable[Observation]:
        """Callback to observe current army size"""
        try:
            location_id = self._active_location_id()
            if hasattr(self, '_get_location_state'):
                state = self._get_location_state(location_id)
                if state:
                    self.logger.debug(f"Observing army size for {location_id}: {state['army']}")
                    yield Observation(
                        value=state["army"],
                        attributes={
                            "location": self.service_name,
                            "location_type": self._active_location_type(),
                            "faction": state["faction"],
                        }
                    )
        except Exception as e:
            self.logger.error(f"Error observing army size: {e}")

    def _observe_resource_cooldown(self, options: CallbackOptions) -> Iterable[Observation]:
        """Callback to observe resource transfer cooldown"""
        try:
            from datetime import datetime
            location_id = self._active_location_id()
            if hasattr(self, 'resource_cooldown') and location_id in self.resource_cooldown:
                cooldown = self.resource_cooldown[location_id]
                now = datetime.now()
                if cooldown > now:
                    cooldown_value = (cooldown - now).total_seconds()
                    self.logger.debug(f"Observing cooldown for {location_id}: {cooldown_value}s")
                    yield Observation(
                        value=cooldown_value,
                        attributes={"location": self.service_name}
                    )
                else:
                    yield Observation(value=0, attributes={"location": location_id})
        except Exception as e:
            self.logger.error(f"Error observing resource cooldown: {e}")

    def _observe_location_control(self, options: CallbackOptions) -> Iterable[Observation]:
        """Callback to observe location control status."""
        try:
            location_id = self._active_location_id()
            if hasattr(self, '_get_location_state'):
                state = self._get_location_state(location_id)
                if state:
                    faction_value = self._FACTION_VALUE.get(state["faction"], -1)
                    self.logger.debug(
                        f"Observing control for {location_id}: {state['faction']} ({faction_value})"
                    )
                    yield Observation(
                        value=faction_value,
                        attributes={
                            "location": self.service_name,
                            "location_type": self._active_location_type(),
                            "faction": state["faction"],
                        }
                    )
        except Exception as e:
            self.logger.error(f"Error observing location control: {e}")
    
    def get_tracer(self):
        """Get the configured tracer"""
        return self.tracer
    
    def get_logger(self):
        """Get the configured logger"""
        return self.logger

    def get_meter(self):
        """Get the configured meter"""
        return self.meter
    
    def record_battle(self, attacker_faction: str, defender_faction: str, result: str):
        """Record a battle event and force metrics collection"""
        try:
            self.battle_counter.add(
                1,
                {
                    "attacker_faction": attacker_faction,
                    "defender_faction": defender_faction,
                    "result": result,
                    "location": self.service_name
                }
            )
            self.logger.info(f"Battle recorded: {attacker_faction} vs {defender_faction} - {result}")
            # Force collection of all metrics
            self.collect_metrics()
        except Exception as e:
            self.logger.error(f"Error recording battle: {e}")

    def collect_metrics(self):
        """Force collection and export of all metrics"""
        try:
            # Collect metrics immediately
            self.metric_reader.collect()
            # Force flush to ensure metrics are exported
            self.meter_provider.force_flush()
            self.logger.debug("Metrics collected and flushed")
        except Exception as e:
            self.logger.error(f"Error collecting metrics: {e}")

    def shutdown(self):
        """Flush and shutdown all telemetry providers."""
        try:
            trace.get_tracer_provider().shutdown()
        except Exception:
            pass
        try:
            self.meter_provider.shutdown()
        except Exception:
            pass
        try:
            self.logger_provider.shutdown()
        except Exception:
            pass


================================================
FILE: game-of-tracing/config-otel.yaml
================================================
#
# OTel Collector YAML Configuration for Game of Tracing
#
# This is the OTel-native equivalent of config.alloy for use with the Alloy OTel Engine.
# Run with: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
#

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  batch: {}

exporters:
  otlp/tempo:
    endpoint: tempo:4317
    tls:
      insecure: true

  otlphttp/logs:
    endpoint: http://loki:3100/otlp

  otlphttp/metrics:
    endpoint: http://prometheus:9090/api/v1/otlp

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [batch]
      exporters: [otlp/tempo]
    logs:
      receivers: [otlp]
      processors: [batch]
      exporters: [otlphttp/logs]
    metrics:
      receivers: [otlp]
      processors: [batch]
      exporters: [otlphttp/metrics]


================================================
FILE: game-of-tracing/config.alloy
================================================
/*
 * Alloy Configuration for OpenTelemetry Trace Collection with Tail Sampling
 */

// Receive OpenTelemetry traces
otelcol.receiver.otlp "default" {
  http {}
  grpc {}

  output {
    metrics = [otelcol.processor.batch.default.input]
    logs = [otelcol.processor.batch.default.input]
    traces = [otelcol.processor.batch.default.input]
  }
}

// Batch processor to improve performance
otelcol.processor.batch "default" {
  output {
    traces = [otelcol.exporter.otlp.tempo.input]
    logs = [otelcol.exporter.otlphttp.logs.input]
    metrics = [otelcol.exporter.otlphttp.metrics.input]
  }
}

// Send sampled traces to Tempo
otelcol.exporter.otlp "tempo" {
  client {
    endpoint = "tempo:4317"
    tls {
      insecure = true
    }
  }
} 

otelcol.exporter.otlphttp "logs" {
  client {
    endpoint = "http://loki:3100/otlp"
  }

}

otelcol.exporter.otlphttp "metrics" {
  client {
    endpoint = "http://prometheus:9090/api/v1/otlp"
  }
}

// Receive pprof profiles from the Python services and forward to Pyroscope.
pyroscope.receive_http "default" {
  http {
    listen_address = "0.0.0.0"
    listen_port    = 9999
  }
  forward_to = [pyroscope.write.default.receiver]
}

pyroscope.write "default" {
  endpoint {
    url = "http://pyroscope:4040"
  }
}

livedebugging {
  enabled = true
}

================================================
FILE: game-of-tracing/docker-compose-otel.yml
================================================
# OTel Engine Override
#
# Uses Alloy's experimental OTel Engine to run a standard OTel Collector YAML config
# instead of the River/HCL config.alloy file.
#
# Usage: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
# Stop:  docker compose -f docker-compose.yml -f docker-compose-otel.yml down
#

services:
  alloy:
    command: otel --config=/etc/alloy/config-otel.yaml
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
    ports:
      - 8888:8888      # OTel Engine HTTP server


================================================
FILE: game-of-tracing/docker-compose.coda.yml
================================================
services:
  # Southern Capital
  southern-capital:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5001:5001"
    environment:
      - LOCATION_ID=southern_capital
      - FLASK_APP=location_server.py
      - LOCATION_NAME=southern-capital
      - DATABASE_FILE=/data/game_state.db
    volumes:
      - game-data:/data
    command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('southern_capital'); server.run()"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s

  # Northern Capital
  northern-capital:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5002:5002"
    environment:
      - LOCATION_ID=northern_capital
      - FLASK_APP=location_server.py
      - LOCATION_NAME=northern-capital
      - DATABASE_FILE=/data/game_state.db
    volumes:
      - game-data:/data
    command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('northern_capital'); server.run()"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5002/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s

  # Village 1
  village-1:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5003:5003"
    environment:
      - LOCATION_ID=village_1
      - FLASK_APP=location_server.py
      - LOCATION_NAME=village-1
      - DATABASE_FILE=/data/game_state.db
    volumes:
      - game-data:/data
    command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('village_1'); server.run()"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5003/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s

  # Village 2
  village-2:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5004:5004"
    environment:
      - LOCATION_ID=village_2
      - FLASK_APP=location_server.py
      - LOCATION_NAME=village-2
      - DATABASE_FILE=/data/game_state.db
    volumes:
      - game-data:/data
    command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('village_2'); server.run()"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5004/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s

  # Village 3
  village-3:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5005:5005"
    environment:
      - LOCATION_ID=village_3
      - FLASK_APP=location_server.py
      - LOCATION_NAME=village-3
      - DATABASE_FILE=/data/game_state.db
    volumes:
      - game-data:/data
    command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('village_3'); server.run()"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5005/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s

  # Village 4
  village-4:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5006:5006"
    environment:
      - LOCATION_ID=village_4
      - FLASK_APP=location_server.py
      - LOCATION_NAME=village-4
      - DATABASE_FILE=/data/game_state.db
    volumes:
      - game-data:/data
    command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('village_4'); server.run()"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5006/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s

  # Village 5
  village-5:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5007:5007"
    environment:
      - LOCATION_ID=village_5
      - FLASK_APP=location_server.py
      - LOCATION_NAME=village-5
      - DATABASE_FILE=/data/game_state.db
    volumes:
      - game-data:/data
    command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('village_5'); server.run()"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5007/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s

  # Village 6
  village-6:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5008:5008"
    environment:
      - LOCATION_ID=village_6
      - FLASK_APP=location_server.py
      - LOCATION_NAME=village-6
      - DATABASE_FILE=/data/game_state.db
    volumes:
      - game-data:/data
    command: ["python", "-c", "from location_server import LocationServer; server = LocationServer('village_6'); server.run()"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5008/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s

  # Web-based war map application
  war-map:
    build:
      context: ./war_map
      dockerfile: Dockerfile
    ports:
      - "8080:8080"
    environment:
      - DATABASE_FILE=/data/game_state.db
      - LOCATION_NAME=war-map
      - SECRET_KEY=war_of_westeros_secret_key
      - IN_DOCKER=1
      - AI_URL=http://ai-opponent:8081
      - TEMPO_URL=http://tempo:3200
    volumes:
      - game-data:/data
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s
    depends_on:
      southern-capital:
        condition: service_healthy
      northern-capital:
        condition: service_healthy
      village-1:
        condition: service_healthy
      village-2:
        condition: service_healthy
      village-3:
        condition: service_healthy
      village-4:
        condition: service_healthy
      village-5:
        condition: service_healthy
      village-6:
        condition: service_healthy

  # AI Opponent Service
  ai-opponent:
    build:
      context: ./ai_opponent
      dockerfile: Dockerfile
    ports:
      - "8081:8081"
    environment:
      - IN_DOCKER=1
      - LOCATION_NAME=ai-opponent
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8081/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s
    depends_on:
      southern-capital:
        condition: service_healthy
      northern-capital:
        condition: service_healthy
      village-1:
        condition: service_healthy
      village-2:
        condition: service_healthy
      village-3:
        condition: service_healthy
      village-4:
        condition: service_healthy
      village-5:
        condition: service_healthy
      village-6:
        condition: service_healthy

volumes:
  game-data:


================================================
FILE: game-of-tracing/docker-compose.yml
================================================
version: '3.8'

services:
  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  # Prometheus for metrics collection
  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --web.enable-otlp-receiver
      - --enable-feature=native-histograms
      - --enable-feature=exemplar-storage
      - --config.file=/etc/prometheus/prometheus.yml
    ports:
      - 9090:9090/tcp
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml

  # Tempo for tracing
  tempo:
    image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4}
    command: ["-config.file=/etc/tempo.yaml"]
    ports:
      - 3200:3200/tcp    # tempo
    volumes:
      - ./tempo-config.yaml:/etc/tempo.yaml
    depends_on:
      - prometheus

  # Pyroscope v2 for continuous profiling
  pyroscope:
    image: grafana/pyroscope:${GRAFANA_PYROSCOPE_VERSION:-2.0.1}
    ports:
      - "4040:4040"
    command:
      - "-config.file=/etc/pyroscope/config.yaml"
      - "-architecture.storage=v1-v2-dual"
    volumes:
      - ./pyroscope-config.yaml:/etc/pyroscope/config.yaml
      - pyroscope-data:/data

  # Grafana for visualization
  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_SECURITY_ALLOW_EMBEDDING=true
      - GF_SECURITY_DISABLE_SANITIZE_HTML=true
      - GF_FEATURE_TOGGLES_ENABLE=dashboardNewLayouts,kubernetesDashboards,provisioning
      - GF_PATHS_PERMITTED_PROVISIONING_PATHS=grafana/|/etc/grafana/provisioning/dashboards/
      - GF_INSTALL_PLUGINS=https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app
    volumes:
      - ./grafana:/etc/grafana/provisioning
    ports:
      - 3000:3000/tcp
    depends_on:
      - prometheus
      - tempo
      - pyroscope

  # Alloy for telemetry pipeline and tail sampling
  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345      # Alloy HTTP server
      - 4317:4317/tcp    # OTLP gRPC
      - 4318:4318/tcp    # OTLP HTTP
      - 9999:9999/tcp    # Pyroscope HTTP receiver
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - pyroscope

  # Game of Kingdoms War Game Services

  # Southern Capital
  southern-capital:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5001:5001"
    environment:
      - SLOT_ID=slot_1
      - LOCATION_ID=southern_capital
      - FLASK_APP=location_server.py
      - LOCATION_NAME=southern-capital
      - DATABASE_FILE=/data/game_state.db
      - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999
      - IN_DOCKER=1
    volumes:
      - game-data:/data
    command: ["python", "location_server.py"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s
    depends_on:
      - alloy

  # Northern Capital
  northern-capital:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5002:5002"
    environment:
      - SLOT_ID=slot_2
      - LOCATION_ID=northern_capital
      - FLASK_APP=location_server.py
      - LOCATION_NAME=northern-capital
      - DATABASE_FILE=/data/game_state.db
      - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999
      - IN_DOCKER=1
    volumes:
      - game-data:/data
    command: ["python", "location_server.py"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5002/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s
    depends_on:
      - alloy

  # Village 1
  village-1:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5003:5003"
    environment:
      - SLOT_ID=slot_3
      - LOCATION_ID=village_1
      - FLASK_APP=location_server.py
      - LOCATION_NAME=village-1
      - DATABASE_FILE=/data/game_state.db
      - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999
      - IN_DOCKER=1
    volumes:
      - game-data:/data
    command: ["python", "location_server.py"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5003/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s
    depends_on:
      - alloy

  # Village 2
  village-2:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5004:5004"
    environment:
      - SLOT_ID=slot_4
      - LOCATION_ID=village_2
      - FLASK_APP=location_server.py
      - LOCATION_NAME=village-2
      - DATABASE_FILE=/data/game_state.db
      - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999
      - IN_DOCKER=1
    volumes:
      - game-data:/data
    command: ["python", "location_server.py"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5004/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s
    depends_on:
      - alloy

  # Village 3
  village-3:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5005:5005"
    environment:
      - SLOT_ID=slot_5
      - LOCATION_ID=village_3
      - FLASK_APP=location_server.py
      - LOCATION_NAME=village-3
      - DATABASE_FILE=/data/game_state.db
      - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999
      - IN_DOCKER=1
    volumes:
      - game-data:/data
    command: ["python", "location_server.py"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5005/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s
    depends_on:
      - alloy

  # Village 4
  village-4:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5006:5006"
    environment:
      - SLOT_ID=slot_6
      - LOCATION_ID=village_4
      - FLASK_APP=location_server.py
      - LOCATION_NAME=village-4
      - DATABASE_FILE=/data/game_state.db
      - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999
      - IN_DOCKER=1
    volumes:
      - game-data:/data
    command: ["python", "location_server.py"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5006/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s
    depends_on:
      - alloy

  # Village 5
  village-5:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5007:5007"
    environment:
      - SLOT_ID=slot_7
      - LOCATION_ID=village_5
      - FLASK_APP=location_server.py
      - LOCATION_NAME=village-5
      - DATABASE_FILE=/data/game_state.db
      - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999
      - IN_DOCKER=1
    volumes:
      - game-data:/data
    command: ["python", "location_server.py"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5007/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s
    depends_on:
      - alloy

  # Village 6
  village-6:
    build:
      context: ./app
      dockerfile: Dockerfile
    ports:
      - "5008:5008"
    environment:
      - SLOT_ID=slot_8
      - LOCATION_ID=village_6
      - FLASK_APP=location_server.py
      - LOCATION_NAME=village-6
      - DATABASE_FILE=/data/game_state.db
      - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999
      - IN_DOCKER=1
    volumes:
      - game-data:/data
    command: ["python", "location_server.py"]
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5008/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s
    depends_on:
      - alloy

  # Web-based war map application
  war-map:
    build:
      context: ./war_map
      dockerfile: Dockerfile
    ports:
      - "8080:8080"
    environment:
      - DATABASE_FILE=/data/game_state.db
      - GAME_SESSIONS_DB=/data/game_sessions.db
      - LOCATION_NAME=war-map
      - SECRET_KEY=war_of_westeros_secret_key
      - IN_DOCKER=1
      - AI_URL=http://ai-opponent:8081
      - TEMPO_URL=http://tempo:3200
      - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999
    volumes:
      - game-data:/data
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s
    depends_on:
      southern-capital:
        condition: service_healthy
      northern-capital:
        condition: service_healthy
      village-1:
        condition: service_healthy
      village-2:
        condition: service_healthy
      village-3:
        condition: service_healthy
      village-4:
        condition: service_healthy
      village-5:
        condition: service_healthy
      village-6:
        condition: service_healthy
      tempo:
        condition: service_started

  # AI Opponent Service
  ai-opponent:
    build:
      context: ./ai_opponent
      dockerfile: Dockerfile
    ports:
      - "8081:8081"
    environment:
      - IN_DOCKER=1
      - LOCATION_NAME=ai-opponent
      - PYROSCOPE_SERVER_ADDRESS=http://alloy:9999
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8081/health')"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 10s
    depends_on:
      southern-capital:
        condition: service_healthy
      northern-capital:
        condition: service_healthy
      village-1:
        condition: service_healthy
      village-2:
        condition: service_healthy
      village-3:
        condition: service_healthy
      village-4:
        condition: service_healthy
      village-5:
        condition: service_healthy
      village-6:
        condition: service_healthy
      alloy:
        condition: service_started

volumes:
  game-data:
  pyroscope-data:


================================================
FILE: game-of-tracing/grafana/dashboards/War of Kingdoms-1747821967780.json
================================================
{
  "apiVersion": "dashboard.grafana.app/v2beta1",
  "kind": "Dashboard",
  "metadata": {
    "name": "game-dashboard"
  },
  "spec": {
    "annotations": [
      {
        "kind": "AnnotationQuery",
        "spec": {
          "builtIn": true,
          "enable": true,
          "hide": true,
          "iconColor": "rgba(0, 211, 255, 1)",
          "name": "Annotations & Alerts",
          "query": {
            "datasource": {
              "name": "-- Grafana --"
            },
            "group": "grafana",
            "kind": "DataQuery",
            "spec": {},
            "version": "v0"
          }
        }
      }
    ],
    "cursorSync": "Off",
    "description": "",
    "editable": true,
    "elements": {
      "panel-1": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "filters": [
                          {
                            "id": "f905accb",
                            "operator": "=",
                            "scope": "span"
                          },
                          {
                            "id": "service-name",
                            "operator": "=",
                            "scope": "resource",
                            "tag": "service.name",
                            "value": [
                              "war_map"
                            ],
                            "valueType": "string"
                          }
                        ],
                        "limit": 20,
                        "metricsQueryType": "range",
                        "queryType": "traceqlSearch",
                        "tableType": "traces"
                      },
                      "group": "tempo",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 1,
          "links": [],
          "title": "Player Decisions",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "custom": {
                    "align": "auto",
                    "cellOptions": {
                      "type": "auto"
                    },
                    "inspect": false
                  },
                  "mappings": [],
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "green"
                      },
                      {
                        "color": "red",
                        "value": 80
                      }
                    ]
                  }
                },
                "overrides": []
              },
              "options": {
                "cellHeight": "sm",
                "footer": {
                  "countRows": false,
                  "fields": "",
                  "reducer": [
                    "sum"
                  ],
                  "show": false
                },
                "showHeader": true
              },
              "pluginVersion": "12.0.0"
            },
            "group": "table",
            "version": "12.4.0"
          }
        }
      },
      "panel-10": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "disableTextWrap": false,
                        "editorMode": "builder",
                        "expr": "game_location_control_ratio{location=\"northern-capital\"}",
                        "fullMetaSearch": false,
                        "includeNullMetadata": true,
                        "instant": false,
                        "legendFormat": "__auto",
                        "range": true,
                        "useBackend": false
                      },
                      "group": "prometheus",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 10,
          "links": [],
          "title": "Location Allegiance",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "color": {
                    "mode": "thresholds"
                  },
                  "max": 2,
                  "min": 0,
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "#370a4d",
                        "value": 0
                      },
                      {
                        "color": "super-light-blue",
                        "value": 1
                      },
                      {
                        "color": "dark-red",
                        "value": 2
                      }
                    ]
                  }
                },
                "overrides": []
              },
              "options": {
                "displayMode": "gradient",
                "legend": {
                  "calcs": [],
                  "displayMode": "list",
                  "placement": "bottom",
                  "showLegend": false
                },
                "maxVizHeight": 300,
                "minVizHeight": 16,
                "minVizWidth": 8,
                "namePlacement": "auto",
                "orientation": "auto",
                "reduceOptions": {
                  "calcs": [
                    "lastNotNull"
                  ],
                  "fields": "",
                  "values": false
                },
                "showUnfilled": true,
                "sizing": "auto",
                "valueMode": "color"
              },
              "pluginVersion": "12.0.0"
            },
            "group": "bargauge",
            "version": "12.4.0"
          }
        }
      },
      "panel-11": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "direction": "backward",
                        "editorMode": "code",
                        "expr": "{service_name=\"northern-capital\"} | code_function_name != \"_log\"",
                        "queryType": "range"
                      },
                      "group": "loki",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 11,
          "links": [],
          "title": "Location History",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {},
                "overrides": []
              },
              "options": {
                "dedupStrategy": "none",
                "enableInfiniteScrolling": false,
                "enableLogDetails": true,
                "prettifyLogMessage": false,
                "showCommonLabels": false,
                "showLabels": false,
                "showTime": false,
                "sortOrder": "Descending",
                "wrapLogMessage": false
              },
              "pluginVersion": "12.0.0"
            },
            "group": "logs",
            "version": "12.4.0"
          }
        }
      },
      "panel-12": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "disableTextWrap": false,
                        "editorMode": "code",
                        "expr": "sum(game_army_size_ratio{job=\"$villages\"}) without (faction)",
                        "fullMetaSearch": false,
                        "includeNullMetadata": true,
                        "instant": false,
                        "legendFormat": "__auto",
                        "range": true,
                        "useBackend": false
                      },
                      "group": "prometheus",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 12,
          "links": [],
          "title": "Current Army",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "green",
                        "value": 0
                      },
                      {
                        "color": "red",
                        "value": 80
                      }
                    ]
                  },
                  "unit": "short"
                },
                "overrides": []
              },
              "options": {
                "colorMode": "background",
                "graphMode": "none",
                "justifyMode": "auto",
                "orientation": "auto",
                "percentChangeColorMode": "standard",
                "reduceOptions": {
                  "calcs": [
                    "lastNotNull"
                  ],
                  "fields": "",
                  "values": false
                },
                "showPercentChange": false,
                "textMode": "auto",
                "wideLayout": true
              },
              "pluginVersion": "12.0.0"
            },
            "group": "stat",
            "version": "12.4.0"
          }
        }
      },
      "panel-13": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "disableTextWrap": false,
                        "editorMode": "builder",
                        "expr": "game_resources_ratio{job=\"$villages\"}",
                        "fullMetaSearch": false,
                        "includeNullMetadata": true,
                        "instant": false,
                        "legendFormat": "__auto",
                        "range": true,
                        "useBackend": false
                      },
                      "group": "prometheus",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 13,
          "links": [],
          "title": "Current Resources",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "color": {
                    "mode": "thresholds"
                  },
                  "max": 200,
                  "min": 0,
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "dark-red",
                        "value": 0
                      },
                      {
                        "color": "red",
                        "value": 30
                      },
                      {
                        "color": "#EAB839",
                        "value": 50
                      },
                      {
                        "color": "dark-green",
                        "value": 100
                      }
                    ]
                  }
                },
                "overrides": []
              },
              "options": {
                "minVizHeight": 75,
                "minVizWidth": 75,
                "orientation": "auto",
                "reduceOptions": {
                  "calcs": [
                    "lastNotNull"
                  ],
                  "fields": "",
                  "values": false
                },
                "showThresholdLabels": false,
                "showThresholdMarkers": true,
                "sizing": "auto"
              },
              "pluginVersion": "12.0.0"
            },
            "group": "gauge",
            "version": "12.4.0"
          }
        }
      },
      "panel-14": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "disableTextWrap": false,
                        "editorMode": "code",
                        "expr": "sum(game_location_control_ratio{job=\"$villages\"}) without (faction)",
                        "fullMetaSearch": false,
                        "includeNullMetadata": true,
                        "instant": false,
                        "legendFormat": "__auto",
                        "range": true,
                        "useBackend": false
                      },
                      "group": "prometheus",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 14,
          "links": [],
          "title": "Location Allegiance",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "color": {
                    "mode": "thresholds"
                  },
                  "max": 2,
                  "min": 0,
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "#370a4d",
                        "value": 0
                      },
                      {
                        "color": "super-light-blue",
                        "value": 1
                      },
                      {
                        "color": "dark-red",
                        "value": 2
                      }
                    ]
                  }
                },
                "overrides": []
              },
              "options": {
                "displayMode": "gradient",
                "legend": {
                  "calcs": [],
                  "displayMode": "list",
                  "placement": "bottom",
                  "showLegend": false
                },
                "maxVizHeight": 300,
                "minVizHeight": 16,
                "minVizWidth": 8,
                "namePlacement": "auto",
                "orientation": "auto",
                "reduceOptions": {
                  "calcs": [
                    "lastNotNull"
                  ],
                  "fields": "",
                  "values": false
                },
                "showUnfilled": true,
                "sizing": "auto",
                "valueMode": "color"
              },
              "pluginVersion": "12.0.0"
            },
            "group": "bargauge",
            "version": "12.4.0"
          }
        }
      },
      "panel-15": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "direction": "backward",
                        "editorMode": "code",
                        "expr": "{service_name=\"$villages\"} | code_function_name !=\"_log\"",
                        "queryType": "range"
                      },
                      "group": "loki",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 15,
          "links": [],
          "title": "Location History",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {},
                "overrides": []
              },
              "options": {
                "dedupStrategy": "none",
                "enableInfiniteScrolling": false,
                "enableLogDetails": true,
                "prettifyLogMessage": false,
                "showCommonLabels": false,
                "showLabels": false,
                "showTime": false,
                "sortOrder": "Descending",
                "wrapLogMessage": false
              },
              "pluginVersion": "12.0.0"
            },
            "group": "logs",
            "version": "12.4.0"
          }
        }
      },
      "panel-16": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "filters": [
                          {
                            "id": "e020e714",
                            "operator": "=",
                            "scope": "span"
                          }
                        ],
                        "limit": 20,
                        "metricsQueryType": "range",
                        "queryType": "serviceMap",
                        "serviceMapQuery": "{}",
                        "tableType": "traces"
                      },
                      "group": "tempo",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 16,
          "links": [],
          "title": "War Map",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {},
                "overrides": []
              },
              "options": {
                "edges": {},
                "layoutAlgorithm": "layered",
                "nodes": {},
                "zoomMode": "cooperative"
              },
              "pluginVersion": "12.0.0"
            },
            "group": "nodeGraph",
            "version": "12.4.0"
          }
        }
      },
      "panel-17": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "filters": [
                          {
                            "id": "9aa2da84",
                            "operator": ">",
                            "scope": "span",
                            "tag": "army_size",
                            "value": [
                              "3"
                            ]
                          }
                        ],
                        "limit": 20,
                        "metricsQueryType": "range",
                        "query": "{span.army_size>3}",
                        "queryType": "traceqlSearch",
                        "tableType": "traces"
                      },
                      "group": "tempo",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 17,
          "links": [],
          "title": "Army Size Greater than 3",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "custom": {
                    "align": "auto",
                    "cellOptions": {
                      "type": "auto"
                    },
                    "inspect": false
                  },
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "green",
                        "value": 0
                      },
                      {
                        "color": "red",
                        "value": 80
                      }
                    ]
                  }
                },
                "overrides": []
              },
              "options": {
                "cellHeight": "sm",
                "footer": {
                  "countRows": false,
                  "fields": "",
                  "reducer": [
                    "sum"
                  ],
                  "show": false
                },
                "showHeader": true
              },
              "pluginVersion": "12.0.0"
            },
            "group": "table",
            "version": "12.4.0"
          }
        }
      },
      "panel-18": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "disableTextWrap": false,
                        "editorMode": "builder",
                        "expr": "game_army_size_ratio{location=\"southern-capital\"}",
                        "fullMetaSearch": false,
                        "includeNullMetadata": true,
                        "instant": false,
                        "legendFormat": "__auto",
                        "range": true,
                        "useBackend": false
                      },
                      "group": "prometheus",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 18,
          "links": [],
          "title": "Current Army",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "color": {
                    "mode": "thresholds"
                  },
                  "max": 10,
                  "min": 0,
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "dark-red",
                        "value": 0
                      },
                      {
                        "color": "red",
                        "value": 3
                      },
                      {
                        "color": "#EAB839",
                        "value": 5
                      },
                      {
                        "color": "dark-green",
                        "value": 15
                      }
                    ]
                  }
                },
                "overrides": []
              },
              "options": {
                "minVizHeight": 75,
                "minVizWidth": 75,
                "orientation": "auto",
                "reduceOptions": {
                  "calcs": [
                    "lastNotNull"
                  ],
                  "fields": "",
                  "values": false
                },
                "showThresholdLabels": false,
                "showThresholdMarkers": true,
                "sizing": "auto"
              },
              "pluginVersion": "12.0.0"
            },
            "group": "gauge",
            "version": "12.4.0"
          }
        }
      },
      "panel-19": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "disableTextWrap": false,
                        "editorMode": "builder",
                        "exemplar": true,
                        "expr": "game_battles_total",
                        "fullMetaSearch": false,
                        "includeNullMetadata": true,
                        "instant": false,
                        "legendFormat": "__auto",
                        "range": true,
                        "useBackend": false
                      },
                      "group": "prometheus",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 19,
          "links": [],
          "title": "Battles",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "color": {
                    "mode": "palette-classic"
                  },
                  "custom": {
                    "axisBorderShow": false,
                    "axisCenteredZero": false,
                    "axisColorMode": "text",
                    "axisLabel": "",
                    "axisPlacement": "auto",
                    "barAlignment": 0,
                    "barWidthFactor": 0.6,
                    "drawStyle": "line",
                    "fillOpacity": 17,
                    "gradientMode": "none",
                    "hideFrom": {
                      "legend": false,
                      "tooltip": false,
                      "viz": false
                    },
                    "insertNulls": false,
                    "lineInterpolation": "linear",
                    "lineStyle": {
                      "fill": "solid"
                    },
                    "lineWidth": 1,
                    "pointSize": 1,
                    "scaleDistribution": {
                      "type": "linear"
                    },
                    "showPoints": "auto",
                    "spanNulls": false,
                    "stacking": {
                      "group": "A",
                      "mode": "none"
                    },
                    "thresholdsStyle": {
                      "mode": "off"
                    }
                  },
                  "mappings": [],
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "green"
                      },
                      {
                        "color": "red",
                        "value": 80
                      }
                    ]
                  }
                },
                "overrides": []
              },
              "options": {
                "legend": {
                  "calcs": [],
                  "displayMode": "list",
                  "placement": "bottom",
                  "showLegend": false
                },
                "tooltip": {
                  "hideZeros": false,
                  "mode": "single",
                  "sort": "none"
                }
              },
              "pluginVersion": "12.0.0"
            },
            "group": "timeseries",
            "version": "12.4.0"
          }
        }
      },
      "panel-2": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "disableTextWrap": false,
                        "editorMode": "code",
                        "exemplar": true,
                        "expr": "sum by (faction) (game_army_size_ratio)",
                        "fullMetaSearch": false,
                        "includeNullMetadata": true,
                        "instant": false,
                        "legendFormat": "__auto",
                        "range": true,
                        "useBackend": false
                      },
                      "group": "prometheus",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 2,
          "links": [],
          "title": "Current Army by Faction",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "color": {
                    "mode": "palette-classic"
                  },
                  "custom": {
                    "axisBorderShow": false,
                    "axisCenteredZero": false,
                    "axisColorMode": "text",
                    "axisLabel": "",
                    "axisPlacement": "auto",
                    "barAlignment": 0,
                    "barWidthFactor": 0.6,
                    "drawStyle": "line",
                    "fillOpacity": 29,
                    "gradientMode": "none",
                    "hideFrom": {
                      "legend": false,
                      "tooltip": false,
                      "viz": false
                    },
                    "insertNulls": false,
                    "lineInterpolation": "linear",
                    "lineWidth": 1,
                    "pointSize": 5,
                    "scaleDistribution": {
                      "type": "linear"
                    },
                    "showPoints": "auto",
                    "spanNulls": false,
                    "stacking": {
                      "group": "A",
                      "mode": "none"
                    },
                    "thresholdsStyle": {
                      "mode": "off"
                    }
                  },
                  "mappings": [],
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "green"
                      },
                      {
                        "color": "red",
                        "value": 80
                      }
                    ]
                  }
                },
                "overrides": []
              },
              "options": {
                "legend": {
                  "calcs": [],
                  "displayMode": "list",
                  "placement": "bottom",
                  "showLegend": true
                },
                "tooltip": {
                  "hideZeros": false,
                  "mode": "single",
                  "sort": "none"
                }
              },
              "pluginVersion": "12.0.0"
            },
            "group": "timeseries",
            "version": "12.4.0"
          }
        }
      },
      "panel-20": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "filters": [
                          {
                            "id": "f905accb",
                            "operator": "=",
                            "scope": "span"
                          },
                          {
                            "id": "service-name",
                            "operator": "=",
                            "scope": "resource",
                            "tag": "service.name",
                            "value": [
                              "ai-opponent"
                            ],
                            "valueType": "string"
                          }
                        ],
                        "limit": 20,
                        "metricsQueryType": "range",
                        "queryType": "traceqlSearch",
                        "tableType": "traces"
                      },
                      "group": "tempo",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 20,
          "links": [],
          "title": "AI Decisions",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "custom": {
                    "align": "auto",
                    "cellOptions": {
                      "type": "auto"
                    },
                    "inspect": false
                  },
                  "mappings": [],
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "green"
                      },
                      {
                        "color": "red",
                        "value": 80
                      }
                    ]
                  }
                },
                "overrides": []
              },
              "options": {
                "cellHeight": "sm",
                "footer": {
                  "countRows": false,
                  "fields": "",
                  "reducer": [
                    "sum"
                  ],
                  "show": false
                },
                "showHeader": true
              },
              "pluginVersion": "12.0.0"
            },
            "group": "table",
            "version": "12.4.0"
          }
        }
      },
      "panel-3": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "disableTextWrap": false,
                        "editorMode": "code",
                        "exemplar": true,
                        "expr": "game_resources_ratio{location_type=\"capital\"}",
                        "fullMetaSearch": false,
                        "includeNullMetadata": true,
                        "instant": false,
                        "legendFormat": "__auto",
                        "range": true,
                        "useBackend": false
                      },
                      "group": "prometheus",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 3,
          "links": [],
          "title": "Current Resources (By Captital)",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "color": {
                    "mode": "palette-classic-by-name"
                  },
                  "custom": {
                    "axisBorderShow": false,
                    "axisCenteredZero": false,
                    "axisColorMode": "text",
                    "axisLabel": "",
                    "axisPlacement": "auto",
                    "barAlignment": 0,
                    "barWidthFactor": 0.6,
                    "drawStyle": "line",
                    "fillOpacity": 26,
                    "gradientMode": "none",
                    "hideFrom": {
                      "legend": false,
                      "tooltip": false,
                      "viz": false
                    },
                    "insertNulls": false,
                    "lineInterpolation": "linear",
                    "lineWidth": 1,
                    "pointSize": 5,
                    "scaleDistribution": {
                      "type": "linear"
                    },
                    "showPoints": "auto",
                    "spanNulls": false,
                    "stacking": {
                      "group": "A",
                      "mode": "none"
                    },
                    "thresholdsStyle": {
                      "mode": "off"
                    }
                  },
                  "mappings": [],
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "green"
                      },
                      {
                        "color": "red",
                        "value": 80
                      }
                    ]
                  }
                },
                "overrides": []
              },
              "options": {
                "legend": {
                  "calcs": [],
                  "displayMode": "list",
                  "placement": "bottom",
                  "showLegend": true
                },
                "tooltip": {
                  "hideZeros": false,
                  "mode": "single",
                  "sort": "none"
                }
              },
              "pluginVersion": "12.0.0"
            },
            "group": "timeseries",
            "version": "12.4.0"
          }
        }
      },
      "panel-4": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "direction": "backward",
                        "editorMode": "code",
                        "expr": "{service_name=\"southern-capital\"} | code_function_name != \"_log\"",
                        "queryType": "range"
                      },
                      "group": "loki",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 4,
          "links": [],
          "title": "Location History",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {},
                "overrides": []
              },
              "options": {
                "dedupStrategy": "none",
                "enableInfiniteScrolling": false,
                "enableLogDetails": true,
                "prettifyLogMessage": false,
                "showCommonLabels": false,
                "showLabels": false,
                "showTime": false,
                "sortOrder": "Descending",
                "wrapLogMessage": false
              },
              "pluginVersion": "12.0.0"
            },
            "group": "logs",
            "version": "12.4.0"
          }
        }
      },
      "panel-5": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "disableTextWrap": false,
                        "editorMode": "builder",
                        "exemplar": true,
                        "expr": "game_army_size_ratio{location=\"southern-capital\"}",
                        "fullMetaSearch": false,
                        "includeNullMetadata": true,
                        "instant": false,
                        "legendFormat": "__auto",
                        "range": true,
                        "useBackend": false
                      },
                      "group": "prometheus",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 5,
          "links": [],
          "title": "Current Army",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "color": {
                    "fixedColor": "dark-red",
                    "mode": "fixed"
                  },
                  "custom": {
                    "axisBorderShow": false,
                    "axisCenteredZero": false,
                    "axisColorMode": "text",
                    "axisLabel": "",
                    "axisPlacement": "auto",
                    "barAlignment": 0,
                    "barWidthFactor": 0.6,
                    "drawStyle": "line",
                    "fillOpacity": 20,
                    "gradientMode": "scheme",
                    "hideFrom": {
                      "legend": false,
                      "tooltip": false,
                      "viz": false
                    },
                    "insertNulls": false,
                    "lineInterpolation": "smooth",
                    "lineWidth": 3,
                    "pointSize": 5,
                    "scaleDistribution": {
                      "type": "linear"
                    },
                    "showPoints": "auto",
                    "spanNulls": false,
                    "stacking": {
                      "group": "A",
                      "mode": "none"
                    },
                    "thresholdsStyle": {
                      "mode": "off"
                    }
                  },
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "green",
                        "value": 0
                      },
                      {
                        "color": "red",
                        "value": 80
                      }
                    ]
                  }
                },
                "overrides": []
              },
              "options": {
                "legend": {
                  "calcs": [],
                  "displayMode": "hidden",
                  "placement": "right",
                  "showLegend": false
                },
                "tooltip": {
                  "hideZeros": false,
                  "mode": "single",
                  "sort": "none"
                }
              },
              "pluginVersion": "12.0.0"
            },
            "group": "timeseries",
            "version": "12.4.0"
          }
        }
      },
      "panel-6": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "disableTextWrap": false,
                        "editorMode": "builder",
                        "expr": "game_resources_ratio{location=\"southern-capital\"}",
                        "fullMetaSearch": false,
                        "includeNullMetadata": true,
                        "instant": false,
                        "legendFormat": "__auto",
                        "range": true,
                        "useBackend": false
                      },
                      "group": "prometheus",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 6,
          "links": [],
          "title": "Current Resources",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "color": {
                    "mode": "thresholds"
                  },
                  "max": 200,
                  "min": 0,
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "dark-red",
                        "value": 0
                      },
                      {
                        "color": "red",
                        "value": 30
                      },
                      {
                        "color": "#EAB839",
                        "value": 50
                      },
                      {
                        "color": "dark-green",
                        "value": 100
                      }
                    ]
                  }
                },
                "overrides": []
              },
              "options": {
                "minVizHeight": 75,
                "minVizWidth": 75,
                "orientation": "auto",
                "reduceOptions": {
                  "calcs": [
                    "lastNotNull"
                  ],
                  "fields": "",
                  "values": false
                },
                "showThresholdLabels": false,
                "showThresholdMarkers": true,
                "sizing": "auto"
              },
              "pluginVersion": "12.0.0"
            },
            "group": "gauge",
            "version": "12.4.0"
          }
        }
      },
      "panel-7": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "disableTextWrap": false,
                        "editorMode": "builder",
                        "expr": "game_location_control_ratio{location=\"southern-capital\"}",
                        "fullMetaSearch": false,
                        "includeNullMetadata": true,
                        "instant": false,
                        "legendFormat": "__auto",
                        "range": true,
                        "useBackend": false
                      },
                      "group": "prometheus",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 7,
          "links": [],
          "title": "Location Allegiance",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "color": {
                    "mode": "thresholds"
                  },
                  "max": 2,
                  "min": 0,
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "#370a4d",
                        "value": 0
                      },
                      {
                        "color": "super-light-blue",
                        "value": 1
                      },
                      {
                        "color": "dark-red",
                        "value": 2
                      }
                    ]
                  }
                },
                "overrides": []
              },
              "options": {
                "displayMode": "gradient",
                "legend": {
                  "calcs": [],
                  "displayMode": "list",
                  "placement": "bottom",
                  "showLegend": false
                },
                "maxVizHeight": 300,
                "minVizHeight": 16,
                "minVizWidth": 8,
                "namePlacement": "auto",
                "orientation": "auto",
                "reduceOptions": {
                  "calcs": [
                    "lastNotNull"
                  ],
                  "fields": "",
                  "values": false
                },
                "showUnfilled": true,
                "sizing": "auto",
                "valueMode": "color"
              },
              "pluginVersion": "12.0.0"
            },
            "group": "bargauge",
            "version": "12.4.0"
          }
        }
      },
      "panel-8": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "disableTextWrap": false,
                        "editorMode": "builder",
                        "expr": "game_army_size_ratio{location=\"northern-capital\"}",
                        "fullMetaSearch": false,
                        "includeNullMetadata": true,
                        "instant": false,
                        "legendFormat": "__auto",
                        "range": true,
                        "useBackend": false
                      },
                      "group": "prometheus",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 8,
          "links": [],
          "title": "Current Army",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "color": {
                    "mode": "thresholds"
                  },
                  "max": 10,
                  "min": 0,
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "dark-red",
                        "value": 0
                      },
                      {
                        "color": "red",
                        "value": 3
                      },
                      {
                        "color": "#EAB839",
                        "value": 5
                      },
                      {
                        "color": "dark-green",
                        "value": 15
                      }
                    ]
                  }
                },
                "overrides": []
              },
              "options": {
                "minVizHeight": 75,
                "minVizWidth": 75,
                "orientation": "auto",
                "reduceOptions": {
                  "calcs": [
                    "lastNotNull"
                  ],
                  "fields": "",
                  "values": false
                },
                "showThresholdLabels": false,
                "showThresholdMarkers": true,
                "sizing": "auto"
              },
              "pluginVersion": "12.0.0"
            },
            "group": "gauge",
            "version": "12.4.0"
          }
        }
      },
      "panel-9": {
        "kind": "Panel",
        "spec": {
          "data": {
            "kind": "QueryGroup",
            "spec": {
              "queries": [
                {
                  "kind": "PanelQuery",
                  "spec": {
                    "hidden": false,
                    "query": {
                      "kind": "DataQuery",
                      "spec": {
                        "disableTextWrap": false,
                        "editorMode": "builder",
                        "expr": "game_resources_ratio{location=\"northern-capital\"}",
                        "fullMetaSearch": false,
                        "includeNullMetadata": true,
                        "instant": false,
                        "legendFormat": "__auto",
                        "range": true,
                        "useBackend": false
                      },
                      "group": "prometheus",
                      "version": "v0"
                    },
                    "refId": "A"
                  }
                }
              ],
              "queryOptions": {},
              "transformations": []
            }
          },
          "description": "",
          "id": 9,
          "links": [],
          "title": "Current Resources",
          "vizConfig": {
            "kind": "VizConfig",
            "spec": {
              "fieldConfig": {
                "defaults": {
                  "color": {
                    "mode": "thresholds"
                  },
                  "max": 200,
                  "min": 0,
                  "thresholds": {
                    "mode": "absolute",
                    "steps": [
                      {
                        "color": "dark-red",
                        "value": 0
                      },
                      {
                        "color": "red",
                        "value": 30
                      },
                      {
                        "color": "#EAB839",
                        "value": 50
                      },
                      {
                        "color": "dark-green",
                        "value": 100
                      }
                    ]
                  }
                },
                "overrides": []
              },
              "options": {
                "minVizHeight": 75,
                "minVizWidth": 75,
                "orientation": "auto",
                "reduceOptions": {
                  "calcs": [
                    "lastNotNull"
                  ],
                  "fields": "",
                  "values": false
                },
                "showThresholdLabels": false,
                "showThresholdMarkers": true,
                "sizing": "auto"
              },
              "pluginVersion": "12.0.0"
            },
            "group": "gauge",
            "version": "12.4.0"
          }
        }
      }
    },
    "layout": {
      "kind": "TabsLayout",
      "spec": {
        "tabs": [
          {
            "kind": "TabsLayoutTab",
            "spec": {
              "layout": {
                "kind": "RowsLayout",
                "spec": {
                  "rows": [
                    {
                      "kind": "RowsLayoutRow",
                      "spec": {
                        "collapse": false,
                        "layout": {
                          "kind": "GridLayout",
                          "spec": {
                            "items": [
                              {
                                "kind": "GridLayoutItem",
                                "spec": {
                                  "element": {
                                    "kind": "ElementReference",
                                    "name": "panel-19"
                                  },
                                  "height": 7,
                                  "width": 24,
                                  "x": 0,
                                  "y": 0
                                }
                              },
                              {
                                "kind": "GridLayoutItem",
                                "spec": {
                                  "element": {
                                    "kind": "ElementReference",
                                    "name": "panel-1"
                                  },
                                  "height": 11,
                                  "width": 24,
                                  "x": 0,
                                  "y": 7
                                }
                              },
                              {
                                "kind": "GridLayoutItem",
                                "spec": {
                                  "element": {
                                    "kind": "ElementReference",
                                    "name": "panel-2"
                                  },
                                  "height": 6,
                                  "width": 12,
                                  "x": 0,
                                  "y": 18
                                }
                              },
                              {
                                "kind": "GridLayoutItem",
                                "spec": {
                                  "element": {
                                    "kind": "ElementReference",
                                    "name": "panel-3"
                                  },
                                  "height": 6,
                                  "width": 12,
                                  "x": 12,
                                  "y": 18
                                }
                              },
                              {
                                "kind": "GridLayoutItem",
                                "spec": {
                                  "element": {
                                    "kind": "ElementReference",
                                    "name": "panel-20"
                                  },
                                  "height": 11,
                                  "width": 24,
                                  "x": 0,
                                  "y": 24
                                }
                              }
                            ]
                          }
                        },
                        "title": "Current Overview"
                      }
                    },
                    {
                      "kind": "RowsLayoutRow",
                      "spec": {
                        "collapse": false,
                        "layout": {
                          "kind": "GridLayout",
                          "spec": {
                            "items": [
                              {
                                "kind": "GridLayoutItem",
                                "spec": {
                                  "element": {
                                    "kind": "ElementReference",
                                    "name": "panel-16"
                                  },
                                  "height": 11,
                                  "width": 24,
                                  "x": 0,
                                  "y": 0
                                }
                              },
                              {
                                "kind": "GridLayoutItem",
                                "spec": {
                                  "element": {
                                    "kind": "ElementReference",
                                    "name": "panel-17"
                                  },
                                  "height": 10,
                                  "width": 24,
                                  "x": 0,
                                  "y": 11
                                }
                              }
                            ]
                          }
                        },
                        "title": "Trace Analytics"
                      }
                    }
                  ]
                }
              },
              "title": "War Map"
            }
          },
          {
            "kind": "TabsLayoutTab",
            "spec": {
              "layout": {
                "kind": "GridLayout",
                "spec": {
                  "items": [
                    {
                      "kind": "GridLayoutItem",
                      "spec": {
                        "element": {
                          "kind": "ElementReference",
                          "name": "panel-18"
                        },
                        "height": 6,
                        "width": 9,
                        "x": 0,
                        "y": 0
                      }
                    },
                    {
                      "kind": "GridLayoutItem",
                      "spec": {
                        "element": {
                          "kind": "ElementReference",
                          "name": "panel-6"
                        },
                        "height": 6,
                        "width": 8,
                        "x": 9,
                        "y": 0
                      }
                    },
                    {
                      "kind": "GridLayoutItem",
                      "spec": {
                        "element": {
                          "kind": "ElementReference",
                          "name": "panel-7"
                        },
                        "height": 6,
                        "width": 7,
                        "x": 17,
                        "y": 0
                      }
                    },
                    {
                      "kind": "GridLayoutItem",
                      "spec": {
                        "element": {
                          "kind": "ElementReference",
                          "name": "panel-4"
                        },
                        "height": 10,
                        "width": 24,
                        "x": 0,
                        "y": 6
                      }
                    },
                    {
                      "kind": "GridLayoutItem",
                      "spec": {
                        "element": {
                          "kind": "ElementReference",
                          "name": "panel-5"
                        },
                        "height": 6,
                        "width": 24,
                        "x": 0,
                        "y": 16
                      }
                    }
                  ]
                }
              },
              "title": "Southern Kingdom"
            }
          },
          {
            "kind": "TabsLayoutTab",
            "spec": {
              "layout": {
                "kind": "GridLayout",
                "spec": {
                  "items": [
                    {
                      "kind": "GridLayoutItem",
                      "spec": {
                        "element": {
                          "kind": "ElementReference",
                          "name": "panel-8"
                        },
                        "height": 6,
                        "width": 9,
                        "x": 0,
                        "y": 0
                      }
                    },
                    {
                      "kind": "GridLayoutItem",
                      "spec": {
                        "element": {
                          "kind": "ElementReference",
                          "name": "panel-9"
                        },
                        "height": 6,
                        "width": 8,
                        "x": 9,
                        "y": 0
                      }
                    },
                    {
                      "kind": "GridLayoutItem",
                      "spec": {
                        "element": {
                          "kind": "ElementReference",
                          "name": "panel-10"
                        },
                        "height": 6,
                        "width": 7,
                        "x": 17,
                        "y": 0
                      }
                    },
                    {
                      "kind": "GridLayoutItem",
                      "spec": {
                        "element": {
                          "kind": "ElementReference",
                          "name": "panel-11"
                        },
                        "height": 10,
                        "width": 24,
                        "x": 0,
                        "y": 6
                      }
                    }
                  ]
                }
              },
              "title": "Northern Kingdom "
            }
          },
          {
            "kind": "TabsLayoutTab",
            "spec": {
              "layout": {
                "kind": "GridLayout",
                "spec": {
                  "items": [
                    {
                      "kind": "GridLayoutItem",
                      "spec": {
                        "element": {
                          "kind": "ElementReference",
                          "name": "panel-12"
                        },
                        "height": 6,
                        "width": 9,
                        "x": 0,
                        "y": 0
                      }
                    },
                    {
                      "kind": "GridLayoutItem",
                      "spec": {
                        "element": {
                          "kind": "ElementReference",
                          "name": "panel-13"
                        },
                        "height": 6,
                        "width": 8,
                        "x": 9,
                        "y": 0
                      }
                    },
                    {
                      "kind": "GridLayoutItem",
                      "spec": {
                        "element": {
                          "kind": "ElementReference",
                          "name": "panel-14"
                        },
                        "height": 6,
                        "width": 7,
                        "x": 17,
                        "y": 0
                      }
                    },
                    {
                      "kind": "GridLayoutItem",
                      "spec": {
                        "element": {
                          "kind": "ElementReference",
                          "name": "panel-15"
                        },
                        "height": 10,
                        "width": 24,
                        "x": 0,
                        "y": 6
                      }
                    }
                  ]
                }
              },
              "title": "Villages"
            }
          }
        ]
      }
    },
    "links": [],
    "liveNow": false,
    "preload": false,
    "tags": [],
    "timeSettings": {
      "autoRefresh": "",
      "autoRefreshIntervals": [
        "5s",
        "10s",
        "30s",
        "1m",
        "5m",
        "15m",
        "30m",
        "1h",
        "2h",
        "1d"
      ],
      "fiscalYearStartMonth": 0,
      "from": "now-30m",
      "hideTimepicker": false,
      "timezone": "browser",
      "to": "now"
    },
    "title": "Game Dashboard",
    "variables": [
      {
        "kind": "QueryVariable",
        "spec": {
          "hide": "dontHide",
          "includeAll": false,
          "label": "Villages",
          "multi": false,
          "name": "villages",
          "query": {
            "datasource": {
              "name": "prometheus"
            },
            "group": "prometheus",
            "kind": "DataQuery",
            "spec": {
              "qryType": 1,
              "query": "label_values(game_resources_ratio,job)",
              "refId": "PrometheusVariableQueryEditor-VariableQuery"
            },
            "version": "v0"
          },
          "refresh": "onDashboardLoad",
          "regex": "",
          "skipUrlSync": false,
          "sort": "alphabeticalAsc"
        }
      }
    ]
  }
}


================================================
FILE: game-of-tracing/grafana/dashboards/dashboards.yaml
================================================
apiVersion: 1
providers:
  - name: 'game-of-tracing'
    orgId: 1
    folder: ''
    type: file
    disableDeletion: true
    updateIntervalSeconds: 10
    allowUiUpdates: false
    options:
      path: /etc/grafana/provisioning/dashboards
      foldersFromFilesStructure: false


================================================
FILE: game-of-tracing/grafana/datasources/defaults.yml
================================================
apiVersion: 1
datasources:
- name: prometheus
  uid: prometheus
  type: prometheus
  orgId: 1
  url: http://prometheus:9090
  basicAuth: false
  isDefault: false
  version: 1
  editable: false
  jsonData:
    exemplarTraceIdDestinations:
      - datasourceUid: "tempo"
        name: "trace_id"
- name: tempo
  uid: tempo
  type: tempo
  access: proxy
  orgId: 1
  url: http://tempo:3200
  basicAuth: false
  isDefault: true
  version: 1
  editable: false
  jsonData:
    serviceMap:
      datasourceUid: 'prometheus'
    nodeGraph:
      enabled: true
    tracesToLogsV2:
      datasourceUid: 'loki'
      filterBySpanID: true
    tracesToMetrics:
      datasourceUid: 'prometheus'
    tracesToProfilesV2:
      datasourceUid: 'pyroscope'
      tags:
        - key: 'service.name'
          value: 'service_name'
      profileTypeId: 'process_cpu:cpu:nanoseconds:cpu:nanoseconds'
- name: loki
  uid: loki
  type: loki
  access: proxy
  orgId: 1
  url: http://loki:3100
  basicAuth: false
  isDefault: false
  jsonData:
    derivedFields:
      - datasourceUid: "tempo"
        matcherRegex: "trace_id"
        matcherType: "label"
        name: "trace_id"
        targetBlank: true
        url: "$${__value.raw}"
        urlDisplayLabel: ""
- name: pyroscope
  uid: pyroscope
  type: grafana-pyroscope-datasource
  access: proxy
  orgId: 1
  url: http://pyroscope:4040
  basicAuth: false
  isDefault: false
  editable: false


================================================
FILE: game-of-tracing/loki-config.yaml
================================================
auth_enabled: false

server:
  http_listen_port: 3100
  grpc_listen_port: 9096
  log_level: debug
  grpc_server_max_concurrent_streams: 1000

common:
  instance_addr: 127.0.0.1
  path_prefix: /tmp/storage
  storage:
    filesystem:
      chunks_directory: /tmp/storage/chunks
      rules_directory: /tmp/storage/rules
  replication_factor: 1
  ring:
    kvstore:
      store: inmemory

query_range:
  results_cache:
    cache:
      embedded_cache:
        enabled: true
        max_size_mb: 100

limits_config:
  metric_aggregation_enabled: true

schema_config:
  configs:
    - from: 2020-10-24
      store: tsdb
      object_store: filesystem
      schema: v13
      index:
        prefix: index_
        period: 24h

pattern_ingester:
  enabled: true
  metric_aggregation:
    loki_address: localhost:3100

ruler:
  alertmanager_url: http://localhost:9093

frontend:
  encoding: protobuf


# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration
# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/
#
# Statistics help us better understand how Loki is used, and they show us performance
# levels for most users. This helps us prioritize features and documentation.
# For more information on what's sent, look at
# https://github.com/grafana/loki/blob/main/pkg/analytics/stats.go
# Refer to the buildReport method to see what goes into a report.
#
# If you would like to disable reporting, uncomment the following lines:
#analytics:
#  reporting_enabled: false

================================================
FILE: game-of-tracing/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s

otlp:
  keep_identifying_resource_attributes: true

================================================
FILE: game-of-tracing/pyroscope-config.yaml
================================================
---
# Minimal Pyroscope v2 config for local single-binary demo.
# v2 defaults (filesystem backend, v1-v2-dual storage) handle the rest.

server:
  http_listen_port: 4040

storage:
  backend: filesystem
  filesystem:
    dir: /data


================================================
FILE: game-of-tracing/tempo-config.yaml
================================================
stream_over_http_enabled: true
server:
  http_listen_port: 3200
  log_level: info


cache:
  background:
    writeback_goroutines: 5
  caches:
  - roles:
    - frontend-search  
    memcached: 
      addresses: dns+memcached:11211

query_frontend:
  search:
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09
    metadata_slo:
        duration_slo: 5s
        throughput_bytes_slo: 1.073741824e+09
  trace_by_id:
    duration_slo: 100ms
  metrics:
    max_duration: 200h                # maximum duration of a metrics query, increase for local setups
    query_backend_after: 5m
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09

distributor:
  receivers:                           # this configuration will listen on all ports and protocols that tempo is capable of.
    jaeger:                            # the receives all come from the OpenTelemetry collector.  more configuration information can
      protocols:                       # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver
        thrift_http:                   #
          endpoint: "tempo:14268"      # for a production deployment you should only enable the receivers you need!
        grpc:
          endpoint: "tempo:14250"
        thrift_binary:
          endpoint: "tempo:6832"
        thrift_compact:
          endpoint: "tempo:6831"
    zipkin:
      endpoint: "tempo:9411"
    otlp:
      protocols:
        grpc:
          endpoint: "tempo:4317"
        http:
          endpoint: "tempo:4318"
    opencensus:
      endpoint: "tempo:55678"

ingester:
  max_block_duration: 5m               # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally

compactor:
  compaction:
    block_retention: 720h                # overall Tempo trace retention. set for demo purposes

metrics_generator:
  registry:
    external_labels:
      source: tempo
      cluster: docker-compose
  storage:
    path: /var/tempo/generator/wal
    remote_write:
      - url: http://prometheus:9090/api/v1/write
        send_exemplars: true
  traces_storage:
    path: /var/tempo/generator/traces
  processor:
    local_blocks:
      filter_server_spans: false
      flush_to_storage: true

storage:
  trace:
    backend: local                     # backend configuration to use
    wal:
      path: /var/tempo/wal             # where to store the wal locally
    local:
      path: /var/tempo/blocks

overrides:
  defaults:
    metrics_generator:
      processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator
      generate_native_histograms: both
      

================================================
FILE: game-of-tracing/war_map/CLAUDE.md
================================================
# war_map/ — UI + Span-Link Broker

> Flask web UI on port 8080, game session orchestrator, and **owner of the span-link reconstruction logic that drives game replay**. This doc is read by any AI coding agent. For scenario-wide context read [`../AGENTS.md`](../AGENTS.md) first.

## Purpose

`war-map` is the human-facing surface of the game and the coordination point for everything the player touches:

- Hosts the **map picker** (`/map_picker` + `/select_map`) that lets the user choose between `war_of_kingdoms` and `white_walkers_attack`, then renders the faction selection (or single-player auto-start) for the chosen map.
- Renders the interactive game map (territory ownership, army sizes, supply routes, wall-hold HUD for WWA).
- Manages faction selection, sessions, and the human player's identity.
- Is the **sole writer** of the `game_actions` SQLite table — the record of every action's trace/span IDs that makes span-link replay possible (rows carry a `map_id` column).
- Activates / deactivates the AI opponent on behalf of the player (auto-activates as `white_walkers` when the chosen map is WWA).
- Proxies trace-replay queries to Tempo and falls back to local SQLite when Tempo is unavailable.
- Instruments player actions as `SERVER` spans with `trace.Link`s chaining each action to the previous one in the session.
- Runs the **wall-hold tick thread** (`_wall_tick_thread`, 30 s cadence) that increments `wall_hold` when one faction owns every wall keep, and declares the WWA winner at 5 consecutive ticks.

## File map

| File | Size | Purpose |
|---|---|---|
| `app.py` | ~64 KB | Flask app, session/player management, span-link broker, Tempo proxy for replay, AI activation control. |
| `telemetry.py` | ~3 KB | `GameTelemetry` — traces + logs (no custom metrics), plus Pyroscope profiling with OTel span-profile linkage. |
| `templates/index.html` | ~7 KB | Faction selection screen. |
| `templates/map.html` | ~50 KB | Main SVG-based game map with real-time updates. |
| `templates/layout.html` | ~4 KB | Shared layout chrome. |
| `templates/replay.html` | ~6 KB | Replay session picker. |
| `templates/replay_session.html` | ~28 KB | Per-session trace-replay UI — the consumer of the span-link chain. |
| `static/css/style.css` | — | UI styling. |
| `Dockerfile` | small | `python:3.11-slim`, runs `python app.py`. |
| `requirements.txt` | small | Flask 3.1.3, requests 2.33.1, python-dotenv 1.2.2, OpenTelemetry SDK/API + exporters, `pyroscope-io` + `pyroscope-otel` for profiling. |

## The span-link broker (the critical bit)

### Two SQLite databases — do not confuse

| File | Owner | Purpose |
|---|---|---|
| `game_state.db` | All 8 location services (WAL mode, shared) | Canonical game state |
| `game_sessions.db` | `war_map` **only** | `game_actions` table: `(game_session_id, action_sequence, action_type, player_name, faction, trace_id, span_id, location_id, target_location_id, timestamp, game_state_after, map_id)` |

`game_actions` schema is defined in `init_game_session_tracking()` at `app.py:60-96`. It carries a `UNIQUE(game_session_id, action_sequence)` constraint — the sequence is what lets "next action" look up "previous action" deterministically.

### Storing an action — `store_game_action()` at `app.py:101-128`

Called at the tail of every action handler. Reads the current max `action_sequence` for the session, inserts a new row with `next_sequence = max + 1`, returns the sequence number. Persists the active `map_id` (defaults to `get_active_map_id()` when callers don't pass one) so the replay UI can render the correct map layout for each session.

### Resolving a session's map — `get_session_map_id()`

Used by `replay_session_page` to pick the right layout. Reads the first non-NULL `map_id` from the session's actions (cheap — sessions don't switch maps mid-play), falls back to the active map, then to `DEFAULT_MAP_ID`. Without this, the replay template renders the WoK layout regardless of which map was actually played.

### Reconstructing a previous span context — `get_previous_action_context()` at `app.py:130-170`

Looks up `(trace_id, span_id)` for `(game_session_id, target_sequence)` in SQLite. Converts the hex strings to integers with `int(result[0], 16)` / `int(result[1], 16)` (this step has bitten agents in the past — the IDs are stored as hex strings, not raw bytes). Constructs a `trace.SpanContext(trace_id=..., span_id=..., is_remote=True, trace_flags=trace.TraceFlags.SAMPLED)` and returns it. The `SAMPLED` flag is required — without it, downstream processors may drop the link.

### Creating a link — `create_span_link_from_context()` at `app.py:172-189`

Wraps the reconstructed context in a `trace.Link(span_context, attributes={...})` with:

- `link.type` — caller-supplied (default `"game_sequence"`; AI opponent uses `"ai_decision_trigger"` in its own code).
- `link.relation` — always `"follows"`.
- `game.sequence` — always `"true"` (enables Tempo tag search).

### Per-action flow inside a player-action handler

```python
previous_span_context = get_previous_action_context(game_session_id, current_sequence)
links = [create_span_link_from_context(previous_span_context, "game_sequence")] if previous_span_context else []

with tracer.start_as_current_span(
    "move_army",
    kind=SpanKind.SERVER,
    links=links,
    attributes={
        "game.session.id": game_session_id,
        "game.action.sequence": current_sequence + 1,
        "span.player.action": True,
        "player.name": ...,
        "player.faction": ...,
    },
) as span:
    # ... do the work, call location_api_request, etc.
    store_game_action(
        game_session_id, "move_army", ...,
        trace_id=format(span.get_span_context().trace_id, '032x'),
        span_id=format(span.get_span_context().span_id, '016x'),
        ...
    )
```

The `format(..., '032x')` / `'016x'` pair is the inverse of the `int(..., 16)` step in `get_previous_action_context()` — always keep the two in sync.

## Replay endpoints

The replay UI (`replay_session.html`) is backed by Tempo. `app.py` serves as the proxy and cleans up the responses.

**Primary (Tempo):**
- Discover sessions — `GET {TEMPO_URL}/api/v2/search/tag/game.session.id/values`
- Pull a session's traces — `GET {TEMPO_URL}/api/search?q={game.session.id="<id>"}&limit=100`
- Pull a specific trace — `GET {TEMPO_URL}/api/traces/<trace_id>`

**Fallback (SQLite):** If Tempo returns an error or is unreachable, read the `game_actions` table directly. Replay renders a reduced view (without span payloads) but the session narrative is preserved.

## Environment

| Var | Default | Purpose |
|---|---|---|
| `SECRET_KEY` | `war_of_westeros_secret_key` | Flask session secret |
| `AI_URL` / `AI_SERVICE_URL` | `http://localhost:8081` | AI opponent base URL. Docker sets `http://ai-opponent:8081` |
| `DATABASE_FILE` | `../app/game_state.db` | Shared game-state DB (read-only access from war_map) |
| `GAME_SESSIONS_DB` | `game_sessions.db` | `game_actions` DB. Docker sets `/data/game_sessions.db` |
| `API_BASE_URL` | `http://localhost` | Base URL for location server calls (host portion only; port comes from `LOCATION_PORTS`) |
| `TEMPO_URL` | `http://tempo:3200` | Replay-query target |
| `IN_DOCKER` | unset | Switches location URLs between `localhost:500X` and container DNS |

Location ports are hard-coded in `LOCATION_PORTS` at `app.py:201-210`; mirror any change here in `app/game_config.py`.

## `X-Frame-Options` stripped — intentional

`@app.after_request` at `app.py:191-194` removes `X-Frame-Options` from every response:

```python
@app.after_request
def remove_frame_options(response):
    response.headers.pop('X-Frame-Options', None)
    return response
```

This is deliberate — it lets the UI be embedded in Grafana iframes for the replay experience. Grafana's `GF_SECURITY_ALLOW_EMBEDDING=true` is the other half of this configuration. **Do not remove** unless you are also disabling Grafana embedding.

## Common edits

**Add a new action type to the span-link chain.**
1. Add the Flask handler in `app.py`, following the `move_army` / `create_army` pattern: look up previous context, build link, start a SERVER span with link + attributes, call `store_game_action()` at the tail.
2. Add a renderer case in `templates/replay_session.html` so the replay UI can visualize the new action.
3. Update the action-types table in [`../SPAN_LINKS.md`](../SPAN_LINKS.md).
4. Update this doc and [`../AGENTS.md`](../AGENTS.md) if the new action surfaces new span attributes.

**Tune the replay query.**
Edit the TraceQL strings in the replay endpoints (`app.py`). The `game.session.id` tag is required — Tempo uses it to group the session's traces.

**Add attributes to every player-action link.**
Edit `create_span_link_from_context()` at `app.py:172-189`. The current three (`link.type`, `link.relation`, `game.sequence`) are load-bearing — the replay UI reads them.

**Change session-tracking schema.**
Edit `init_game_session_tracking()` at `app.py:60-96`. Because the DB lives on a persistent Docker volume, a schema change requires either `docker compose down -v` before restart **or** a migration script. Flag to the user which one you recommend before changing columns.

## Keep this doc current

Per the sub-agent rule, any change to span-link fields, replay endpoints, env vars, action types, or the line-number anchors above must land in the same work unit. Before returning a response that touched `war_map/`, grep this file for references to anything you changed.

Particularly sensitive references:
- `app.py:130-170` — `get_previous_action_context`
- `app.py:172-189` — `create_span_link_from_context`
- `app.py:60-96` — `init_game_session_tracking`
- `app.py:101-128` — `store_game_action`
- `app.py:191-194` — `X-Frame-Options` strip
- `app.py:201-210` — `LOCATION_PORTS` dict

## Cross-references

- [`../AGENTS.md`](../AGENTS.md) — scenario-wide architecture and patterns
- [`../SPAN_LINKS.md`](../SPAN_LINKS.md) — full span-link design spec and replay flow
- [`../app/CLAUDE.md`](../app/CLAUDE.md) — location-server HTTP API this service calls
- [`../ai_opponent/CLAUDE.md`](../ai_opponent/CLAUDE.md) — AI service this one activates/deactivates


================================================
FILE: game-of-tracing/war_map/Dockerfile
================================================
FROM python:3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2

WORKDIR /app

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .

ENV FLASK_APP=app.py
ENV FLASK_DEBUG=0
ENV IN_DOCKER=1

EXPOSE 8080

CMD ["flask", "run", "--host=0.0.0.0", "--port=8080"] 

================================================
FILE: game-of-tracing/war_map/app.py
================================================
import os
import json
import sqlite3
import requests
import threading
import uuid
import time
import atexit
from flask import Flask, render_template, jsonify, request, redirect, url_for, session
from telemetry import GameTelemetry
from opentelemetry import trace
from opentelemetry.trace import SpanKind
from opentelemetry.propagate import inject

app = Flask(__name__)
app.secret_key = os.environ.get('SECRET_KEY', 'war_of_westeros_secret_key')

# AI Service configuration
AI_SERVICE_URL = os.environ.get('AI_URL', 'http://localhost:8081')

# Initialize telemetry
telemetry = GameTelemetry(service_name="war_map")
logger = telemetry.get_logger()
tracer = telemetry.get_tracer()
atexit.register(telemetry.shutdown)

# Game session tracking database
GAME_SESSIONS_DB = os.environ.get('GAME_SESSIONS_DB', 'game_sessions.db')  # Use local file for development

# Game state variables
GAME_OVER = False
WINNER = None
VICTORY_MESSAGE = None

# ----------------------------------------------------------------
# Maps — in-UI picker metadata.
# Full per-location config lives in app/game_config.py. This is a compact
# read-only duplicate of the fields war_map actually needs: layout for the
# canvas, tick rules for the hold-to-win loop, faction/AI wiring for the
# picker screen. Keep the map-id strings in sync with app/game_config.py.
# ----------------------------------------------------------------
DEFAULT_MAP_ID = "war_of_kingdoms"

MAPS_META = {
    "war_of_kingdoms": {
        "display_name": "War of Kingdoms",
        "description": (
            "Northern and Southern kingdoms clash for dominance. "
            "Capture the enemy capital to win."
        ),
        "single_player": False,
        "player_faction": None,
        "ai_faction": None,
        "factions": ["northern", "southern"],
        "tick_interval_s": 0,
        "win_hold_ticks": 0,
        "icon": "fa-chess-knight",
    },
    "white_walkers_attack": {
        "display_name": "White Walkers Attack",
        "description": (
            "The Long Night has come. As the Night's Watch, hold every Wall "
            "keep for 5 ticks before the White Walkers do. Single-player."
        ),
        "single_player": True,
        "player_faction": "nights_watch",
        "ai_faction": "white_walkers",
        "factions": ["nights_watch", "white_walkers", "barbarian"],
        "tick_interval_s": 30,
        "win_hold_ticks": 5,
        "icon": "fa-icicles",
    },
}

# Map layout — canvas x/y percentages per location. Each map's keys must
# match the location ids in app/game_config.py's MAPS[map_id]["locations"].
LOCATION_POSITIONS_BY_MAP = {
    "war_of_kingdoms": {
        "southern_capital": {"x": 20, "y": 70, "type": "capital", "name": "Southern Capital"},
        "northern_capital": {"x": 80, "y": 20, "type": "capital", "name": "Northern Capital"},
        "village_1": {"x": 35, "y": 55, "type": "village", "name": "Village 1"},
        "village_2": {"x": 65, "y": 35, "type": "village", "name": "Village 2"},
        "village_3": {"x": 30, "y": 40, "type": "village", "name": "Village 3"},
        "village_4": {"x": 45, "y": 65, "type": "village", "name": "Village 4"},
        "village_5": {"x": 50, "y": 50, "type": "village", "name": "Village 5"},
        "village_6": {"x": 70, "y": 45, "type": "village", "name": "Village 6"},
    },
    "white_walkers_attack": {
        "nights_watch_fortress": {"x": 50, "y": 85, "type": "capital", "name": "Castle Black"},
        "white_walker_fortress": {"x": 50, "y": 15, "type": "capital", "name": "The Lands of Always Winter"},
        "wall_west": {"x": 20, "y": 50, "type": "wall", "name": "Westwatch"},
        "wall_center_west": {"x": 40, "y": 50, "type": "wall", "name": "Queensgate"},
        "wall_center_east": {"x": 60, "y": 50, "type": "wall", "name": "Deep Lake"},
        "wall_east": {"x": 80, "y": 50, "type": "wall", "name": "Eastwatch-by-the-Sea"},
        "barbarian_village_west": {"x": 10, "y": 72, "type": "village", "name": "Free Folk Camp (West)"},
        "barbarian_village_east": {"x": 90, "y": 72, "type": "village", "name": "Free Folk Camp (East)"},
    },
}

LOCATION_CONNECTIONS_BY_MAP = {
    "war_of_kingdoms": [
        ["southern_capital", "village_1"],
        ["southern_capital", "village_3"],
        ["northern_capital", "village_2"],
        ["northern_capital", "village_6"],
        ["village_1", "village_2"],
        ["village_1", "village_4"],
        ["village_2", "village_5"],
        ["village_3", "village_5"],
        ["village_3", "village_6"],
        ["village_4", "village_5"],
        ["village_5", "village_6"],
    ],
    "white_walkers_attack": [
        ["nights_watch_fortress", "wall_west"],
        ["nights_watch_fortress", "wall_center_west"],
        ["nights_watch_fortress", "wall_center_east"],
        ["nights_watch_fortress", "wall_east"],
        ["white_walker_fortress", "wall_west"],
        ["white_walker_fortress", "wall_center_west"],
        ["white_walker_fortress", "wall_center_east"],
        ["white_walker_fortress", "wall_east"],
        ["wall_west", "wall_center_west"],
        ["wall_center_west", "wall_center_east"],
        ["wall_center_east", "wall_east"],
        ["wall_west", "barbarian_village_west"],
        ["wall_east", "barbarian_village_east"],
    ],
}

# Per-map list of wall-type locations for the hold-to-win check.
WALL_LOCATIONS_BY_MAP = {
    map_id: [
        loc_id for loc_id, meta in positions.items()
        if meta.get("type") == "wall"
    ]
    for map_id, positions in LOCATION_POSITIONS_BY_MAP.items()
}

# Kept for legacy call sites that still reference the module-level names.
# These stay pointing at the WoK defaults — call sites that need per-map
# behaviour should call _current_positions() / _current_connections() instead.
LOCATION_POSITIONS = LOCATION_POSITIONS_BY_MAP[DEFAULT_MAP_ID]
LOCATION_CONNECTIONS = LOCATION_CONNECTIONS_BY_MAP[DEFAULT_MAP_ID]


def _current_positions():
    """Positions for the currently active map (reads active_map_id from DB)."""
    return LOCATION_POSITIONS_BY_MAP.get(
        get_active_map_id(), LOCATION_POSITIONS_BY_MAP[DEFAULT_MAP_ID]
    )


def _current_connections():
    """Connections for the currently active map."""
    return LOCATION_CONNECTIONS_BY_MAP.get(
        get_active_map_id(), LOCATION_CONNECTIONS_BY_MAP[DEFAULT_MAP_ID]
    )


def _current_walls():
    return WALL_LOCATIONS_BY_MAP.get(get_active_map_id(), [])

def init_game_session_tracking():
    """Initialize the game session tracking database"""
    try:
        # Ensure the database directory exists if using an absolute path
        db_dir = os.path.dirname(GAME_SESSIONS_DB)
        if db_dir and not os.path.exists(db_dir):
            os.makedirs(db_dir, exist_ok=True)
        
        conn = sqlite3.connect(GAME_SESSIONS_DB)
        cursor = conn.cursor()
        
        cursor.execute('''
        CREATE TABLE IF NOT EXISTS game_actions (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            game_session_id TEXT NOT NULL,
            action_sequence INTEGER NOT NULL,
            action_type TEXT NOT NULL,
            player_name TEXT,
            faction TEXT,
            trace_id TEXT NOT NULL,
            span_id TEXT NOT NULL,
            location_id TEXT,
            target_location_id TEXT,
            timestamp INTEGER NOT NULL,
            game_state_after TEXT,
            map_id TEXT,
            UNIQUE(game_session_id, action_sequence)
        )
        ''')

        # Best-effort migration for existing game_sessions.db files created
        # before the map_id column existed. SQLite's ALTER TABLE only adds
        # missing columns; the IGNORE/OperationalError guard keeps a
        # fresh-install run idempotent.
        try:
            cursor.execute("ALTER TABLE game_actions ADD COLUMN map_id TEXT")
        except sqlite3.OperationalError:
            pass

        conn.commit()
        conn.close()
        logger.info(f"Game session tracking database initialized: {GAME_SESSIONS_DB}")
        
    except Exception as e:
        logger.error(f"Failed to initialize game session tracking database: {e}")
        # Don't fail the app startup if database init fails
        pass

# Initialize the game session tracking database immediately
init_game_session_tracking()
# Tables in game_state.db (game_config, wall_hold, faction_economy) are
# initialized lazily on first call to _ensure_game_config_tables() — see
# the in-process startup path later in this module.

def store_game_action(game_session_id, action_type, player_name, faction,
                     trace_id, span_id, location_id=None, target_location_id=None,
                     game_state=None, map_id=None):
    """Store a game action with its trace information.

    ``map_id`` is recorded so the replay page can render the correct map
    layout (positions/connections) for sessions played on non-default maps.
    Defaults to the currently active map when not supplied.
    """
    if map_id is None:
        try:
            map_id = get_active_map_id()
        except Exception:
            map_id = DEFAULT_MAP_ID

    conn = sqlite3.connect(GAME_SESSIONS_DB)
    cursor = conn.cursor()

    # Get next sequence number
    cursor.execute("SELECT MAX(action_sequence) FROM game_actions WHERE game_session_id = ?",
                   (game_session_id,))
    result = cursor.fetchone()
    next_sequence = (result[0] or 0) + 1

    # Debug logging
    logger.info(f"Storing action: session={game_session_id}, sequence={next_sequence}, action={action_type}, trace_id={trace_id}, span_id={span_id}, map_id={map_id}")

    cursor.execute('''
    INSERT INTO game_actions
    (game_session_id, action_sequence, action_type, player_name, faction,
     trace_id, span_id, location_id, target_location_id, timestamp, game_state_after, map_id)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    ''', (game_session_id, next_sequence, action_type, player_name, faction,
          trace_id, span_id, location_id, target_location_id,
          int(time.time()), json.dumps(game_state) if game_state else None, map_id))

    conn.commit()
    conn.close()
    return next_sequence


def get_session_map_id(session_id):
    """Resolve the map a session was played on.

    Looks at any non-NULL ``map_id`` in the session's actions; falls back to
    the currently active map for sessions stored before the column was
    populated. Returns ``DEFAULT_MAP_ID`` as a last resort so the replay
    template always has a layout to render.
    """
    try:
        conn = sqlite3.connect(GAME_SESSIONS_DB)
        try:
            row = conn.execute(
                "SELECT map_id FROM game_actions "
                "WHERE game_session_id = ? AND map_id IS NOT NULL "
                "ORDER BY action_sequence LIMIT 1",
                (session_id,),
            ).fetchone()
            if row and row[0] in LOCATION_POSITIONS_BY_MAP:
                return row[0]
        finally:
            conn.close()
    except Exception as e:
        logger.warning(f"get_session_map_id failed for {session_id}: {e}")

    try:
        active = get_active_map_id()
        if active in LOCATION_POSITIONS_BY_MAP:
            return active
    except Exception:
        pass
    return DEFAULT_MAP_ID

def get_previous_action_context(game_session_id, target_sequence):
    """Get the action's span context for linking by target sequence number"""
    conn = sqlite3.connect(GAME_SESSIONS_DB)
    cursor = conn.cursor()
    
    # Debug logging
    logger.info(f"Looking for action: session={game_session_id}, target_sequence={target_sequence}")
    
    cursor.execute('''
    SELECT trace_id, span_id FROM game_actions 
    WHERE game_session_id = ? AND action_sequence = ?
    ''', (game_session_id, target_sequence))
    
    result = cursor.fetchone()
    conn.close()
    
    if result:
        try:
            # Debug logging
            logger.info(f"Found target action: trace_id={result[0]}, span_id={result[1]}")
            
            # Reconstruct the span context from stored trace and span IDs
            trace_id = int(result[0], 16)
            span_id = int(result[1], 16)
            
            # Create span context with proper trace flags
            span_context = trace.SpanContext(
                trace_id=trace_id,
                span_id=span_id,
                is_remote=True,
                trace_flags=trace.TraceFlags.SAMPLED
            )
            
            logger.info(f"Created span context for linking: trace_id={trace_id:032x}, span_id={span_id:016x}")
            return span_context
        except (ValueError, TypeError) as e:
            logger.error(f"Failed to reconstruct span context: {e}")
            return None
    else:
        logger.info(f"No action found for sequence {target_sequence}")
    return None

def create_span_link_from_context(span_context, link_type="game_sequence"):
    """Create a span link from a span context using the official API"""
    if span_context is None:
        return None
    
    try:
        link = trace.Link(
            span_context,
            attributes={
                "link.type": link_type,
                "link.relation": "follows",
                "game.sequence": "true"
            }
        )
        return link
    except Exception as e:
        logger.error(f"Failed to create span link: {e}")
        return None

@app.after_request
def remove_frame_options(response):
    response.headers.pop('X-Frame-Options', None)
    return response

# Configuration
DATABASE_FILE = os.environ.get('DATABASE_FILE', '../app/game_state.db')
API_BASE_URL = os.environ.get('API_BASE_URL', 'http://localhost')  # Base URL for API calls

# Location server ports (from game_config.py). These are keyed by the
# *current-map* location id; when the active map changes, the keys here
# follow along because both maps assign the same port to the same slot.
LOCATION_PORTS = {
    "southern_capital": 5001,
    "northern_capital": 5002,
    "village_1": 5003,
    "village_2": 5004,
    "village_3": 5005,
    "village_4": 5006,
    "village_5": 5007,
    "village_6": 5008,
    # White Walkers Attack mappings (same ports — just aliased).
    "nights_watch_fortress": 5001,
    "white_walker_fortress": 5002,
    "wall_west": 5003,
    "wall_center_west": 5004,
    "wall_center_east": 5005,
    "wall_east": 5006,
    "barbarian_village_west": 5007,
    "barbarian_village_east": 5008,
}

# Container hostname per logical location id. WWA reuses the same 8 slot
# containers, so its location ids resolve to the WoK container names. Without
# this aliasing, ``location_id.replace('_', '-')`` produces hostnames like
# ``nights-watch-fortress`` that don't exist in the docker network and the
# /map render returns an empty locations dict (blank map).
CONTAINER_FOR_LOCATION_ID = {
    "southern_capital": "southern-capital",
    "northern_capital": "northern-capital",
    "village_1": "village-1",
    "village_2": "village-2",
    "village_3": "village-3",
    "village_4": "village-4",
    "village_5": "village-5",
    "village_6": "village-6",
    "nights_watch_fortress": "southern-capital",
    "white_walker_fortress": "northern-capital",
    "wall_west": "village-1",
    "wall_center_west": "village-2",
    "wall_center_east": "village-3",
    "wall_east": "village-4",
    "barbarian_village_west": "village-5",
    "barbarian_village_east": "village-6",
}

# Container hostname (in docker-compose) per slot. Stable across maps.
SLOT_CONTAINER_NAMES = {
    "slot_1": "southern-capital",
    "slot_2": "northern-capital",
    "slot_3": "village-1",
    "slot_4": "village-2",
    "slot_5": "village-3",
    "slot_6": "village-4",
    "slot_7": "village-5",
    "slot_8": "village-6",
}

# Port per slot.
SLOT_PORTS = {
    "slot_1": 5001,
    "slot_2": 5002,
    "slot_3": 5003,
    "slot_4": 5004,
    "slot_5": 5005,
    "slot_6": 5006,
    "slot_7": 5007,
    "slot_8": 5008,
}


def _container_for_slot(slot_id):
    """Return the docker-compose service name hosting ``slot_id`` (stable)."""
    return SLOT_CONTAINER_NAMES.get(slot_id, slot_id.replace('_', '-'))


def _slot_port_pairs():
    """Yield (slot_id, port) tuples for all 8 slots."""
    return list(SLOT_PORTS.items())

# LOCATION_POSITIONS and LOCATION_CONNECTIONS are defined earlier (as the
# WoK default slices of the LOCATION_*_BY_MAP dicts). Legacy call sites that
# still reference the unsuffixed names get the WoK layout; new code should
# go through _current_positions() / _current_connections().

# Game state - track victory conditions (local process cache; also read
# from wall_hold on map WWA).
# Note: GAME_OVER/WINNER/VICTORY_MESSAGE already declared near top of file.

def get_db_connection():
    """Create a connection to the SQLite database"""
    conn = sqlite3.connect(DATABASE_FILE)
    conn.row_factory = sqlite3.Row
    return conn


# ----------------------------------------------------------------
# Active map + wall-hold state (lives in game_state.db so location
# servers and war_map agree on the single source of truth).
# ----------------------------------------------------------------

def _ensure_game_config_tables():
    """Create game_config, faction_economy, and wall_hold if missing, and
    migrate the war_map table for single-player maps (adds map_id + drops the
    faction UNIQUE constraint so nights_watch can be registered without
    conflicting with the WoK two-faction model).
    """
    try:
        conn = get_db_connection()
        cursor = conn.cursor()
        cursor.execute('''
        CREATE TABLE IF NOT EXISTS game_config (
            key TEXT PRIMARY KEY,
            value TEXT NOT NULL
        )
        ''')
        cursor.execute(
            "INSERT OR IGNORE INTO game_config (key, value) VALUES ('active_map_id', ?)",
            (DEFAULT_MAP_ID,),
        )
        cursor.execute('''
        CREATE TABLE IF NOT EXISTS faction_economy (
            faction TEXT PRIMARY KEY,
            corpses INTEGER NOT NULL DEFAULT 0
        )
        ''')
        cursor.execute('''
        CREATE TABLE IF NOT EXISTS wall_hold (
            map_id TEXT NOT NULL,
            faction TEXT NOT NULL,
            ticks INTEGER NOT NULL DEFAULT 0,
            last_update INTEGER NOT NULL,
            PRIMARY KEY (map_id, faction)
        )
        ''')
        # war_map table: additive map_id column for session-level bookkeeping.
        try:
            cursor.execute("ALTER TABLE war_map ADD COLUMN map_id TEXT")
        except sqlite3.OperationalError:
            pass
        conn.commit()
        conn.close()
    except sqlite3.Error as e:
        logger.error(f"Failed to ensure game_config tables: {e}")


def get_active_map_id():
    """Return the currently active map id from game_state.db (cached row)."""
    try:
        conn = get_db_connection()
        cursor = conn.cursor()
        cursor.execute("SELECT value FROM game_config WHERE key = 'active_map_id'")
        row = cursor.fetchone()
        conn.close()
        return row['value'] if row else DEFAULT_MAP_ID
    except sqlite3.Error:
        return DEFAULT_MAP_ID


def set_active_map_id(map_id):
    """Persist the active map id. Location services pick this up via /reload."""
    try:
        conn = get_db_connection()
        cursor = conn.cursor()
        cursor.execute(
            "INSERT INTO game_config (key, value) VALUES ('active_map_id', ?) "
            "ON CONFLICT(key) DO UPDATE SET value = excluded.value",
            (map_id,),
        )
        conn.commit()
        conn.close()
        return True
    except sqlite3.Error as e:
        logger.error(f"Failed to set active map id: {e}")
        return False


def reset_wall_hold(map_id):
    """Zero the wall-hold counter for every faction on ``map_id``."""
    try:
        conn = get_db_connection()
        cursor = conn.cursor()
        cursor.execute("DELETE FROM wall_hold WHERE map_id = ?", (map_id,))
        conn.commit()
        conn.close()
    except sqlite3.Error as e:
        logger.error(f"Failed to reset wall_hold for {map_id}: {e}")


def bump_wall_hold(map_id, faction, reset_others=True):
    """Increment ``faction``'s tick count on ``map_id``. Optionally reset
    every other faction back to 0. Returns the new tick count.
    """
    try:
        conn = get_db_connection()
        cursor = conn.cursor()
        now = int(time.time())
        if reset_others:
            cursor.execute(
                "UPDATE wall_hold SET ticks = 0 WHERE map_id = ? AND faction != ?",
                (map_id, faction),
            )
        cursor.execute(
            "INSERT INTO wall_hold (map_id, faction, ticks, last_update) "
            "VALUES (?, ?, 1, ?) "
            "ON CONFLICT(map_id, faction) DO UPDATE SET "
            "ticks = ticks + 1, last_update = excluded.last_update",
            (map_id, faction, now),
        )
        cursor.execute(
            "SELECT ticks FROM wall_hold WHERE map_id = ? AND faction = ?",
            (map_id, faction),
        )
        row = cursor.fetchone()
        conn.commit()
        conn.close()
        return int(row['ticks']) if row else 0
    except sqlite3.Error as e:
        logger.error(f"Failed to bump wall_hold: {e}")
        return 0


def get_wall_hold(map_id):
    """Return {faction: ticks} for the given map."""
    try:
        conn = get_db_connection()
        cursor = conn.cursor()
        cursor.execute("SELECT faction, ticks FROM wall_hold WHERE map_id = ?", (map_id,))
        rows = cursor.fetchall()
        conn.close()
        return {r['faction']: int(r['ticks']) for r in rows}
    except sqlite3.Error:
        return {}


def get_faction_corpses(faction):
    """Read a faction's corpse pool (0 when no row yet)."""
    try:
        conn = get_db_connection()
        cursor = conn.cursor()
        cursor.execute("SELECT corpses FROM faction_economy WHERE faction = ?", (faction,))
        row = cursor.fetchone()
        conn.close()
        return int(row['corpses']) if row else 0
    except sqlite3.Error:
        return 0


def check_faction_availability(faction):
    """Check if a faction is already claimed by another player"""
    try:
        conn = get_db_connection()
        cursor = conn.cursor()
        
        # Check if the war_map table exists
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='war_map'")
        if not cursor.fetchone():
            # Create the war_map table if it doesn't exist
            cursor.execute('''
            CREATE TABLE war_map (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                faction TEXT UNIQUE NOT NULL,
                player_name TEXT,
                session_id TEXT UNIQUE
            )
            ''')
            conn.commit()
        
        # Check if the faction is already taken
        cursor.execute("SELECT * FROM war_map WHERE faction = ?", (faction,))
        result = cursor.fetchone()
        
        conn.close()
        logger.info(f"Faction availability check: {result is None}")
        return result is None  # True if available, False if taken
    except sqlite3.Error as e:
        logger.error(f"Database error: {e}")
        return False

def register_faction(faction, player_name, session_id):
    """Register a player's faction choice"""
    try:
        conn = get_db_connection()
        cursor = conn.cursor()
        
        # Try to insert the new faction record
        cursor.execute(
            "INSERT INTO war_map (faction, player_name, session_id) VALUES (?, ?, ?)",
            (faction, player_name, session_id)
        )
        conn.commit()
        conn.close()
        logger.info(f"Faction registered: {faction} for {player_name} with session ID {session_id}")
        return True
    except sqlite3.Error as e:
        logger.error(f"Database error when registering faction: {e}")
        return False

def get_player_faction(session_id):
    """Get the faction associated with a session ID"""
    try:
        conn = get_db_connection()
        cursor = conn.cursor()
        
        cursor.execute("SELECT faction FROM war_map WHERE session_id = ?", (session_id,))
        result = cursor.fetchone()
        
        conn.close()
        logger.info(f"Player faction retrieved: {result['faction'] if result else None}")
        return result['faction'] if result else None
    except sqlite3.Error as e:
        logger.error(f"Database error: {e}")
        return None

def release_faction(session_id):
    """Release a faction when a player logs out or disconnects"""
    try:
        conn = get_db_connection()
        cursor = conn.cursor()
        
        cursor.execute("DELETE FROM war_map WHERE session_id = ?", (session_id,))
        conn.commit()
        conn.close()
        logger.info(f"Faction released for session ID: {session_id}")
        return True
    except sqlite3.Error as e:
        logger.error(f"Database error when releasing faction: {e}")
        return False

def release_all_factions():
    """Release all faction assignments - used for game reset"""
    try:
        conn = get_db_connection()
        cursor = conn.cursor()
        
        cursor.execute("DELETE FROM war_map")
        conn.commit()
        conn.close()
        logger.info("All factions released")
        return True
    except sqlite3.Error as e:
        logger.error(f"Database error when releasing all factions: {e}")
        return False

def get_location_url(location_id):
    """Get the URL for a location's API"""
    # In Docker, use container names instead of localhost. WWA location ids
    # alias the WoK slot containers — see CONTAINER_FOR_LOCATION_ID.
    if os.environ.get('IN_DOCKER'):
        host = CONTAINER_FOR_LOCATION_ID.get(
            location_id, location_id.replace('_', '-')
        )
    else:
        host = 'localhost'

    port = LOCATION_PORTS[location_id]
    return f"http://{host}:{port}"

def make_api_request(location_id, endpoint, method='GET', data=None):
    """Make an API request to a location server with trace context."""
    url = f"{get_location_url(location_id)}/{endpoint}"
    
    # Only create spans for important operations, not for status checks
    important_endpoints = {'move_army', 'all_out_attack', 'send_resources_to_capital', 'receive_army', 'receive_resources', 'collect_resources', 'create_army'}
    
    headers = {"Content-Type": "application/json"}
    if endpoint in important_endpoints:
        # Create span only for important operations
        with tracer.start_as_current_span(
            "location_api_request",
            kind=SpanKind.CLIENT,
            attributes={
                "location.id": location_id,
                "location.endpoint": endpoint,
                "http.method": method
            }
        ) as span:
            inject(headers)  # Inject trace context into headers
            try:
                if method == 'GET':
                    response = requests.get(url, headers=headers)
                else:  # POST
                    response = requests.post(url, json=data, headers=headers)
                
                span.set_attribute("http.status_code", response.status_code)
                response.raise_for_status()
                result = response.json()
                
                if not result.get("success", True):
                    span.set_status(trace.StatusCode.ERROR, result.get("message", "Unknown error"))
                
                return result
            except requests.RequestException as e:
                span.record_exception(e)
                span.set_status(trace.StatusCode.ERROR, str(e))
                return {"error": str(e)}
    else:
        # For status checks and other non-important operations, just make the request without tracing
        try:
            if method == 'GET':
                response = requests.get(url, headers=headers)
            else:  # POST
                response = requests.post(url, json=data, headers=headers)
            response.raise_for_status()
            return response.json()
        except requests.RequestException as e:
            return {"error": str(e)}

def check_game_over(locations_data, map_id=None):
    """Dispatch to the right win-condition check based on the active map."""
    if map_id is None:
        map_id = get_active_map_id()
    if map_id == "white_walkers_attack":
        # WWA games end only via hold-the-walls. Capital captures do not end
        # the game (the capital can change hands mid-match).
        return check_wall_hold_win(locations_data, map_id)
    return check_capital_capture_win(locations_data)


def check_capital_capture_win(locations_data):
    """Classic WoK win: take the enemy capital."""
    global GAME_OVER, WINNER, VICTORY_MESSAGE

    if locations_data.get('southern_capital', {}).get('faction') == 'northern':
        GAME_OVER = True
        WINNER = 'northern'
        VICTORY_MESSAGE = "The Northern Kingdom has conquered the Southern Capital! Victory through unity!"
        return True

    if locations_data.get('northern_capital', {}).get('faction') == 'southern':
        GAME_OVER = True
        WINNER = 'southern'
        VICTORY_MESSAGE = "The Southern Kingdom has conquered the Northern Capital! Glory to the South!"
        return True

    logger.info("Game is not over")
    return False


def check_wall_hold_win(locations_data, map_id):
    """White Walkers Attack win: one faction has held every wall for the
    configured number of ticks. This is a passive check — the tick thread
    owns incrementing the counter; here we just observe + declare.
    """
    global GAME_OVER, WINNER, VICTORY_MESSAGE

    threshold = MAPS_META.get(map_id, {}).get("win_hold_ticks", 0)
    if threshold <= 0:
        return False

    holds = get_wall_hold(map_id)
    for faction, ticks in holds.items():
        if ticks >= threshold:
            GAME_OVER = True
            WINNER = faction
            if faction == "nights_watch":
                VICTORY_MESSAGE = (
                    "The Night's Watch held the Wall! The Long Night is broken."
                )
            elif faction == "white_walkers":
                VICTORY_MESSAGE = (
                    "The Wall has fallen. The Long Night has come for Westeros."
                )
            else:
                VICTORY_MESSAGE = f"{faction.title()} held every Wall keep for {threshold} ticks."
            return True

    logger.debug(f"Wall hold check: {holds} (threshold {threshold})")
    return False

def reset_game_state():
    """Reset the game state"""
    global GAME_OVER, WINNER, VICTORY_MESSAGE
    GAME_OVER = False
    WINNER = None
    VICTORY_MESSAGE = None

def reset_game_data():
    """Reset the game completely by resetting each location's state"""
    # First, reset our local game state
    reset_game_state()
    
    # Deactivate AI if it's running
    try:
        requests.post(f"{AI_SERVICE_URL}/deactivate", timeout=5)
        logger.info("AI deactivated during game reset")
    except Exception as e:
        logger.warning(f"Failed to deactivate AI during reset: {e}")
    
    # Next, reset all faction assignments
    release_all_factions()
    
    # Clear the game session tracking database
    try:
        conn = sqlite3.connect(GAME_SESSIONS_DB)
        cursor = conn.cursor()
        cursor.execute("DELETE FROM game_actions")
        conn.commit()
        conn.close()
        logger.info("Game session tracking database cleared")
    except Exception as e:
        logger.warning(f"Failed to clear game session database: {e}")
    
    # Finally, reset one location to trigger a database reset
    # (Since they all share the same database, we only need to reset one)
    try:
        make_api_request('southern_capital', 'reset', method='POST')
        logger.info("Game data reset")
        return True
    except Exception as e:
        logger.error(f"Error resetting game data: {e}")
        return False

@app.route('/health', methods=['GET'])
def health():
    return jsonify({"status": "ok"})

@app.route('/')
def index():
    """Home page. Routes through the map picker on first visit; once the
    user has picked a map the faction-selection view (WoK) or auto-start
    view (WWA single-player) is served instead.
    """
    _ensure_game_config_tables()

    # Already in a game with a faction? Go straight to the map.
    if 'session_id' in session and get_player_faction(session['session_id']):
        return redirect(url_for('game_map'))

    # No map chosen yet → map picker.
    if 'map_id' not in session:
        return redirect(url_for('map_picker'))

    map_id = session['map_id']
    meta = MAPS_META.get(map_id, MAPS_META[DEFAULT_MAP_ID])

    if meta["single_player"]:
        # Single-player maps skip the faction cards. A single CTA button posts
        # back with faction=player_faction.
        player_faction = meta["player_faction"]
        player_available = check_faction_availability(player_faction)
        return render_template(
            'index.html',
            map_id=map_id,
            map_meta=meta,
            single_player=True,
            player_faction=player_faction,
            player_available=player_available,
            southern_available=False,
            northern_available=False,
        )

    # Classic WoK two-faction flow.
    southern_available = check_faction_availability('southern')
    northern_available = check_faction_availability('northern')
    logger.info(f"Southern available: {southern_available}, Northern available: {northern_available}")

    return render_template(
        'index.html',
        map_id=map_id,
        map_meta=meta,
        single_player=False,
        southern_available=southern_available,
        northern_available=northern_available,
    )


@app.route('/map_picker')
def map_picker():
    """Map selection screen. Renders one card per entry in MAPS_META."""
    _ensure_game_config_tables()
    return render_template('map_picker.html', maps=MAPS_META)


@app.route('/select_map', methods=['POST'])
def select_map():
    """Persist the chosen map as active + reload every location service.

    Steps:
      1. Write ``active_map_id`` to game_config.
      2. Reset the locations table via one location's ``/reset`` (shared DB —
         one call repopulates the 8 rows from the new map's config).
      3. POST ``/reload`` to every slot so the in-memory ``location_info`` on
         each service rebinds without a container restart.
      4. For single-player maps, auto-register the preset player faction and
         auto-activate the AI as the preset enemy faction.
      5. Redirect to the entry UI (map-aware from the session).
    """
    map_id = request.form.get('map_id') or DEFAULT_MAP_ID
    if map_id not in MAPS_META:
        logger.error(f"Unknown map_id: {map_id}")
        return redirect(url_for('map_picker'))

    with tracer.start_as_current_span(
        "select_map",
        kind=SpanKind.SERVER,
        attributes={"game.map.id": map_id},
    ) as span:
        # 1. Persist + wipe any previous wall-hold counters.
        set_active_map_id(map_id)
        reset_wall_hold(map_id)
        # Clear all maps' old counters to avoid stale wins after switching.
        for mid in MAPS_META:
            reset_wall_hold(mid)

        # 2. Reset locations rows to match the new map.
        try:
            # Any one container will do — the DB is shared. Use the first
            # Docker service name (stable across maps).
            reset_container = _container_for_slot("slot_1")
            requests.post(
                f"http://{reset_container}:5001/reset" if os.environ.get('IN_DOCKER')
                else f"http://localhost:5001/reset",
                timeout=5,
            )
        except Exception as e:
            logger.warning(f"Failed to reset location rows during map switch: {e}")

        # 3. Kick every slot to reload identity.
        for slot_id, port in _slot_port_pairs():
            try:
                host = _container_for_slot(slot_id) if os.environ.get('IN_DOCKER') else "localhost"
                requests.post(f"http://{host}:{port}/reload", timeout=5)
            except Exception as e:
                logger.warning(f"Failed to /reload {slot_id}: {e}")

        # 4. Clear faction claims + session data so the new map starts clean.
        release_all_factions()
        session.pop('faction', None)
        session.pop('player_name', None)
        session.pop('game_session_id', None)
        session.pop('action_sequence', None)
        session.pop('session_id', None)
        session['map_id'] = map_id

        meta = MAPS_META[map_id]

        # 5. Single-player: AI activation is deferred until the player clicks
        # "Take the Black" on the index page (so the player always explicitly
        # starts the game). But we do pre-reset the game-over flags.
        reset_game_state()
        span.set_attribute("single_player", meta["single_player"])

    return redirect(url_for('index'))


@app.route('/select_faction', methods=['POST'])
def select_faction():
    """Process faction selection (WoK two-player or single-player preset)."""
    map_id = session.get('map_id', DEFAULT_MAP_ID)
    meta = MAPS_META.get(map_id, MAPS_META[DEFAULT_MAP_ID])

    faction = request.form.get('faction')
    player_name = request.form.get('player_name', 'Unknown Player')

    allowed = set(meta.get("factions", []))
    if not faction or faction not in allowed:
        return render_template(
            'index.html',
            map_id=map_id,
            map_meta=meta,
            single_player=meta["single_player"],
            player_faction=meta.get("player_faction"),
            southern_available=check_faction_availability('southern'),
            northern_available=check_faction_availability('northern'),
            player_available=(
                check_faction_availability(meta.get("player_faction"))
                if meta["single_player"] else False
            ),
            error="Invalid faction selected",
        )

    # Check if faction is available
    if not check_faction_availability(faction):
        logger.info(f"Faction {faction} is already taken")
        return render_template(
            'index.html',
            map_id=map_id,
            map_meta=meta,
            single_player=meta["single_player"],
            player_faction=meta.get("player_faction"),
            southern_available=check_faction_availability('southern'),
            northern_available=check_faction_availability('northern'),
            player_available=False,
            error=f"The {faction.replace('_', ' ').title()} faction is already taken",
        )

    # Generate a session ID if not present
    if 'session_id' not in session:
        session['session_id'] = str(uuid.uuid4())

    # Generate a game session ID for span linking
    if 'game_session_id' not in session:
        session['game_session_id'] = str(uuid.uuid4())
        session['action_sequence'] = 0  # Initialize action sequence
        logger.info(f"Initialized game session: {session['game_session_id']}")

    # Register the faction
    if register_faction(faction, player_name, session['session_id']):
        session['faction'] = faction
        session['player_name'] = player_name
        session['is_ai'] = False  # Human player by default
        logger.info(f"Player {player_name} selected faction {faction} on map {map_id}")

        # On single-player maps, auto-activate the AI as the preset enemy
        # the moment the human commits to playing.
        if meta["single_player"] and meta.get("ai_faction"):
            try:
                requests.post(
                    f"{AI_SERVICE_URL}/activate",
                    json={"faction": meta["ai_faction"], "map_id": map_id},
                    timeout=5,
                )
                logger.info(f"Auto-activated AI as {meta['ai_faction']} for single-player map {map_id}")
            except Exception as e:
                logger.warning(f"Auto-activation of AI failed: {e}")

        return redirect(url_for('game_map'))
    else:
        logger.error(f"Failed to register faction {faction}")
        return render_template(
            'index.html',
            map_id=map_id,
            map_meta=meta,
            single_player=meta["single_player"],
            player_faction=meta.get("player_faction"),
            southern_available=check_faction_availability('southern'),
            northern_available=check_faction_availability('northern'),
            player_available=False,
            error=f"Failed to register {faction.replace('_', ' ').title()} faction",
        )

@app.route('/logout')
def logout():
    """Log out and release faction"""
    if 'session_id' in session:
        release_faction(session['session_id'])
        logger.info(f"Faction released for session ID: {session['session_id']}")
    # Clear the session
    session.clear()
    return redirect(url_for('index'))

@app.route('/restart-game')
def restart_game():
    """Reset the game and redirect all players to faction selection"""
    logger.info("Game restart initiated")
    
    # Store current session info for logging
    current_player = session.get('player_name', 'Unknown')
    current_faction = session.get('faction', 'Unknown')
    
    # Reset the entire game state
    success = reset_game_data()
    
    # Clear current user's session completely
    session.clear()
    
    # Log the restart
    if success:
        logger.info(f"Game successfully restarted by {current_player} ({current_faction})")
    else:
        logger.error(f"Game restart failed, initiated by {current_player} ({current_faction})")
    
    # Redirect to the home page with a reset status
    if success:
        return redirect(url_for('index') + '?reset=success&message=Game has been reset successfully')
    else:
        return redirect(url_for('index') + '?reset=failed&message=Game reset failed, please try again')

@app.route('/map')
def game_map():
    """Game map page — renders the canvas for the currently active map."""
    # Check if user has selected a faction
    if 'faction' not in session:
        return redirect(url_for('index'))

    map_id = session.get('map_id') or get_active_map_id()
    positions = LOCATION_POSITIONS_BY_MAP.get(map_id, LOCATION_POSITIONS_BY_MAP[DEFAULT_MAP_ID])
    connections = LOCATION_CONNECTIONS_BY_MAP.get(map_id, LOCATION_CONNECTIONS_BY_MAP[DEFAULT_MAP_ID])
    meta = MAPS_META.get(map_id, MAPS_META[DEFAULT_MAP_ID])

    faction = session['faction']
    player_name = session.get('player_name', 'Unknown Player')

    # Get all location data for the map (only the ids relevant to this map).
    locations_data = {}
    for loc_id in positions.keys():
        data = make_api_request(loc_id, '')
        if 'error' not in data:
            locations_data[loc_id] = {
                **positions[loc_id],
                'faction': data['faction'],
                'resources': data['resources'],
                'army': data['army'],
            }

    # Check for game over condition (map-aware).
    check_game_over(locations_data, map_id=map_id)

    # Wall-hold HUD payload for WWA.
    wall_hold_state = None
    if map_id == "white_walkers_attack":
        wall_hold_state = {
            "threshold": meta.get("win_hold_ticks", 0),
            "holds": get_wall_hold(map_id),
            "walls": WALL_LOCATIONS_BY_MAP.get(map_id, []),
        }

    return render_template(
        'map.html',
        player_name=player_name,
        faction=faction,
        map_id=map_id,
        map_meta=meta,
        locations=locations_data,
        connections=connections,
        wall_hold=wall_hold_state,
        game_over=GAME_OVER,
        winner=WINNER,
        victory_message=VICTORY_MESSAGE,
    )

@app.route('/api/collect_resources', methods=['POST'])
def collect_resources():
    """API endpoint to collect resources at a location"""
    # Get game session info for span linking
    game_session_id = session.get('game_session_id')
    current_sequence = session.get('action_sequence', 0)
    
    # Get previous action context for linking
    links = []
    if game_session_id and current_sequence > 0:
        previous_span_context = get_previous_action_context(game_session_id, current_sequence)
        if previous_span_context:
            link = create_span_link_from_context(previous_span_context, "game_sequence")
            if link:
                links.append(link)
    
    with tracer.start_as_current_span(
        "collect_resources",
        kind=SpanKind.SERVER,
        links=links,
        attributes={
            "player.name": session.get('player_name', 'Unknown'),
            "player.faction": session.get('faction', 'Unknown'),
            "game.session.id": game_session_id,
            "game.action.type": "collect_resources",
            "game.action.sequence": current_sequence + 1
        }
    ) as span:
        location_id = request.json.get('location_id')
        if not location_id:
            logger.error("Location ID required")
            return jsonify({"error": "Location ID required"}), 400
        
        span.set_attribute("location_id", location_id)
        
        result = make_api_request(location_id, 'collect_resources', method='POST')
        logger.info(f"Collect resources result: {result}")
        
        # Store this action for future span linking
        if game_session_id and result.get('success', True):  # Assume success if not specified
            try:
                next_sequence = store_game_action(
                    game_session_id=game_session_id,
                    action_type="collect_resources",
                    player_name=session.get('player_name'),
                    faction=session.get('faction'),
                    trace_id=format(span.get_span_context().trace_id, '032x'),
                    span_id=format(span.get_span_context().span_id, '016x'),
                    location_id=location_id
                )
                session['action_sequence'] = next_sequence
                logger.info(f"Stored game action {next_sequence} for session {game_session_id}")
            except Exception as e:
                logger.error(f"Failed to store game action: {e}")
        
        return jsonify(result)

@app.route('/api/create_army', methods=['POST'])
def create_army():
    """API endpoint to create an army at a location"""
    # Get game session info for span linking
    game_session_id = session.get('game_session_id')
    current_sequence = session.get('action_sequence', 0)
    
    # Get previous action context for linking
    links = []
    if game_session_id and current_sequence > 0:
        previous_span_context = get_previous_action_context(game_session_id, current_sequence)
        if previous_span_context:
            link = create_span_link_from_context(previous_span_context, "game_sequence")
            if link:
                links.append(link)
    
    with tracer.start_as_current_span(
        "create_army",
        kind=SpanKind.SERVER,
        links=links,
        attributes={
            "player.name": session.get('player_name', 'Unknown'),
            "player.faction": session.get('faction', 'Unknown'),
            "game.session.id": game_session_id,
            "game.action.type": "create_army",
            "game.action.sequence": current_sequence + 1
        }
    ) as span:
        location_id = request.json.get('location_id')
        if not location_id:
            logger.error("Location ID required")
            return jsonify({"error": "Location ID required"}), 400
        
        span.set_attribute("location_id", location_id)
        
        result = make_api_request(location_id, 'create_army', method='POST')
        logger.info(f"Create army result: {result}")
        
        # Store this action for future span linking
        if game_session_id and result.get('success', True):  # Assume success if not specified
            try:
                next_sequence = store_game_action(
                    game_session_id=game_session_id,
                    action_type="create_army",
                    player_name=session.get('player_name'),
                    faction=session.get('faction'),
                    trace_id=format(span.get_span_context().trace_id, '032x'),
                    span_id=format(span.get_span_context().span_id, '016x'),
                    location_id=location_id
                )
                session['action_sequence'] = next_sequence
                logger.info(f"Stored game action {next_sequence} for session {game_session_id}")
            except Exception as e:
                logger.error(f"Failed to store game action: {e}")
        
        return jsonify(result)

@app.route('/api/move_army', methods=['POST'])
def move_army():
    """API endpoint to move an army"""
    # Get game session info for span linking
    game_session_id = session.get('game_session_id')
    current_sequence = session.get('action_sequence', 0)
    
    # Debug logging
    logger.info(f"move_army: session={game_session_id}, current_sequence={current_sequence}")
    
    # Get previous action context for linking
    # Note: current_sequence is the last stored sequence number, so we look for that
    previous_span_context = None
    links = []
    if game_session_id and current_sequence > 0:
        previous_span_context = get_previous_action_context(game_session_id, current_sequence)
        if previous_span_context:
            link = create_span_link_from_context(previous_span_context, "game_sequence")
            if link:
                links.append(link)
                logger.info(f"Created span link to previous action (sequence {current_sequence})")
    
    with tracer.start_as_current_span(
        "move_army",
        kind=SpanKind.SERVER,
        links=links,  # Add span links here
        attributes={
            "player.name": session.get('player_name', 'Unknown'),
            "player.faction": session.get('faction', 'Unknown'),
            "game.session.id": game_session_id,
            "game.action.type": "move_army",
            "game.action.sequence": current_sequence + 1
        }
    ) as span:
        # Debug: log current span info
        current_trace_id = format(span.get_span_context().trace_id, '032x')
        current_span_id = format(span.get_span_context().span_id, '016x')
        logger.info(f"Current span: trace_id={current_trace_id}, span_id={current_span_id}")
        
        source_id = request.json.get('source_id')
        target_id = request.json.get('target_id')
        
        if not source_id or not target_id:
            span.set_status(trace.StatusCode.ERROR, "Missing location IDs")
            return jsonify({"error": "Source and target location IDs required"}), 400
        
        span.set_attribute("source_location", source_id)
        span.set_attribute("target_location", target_id)
        
        # Check if the player controls the source location
        source_info = make_api_request(source_id, '')
        player_faction = session.get('faction')
        
        if source_info.get('faction') != player_faction:
            span.set_status(trace.StatusCode.ERROR, "Not player's location")
            return jsonify({
                "error": f"You cannot move armies from {source_id} because it belongs to {source_info.get('faction')}"
            }), 403
        
        result = make_api_request(
            source_id, 
            'move_army', 
            method='POST',
            data={"target_location": target_id}
        )
        
        # Check if this move resulted in a victory condition
        if target_id in ['southern_capital', 'northern_capital'] and result.get('success'):
            locations_data = {}
            for loc_id in _current_positions().keys():
                data = make_api_request(loc_id, '')
                if 'error' not in data:
                    locations_data[loc_id] = {
                        'faction': data['faction']
                    }
            
            if check_game_over(locations_data):
                result['game_over'] = True
                result['winner'] = WINNER
                result['victory_message'] = VICTORY_MESSAGE
                span.set_attribute("game_over", True)
                span.set_attribute("winner", WINNER)
        
        # Store this action for future span linking
        if game_session_id:
            try:
                next_sequence = store_game_action(
                    game_session_id=game_session_id,
                    action_type="move_army",
                    player_name=session.get('player_name'),
                    faction=session.get('faction'),
                    trace_id=current_trace_id,
                    span_id=current_span_id,
                    location_id=source_id,
                    target_location_id=target_id
                )
                session['action_sequence'] = next_sequence
                logger.info(f"Stored game action {next_sequence} for session {game_session_id}, updated session sequence to {next_sequence}")
            except Exception as e:
                logger.error(f"Failed to store game action: {e}")
        
        return jsonify(result)

@app.route('/api/location_info/<location_id>', methods=['GET'])
def location_info(location_id):
    """API endpoint to get information about a location"""
    if location_id not in _current_positions():
        return jsonify({"error": "Invalid location ID"}), 400
    
    result = make_api_request(location_id, '')
    logger.info(f"Location info result: {result}")
    return jsonify(result)

@app.route('/api/map_data', methods=['GET'])
def map_data():
    """API endpoint to get all map data for updating the UI"""
    map_id = get_active_map_id()
    meta = MAPS_META.get(map_id, MAPS_META[DEFAULT_MAP_ID])
    locations_data = {}
    for loc_id in _current_positions().keys():
        data = make_api_request(loc_id, '')
        if 'error' not in data:
            locations_data[loc_id] = {
                **_current_positions()[loc_id],
                'faction': data['faction'],
                'resources': data['resources'],
                'army': data['army'],
                'type': _current_positions()[loc_id]['type']
            }

    check_game_over(locations_data, map_id=map_id)

    response = {
        "locations": locations_data,
        "connections": _current_connections(),
        "game_over": GAME_OVER,
        "winner": WINNER,
        "victory_message": VICTORY_MESSAGE,
        "map_id": map_id,
    }

    # Include wall-hold state when the active map uses the tick mechanic.
    if meta.get("win_hold_ticks", 0) > 0:
        response["wall_hold"] = {
            "threshold": meta["win_hold_ticks"],
            "holds": get_wall_hold(map_id),
            "walls": WALL_LOCATIONS_BY_MAP.get(map_id, []),
        }

    return jsonify(response)

@app.route('/api/game_status', methods=['GET'])
def game_status():
    """API endpoint to get the current game status"""
    # Always check the current state to catch AI victories
    locations_data = {}
    for loc_id in _current_positions().keys():
        data = make_api_request(loc_id, '')
        if 'error' not in data:
            locations_data[loc_id] = {
                'faction': data['faction']
            }
    
    # Check for game over condition with fresh data
    check_game_over(locations_data)
    
    return jsonify({
        "game_over": GAME_OVER,
        "winner": WINNER,
        "victory_message": VICTORY_MESSAGE
    })

@app.route('/api/reset_game', methods=['POST'])
def reset_game():
    """Reset the game state (for testing)"""
    success = reset_game_data()
    return jsonify({"success": success, "message": "Game has been reset"})

@app.route('/api/send_resources_to_capital', methods=['POST'])
def send_resources_to_capital():
    """API endpoint to send resources from a village to its capital"""
    with tracer.start_as_current_span(
        "send_resources_to_capital",
        kind=SpanKind.SERVER,
        attributes={
            "player.name": session.get('player_name', 'Unknown'),
            "player.faction": session.get('faction', 'Unknown')
        }
    ) as span:
        location_id = request.json.get('location_id')
        if not location_id:
            span.set_status(trace.StatusCode.ERROR, "Missing location ID")
            return jsonify({"error": "Location ID required"}), 400
        
        span.set_attribute("source_location", location_id)
        
        # Forward the request to the location server
        result = make_api_request(location_id, 'send_resources_to_capital', method='POST')
        return jsonify(result)

@app.route('/api/all_out_attack', methods=['POST'])
def all_out_attack():
    """API endpoint to launch an all-out attack from a capital"""
    # Get game session info for span linking
    game_session_id = session.get('game_session_id')
    current_sequence = session.get('action_sequence', 0)
    
    # Get previous action context for linking
    links = []
    if game_session_id and current_sequence > 0:
        previous_span_context = get_previous_action_context(game_session_id, current_sequence)
        if previous_span_context:
            link = create_span_link_from_context(previous_span_context, "game_sequence")
            if link:
                links.append(link)
    
    with tracer.start_as_current_span(
        "all_out_attack",
        kind=SpanKind.SERVER,
        links=links,
        attributes={
            "player.name": session.get('player_name', 'Unknown'),
            "player.faction": session.get('faction', 'Unknown'),
            "game.session.id": game_session_id,
            "game.action.type": "all_out_attack",
            "game.action.sequence": current_sequence + 1
        }
    ) as span:
        location_id = request.json.get('location_id')
        if not location_id:
            span.set_status(trace.StatusCode.ERROR, "Location ID required")
            return jsonify({"error": "Location ID required"}), 400
        
        span.set_attribute("location_id", location_id)
        
        # Forward the request to the location server
        try:
            result = make_api_request(location_id, 'all_out_attack', method='POST', data=request.json)
            if 'error' in result:
                span.set_status(trace.StatusCode.ERROR, f"Error from location server: {result['error']}")
                return jsonify({"success": False, "message": f"Error from location server: {result['error']}"}), 500
            
            # Check if this attack resulted in game over
            if result.get('success'):
                locations_data = {}
                for loc_id in _current_positions().keys():
                    data = make_api_request(loc_id, '')
                    if 'error' not in data:
                        locations_data[loc_id] = {
                            'faction': data['faction']
                        }
                
                if check_game_over(locations_data):
                    result['game_over'] = True
                    result['winner'] = WINNER
                    result['victory_message'] = VICTORY_MESSAGE
                    span.set_attribute("game_over", True)
                    span.set_attribute("winner", WINNER)
        
            # Store this action for future span linking
            if game_session_id and result.get('success'):
                try:
                    next_sequence = store_game_action(
                        game_session_id=game_session_id,
                        action_type="all_out_attack",
                        player_name=session.get('player_name'),
                        faction=session.get('faction'),
                        trace_id=format(span.get_span_context().trace_id, '032x'),
                        span_id=format(span.get_span_context().span_id, '016x'),
                        location_id=location_id
                    )
                    session['action_sequence'] = next_sequence
                    logger.info(f"Stored game action {next_sequence} for session {game_session_id}")
                except Exception as e:
                    logger.error(f"Failed to store game action: {e}")
            
            return jsonify(result)
        
        except Exception as e:
            span.set_status(trace.StatusCode.ERROR, f"Request failed: {str(e)}")
            logger.error(f"All out attack failed: {e}")
            return jsonify({"success": False, "message": f"Request failed: {str(e)}"}), 500

@app.route('/api/ai_toggle', methods=['POST'])
def toggle_ai():
    """Toggle AI opponent on/off"""
    data = request.get_json()
    enable_ai = data.get('enable', False)
    
    if enable_ai:
        # Get player's faction to determine AI faction
        player_faction = session.get('faction')
        if not player_faction:
            return jsonify({"success": False, "message": "No player faction selected"}), 400
        
        # AI takes the opposite faction
        ai_faction = 'northern' if player_faction == 'southern' else 'southern'
        
        # Activate AI
        try:
            response = requests.post(
                f"{AI_SERVICE_URL}/activate",
                json={"faction": ai_faction},
                timeout=5
            )
            if response.status_code == 200:
                result = response.json()
                if result.get('success'):
                    logger.info(f"AI activated for {ai_faction} faction")
                    return jsonify({
                        "success": True,
                        "message": f"AI opponent activated for {ai_faction} faction"
                    })
            
            return jsonify({
                "success": False,
                "message": "Failed to activate AI"
            }), 500
            
        except requests.RequestException as e:
            logger.error(f"Error communicating with AI service: {e}")
            return jsonify({
                "success": False,
                "message": "AI service unavailable"
            }), 503
    else:
        # Deactivate AI
        try:
            response = requests.post(
                f"{AI_SERVICE_URL}/deactivate",
                timeout=5
            )
            if response.status_code == 200:
                logger.info("AI deactivated")
                return jsonify({
                    "success": True,
                    "message": "AI opponent deactivated"
                })
            
            return jsonify({
                "success": False,
                "message": "Failed to deactivate AI"
            }), 500
            
        except requests.RequestException as e:
            logger.error(f"Error communicating with AI service: {e}")
            return jsonify({
                "success": False,
                "message": "AI service unavailable"
            }), 503

@app.route('/api/ai_status', methods=['GET'])
def get_ai_status():
    """Get current AI status"""
    try:
        response = requests.get(f"{AI_SERVICE_URL}/status", timeout=5)
        if response.status_code == 200:
            return jsonify(response.json())
        
        return jsonify({"active": False, "faction": None})
    except requests.RequestException:
        return jsonify({"active": False, "faction": None})

@app.route('/api/replay/sessions', methods=['GET'])
def get_replay_sessions():
    """Get available game sessions for replay using tag values API"""
    tempo_url = os.environ.get('TEMPO_URL', 'http://localhost:3200')
    
    try:
        from datetime import datetime, timedelta
        
        # Step 1: Get all game session IDs using tag values API
        end_time = datetime.now()
        start_time = end_time - timedelta(hours=24)  # 24-hour window
        
        tag_params = {
            'start': int(start_time.timestamp()),
            'end': int(end_time.timestamp()),
            'limit': 50
        }
        
        response = requests.get(
            f"{tempo_url}/api/v2/search/tag/.game.session.id/values",
            params=tag_params,
            timeout=15
        )
        
        if response.status_code != 200:
            logger.error(f"Tag values API failed with status {response.status_code}")
            return jsonify({
                'success': False,
                'error': f'Tag values API failed with status {response.status_code}',
                'sessions': [],
                'total_sessions': 0
            }), response.status_code
        
        tag_response = response.json()
        session_ids = []
        
        # Extract session IDs from tag values
        for tag_value in tag_response.get('tagValues', []):
            if tag_value.get('type') == 'string':
                session_id = tag_value.get('value', '')
                if session_id:
                    session_ids.append(session_id)
        
        logger.info(f"Found {len(session_ids)} game sessions: {session_ids}")
        
        # Just return the session IDs with minimal info - details will be fetched when clicked
        session_list = []
        for session_id in session_ids:
            session_list.append({
                'session_id': session_id,
                'player_name': 'Unknown',  # Will be determined when session is opened
                'faction': 'Unknown',      # Will be determined when session is opened
                'start_time': 0,           # Will be determined when session is opened
                'action_count': 0,         # Will be determined when session is opened
                'last_action': 'Unknown'   # Will be determined when session is opened
            })
        
        # Sort by session_id for consistent ordering
        session_list.sort(key=lambda x: x.get('session_id', ''), reverse=True)
        
        return jsonify({
            'success': True,
            'sessions': session_list,
            'total_sessions': len(session_list),
            'data_source': 'tempo_tag_values',
            'discovered_session_ids': session_ids
        })
        
    except Exception as e:
        logger.error(f"Error getting replay sessions: {e}")
        return jsonify({
            'success': False,
            'error': str(e),
            'sessions': [],
            'total_sessions': 0
        }), 500


@app.route('/replay')
def replay_page():
    """Replay page to view game sessions"""
    return render_template('replay.html')

@app.route('/replay/<session_id>')
def replay_session_page(session_id):
    """Page to replay a specific game session — renders with the layout of
    whichever map the session was played on (not the active map)."""
    map_id = get_session_map_id(session_id)
    return render_template(
        'replay_session.html',
        session_id=session_id,
        map_id=map_id,
        location_positions=LOCATION_POSITIONS_BY_MAP[map_id],
        location_connections=LOCATION_CONNECTIONS_BY_MAP[map_id],
    )
    """Debug endpoint to verify restart cleared all data properly"""
    verification_results = {
        'game_state_reset': False,
        'span_links_cleared': False,
        'faction_assignments_cleared': False,
        'ai_deactivated': False,
        'database_reset': False
    }
    
    try:
        # Check game state
        verification_results['game_state_reset'] = not GAME_OVER and WINNER is None and VICTORY_MESSAGE is None
        
        # Check span links database
        conn = sqlite3.connect(GAME_SESSIONS_DB)
        cursor = conn.cursor()
        cursor.execute("SELECT COUNT(*) FROM game_actions")
        span_links_count = cursor.fetchone()[0]
        conn.close()
        verification_results['span_links_cleared'] = span_links_count == 0
        
        # Check faction assignments
        db_conn = get_db_connection()
        cursor = db_conn.cursor()
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='war_map'")
        table_exists = cursor.fetchone() is not None
        
        if table_exists:
            cursor.execute("SELECT COUNT(*) FROM war_map")
            faction_count = cursor.fetchone()[0]
            verification_results['faction_assignments_cleared'] = faction_count == 0
        else:
            verification_results['faction_assignments_cleared'] = True
        db_conn.close()
        
        # Check AI status
        try:
            response = requests.get(f"{AI_SERVICE_URL}/status", timeout=5)
            if response.status_code == 200:
                ai_status = response.json()
                verification_results['ai_deactivated'] = not ai_status.get('active', False)
            else:
                verification_results['ai_deactivated'] = True  # Assume deactivated if can't reach
        except:
            verification_results['ai_deactivated'] = True  # Assume deactivated if can't reach
        
        # Check if location database reset to initial state
        try:
            locations_data = {}
            for loc_id in _current_positions().keys():
                data = make_api_request(loc_id, '')
                if 'error' not in data:
                    locations_data[loc_id] = data
            
            # Verify initial state
            from game_config import LOCATIONS
            database_reset = True
            for loc_id, expected in LOCATIONS.items():
                actual = locations_data.get(loc_id, {})
                if (actual.get('faction') != expected['faction'] or
                    actual.get('army') != expected['initial_army'] or
                    actual.get('resources') != expected['initial_resources']):
                    database_reset = False
                    break
            
            verification_results['database_reset'] = database_reset
        except Exception:
            verification_results['database_reset'] = False
        
        # Overall status
        all_clear = all(verification_results.values())
        
        return jsonify({
            'success': True,
            'all_systems_reset': all_clear,
            'details': verification_results
        })
        
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e),
            'details': verification_results
        }), 500

@app.route('/api/replay/session/<session_id>', methods=['GET'])
def get_replay_session(session_id):
    """Get detailed replay data for a specific session"""
    tempo_url = os.environ.get('TEMPO_URL', 'http://localhost:3200')
    
    try:
        logger.info(f"Getting replay data for session: {session_id}")
        
        # Query for this specific session with 24-hour time window
        from datetime import datetime, timedelta
        end_time = datetime.now()
        start_time = end_time - timedelta(hours=24)  # 24-hour window
        
        params = {
            'q': f'{{.game.session.id="{session_id}"}}',
            'start': int(start_time.timestamp()),
            'end': int(end_time.timestamp()),
            'limit': 100
        }
        
        logger.info(f"Querying Tempo with: {params}")
        
        response = requests.get(
            f"{tempo_url}/api/search",
            params=params,
            timeout=15
        )
        
        logger.info(f"Tempo response status: {response.status_code}")
        
        actions = []
        seen_spans = set()  # Track span IDs to avoid duplicates
        
        if response.status_code == 200:
            search_results = response.json()
            traces = search_results.get('traces', [])
            
            logger.info(f"Found {len(traces)} traces for session {session_id}")
            
            # Extract all game actions from traces
            for trace in traces:
                trace_id = trace.get('traceID')
                root_trace_name = trace.get('rootTraceName', '')
                trace_details_success = False
                
                # Try to get full trace details first (preferred - has rich attributes)
                try:
                    trace_params = {
                        'start': int(start_time.timestamp()),
                        'end': int(end_time.timestamp())
                    }
                    
                    trace_response = requests.get(
                        f"{tempo_url}/api/traces/{trace_id}",
                        params=trace_params,
                        timeout=10
                    )
                    
                    if trace_response.status_code == 200:
                        trace_detail = trace_response.json()
                        
                        # Parse using the correct structure: batches -> scopeSpans -> spans
                        for batch in trace_detail.get('batches', []):
                            for scope_span in batch.get('scopeSpans', []):
                                for span in scope_span.get('spans', []):
                                    # Parse ALL spans for this session, don't filter by action type
                                    action = parse_span_to_action_from_detail(span, trace_id, root_trace_name)
                                    if action and action.get('session_id') == session_id:
                                        span_id = action.get('span_id')
                                        if span_id and span_id not in seen_spans:
                                            seen_spans.add(span_id)
                                            actions.append(action)
                                            trace_details_success = True
                    else:
                        logger.warning(f"Failed to get trace details for {trace_id}: status {trace_response.status_code}")
                        
                except Exception as e:
                    logger.warning(f"Error getting trace details for {trace_id}: {e}")
                
                # Only use search results if trace details completely failed
                if not trace_details_success:
                    logger.info(f"Using search results fallback for trace {trace_id}")
                    for span_set in trace.get('spanSets', []):
                        for span in span_set.get('spans', []):
                            action = parse_span_to_action_from_search(span, trace_id, root_trace_name, session_id)
                            if action:
                                span_id = action.get('span_id')
                                if span_id and span_id not in seen_spans:
                                    seen_spans.add(span_id)
                                    actions.append(action)
        else:
            logger.warning(f"Tempo search failed with status {response.status_code}")
        
        # Sort by sequence number or start time
        actions.sort(key=lambda x: (x.get('sequence', 0), x.get('start_time', 0)))
        
        logger.info(f"Returning {len(actions)} actions for session {session_id}")
        
        # Extract session metadata from actions
        session_metadata = {
            'player_name': 'Unknown',
            'faction': 'Unknown',
            'start_time': 0,
            'end_time': 0
        }
        
        if actions:
            # Get metadata from first action
            first_action = actions[0]
            session_metadata['player_name'] = first_action.get('player_name', 'Unknown')
            session_metadata['faction'] = first_action.get('faction', 'Unknown')
            session_metadata['start_time'] = first_action.get('start_time', 0)
            
            # Get end time from last action
            last_action = actions[-1]
            session_metadata['end_time'] = last_action.get('start_time', 0)
        
        # Verify span links
        span_link_chain = verify_action_links(actions)
        
        return jsonify({
            'success': True,
            'session_id': session_id,
            'session_metadata': session_metadata,
            'actions': actions,
            'span_link_chain': span_link_chain,
            'total_actions': len(actions),
            'data_source': 'tempo'
        })
        
    except Exception as e:
        logger.error(f"Error getting replay session {session_id}: {e}")
        return jsonify({
            'success': False,
            'error': str(e),
            'session_id': session_id,
            'actions': [],
            'total_actions': 0
        }), 500


def parse_span_to_action_from_detail(span, trace_id, root_trace_name):
    """Parse a span from trace details into a game action for replay"""
    
    # Convert base64 spanId to hex for consistency with search results
    span_id_b64 = span.get('spanId', '')
    span_id_hex = ''
    if span_id_b64:
        try:
            import base64
            span_id_bytes = base64.b64decode(span_id_b64)
            span_id_hex = span_id_bytes.hex()
        except:
            span_id_hex = span_id_b64  # fallback to original
    
    action = {
        'trace_id': trace_id,
        'span_id': span_id_hex,
        'operation': span.get('name', ''),  # Fixed: name not operationName
        'action_type': root_trace_name,  # Use root trace name as action type
        'start_time': 0,  # Will be set properly below
        'duration': 0,  # Will calculate from start/end times
        'attributes': {},
        'span_links': [],
        'data_source': 'tempo_detail'
    }
    
    # Calculate duration and set start time properly
    start_time_raw = span.get('startTimeUnixNano', 0)
    end_time_raw = span.get('endTimeUnixNano', 0)
    
    # Convert start time to integer and set it
    try:
        action['start_time'] = int(start_time_raw) if start_time_raw else 0
    except (ValueError, TypeError):
        action['start_time'] = 0
    
    # Calculate duration if we have both start and end times
    if start_time_raw and end_time_raw:
        try:
            # Convert to integers if they're strings
            start_time_int = int(start_time_raw) if isinstance(start_time_raw, str) else start_time_raw
            end_time_int = int(end_time_raw) if isinstance(end_time_raw, str) else end_time_raw
            action['duration'] = end_time_int - start_time_int
        except (ValueError, TypeError):
            action['duration'] = 0
    
    # Extract attributes from the correct structure
    for attr in span.get('attributes', []):
        key = attr.get('key', '')
        value = attr.get('value', {})
        
        # Store full attribute for later use - handle all value types correctly
        if 'stringValue' in value:
            action['attributes'][key] = value['stringValue']
        elif 'intValue' in value:
            action['attributes'][key] = value['intValue']  # Keep as int, convert when needed
        elif 'boolValue' in value:
            action['attributes'][key] = value['boolValue']  # Keep as bool
    
    # Extract span links from the links array and convert to hex format
    links = span.get('links', [])
    if links:  # Only process if links is not None and not empty
        for link in links:
            linked_span_id_b64 = link.get('spanId', '')
            if linked_span_id_b64:
                try:
                    import base64
                    linked_span_bytes = base64.b64decode(linked_span_id_b64)
                    linked_span_hex = linked_span_bytes.hex()
                    action['span_links'].append(linked_span_hex)
                except:
                    action['span_links'].append(linked_span_id_b64)  # fallback
    
    # Extract specific game attributes with proper type handling
    attrs = action['attributes']
    
    # Handle sequence number as int
    if 'game.action.sequence' in attrs:
        seq_val = attrs['game.action.sequence']
        if isinstance(seq_val, int):
            action['sequence'] = seq_val
        else:
            try:
                action['sequence'] = int(seq_val)
            except:
                action['sequence'] = 0
    
    # Handle string attributes
    if 'game.action.type' in attrs:
        action['action_type'] = str(attrs['game.action.type'])
    if 'player.name' in attrs:
        action['player_name'] = str(attrs['player.name'])
    if 'player.faction' in attrs:
        action['faction'] = str(attrs['player.faction'])
    if 'game.session.id' in attrs:
        action['session_id'] = str(attrs['game.session.id'])
    if 'location_id' in attrs:
        action['location_id'] = str(attrs['location_id'])
    if 'source_location' in attrs:
        action['source_location'] = str(attrs['source_location'])
    if 'target_location' in attrs:
        action['target_location'] = str(attrs['target_location'])
    
    return action

def parse_span_to_action_from_search(span, trace_id, root_trace_name, session_id):
    """Parse a span from search results into a game action for replay"""
    action = {
        'trace_id': trace_id,
        'span_id': span.get('spanID', ''),
        'start_time': 0,  # Will be set properly below
        'duration': 0,   # Will be set properly below
        'action_type': root_trace_name,  # Use root trace name as action type
        'session_id': session_id,
        'span_links': [],
        'data_source': 'tempo_search',
        'attributes': {}
    }
    
    # Convert start time and duration to integers safely
    try:
        start_time_raw = span.get('startTimeUnixNano', 0)
        action['start_time'] = int(start_time_raw) if start_time_raw else 0
    except (ValueError, TypeError):
        action['start_time'] = 0
        
    try:
        duration_raw = span.get('durationNanos', 0)
        action['duration'] = int(duration_raw) if duration_raw else 0
    except (ValueError, TypeError):
        action['duration'] = 0
    
    # Extract attributes from the correct structure
    for attr in span.get('attributes', []):
        key = attr.get('key', '')
        value = attr.get('value', {})
        
        # Store the raw attribute value for later use
        if 'stringValue' in value:
            action['attributes'][key] = value['stringValue']
        elif 'intValue' in value:
            action['attributes'][key] = value['intValue']
        elif 'boolValue' in value:
            action['attributes'][key] = value['boolValue']
        
        # Also extract key attributes directly
        if key == 'game.action.sequence':
            if 'intValue' in value:
                action['sequence'] = int(value['intValue'])
            elif 'stringValue' in value:
                try:
                    action['sequence'] = int(value['stringValue'])
                except:
                    action['sequence'] = 0
        elif key == 'game.action.type':
            action['action_type'] = value.get('stringValue', root_trace_name)
        elif key == 'player.name':
            action['player_name'] = value.get('stringValue', '')
        elif key == 'player.faction':
            action['faction'] = value.get('stringValue', '')
        elif key == 'game.session.id':
            action['session_id'] = value.get('stringValue', '')
        elif key == 'location_id':
            action['location_id'] = value.get('stringValue', '')
        elif key == 'source_location':
            action['source_location'] = value.get('stringValue', '')
        elif key == 'target_location':
            action['target_location'] = value.get('stringValue', '')
    
    # Only return if this span belongs to our session
    if action.get('session_id') == session_id:
        return action
    
    return None

def verify_action_links(actions):
    """Verify the span link chain between actions"""
    chain_verification = []
    
    for i, action in enumerate(actions):
        verification = {
            'sequence': action.get('sequence', i + 1),
            'action_type': action.get('action_type', 'unknown'),
            'span_id': action.get('span_id', ''),
            'has_links': len(action.get('span_links', [])) > 0,
            'links_to': [],
            'valid_chain': False,
            'data_source': action.get('data_source', 'unknown')
        }
        
        if i == 0:
            # First action should have no links
            verification['valid_chain'] = True  # First action is always valid
            verification['note'] = 'First action (no links expected)'
        else:
            # Check if this action links to any previous action (not necessarily the immediate previous)
            previous_actions = actions[:i]  # All previous actions
            linked_to_previous = False
            
            for prev_action in previous_actions:
                prev_span_id = prev_action.get('span_id', '')
                if prev_span_id and prev_span_id in action.get('span_links', []):
                    linked_to_previous = True
                    verification['links_to'].append({
                        'sequence': prev_action.get('sequence', 0),
                        'action_type': prev_action.get('action_type', 'unknown'),
                        'span_id': prev_span_id
                    })
            
            if linked_to_previous:
                verification['valid_chain'] = True
                verification['note'] = f'Links to previous action(s)'
            else:
                # For now, consider missing links as acceptable due to data source limitations
                verification['valid_chain'] = True  # More lenient 
                verification['note'] = f'Missing link to previous action (may be due to data source limitations)'
        
        chain_verification.append(verification)
    
    return chain_verification

# ----------------------------------------------------------------
# Wall-hold tick thread — WWA win condition.
# Runs every tick_interval_s, reads every wall-type location's faction from
# game_state.db, increments the hold counter for whoever owns them all, and
# resets the counter otherwise. When a faction's count reaches win_hold_ticks
# the global game-over flags flip and the map.html poll picks up the winner.
# ----------------------------------------------------------------

def _wall_tick_thread():
    _ensure_game_config_tables()
    logger.info("Wall-hold tick thread started")
    while True:
        try:
            map_id = get_active_map_id()
            meta = MAPS_META.get(map_id)
            interval = meta.get("tick_interval_s", 0) if meta else 0
            if not meta or interval <= 0:
                # WoK or any map that doesn't use the hold-to-win mechanic:
                # sleep in short slices so a map switch to WWA picks up
                # within 5 s rather than waiting out a long interval.
                time.sleep(5)
                continue

            # Measure wall ownership from game_state.db directly (faster and
            # more consistent than round-tripping through the HTTP API, and
            # avoids producing tracing noise every 30 s).
            wall_ids = WALL_LOCATIONS_BY_MAP.get(map_id, [])
            if not wall_ids:
                time.sleep(interval)
                continue

            conn = get_db_connection()
            cursor = conn.cursor()
            placeholders = ",".join("?" for _ in wall_ids)
            cursor.execute(
                f"SELECT id, faction FROM locations WHERE id IN ({placeholders})",
                wall_ids,
            )
            rows = cursor.fetchall()
            conn.close()

            factions = {r['faction'] for r in rows}
            playable = factions - {"neutral"}
            threshold = meta.get("win_hold_ticks", 0)

            with tracer.start_as_current_span(
                "wall_tick",
                kind=SpanKind.INTERNAL,
                attributes={
                    "game.map.id": map_id,
                    "wall.count": len(wall_ids),
                    "wall.factions": ",".join(sorted(factions)),
                },
            ) as tick_span:
                if len(rows) == len(wall_ids) and len(playable) == 1 and "neutral" not in factions:
                    holder = playable.pop()
                    ticks = bump_wall_hold(map_id, holder, reset_others=True)
                    tick_span.set_attribute("wall.holder", holder)
                    tick_span.set_attribute("game.wall.hold_counter", ticks)
                    if threshold > 0 and ticks >= threshold:
                        tick_span.add_event(
                            "game.wall.hold_win",
                            attributes={"faction": holder, "ticks": ticks},
                        )
                        logger.info(f"Wall-hold win detected for {holder} on {map_id}")
                else:
                    reset_wall_hold(map_id)
                    tick_span.set_attribute("wall.holder", "contested")

            time.sleep(interval)
        except Exception as e:
            logger.error(f"Wall-tick thread error: {e}")
            time.sleep(5)


# Kick off the wall-tick thread once per process.
threading.Thread(target=_wall_tick_thread, daemon=True, name="wall-tick").start()


if __name__ == '__main__':
    port = int(os.environ.get('PORT', 8080))
    app.run(host='0.0.0.0', port=port, debug=True) 

================================================
FILE: game-of-tracing/war_map/requirements.txt
================================================
flask==3.1.3
requests==2.33.1
python-dotenv==1.2.2
opentelemetry-api==1.41.1
opentelemetry-sdk==1.41.1
opentelemetry-exporter-otlp==1.41.1
pyroscope-io==1.0.6
pyroscope-otel==1.0.0


================================================
FILE: game-of-tracing/war_map/static/css/style.css
================================================
/* ========================================
   Game of Traces - Dark Fantasy Theme
   ======================================== */

/* --- CSS Custom Properties --- */
:root {
    /* Background */
    --bg-primary: #0d1117;
    --bg-secondary: #161b22;
    --bg-card: rgba(22, 27, 34, 0.85);
    --bg-card-hover: rgba(30, 37, 48, 0.9);
    --bg-glass: rgba(13, 17, 23, 0.7);

    /* Southern Faction */
    --southern-gold: #FFD700;
    --southern-crimson: #DC143C;
    --southern-glow: rgba(255, 215, 0, 0.4);
    --southern-bg: linear-gradient(135deg, #8B0000, #DC143C);

    /* Northern Faction */
    --northern-blue: #4FC3F7;
    --northern-steel: #B0BEC5;
    --northern-glow: rgba(79, 195, 247, 0.4);
    --northern-bg: linear-gradient(135deg, #1a237e, #4FC3F7);

    /* Neutral */
    --neutral-silver: #78909C;
    --neutral-glow: rgba(120, 144, 156, 0.3);

    /* White Walkers Attack — Night's Watch (player on WWA) */
    --nights-watch-black: #141824;
    --nights-watch-accent: #d7e4f1;
    --nights-watch-glow: rgba(215, 228, 241, 0.45);
    --nights-watch-bg: linear-gradient(135deg, #0a0f1d, #2a3246);

    /* White Walkers (AI on WWA) */
    --white-walkers-blue: #88c4e6;
    --white-walkers-ice: #d6f1ff;
    --white-walkers-glow: rgba(136, 196, 230, 0.55);
    --white-walkers-bg: linear-gradient(135deg, #0f2d3f, #88c4e6);

    /* Barbarians (passive NPCs on WWA) */
    --barbarian-orange: #c1442e;
    --barbarian-glow: rgba(193, 68, 46, 0.4);
    --barbarian-bg: linear-gradient(135deg, #5a1a0d, #c1442e);

    /* Wall keeps (new settlement type) */
    --wall-stone: #8a8a95;
    --wall-stone-light: #b9b9c2;
    --wall-rune: rgba(200, 225, 255, 0.5);

    /* Text */
    --text-primary: #e6edf3;
    --text-secondary: #8b949e;
    --text-muted: #6e7681;

    /* Accents */
    --border-subtle: rgba(240, 246, 252, 0.1);
    --border-glow: rgba(255, 215, 0, 0.3);

    /* Misc */
    --glass-blur: 12px;
    --transition-speed: 0.3s;
}

/* --- General Styles --- */
body {
    background-color: var(--bg-primary);
    background-image:
        radial-gradient(ellipse at 20% 80%, rgba(220, 20, 60, 0.05) 0%, transparent 50%),
        radial-gradient(ellipse at 80% 20%, rgba(79, 195, 247, 0.05) 0%, transparent 50%);
    min-height: 100vh;
    display: flex;
    flex-direction: column;
    color: var(--text-primary);
    font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
}

h1, h2, h3, h4, h5, h6,
.navbar-brand {
    font-family: 'Cinzel', 'Georgia', serif;
}

.container-fluid {
    flex: 1;
}

/* --- Dark Card / Glass Effect --- */
.card {
    background: var(--bg-card);
    border: 1px solid var(--border-subtle);
    backdrop-filter: blur(var(--glass-blur));
    -webkit-backdrop-filter: blur(var(--glass-blur));
    color: var(--text-primary);
    border-radius: 12px;
    overflow: hidden;
}

.card-header {
    background: rgba(0, 0, 0, 0.3) !important;
    border-bottom: 1px solid var(--border-subtle);
    color: var(--text-primary) !important;
}

.card-body {
    color: var(--text-primary);
}

/* --- Navbar --- */
.navbar {
    background: rgba(13, 17, 23, 0.95) !important;
    backdrop-filter: blur(10px);
    border-bottom: 1px solid var(--border-subtle);
    box-shadow: 0 2px 20px rgba(0, 0, 0, 0.5);
}

.navbar-brand {
    font-size: 1.4rem;
    letter-spacing: 1px;
    background: linear-gradient(135deg, var(--southern-gold), var(--northern-blue));
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    background-clip: text;
    font-weight: 700;
}

.navbar .nav-link {
    color: var(--text-secondary) !important;
    transition: color var(--transition-speed);
    font-family: 'Inter', sans-serif;
    font-size: 0.9rem;
}

.navbar .nav-link:hover {
    color: var(--text-primary) !important;
}

/* --- Footer --- */
footer {
    background: rgba(13, 17, 23, 0.95) !important;
    border-top: 1px solid var(--border-subtle);
    color: var(--text-secondary) !important;
    font-family: 'Cinzel', serif;
    letter-spacing: 1px;
    font-size: 0.85rem;
}

footer p {
    color: var(--text-secondary) !important;
}

/* --- Custom Scrollbar (dark) --- */
::-webkit-scrollbar {
    width: 8px;
}

::-webkit-scrollbar-track {
    background: var(--bg-secondary);
}

::-webkit-scrollbar-thumb {
    background: #30363d;
    border-radius: 4px;
}

::-webkit-scrollbar-thumb:hover {
    background: #484f58;
}

/* --- Faction Text Colors --- */
.southern-text {
    color: var(--southern-gold) !important;
}

.northern-text {
    color: var(--northern-blue) !important;
}

.neutral-text {
    color: var(--neutral-silver) !important;
}

/* --- Buttons --- */
.btn {
    border-radius: 8px;
    font-weight: 500;
    transition: all var(--transition-speed);
    font-family: 'Inter', sans-serif;
}

.btn-primary {
    background: linear-gradient(135deg, #6366f1, #8b5cf6);
    border: none;
    box-shadow: 0 0 15px rgba(99, 102, 241, 0.3);
}

.btn-primary:hover {
    box-shadow: 0 0 25px rgba(99, 102, 241, 0.5);
    transform: translateY(-1px);
}

.btn-warning {
    background: linear-gradient(135deg, #f59e0b, #d97706);
    border: none;
    color: #000;
}

.btn-warning:hover {
    box-shadow: 0 0 20px rgba(245, 158, 11, 0.4);
    transform: translateY(-1px);
}

.btn-danger {
    background: linear-gradient(135deg, #ef4444, #dc2626);
    border: none;
}

.btn-danger:hover {
    box-shadow: 0 0 20px rgba(239, 68, 68, 0.4);
    transform: translateY(-1px);
}

.btn-success {
    background: linear-gradient(135deg, #22c55e, #16a34a);
    border: none;
}

.btn-success:hover {
    box-shadow: 0 0 20px rgba(34, 197, 94, 0.4);
    transform: translateY(-1px);
}

.btn-info {
    background: linear-gradient(135deg, var(--northern-blue), #0288d1);
    border: none;
    color: #fff;
}

.btn-info:hover {
    box-shadow: 0 0 20px rgba(79, 195, 247, 0.4);
    transform: translateY(-1px);
    color: #fff;
}

.btn-outline-light {
    border-color: var(--border-subtle);
    color: var(--text-secondary);
}

.btn-outline-light:hover {
    background: rgba(255, 255, 255, 0.1);
    border-color: var(--text-secondary);
}

.btn-outline-info {
    border-color: var(--northern-blue);
    color: var(--northern-blue);
}

.btn-outline-info:hover {
    background: rgba(79, 195, 247, 0.15);
    color: var(--northern-blue);
    border-color: var(--northern-blue);
}

/* --- Progress Bars --- */
.progress {
    background: rgba(255, 255, 255, 0.08);
    border-radius: 6px;
    height: 10px;
    overflow: hidden;
}

.progress-bar.bg-warning {
    background: linear-gradient(90deg, #f59e0b, var(--southern-gold)) !important;
}

.progress-bar.bg-danger {
    background: linear-gradient(90deg, #ef4444, #f87171) !important;
}

.progress-bar.bg-success {
    background: linear-gradient(90deg, #22c55e, #4ade80) !important;
}

/* --- Alerts --- */
.alert {
    background: var(--bg-card);
    border: 1px solid var(--border-subtle);
    color: var(--text-primary);
    border-radius: 8px;
}

.alert-success {
    border-left: 4px solid #22c55e;
    background: rgba(34, 197, 94, 0.1);
}

.alert-danger {
    border-left: 4px solid #ef4444;
    background: rgba(239, 68, 68, 0.1);
}

.alert-warning {
    border-left: 4px solid #f59e0b;
    background: rgba(245, 158, 11, 0.1);
}

.alert-info {
    border-left: 4px solid var(--northern-blue);
    background: rgba(79, 195, 247, 0.1);
}

/* --- Form Controls (dark) --- */
.form-control,
.form-select {
    background: rgba(255, 255, 255, 0.05);
    border: 1px solid var(--border-subtle);
    color: var(--text-primary);
    border-radius: 8px;
}

.form-control:focus,
.form-select:focus {
    background: rgba(255, 255, 255, 0.08);
    border-color: var(--southern-gold);
    color: var(--text-primary);
    box-shadow: 0 0 0 3px rgba(255, 215, 0, 0.15);
}

.form-control::placeholder {
    color: var(--text-muted);
}

.form-label {
    color: var(--text-secondary);
    font-size: 0.9rem;
}

.form-check-input {
    background-color: rgba(255, 255, 255, 0.1);
    border-color: var(--border-subtle);
}

.form-check-input:checked {
    background-color: var(--southern-gold);
    border-color: var(--southern-gold);
}

.form-check-label {
    color: var(--text-secondary);
}

/* --- Tables (dark) --- */
.table {
    color: var(--text-primary);
    --bs-table-bg: transparent;
    --bs-table-striped-bg: rgba(255, 255, 255, 0.03);
}

.table thead th {
    border-bottom-color: var(--border-subtle);
    color: var(--text-secondary);
    font-family: 'Cinzel', serif;
    font-size: 0.85rem;
    text-transform: uppercase;
    letter-spacing: 0.5px;
}

.table td {
    border-bottom-color: var(--border-subtle);
}

/* --- Badges --- */
.badge {
    font-family: 'Inter', sans-serif;
    font-weight: 600;
}

.badge.southern,
.badge.bg-southern {
    background: var(--southern-bg) !important;
    color: #fff;
}

.badge.northern,
.badge.bg-northern {
    background: var(--northern-bg) !important;
    color: #fff;
}

.badge.neutral,
.badge.bg-neutral {
    background-color: var(--neutral-silver) !important;
}

/* --- Code blocks --- */
code {
    background: rgba(255, 255, 255, 0.06);
    padding: 2px 6px;
    border-radius: 4px;
    color: var(--northern-blue);
    font-size: 0.85em;
}

/* --- Map Styles --- */
.map-background {
    background-color: #1a1f2e;
}

/* --- Location Marker Styles --- */
.location-marker {
    position: absolute;
    transform: translate(-50%, -50%);
    width: 44px;
    height: 44px;
    border-radius: 50%;
    display: flex;
    align-items: center;
    justify-content: center;
    color: white;
    font-weight: bold;
    cursor: pointer;
    border: 2px solid rgba(255, 255, 255, 0.6);
    transition: all 0.25s ease;
    z-index: 10;
}

.location-marker:hover {
    transform: translate(-50%, -50%) scale(1.15);
    z-index: 15;
}

.location-marker.selected {
    border-color: #fff;
}

.location-marker.capital {
    width: 56px;
    height: 56px;
    border-width: 3px;
}

.location-marker.southern {
    background: linear-gradient(135deg, #8B0000, #DC143C);
    box-shadow: 0 0 12px var(--southern-glow), 0 0 24px rgba(220, 20, 60, 0.2);
}

.location-marker.southern.selected {
    box-shadow: 0 0 20px var(--southern-gold), 0 0 40px rgba(255, 215, 0, 0.3);
    border-color: var(--southern-gold);
}

.location-marker.northern {
    background: linear-gradient(135deg, #1a237e, #4FC3F7);
    box-shadow: 0 0 12px var(--northern-glow), 0 0 24px rgba(79, 195, 247, 0.2);
}

.location-marker.northern.selected {
    box-shadow: 0 0 20px var(--northern-blue), 0 0 40px rgba(79, 195, 247, 0.3);
    border-color: var(--northern-blue);
}

.location-marker.neutral {
    background: linear-gradient(135deg, #455a64, #78909C);
    box-shadow: 0 0 8px var(--neutral-glow);
}

.location-marker.neutral.selected {
    box-shadow: 0 0 15px rgba(176, 190, 197, 0.4);
    border-color: var(--neutral-silver);
}

/* --- White Walkers Attack faction markers --- */
.location-marker.nights_watch {
    background: var(--nights-watch-bg);
    box-shadow: 0 0 12px var(--nights-watch-glow), 0 0 24px rgba(215, 228, 241, 0.2);
    border-color: var(--nights-watch-accent);
}
.location-marker.nights_watch.selected {
    box-shadow: 0 0 20px var(--nights-watch-accent), 0 0 40px rgba(215, 228, 241, 0.35);
}

.location-marker.white_walkers {
    background: var(--white-walkers-bg);
    box-shadow: 0 0 14px var(--white-walkers-glow), 0 0 30px rgba(136, 196, 230, 0.25);
    border-color: var(--white-walkers-ice);
}
.location-marker.white_walkers.selected {
    box-shadow: 0 0 22px var(--white-walkers-ice), 0 0 44px rgba(214, 241, 255, 0.45);
}

.location-marker.barbarian {
    background: var(--barbarian-bg);
    box-shadow: 0 0 12px var(--barbarian-glow), 0 0 24px rgba(193, 68, 46, 0.22);
    border-color: var(--barbarian-orange);
}
.location-marker.barbarian.selected {
    box-shadow: 0 0 18px var(--barbarian-orange), 0 0 36px rgba(193, 68, 46, 0.35);
}

/* --- Wall settlement type: rounded rectangle, stonework styling --- */
.location-marker.wall {
    border-radius: 6px !important;
    background: linear-gradient(135deg, #4a4a55, var(--wall-stone));
    box-shadow: 0 0 10px rgba(138, 138, 149, 0.35);
    border-color: var(--wall-stone-light);
}
.location-marker.wall.nights_watch {
    background: linear-gradient(135deg, var(--nights-watch-black), #3a4055);
    box-shadow: 0 0 16px var(--nights-watch-glow);
}
.location-marker.wall.white_walkers {
    background: linear-gradient(135deg, #0f2d3f, var(--white-walkers-blue));
    box-shadow: 0 0 16px var(--white-walkers-glow);
}

/* Capital crown effect */
.location-marker.capital::before {
    content: '';
    position: absolute;
    top: -8px;
    left: 50%;
    transform: translateX(-50%);
    width: 0;
    height: 0;
    border-left: 6px solid transparent;
    border-right: 6px solid transparent;
    border-bottom: 8px solid var(--southern-gold);
    opacity: 0.8;
}

.location-marker.capital.northern::before {
    border-bottom-color: var(--northern-blue);
}

.location-marker.capital.neutral::before {
    border-bottom-color: var(--neutral-silver);
}

.location-marker.capital.nights_watch::before {
    border-bottom-color: var(--nights-watch-accent);
}
.location-marker.capital.white_walkers::before {
    border-bottom-color: var(--white-walkers-ice);
}

/* Location label */
.location-label {
    position: absolute;
    bottom: -22px;
    left: 50%;
    transform: translateX(-50%);
    white-space: nowrap;
    font-size: 0.65rem;
    font-family: 'Inter', sans-serif;
    font-weight: 600;
    text-transform: uppercase;
    letter-spacing: 0.5px;
    color: var(--text-secondary);
    text-shadow: 0 1px 3px rgba(0, 0, 0, 0.8);
    pointer-events: none;
}

/* Pulsing animation */
@keyframes pulse {
    0% {
        box-shadow: 0 0 0 0 rgba(255, 255, 255, 0.5);
    }
    70% {
        box-shadow: 0 0 0 12px rgba(255, 255, 255, 0);
    }
    100% {
        box-shadow: 0 0 0 0 rgba(255, 255, 255, 0);
    }
}

.location-marker.pulsing {
    animation: pulse 1.5s infinite;
}

/* --- Faction Selection Page --- */
.faction-hero {
    min-height: calc(100vh - 76px);
    display: flex;
    align-items: center;
    justify-content: center;
    padding: 2rem 0;
}

.faction-hero-title {
    font-family: 'Cinzel', serif;
    font-size: 2.5rem;
    font-weight: 700;
    background: linear-gradient(135deg, var(--southern-gold), var(--northern-blue));
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    background-clip: text;
    margin-bottom: 0.5rem;
}

.faction-hero-subtitle {
    color: var(--text-secondary);
    font-size: 1.1rem;
    margin-bottom: 2.5rem;
}

.faction-card {
    transition: all var(--transition-speed) ease;
    height: 100%;
    border: 2px solid var(--border-subtle);
    cursor: pointer;
    position: relative;
    overflow: visible;
}

.faction-card:not(.faction-unavailable):hover {
    transform: translateY(-8px);
}

.faction-card.faction-selected {
    transform: translateY(-5px) scale(1.02);
}

.faction-card.faction-selected.faction-southern {
    border-color: var(--southern-gold);
    box-shadow: 0 0 30px var(--southern-glow), 0 0 60px rgba(255, 215, 0, 0.15);
}

.faction-card.faction-selected.faction-northern {
    border-color: var(--northern-blue);
    box-shadow: 0 0 30px var(--northern-glow), 0 0 60px rgba(79, 195, 247, 0.15);
}

.faction-card.faction-selected.faction-nights-watch {
    border-color: var(--nights-watch-accent);
    box-shadow: 0 0 30px var(--nights-watch-glow), 0 0 60px rgba(215, 228, 241, 0.2);
}

.faction-card.faction-selected.faction-white-walkers {
    border-color: var(--white-walkers-ice);
    box-shadow: 0 0 30px var(--white-walkers-glow), 0 0 60px rgba(214, 241, 255, 0.2);
}

.faction-card.faction-selected.faction-barbarian {
    border-color: var(--barbarian-orange);
    box-shadow: 0 0 30px var(--barbarian-glow), 0 0 60px rgba(193, 68, 46, 0.18);
}

.faction-card.faction-selected.map-card {
    border-color: var(--northern-blue);
    box-shadow: 0 0 30px var(--northern-glow), 0 0 60px rgba(79, 195, 247, 0.15);
}

.faction-unavailable {
    opacity: 0.4;
    cursor: not-allowed;
}

.faction-icon {
    display: inline-flex;
    align-items: center;
    justify-content: center;
    font-size: 2.5rem;
    width: 90px;
    height: 90px;
    border-radius: 50%;
    margin-bottom: 1rem;
    transition: all var(--transition-speed);
}

.southern-icon {
    background: radial-gradient(circle, rgba(255, 215, 0, 0.2), transparent 70%);
    color: var(--southern-gold);
    border: 2px solid rgba(255, 215, 0, 0.3);
}

.faction-card:hover .southern-icon,
.faction-card.faction-selected .southern-icon {
    box-shadow: 0 0 30px var(--southern-glow);
    border-color: var(--southern-gold);
}

.northern-icon {
    background: radial-gradient(circle, rgba(79, 195, 247, 0.2), transparent 70%);
    color: var(--northern-blue);
    border: 2px solid rgba(79, 195, 247, 0.3);
}

.faction-card:hover .northern-icon,
.faction-card.faction-selected .northern-icon {
    box-shadow: 0 0 30px var(--northern-glow);
    border-color: var(--northern-blue);
}

/* --- WWA faction icons --- */
.nights-watch-icon {
    background: radial-gradient(circle, rgba(215, 228, 241, 0.18), transparent 70%);
    color: var(--nights-watch-accent);
    border: 2px solid rgba(215, 228, 241, 0.3);
}
.faction-card:hover .nights-watch-icon,
.faction-card.faction-selected .nights-watch-icon {
    box-shadow: 0 0 30px var(--nights-watch-glow);
    border-color: var(--nights-watch-accent);
}

.white-walkers-icon {
    background: radial-gradient(circle, rgba(136, 196, 230, 0.22), transparent 70%);
    color: var(--white-walkers-ice);
    border: 2px solid rgba(136, 196, 230, 0.35);
}
.faction-card:hover .white-walkers-icon,
.faction-card.faction-selected .white-walkers-icon {
    box-shadow: 0 0 30px var(--white-walkers-glow);
    border-color: var(--white-walkers-ice);
}

.barbarian-icon {
    background: radial-gradient(circle, rgba(193, 68, 46, 0.2), transparent 70%);
    color: var(--barbarian-orange);
    border: 2px solid rgba(193, 68, 46, 0.32);
}

/* --- Wall-hold HUD overlay for WWA --- */
.wall-hold-hud {
    position: absolute;
    top: 20px;
    right: 20px;
    z-index: 10;
    background: var(--bg-glass);
    backdrop-filter: blur(var(--glass-blur));
    border: 1px solid var(--border-subtle);
    border-radius: 10px;
    padding: 0.75rem 1rem;
    color: var(--text-primary);
    min-width: 220px;
    font-size: 0.9rem;
}
.wall-hold-hud h6 {
    margin: 0 0 0.35rem 0;
    color: var(--nights-watch-accent);
    font-size: 0.85rem;
    text-transform: uppercase;
    letter-spacing: 0.05em;
}
.wall-hold-hud .hold-row {
    display: flex;
    justify-content: space-between;
    margin: 0.1rem 0;
}
.wall-hold-hud .hold-row .ticks {
    font-family: monospace;
    font-weight: 600;
}
.wall-hold-hud .hold-row.nights_watch .ticks { color: var(--nights-watch-accent); }
.wall-hold-hud .hold-row.white_walkers .ticks { color: var(--white-walkers-ice); }

@keyframes iconFloat {
    0%, 100% { transform: translateY(0); }
    50% { transform: translateY(-6px); }
}

.faction-card:hover .faction-icon,
.faction-card.faction-selected .faction-icon {
    animation: iconFloat 2s ease-in-out infinite;
}

.faction-card .card-body {
    text-align: center;
    padding: 2rem 1.5rem;
}

.faction-card h4 {
    font-family: 'Cinzel', serif;
    font-weight: 700;
    margin-bottom: 0.75rem;
}

.faction-card .faction-motto {
    color: var(--text-secondary);
    font-style: italic;
    font-size: 0.95rem;
}

.faction-card .faction-start {
    color: var(--text-muted);
    font-size: 0.85rem;
}

/* --- Game HUD --- */
.game-hud {
    background: var(--bg-card);
    backdrop-filter: blur(var(--glass-blur));
    border: 1px solid var(--border-subtle);
    border-radius: 10px;
    padding: 0.6rem 1.2rem;
    display: flex;
    align-items: center;
    gap: 1.5rem;
    font-size: 0.85rem;
    margin-bottom: 0.75rem;
}

.hud-item {
    display: flex;
    align-items: center;
    gap: 0.4rem;
    color: var(--text-secondary);
}

.hud-item i {
    font-size: 0.9rem;
}

.hud-value {
    font-weight: 700;
    color: var(--text-primary);
    font-family: 'Inter', sans-serif;
}

.hud-item.southern .hud-value { color: var(--southern-gold); }
.hud-item.northern .hud-value { color: var(--northern-blue); }

.hud-divider {
    width: 1px;
    height: 20px;
    background: var(--border-subtle);
}

/* --- Event Feed --- */
.event-feed {
    background: var(--bg-card);
    backdrop-filter: blur(var(--glass-blur));
    border: 1px solid var(--border-subtle);
    border-radius: 10px;
    max-height: 160px;
    overflow-y: auto;
}

.event-feed-header {
    padding: 0.5rem 1rem;
    border-bottom: 1px solid var(--border-subtle);
    font-family: 'Cinzel', serif;
    font-size: 0.8rem;
    color: var(--text-secondary);
    text-transform: uppercase;
    letter-spacing: 1px;
    position: sticky;
    top: 0;
    background: var(--bg-card);
    z-index: 1;
}

.event-item {
    padding: 0.4rem 1rem;
    border-bottom: 1px solid rgba(240, 246, 252, 0.04);
    display: flex;
    align-items: flex-start;
    gap: 0.6rem;
    font-size: 0.78rem;
    line-height: 1.4;
}

.event-item:last-child {
    border-bottom: none;
}

.event-time {
    color: var(--text-muted);
    font-size: 0.7rem;
    white-space: nowrap;
    min-width: 48px;
    font-family: 'Inter', sans-serif;
}

.event-icon {
    font-size: 0.75rem;
    min-width: 16px;
    text-align: center;
}

.event-icon.southern { color: var(--southern-gold); }
.event-icon.northern { color: var(--northern-blue); }
.event-icon.neutral { color: var(--neutral-silver); }

.event-message {
    color: var(--text-secondary);
}

/* --- Map Container --- */
#mapContainer {
    background: #1a1f2e !important;
    border: 1px solid var(--border-subtle) !important;
    border-radius: 10px !important;
    overflow: hidden;
}

/* --- Action Panel (right sidebar) --- */
.action-panel .card {
    border: 1px solid var(--border-subtle);
}

.action-panel .card-header {
    font-family: 'Cinzel', serif;
    font-size: 0.95rem;
}

.action-panel .btn {
    font-size: 0.85rem;
    padding: 0.5rem 0.75rem;
}

/* --- AI Toggle --- */
.ai-toggle-card .form-check-input:checked {
    background-color: #22c55e;
    border-color: #22c55e;
}

.ai-status-dot {
    display: inline-block;
    width: 8px;
    height: 8px;
    border-radius: 50%;
    margin-right: 6px;
}

.ai-status-dot.active {
    background: #22c55e;
    box-shadow: 0 0 8px rgba(34, 197, 94, 0.6);
    animation: statusPulse 2s infinite;
}

.ai-status-dot.inactive {
    background: var(--text-muted);
}

@keyframes statusPulse {
    0%, 100% { opacity: 1; }
    50% { opacity: 0.4; }
}

/* --- Game Over Overlay --- */
#gameOverOverlay {
    background: rgba(13, 17, 23, 0.92);
    z-index: 100;
    color: white;
    backdrop-filter: blur(8px);
}

.victory-text {
    font-family: 'Cinzel', serif;
    text-shadow: 0 0 20px var(--southern-gold), 0 0 40px rgba(255, 215, 0, 0.3);
    animation: victoryPulse 2s infinite;
}

.defeat-text {
    font-family: 'Cinzel', serif;
    text-shadow: 0 0 20px #ef4444, 0 0 40px rgba(239, 68, 68, 0.3);
    animation: defeatPulse 2s infinite;
}

.victory-icon {
    animation: bounce 2s infinite;
}

.defeat-icon {
    animation: shake 2s infinite;
}

@keyframes victoryPulse {
    0% { text-shadow: 0 0 20px var(--southern-gold); }
    50% { text-shadow: 0 0 40px var(--southern-gold), 0 0 60px rgba(255, 215, 0, 0.4); }
    100% { text-shadow: 0 0 20px var(--southern-gold); }
}

@keyframes defeatPulse {
    0% { text-shadow: 0 0 20px #ef4444; }
    50% { text-shadow: 0 0 40px #ef4444, 0 0 60px rgba(239, 68, 68, 0.4); }
    100% { text-shadow: 0 0 20px #ef4444; }
}

@keyframes bounce {
    0%, 20%, 50%, 80%, 100% { transform: translateY(0); }
    40% { transform: translateY(-30px); }
    60% { transform: translateY(-15px); }
}

@keyframes shake {
    0%, 100% { transform: translateX(0); }
    10%, 30%, 50%, 70%, 90% { transform: translateX(-10px); }
    20%, 40%, 60%, 80% { transform: translateX(10px); }
}

/* --- Transfer Indicators --- */
.transfer-indicator {
    position: absolute;
    transform-origin: 50% 50%;
    z-index: 15;
    opacity: 0.9;
    pointer-events: none;
    display: flex;
    align-items: center;
    justify-content: center;
    font-size: 24px;
    color: white;
    text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5);
    background: rgba(13, 17, 23, 0.7);
    width: 40px;
    height: 40px;
    border-radius: 50%;
    backdrop-filter: blur(4px);
}

.transfer-indicator.southern {
    border: 2px solid var(--southern-gold);
    box-shadow: 0 0 12px var(--southern-glow);
}

.transfer-indicator.northern {
    border: 2px solid var(--northern-blue);
    box-shadow: 0 0 12px var(--northern-glow);
}

.transfer-indicator.attack i {
    color: #ff4444;
}

.transfer-indicator.resources i {
    color: var(--southern-gold);
}

/* --- Move Army Modal (dark) --- */
.modal-content {
    background: var(--bg-secondary);
    border: 1px solid var(--border-subtle);
    color: var(--text-primary);
    border-radius: 12px;
}

.modal-header {
    border-bottom-color: var(--border-subtle);
}

.modal-header .modal-title {
    font-family: 'Cinzel', serif;
}

.modal-footer {
    border-top-color: var(--border-subtle);
}

.modal-header .btn-close {
    filter: invert(1);
}

.list-group-item {
    background: rgba(255, 255, 255, 0.03);
    border-color: var(--border-subtle);
    color: var(--text-primary);
}

.list-group-item:hover,
.list-group-item-action:hover {
    background: rgba(255, 255, 255, 0.08);
    color: var(--text-primary);
}

.destination-item {
    cursor: pointer;
    transition: all var(--transition-speed);
}

.destination-item:hover {
    background: rgba(255, 255, 255, 0.08) !important;
    border-color: var(--southern-gold);
}

/* --- Replay Page Styles --- */
.attribute-item {
    border-left: 3px solid var(--northern-blue);
    padding-left: 8px;
    margin-bottom: 8px;
}

#span-attributes {
    max-height: 200px;
    overflow-y: auto;
}

/* Movement arrow (replay) */
.movement-arrow {
    position: absolute;
    z-index: 15;
    pointer-events: none;
    color: #fff;
    font-size: 24px;
    text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.7);
    animation: moveAlongPath 2s ease-in-out;
}

@keyframes moveAlongPath {
    0% { opacity: 0; transform: scale(0.5); }
    50% { opacity: 1; transform: scale(1); }
    100% { opacity: 0; transform: scale(0.5); }
}

/* Action highlight (replay) */
.location-marker.active {
    animation: markerPulse 1.5s infinite;
}

.location-marker.action-highlight {
    animation: actionPulse 1.5s ease-in-out;
}

@keyframes markerPulse {
    0% { transform: translate(-50%, -50%) scale(1); }
    50% { transform: translate(-50%, -50%) scale(1.2); }
    100% { transform: translate(-50%, -50%) scale(1); }
}

@keyframes actionPulse {
    0% { transform: translate(-50%, -50%) scale(1); }
    50% { transform: translate(-50%, -50%) scale(1.3); box-shadow: 0 0 30px rgba(255, 255, 0, 0.6); }
    100% { transform: translate(-50%, -50%) scale(1); }
}

/* --- Spinner (dark) --- */
.spinner-border {
    color: var(--northern-blue) !important;
}

/* --- Small text helpers --- */
.text-muted {
    color: var(--text-muted) !important;
}

small.text-muted {
    color: var(--text-muted) !important;
}

/* --- Traveling Unit Animations --- */
.traveling-unit {
    position: absolute;
    z-index: 20;
    pointer-events: none;
    display: flex;
    align-items: center;
    justify-content: center;
    border-radius: 50%;
    font-size: 18px;
    transition: none;
}

.traveling-unit.army {
    width: 42px;
    height: 42px;
    background: rgba(13, 17, 23, 0.9);
    border: 2px solid;
    backdrop-filter: blur(4px);
    animation: unitBob 0.6s ease-in-out infinite;
}

.traveling-unit.army.southern {
    border-color: var(--southern-gold);
    box-shadow: 0 0 16px var(--southern-glow), 0 0 32px rgba(255, 215, 0, 0.2);
    color: var(--southern-gold);
}

.traveling-unit.army.northern {
    border-color: var(--northern-blue);
    box-shadow: 0 0 16px var(--northern-glow), 0 0 32px rgba(79, 195, 247, 0.2);
    color: var(--northern-blue);
}

.traveling-unit.cart {
    width: 36px;
    height: 36px;
    background: rgba(13, 17, 23, 0.9);
    border: 2px solid var(--southern-gold);
    box-shadow: 0 0 14px rgba(255, 215, 0, 0.35);
    color: var(--southern-gold);
    font-size: 15px;
    animation: unitBob 0.8s ease-in-out infinite;
}

.traveling-unit.cart.northern {
    border-color: var(--northern-blue);
    box-shadow: 0 0 14px rgba(79, 195, 247, 0.35);
    color: var(--northern-blue);
}

/* Army count badge on traveling unit */
.traveling-unit .army-count {
    position: absolute;
    top: -8px;
    right: -8px;
    min-width: 18px;
    height: 18px;
    border-radius: 9px;
    background: var(--southern-crimson);
    color: #fff;
    font-size: 10px;
    font-weight: 700;
    font-family: 'Inter', sans-serif;
    display: flex;
    align-items: center;
    justify-content: center;
    padding: 0 4px;
    line-height: 1;
}

@keyframes unitBob {
    0%, 100% { margin-top: 0; }
    50% { margin-top: -4px; }
}

/* Trail particles left behind by traveling units */
.trail-particle {
    position: absolute;
    z-index: 18;
    pointer-events: none;
    width: 6px;
    height: 6px;
    border-radius: 50%;
    animation: trailFade 1.5s ease-out forwards;
}

.trail-particle.southern {
    background: var(--southern-gold);
    box-shadow: 0 0 8px var(--southern-glow), 0 0 16px rgba(255, 215, 0, 0.15);
}

.trail-particle.northern {
    background: var(--northern-blue);
    box-shadow: 0 0 8px var(--northern-glow), 0 0 16px rgba(79, 195, 247, 0.15);
}

.trail-particle.resource {
    background: var(--southern-gold);
    box-shadow: 0 0 6px rgba(255, 215, 0, 0.5);
    width: 5px;
    height: 5px;
}

/* Persistent glowing trail line segment */
.trail-line-segment {
    position: absolute;
    z-index: 17;
    pointer-events: none;
    height: 2px;
    transform-origin: 0 50%;
    animation: trailLineFade 2.5s ease-out forwards;
}

.trail-line-segment.southern {
    background: linear-gradient(90deg, transparent, var(--southern-gold), transparent);
    box-shadow: 0 0 6px var(--southern-glow);
}

.trail-line-segment.northern {
    background: linear-gradient(90deg, transparent, var(--northern-blue), transparent);
    box-shadow: 0 0 6px var(--northern-glow);
}

.trail-line-segment.resource {
    background: linear-gradient(90deg, transparent, var(--southern-gold), transparent);
    box-shadow: 0 0 4px rgba(255, 215, 0, 0.3);
    height: 1.5px;
}

@keyframes trailFade {
    0% { opacity: 0.9; transform: scale(1.2); }
    50% { opacity: 0.4; transform: scale(0.8); }
    100% { opacity: 0; transform: scale(0.1); }
}

@keyframes trailLineFade {
    0% { opacity: 0.7; }
    60% { opacity: 0.3; }
    100% { opacity: 0; }
}

/* Clash/explosion effect at destination */
.clash-burst {
    position: absolute;
    z-index: 25;
    pointer-events: none;
    width: 80px;
    height: 80px;
    border-radius: 50%;
    transform: translate(-50%, -50%);
    animation: clashExpand 1.2s ease-out forwards;
}

.clash-burst.attack {
    background: radial-gradient(circle, rgba(239, 68, 68, 0.7), rgba(255, 165, 0, 0.3) 50%, rgba(239, 68, 68, 0) 70%);
    box-shadow: 0 0 40px rgba(239, 68, 68, 0.5), 0 0 80px rgba(239, 68, 68, 0.2);
}

.clash-burst.capture {
    background: radial-gradient(circle, rgba(34, 197, 94, 0.6), rgba(255, 215, 0, 0.3) 50%, rgba(34, 197, 94, 0) 70%);
    box-shadow: 0 0 40px rgba(34, 197, 94, 0.4), 0 0 80px rgba(34, 197, 94, 0.15);
}

.clash-burst.reinforce {
    background: radial-gradient(circle, rgba(99, 102, 241, 0.5), rgba(99, 102, 241, 0) 70%);
    box-shadow: 0 0 30px rgba(99, 102, 241, 0.4);
}

@keyframes clashExpand {
    0% { transform: translate(-50%, -50%) scale(0.2); opacity: 1; }
    30% { transform: translate(-50%, -50%) scale(1.2); opacity: 0.9; }
    60% { transform: translate(-50%, -50%) scale(1.8); opacity: 0.5; }
    100% { transform: translate(-50%, -50%) scale(2.5); opacity: 0; }
}

/* Capture sparkle particles */
.capture-sparkle {
    position: absolute;
    z-index: 24;
    pointer-events: none;
    width: 6px;
    height: 6px;
    border-radius: 50%;
    animation: sparkleFloat 1.2s ease-out forwards;
}

@keyframes sparkleFloat {
    0% { opacity: 1; transform: translate(-50%, -50%) scale(1); }
    100% { opacity: 0; transform: translate(-50%, -50%) translateY(-40px) scale(0); }
}

/* Marker captured flash */
.location-marker.just-captured {
    animation: capturedFlash 0.6s ease-out;
}

@keyframes capturedFlash {
    0% { filter: brightness(1); }
    30% { filter: brightness(2.5); }
    100% { filter: brightness(1); }
}

/* Glowing connection line pulse during movement */
.connection-pulse {
    position: absolute;
    z-index: 16;
    pointer-events: none;
    width: 10px;
    height: 10px;
    border-radius: 50%;
    animation: connectionGlow 1s ease-in-out infinite;
}

.connection-pulse.southern {
    background: var(--southern-gold);
    box-shadow: 0 0 12px var(--southern-glow), 0 0 24px rgba(255, 215, 0, 0.15);
}

.connection-pulse.northern {
    background: var(--northern-blue);
    box-shadow: 0 0 12px var(--northern-glow), 0 0 24px rgba(79, 195, 247, 0.15);
}

@keyframes connectionGlow {
    0%, 100% { opacity: 0.3; transform: translate(-50%, -50%) scale(0.6); }
    50% { opacity: 1; transform: translate(-50%, -50%) scale(1.4); }
}

/* --- Responsive --- */
@media (max-width: 768px) {
    .location-marker {
        width: 32px !important;
        height: 32px !important;
    }

    .location-marker.capital {
        width: 42px !important;
        height: 42px !important;
    }

    .faction-hero-title {
        font-size: 1.8rem;
    }

    .game-hud {
        flex-wrap: wrap;
        gap: 0.75rem;
    }

    .location-label {
        font-size: 0.55rem;
    }
}


================================================
FILE: game-of-tracing/war_map/telemetry.py
================================================
import os

from opentelemetry.sdk.resources import SERVICE_NAME, Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry import trace

# Logging setup
import logging
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry._logs import set_logger_provider

# Profiling setup (Pyroscope v2 + OTel span-profile linking)
import pyroscope
from pyroscope.otel import PyroscopeSpanProcessor

class GameTelemetry:
    def __init__(self, service_name, logging_endpoint="http://alloy:4318", tracing_endpoint="http://alloy:4317"):
        self.service_name = service_name
        self.logging_endpoint = logging_endpoint
        self.tracing_endpoint = tracing_endpoint
        self.resource = Resource.create(attributes={
            SERVICE_NAME: service_name
        })

        self._setup_logging()
        self._setup_tracing()
        self._setup_profiling()
        
    def _setup_logging(self):
        """Configure OpenTelemetry logging"""
        self.logger_provider = LoggerProvider(resource=self.resource)
        set_logger_provider(self.logger_provider)
        
        log_exporter = OTLPLogExporter(
            endpoint=f"{self.logging_endpoint}/v1/logs"
        )
        
        self.logger_provider.add_log_record_processor(
            BatchLogRecordProcessor(
                exporter=log_exporter,
                max_queue_size=30,
                max_export_batch_size=5
            )
        )
        
        # Setup root logger
        handler = LoggingHandler(
            level=logging.NOTSET,
            logger_provider=self.logger_provider
        )
        logging.getLogger().addHandler(handler)
        logging.getLogger().setLevel(logging.INFO)
        
        self.logger = logging.getLogger(self.service_name)
    
    def _setup_tracing(self):
        """Configure OpenTelemetry tracing"""
        trace.set_tracer_provider(TracerProvider(resource=self.resource))
        
        otlp_exporter = OTLPSpanExporter(
            endpoint=f"{self.tracing_endpoint}/v1/traces",
            insecure=True
        )
        
        span_processor = BatchSpanProcessor(
            span_exporter=otlp_exporter,
            max_export_batch_size=1
        )
        
        trace.get_tracer_provider().add_span_processor(span_processor)
        self.tracer = trace.get_tracer(__name__)

    def _setup_profiling(self):
        """Configure Pyroscope profiling + OTel span-profile linkage."""
        pyroscope.configure(
            application_name=self.service_name,
            server_address=os.getenv("PYROSCOPE_SERVER_ADDRESS", "http://alloy:9999"),
            tags={"service_name": self.service_name},
            oncpu=True,
            gil_only=True,
        )
        trace.get_tracer_provider().add_span_processor(PyroscopeSpanProcessor())

    def get_tracer(self):
        """Get the configured tracer"""
        return self.tracer

    def get_logger(self):
        """Get the configured logger"""
        return self.logger

    def shutdown(self):
        """Flush and shutdown all telemetry providers."""
        try:
            trace.get_tracer_provider().shutdown()
        except Exception:
            pass
        try:
            self.logger_provider.shutdown()
        except Exception:
            pass


================================================
FILE: game-of-tracing/war_map/templates/index.html
================================================
{% extends "layout.html" %}

{% block title %}
    {% if single_player %}Take the Black — {{ map_meta.display_name }}{% else %}Choose Your Faction{% endif %}
{% endblock %}

{% block content %}
<div class="faction-hero">
    <div class="col-lg-8 col-xl-7">
        <div class="text-center mb-5">
            <h1 class="faction-hero-title">
                {% if single_player %}{{ map_meta.display_name }}{% else %}A Game of Traces{% endif %}
            </h1>
            <p class="faction-hero-subtitle">
                {% if single_player %}{{ map_meta.description }}
                {% else %}Choose your kingdom. Command your armies. Master distributed tracing.
                {% endif %}
            </p>
            <p class="small">
                <a href="{{ url_for('map_picker') }}" class="text-decoration-none">
                    <i class="fas fa-map me-1"></i>Pick a different map
                </a>
            </p>
        </div>

        <!-- Show reset status if coming from restart -->
        {% if request.args.get('reset') %}
        <div class="alert {% if request.args.get('reset') == 'success' %}alert-success{% else %}alert-danger{% endif %} mb-4">
            <i class="fas {% if request.args.get('reset') == 'success' %}fa-check-circle{% else %}fa-exclamation-triangle{% endif %} me-2"></i>
            {{ request.args.get('message', 'Game reset status unknown') }}
        </div>
        {% endif %}

        {% if error %}
        <div class="alert alert-danger mb-4">{{ error }}</div>
        {% endif %}

        <form method="POST" action="{{ url_for('select_faction') }}" id="factionForm">
            <div class="mb-4">
                <label for="player_name" class="form-label">
                    {% if single_player %}Your name, brother of the Watch{% else %}Commander Name{% endif %}
                </label>
                <input type="text" class="form-control form-control-lg" id="player_name" name="player_name"
                       placeholder="Enter your name..." required
                       style="max-width: 400px; margin: 0 auto; text-align: center;">
            </div>
            <input type="hidden" name="faction" id="factionInput" value="{% if single_player %}{{ player_faction }}{% endif %}" required>

            {% if single_player %}
            <!-- Single-player: one preset faction card, auto-selected. -->
            <div class="row g-4 faction-selection justify-content-center mb-4">
                <div class="col-md-6">
                    <div class="card faction-card faction-nights-watch faction-selected {% if not player_available %}faction-unavailable{% endif %}"
                         data-faction="{{ player_faction }}">
                        <div class="card-body">
                            <span class="faction-icon nights-watch-icon">
                                <i class="fas fa-shield-halved"></i>
                            </span>
                            <h4>The Night's Watch</h4>
                            <p class="faction-motto">"Night gathers, and now my watch begins."</p>
                            <p class="faction-start">
                                <i class="fas fa-map-marker-alt me-1"></i>Castle Black
                            </p>
                            {% if not player_available %}
                            <div class="mt-2">
                                <span class="badge bg-danger"><i class="fas fa-ban me-1"></i>Already taken — reset the game</span>
                            </div>
                            {% endif %}
                        </div>
                    </div>
                </div>
            </div>

            <div class="text-center">
                <button type="submit" id="enterGameBtn" class="btn btn-primary btn-lg px-5 py-2"
                        {% if not player_available %}disabled{% endif %}>
                    <i class="fas fa-chess-knight me-2"></i>Take the Black
                </button>
            </div>
            {% else %}
            <!-- Two-faction WoK selection -->
            <div class="row g-4 faction-selection justify-content-center mb-4">
                <div class="col-md-5">
                    <div class="card faction-card faction-southern {% if not southern_available %}faction-unavailable{% endif %}"
                         data-faction="southern" {% if not southern_available %}aria-disabled="true"{% endif %}>
                        <div class="card-body">
                            <span class="faction-icon southern-icon">
                                <i class="fas fa-sun"></i>
                            </span>
                            <h4>Southern Kingdom</h4>
                            <p class="faction-motto">"Glory and Honor!"</p>
                            <p class="faction-start">
                                <i class="fas fa-map-marker-alt me-1"></i>Start at Southern Capital
                            </p>
                            {% if not southern_available %}
                            <div class="mt-2">
                                <span class="badge bg-danger"><i class="fas fa-ban me-1"></i>Already taken</span>
                            </div>
                            {% endif %}
                        </div>
                    </div>
                </div>

                <div class="col-md-5">
                    <div class="card faction-card faction-northern {% if not northern_available %}faction-unavailable{% endif %}"
                         data-faction="northern" {% if not northern_available %}aria-disabled="true"{% endif %}>
                        <div class="card-body">
                            <span class="faction-icon northern-icon">
                                <i class="fas fa-snowflake"></i>
                            </span>
                            <h4>Northern Kingdom</h4>
                            <p class="faction-motto">"Strength and Unity"</p>
                            <p class="faction-start">
                                <i class="fas fa-map-marker-alt me-1"></i>Start at Northern Capital
                            </p>
                            {% if not northern_available %}
                            <div class="mt-2">
                                <span class="badge bg-danger"><i class="fas fa-ban me-1"></i>Already taken</span>
                            </div>
                            {% endif %}
                        </div>
                    </div>
                </div>
            </div>

            <div class="text-center">
                <button type="submit" id="enterGameBtn" class="btn btn-primary btn-lg px-5 py-2" disabled
                        {% if not southern_available and not northern_available %}disabled{% endif %}>
                    <i class="fas fa-dungeon me-2"></i>Enter The Game
                </button>
            </div>
            {% endif %}
        </form>

        <!-- Reset & Replay links -->
        <div class="row g-3 mt-4 justify-content-center">
            <div class="col-auto">
                <button id="reset-game-btn" class="btn btn-outline-light btn-sm">
                    <i class="fas fa-redo-alt me-1"></i>Reset Game
                </button>
                <div id="reset-status" class="mt-2 text-center small"></div>
            </div>
            <div class="col-auto">
                <a href="/replay" class="btn btn-outline-info btn-sm">
                    <i class="fas fa-play me-1"></i>View Game Replays
                </a>
            </div>
        </div>
    </div>
</div>
{% endblock %}

{% block scripts %}
<script>
    $(document).ready(function() {
        const factionInput = $('#factionInput');
        const enterBtn = $('#enterGameBtn');

        // Make the entire card clickable for faction selection
        $('.faction-card:not(.faction-unavailable)').click(function() {
            const faction = $(this).data('faction');
            factionInput.val(faction);

            // Update visual selection
            $('.faction-card').removeClass('faction-selected');
            $(this).addClass('faction-selected');

            // Enable submit button
            enterBtn.prop('disabled', false);
        });

        // Prevent submit if no faction
        $('#factionForm').on('submit', function(e) {
            if (!factionInput.val()) {
                e.preventDefault();
                // Flash the cards briefly
                $('.faction-card:not(.faction-unavailable)').addClass('border-warning');
                setTimeout(() => $('.faction-card').removeClass('border-warning'), 1000);
            }
        });

        // Reset Game button handler
        $('#reset-game-btn').click(function() {
            if (!confirm('Are you sure you want to reset the game? This will clear all progress.')) return;

            $('#reset-game-btn').prop('disabled', true);
            $('#reset-status').html('<span style="color: var(--northern-blue);">Resetting game...</span>');

            fetch('/api/reset_game', {method: 'POST'})
                .then(response => response.json())
                .then(data => {
                    if (data.success) {
                        $('#reset-status').html('<span style="color: #22c55e;">' + data.message + '</span>');
                        setTimeout(() => { window.location.reload(); }, 1000);
                    } else {
                        $('#reset-status').html('<span style="color: #ef4444;">Failed to reset game.</span>');
                        $('#reset-game-btn').prop('disabled', false);
                    }
                })
                .catch(() => {
                    $('#reset-status').html('<span style="color: #ef4444;">Network error. Try again.</span>');
                    $('#reset-game-btn').prop('disabled', false);
                });
        });
    });
</script>
{% endblock %}


================================================
FILE: game-of-tracing/war_map/templates/layout.html
================================================
<!DOCTYPE html>
<html lang="en" data-bs-theme="dark">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta name="theme-color" content="#0d1117">
    <title>A Game of Traces - {% block title %}Game Map{% endblock %}</title>
    <link rel="preconnect" href="https://fonts.googleapis.com">
    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
    <link href="https://fonts.googleapis.com/css2?family=Cinzel:wght@400;600;700&family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
    <link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
    {% block extra_css %}{% endblock %}
</head>
<body>
    <nav class="navbar navbar-expand-lg navbar-dark">
        <div class="container-fluid">
            <a class="navbar-brand" href="{{ url_for('index') }}">
                <i class="fas fa-chess-rook me-2"></i>A Game of Traces
            </a>
            <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
                <span class="navbar-toggler-icon"></span>
            </button>
            <div class="collapse navbar-collapse" id="navbarNav">
                <ul class="navbar-nav">
                    <li class="nav-item">
                        <a class="nav-link" href="{{ url_for('index') }}">Home</a>
                    </li>
                    {% if session.get('faction') %}
                    <li class="nav-item">
                        <a class="nav-link" href="{{ url_for('game_map') }}">Game Map</a>
                    </li>
                    <li class="nav-item">
                        <a class="nav-link" href="{{ url_for('replay_page') }}">
                            <i class="fas fa-history me-1"></i>Replay
                        </a>
                    </li>
                    {% endif %}
                </ul>
                <ul class="navbar-nav ms-auto">
                    {% if session.get('faction') %}
                    <li class="nav-item">
                        <span class="nav-link">
                            {% if session.get('faction') == 'southern' %}
                            <i class="fas fa-sun me-1" style="color: var(--southern-gold);"></i>
                            {% else %}
                            <i class="fas fa-snowflake me-1" style="color: var(--northern-blue);"></i>
                            {% endif %}
                            {{ session.get('player_name', 'Player') }} <span class="text-muted">({{ session.get('faction', '').capitalize() }})</span>
                        </span>
                    </li>
                    <li class="nav-item">
                        <a class="nav-link" href="{{ url_for('logout') }}">Logout</a>
                    </li>
                    {% endif %}
                </ul>
            </div>
        </div>
    </nav>

    <div class="container-fluid mt-4">
        {% block content %}{% endblock %}
    </div>

    <footer class="text-center py-3 mt-5">
        <div class="container">
            <p class="mb-0">A Game of Traces</p>
        </div>
    </footer>

    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
    <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
    {% block scripts %}{% endblock %}
</body>
</html>


================================================
FILE: game-of-tracing/war_map/templates/map.html
================================================
{% extends "layout.html" %}

{% block title %}Game Map{% endblock %}

{% block content %}
<!-- Template variables for JavaScript -->
<script type="application/json" id="templateData">
    {
        "locations": {{ locations | tojson }},
        "connections": {{ connections | tojson }},
        "playerFaction": "{{ faction }}",
        "gameOver": {{ game_over | tojson }},
        "winner": {% if winner %}"{{ winner }}"{% else %}null{% endif %},
        "victoryMessage": {% if victory_message %}"{{ victory_message }}"{% else %}null{% endif %}
    }
</script>

<!-- Game HUD -->
<div class="game-hud" id="gameHud">
    <div class="hud-item {{ faction }}">
        {% if faction == 'southern' %}
        <i class="fas fa-sun" style="color: var(--southern-gold);"></i>
        {% else %}
        <i class="fas fa-snowflake" style="color: var(--northern-blue);"></i>
        {% endif %}
        <span class="hud-value">{{ faction|capitalize }}</span>
    </div>
    <div class="hud-divider"></div>
    <div class="hud-item">
        <i class="fas fa-coins" style="color: var(--southern-gold);"></i>
        <span>Resources:</span>
        <span class="hud-value" id="hudResources">0</span>
    </div>
    <div class="hud-divider"></div>
    <div class="hud-item">
        <i class="fas fa-shield-alt" style="color: #ef4444;"></i>
        <span>Armies:</span>
        <span class="hud-value" id="hudArmies">0</span>
    </div>
    <div class="hud-divider"></div>
    <div class="hud-item">
        <i class="fas fa-map-marked-alt" style="color: #22c55e;"></i>
        <span>Territory:</span>
        <span class="hud-value" id="hudTerritory">0/8</span>
    </div>
    <div class="ms-auto">
        <button id="refreshMapBtn" class="btn btn-outline-light btn-sm">
            <i class="fas fa-sync-alt"></i>
        </button>
    </div>
</div>

<div class="row g-3">
    <!-- Game Map -->
    <div class="col-lg-9">
        <div id="mapContainer" class="position-relative w-100" style="height: 65vh; min-height: 400px;">
            <!-- Map Canvas -->
            <canvas id="mapCanvas" class="h-100 w-100"></canvas>

            <!-- Location Markers (added dynamically) -->
            <div id="mapMarkers"></div>

            {% if wall_hold %}
            <!-- Wall-hold HUD (WWA win-condition tracker) -->
            <div class="wall-hold-hud" id="wallHoldHud">
                <h6><i class="fas fa-gavel me-1"></i>Wall Hold</h6>
                <div class="small mb-1">Hold every keep for {{ wall_hold.threshold }} ticks to win.</div>
                <div class="hold-row nights_watch">
                    <span>Night's Watch</span>
                    <span class="ticks" id="hudHoldNightsWatch">
                        {{ wall_hold.holds.get('nights_watch', 0) }}/{{ wall_hold.threshold }}
                    </span>
                </div>
                <div class="hold-row white_walkers">
                    <span>White Walkers</span>
                    <span class="ticks" id="hudHoldWhiteWalkers">
                        {{ wall_hold.holds.get('white_walkers', 0) }}/{{ wall_hold.threshold }}
                    </span>
                </div>
            </div>
            {% endif %}

            <!-- Alert messages -->
            <div id="mapAlert" class="position-absolute top-0 start-0 end-0 alert alert-danger m-3 d-none">
                An error occurred
            </div>

            <!-- Game Over Overlay -->
            <div id="gameOverOverlay" class="position-absolute top-0 start-0 w-100 h-100 d-none">
                <div class="h-100 d-flex flex-column justify-content-center align-items-center text-center p-4">
                    <h1 id="gameOverTitle" class="mb-4"></h1>
                    <p id="gameOverMessage" class="mb-5 fs-4"></p>
                    <div id="victoryAnimation" class="d-none">
                        <i class="fas fa-crown fa-5x mb-4 victory-icon" style="color: var(--southern-gold);"></i>
                    </div>
                    <div id="defeatAnimation" class="d-none">
                        <i class="fas fa-skull-crossbones fa-5x mb-4 defeat-icon" style="color: #ef4444;"></i>
                    </div>
                    <a href="/restart-game" class="btn btn-primary btn-lg mt-4">
                        <i class="fas fa-redo-alt me-2"></i>Restart Game
                    </a>
                </div>
            </div>
        </div>

        <!-- Event Feed -->
        <div class="event-feed mt-2" id="eventFeed">
            <div class="event-feed-header">
                <i class="fas fa-scroll me-1"></i> Battle Log
            </div>
            <div id="eventFeedBody">
                <div class="event-item">
                    <span class="event-time">--:--</span>
                    <span class="event-icon neutral"><i class="fas fa-flag"></i></span>
                    <span class="event-message">Game started. Select a location on the map to begin.</span>
                </div>
            </div>
        </div>
    </div>

    <!-- Right Panel -->
    <div class="col-lg-3 action-panel">
        <!-- AI Opponent Control -->
        <div class="card ai-toggle-card mb-3">
            <div class="card-header">
                <h6 class="mb-0"><i class="fas fa-robot me-2"></i>AI Opponent</h6>
            </div>
            <div class="card-body py-2">
                <div class="form-check form-switch">
                    <input class="form-check-input" type="checkbox" id="aiToggle" role="switch">
                    <label class="form-check-label small" for="aiToggle">
                        Enable AI
                    </label>
                </div>
                <div id="aiStatus" class="mt-1 small" style="color: var(--text-muted);">
                    <span class="ai-status-dot inactive"></span>AI is inactive
                </div>
            </div>
        </div>

        <!-- Location Details -->
        <div class="card mb-3">
            <div class="card-header">
                <h6 class="mb-0" id="locationName">Select a Location</h6>
            </div>
            <div class="card-body">
                <div id="emptyState" class="text-center py-4">
                    <i class="fas fa-map-marker-alt fa-2x mb-2" style="color: var(--text-muted);"></i>
                    <p class="small mb-0" style="color: var(--text-secondary);">Click on a location on the map</p>
                </div>

                <div id="locationDetails" style="display: none;">
                    <div class="d-flex justify-content-between mb-3">
                        <span class="small" style="color: var(--text-secondary);">Faction</span>
                        <span id="locationFaction" class="badge">Neutral</span>
                    </div>

                    <!-- Resources bar -->
                    <div class="d-flex justify-content-between mb-1">
                        <small style="color: var(--text-secondary);"><i class="fas fa-coins me-1" style="color: var(--southern-gold);"></i>Resources</small>
                        <small id="resourcesValue" class="fw-bold">0</small>
                    </div>
                    <div class="progress mb-3">
                        <div id="resourcesBar" class="progress-bar bg-warning" role="progressbar" style="width: 0%"></div>
                    </div>

                    <!-- Army bar -->
                    <div class="d-flex justify-content-between mb-1">
                        <small style="color: var(--text-secondary);"><i class="fas fa-shield-alt me-1" style="color: #ef4444;"></i>Army</small>
                        <small id="armyValue" class="fw-bold">0</small>
                    </div>
                    <div class="progress mb-3">
                        <div id="armyBar" class="progress-bar bg-danger" role="progressbar" style="width: 0%"></div>
                    </div>

                    <div id="actionButtons">
                        <button id="collectResourcesBtn" class="btn btn-warning btn-sm w-100 mb-2" style="display: none;">
                            <i class="fas fa-coins me-1"></i> Collect Resources
                            <small id="resourceCooldown" class="d-none">(Wait: <span>0</span>s)</small>
                        </button>

                        <button id="createArmyBtn" class="btn btn-danger btn-sm w-100 mb-2" style="display: none;">
                            <i class="fas fa-shield-alt me-1"></i> Create Army <small class="opacity-75">(30 res)</small>
                        </button>

                        <button id="allOutAttackBtn" class="btn btn-sm w-100 mb-2" style="display: none; background: linear-gradient(135deg, #ef4444, #991b1b); border: none; color: #fff;">
                            <i class="fas fa-skull-crossbones me-1"></i> All Out Attack
                            <small class="d-block opacity-75">Send all armies to enemy capital</small>
                        </button>

                        <button id="sendResourcesBtn" class="btn btn-success btn-sm w-100 mb-2" style="display: none;">
                            <i class="fas fa-route me-1"></i> Send Resources to Capital
                        </button>

                        <button id="moveArmyBtn" class="btn btn-info btn-sm w-100" data-bs-toggle="modal" data-bs-target="#moveArmyModal">
                            <i class="fas fa-people-arrows me-1"></i> Move Army
                        </button>
                    </div>

                    <div id="actionStatus" class="alert alert-info mt-3 small" style="display: none;">
                        Action result will appear here
                    </div>
                </div>
            </div>
        </div>
    </div>
</div>

<!-- Move Army Modal -->
<div class="modal fade" id="moveArmyModal" tabindex="-1" role="dialog" aria-labelledby="moveArmyModalLabel">
    <div class="modal-dialog" role="document">
        <div class="modal-content">
            <div class="modal-header">
                <h5 class="modal-title" id="moveArmyModalLabel">Move Army</h5>
                <button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
            </div>
            <div class="modal-body">
                <p>Select a destination for your army from <strong id="sourceLocationName">this location</strong>:</p>

                <div class="list-group" id="destinationsList" role="listbox" aria-label="Available destinations">
                </div>

                <div id="moveArmyStatus" class="alert alert-warning mt-3" style="display: none;" role="alert">
                    Status message will appear here
                </div>
            </div>
            <div class="modal-footer">
                <button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Cancel</button>
            </div>
        </div>
    </div>
</div>
{% endblock %}

{% block extra_css %}
<style>
    #mapCanvas {
        position: absolute;
        top: 0;
        left: 0;
        z-index: 5;
    }
</style>
{% endblock %}

{% block scripts %}
<script>
    // Parse the JSON data from the template
    const templateData = JSON.parse(document.getElementById('templateData').textContent);

    // Store the current game state
    let gameState = {
        locations: templateData.locations,
        connections: templateData.connections,
        selectedLocation: null,
        playerFaction: templateData.playerFaction,
        gameOver: templateData.gameOver,
        winner: templateData.winner,
        victoryMessage: templateData.victoryMessage
    };

    // Event feed log (client-side)
    let eventLog = [];

    // Animation tracking -- prevents map refresh from destroying in-flight animations
    let activeAnimations = 0;
    let pendingRefresh = false;

    // DOM elements
    const mapContainer = document.getElementById('mapContainer');
    const mapCanvas = document.getElementById('mapCanvas');
    const markersContainer = document.getElementById('mapMarkers');
    const gameOverOverlay = document.getElementById('gameOverOverlay');

    // --- HUD ---
    function updateHUD() {
        let totalResources = 0;
        let totalArmies = 0;
        let territoryCount = 0;
        const totalLocations = Object.keys(gameState.locations).length;

        for (const [id, loc] of Object.entries(gameState.locations)) {
            if (loc.faction === gameState.playerFaction) {
                totalResources += loc.resources;
                totalArmies += loc.army;
                territoryCount++;
            }
        }

        document.getElementById('hudResources').textContent = totalResources;
        document.getElementById('hudArmies').textContent = totalArmies;
        document.getElementById('hudTerritory').textContent = `${territoryCount}/${totalLocations}`;
    }

    // --- Event Feed ---
    function addEvent(message, faction, icon) {
        const now = new Date();
        const timeStr = now.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit', second: '2-digit' });
        const factionClass = faction || 'neutral';
        const iconClass = icon || 'fa-circle';

        eventLog.push({ time: timeStr, message, factionClass, iconClass });

        const feedBody = document.getElementById('eventFeedBody');
        const eventEl = document.createElement('div');
        eventEl.className = 'event-item';
        eventEl.innerHTML = `
            <span class="event-time">${timeStr}</span>
            <span class="event-icon ${factionClass}"><i class="fas ${iconClass}"></i></span>
            <span class="event-message">${message}</span>
        `;

        // Insert at top (newest first)
        feedBody.insertBefore(eventEl, feedBody.firstChild.nextSibling);

        // Keep max 30 events
        while (feedBody.children.length > 31) {
            feedBody.removeChild(feedBody.lastChild);
        }
    }

    // --- Map Init ---
    function initMap() {
        clearMap();
        drawConnections();
        createLocationMarkers();
        updateHUD();

        document.getElementById('locationDetails').style.display = 'none';
        document.getElementById('emptyState').style.display = 'block';

        checkGameOver();
    }

    function checkGameOver() {
        if (gameState.gameOver) {
            const isWinner = gameState.winner === gameState.playerFaction;
            const gameOverTitle = document.getElementById('gameOverTitle');
            const gameOverMessage = document.getElementById('gameOverMessage');

            if (isWinner) {
                gameOverTitle.textContent = "VICTORY!";
                gameOverTitle.classList.add('victory-text');
                gameOverMessage.textContent = gameState.victoryMessage;
                document.getElementById('victoryAnimation').classList.remove('d-none');
                document.getElementById('defeatAnimation').classList.add('d-none');
            } else {
                gameOverTitle.textContent = "DEFEAT!";
                gameOverTitle.classList.add('defeat-text');
                if (gameState.winner === 'southern') {
                    gameOverMessage.textContent = "The Southern Kingdom has conquered your capital! Glory to the South!";
                } else {
                    gameOverMessage.textContent = "The Northern Kingdom has conquered your capital! Victory through unity!";
                }
                document.getElementById('victoryAnimation').classList.add('d-none');
                document.getElementById('defeatAnimation').classList.remove('d-none');
            }

            gameOverOverlay.classList.remove('d-none');
            document.querySelectorAll('#actionButtons button').forEach(b => b.disabled = true);
        }
    }

    function clearMap() {
        markersContainer.innerHTML = '';
        const ctx = mapCanvas.getContext('2d');
        ctx.clearRect(0, 0, mapCanvas.width, mapCanvas.height);
    }

    // --- Draw Connections (faction-colored) ---
    function drawConnections() {
        const canvas = mapCanvas;
        canvas.width = mapContainer.clientWidth;
        canvas.height = mapContainer.clientHeight;
        const ctx = canvas.getContext('2d');

        // Subtle grid overlay
        ctx.strokeStyle = 'rgba(255, 255, 255, 0.02)';
        ctx.lineWidth = 1;
        const gridSize = 40;
        for (let x = 0; x < canvas.width; x += gridSize) {
            ctx.beginPath(); ctx.moveTo(x, 0); ctx.lineTo(x, canvas.height); ctx.stroke();
        }
        for (let y = 0; y < canvas.height; y += gridSize) {
            ctx.beginPath(); ctx.moveTo(0, y); ctx.lineTo(canvas.width, y); ctx.stroke();
        }

        // Draw connections
        ctx.lineWidth = 2;

        gameState.connections.forEach(connection => {
            const loc1 = gameState.locations[connection[0]];
            const loc2 = gameState.locations[connection[1]];

            if (loc1 && loc2) {
                const x1 = mapContainer.clientWidth * (loc1.x / 100);
                const y1 = mapContainer.clientHeight * (loc1.y / 100);
                const x2 = mapContainer.clientWidth * (loc2.x / 100);
                const y2 = mapContainer.clientHeight * (loc2.y / 100);

                // Determine line color based on factions
                const f1 = loc1.faction;
                const f2 = loc2.faction;

                const factionLineColors = {
                    southern: 'rgba(255, 215, 0, 0.35)',
                    northern: 'rgba(79, 195, 247, 0.35)',
                    nights_watch: 'rgba(215, 228, 241, 0.45)',
                    white_walkers: 'rgba(136, 196, 230, 0.45)',
                    barbarian: 'rgba(193, 68, 46, 0.35)'
                };
                if (f1 !== 'neutral' && f1 === f2) {
                    ctx.strokeStyle = factionLineColors[f1] || 'rgba(120, 144, 156, 0.2)';
                    ctx.setLineDash([]);
                } else if (f1 !== 'neutral' && f2 !== 'neutral' && f1 !== f2) {
                    // Any two non-neutral, non-identical factions = contested.
                    ctx.strokeStyle = 'rgba(239, 68, 68, 0.3)';
                    ctx.setLineDash([8, 6]);
                } else {
                    ctx.strokeStyle = 'rgba(120, 144, 156, 0.2)';
                    ctx.setLineDash([]);
                }

                ctx.beginPath();
                ctx.moveTo(x1, y1);
                ctx.lineTo(x2, y2);
                ctx.stroke();
                ctx.setLineDash([]);
            }
        });
    }

    // --- Create Location Markers (with labels) ---
    function createLocationMarkers() {
        for (const [locationId, locationData] of Object.entries(gameState.locations)) {
            const marker = document.createElement('div');
            marker.id = `marker-${locationId}`;
            marker.className = `location-marker ${locationData.faction} ${locationData.type}`;
            marker.dataset.locationId = locationId;
            marker.style.left = `${locationData.x}%`;
            marker.style.top = `${locationData.y}%`;

            // Icon — chosen by (type, faction).
            let icon = document.createElement('i');
            const factionIcons = {
                southern: 'fas fa-sun',
                northern: 'fas fa-snowflake',
                nights_watch: 'fas fa-shield-halved',
                white_walkers: 'fas fa-icicles',
                barbarian: 'fas fa-campground',
                neutral: 'fas fa-chess-rook'
            };
            if (locationData.type === 'wall') {
                // Wall keeps always render as a gate/tower regardless of holder;
                // the colour on the marker conveys the faction.
                icon.className = 'fas fa-tower-cell';
            } else if (locationData.type === 'capital') {
                icon.className = factionIcons[locationData.faction] || 'fas fa-chess-rook';
            } else if (locationData.faction === 'barbarian') {
                icon.className = 'fas fa-campground';
            } else {
                icon.className = 'fas fa-map-marker-alt';
            }
            marker.appendChild(icon);

            // Army badge
            if (locationData.army > 0) {
                const armyBadge = document.createElement('span');
                armyBadge.className = 'position-absolute top-0 start-100 translate-middle badge rounded-pill bg-danger';
                armyBadge.textContent = locationData.army;
                armyBadge.style.fontSize = '0.65em';
                armyBadge.style.zIndex = '20';
                marker.appendChild(armyBadge);
            }

            // Location label below marker
            const label = document.createElement('span');
            label.className = 'location-label';
            label.textContent = locationData.name;
            marker.appendChild(label);

            // Click handler
            marker.addEventListener('click', () => selectLocation(locationId));

            markersContainer.appendChild(marker);
        }
    }

    // --- Select Location ---
    function selectLocation(locationId) {
        if (gameState.selectedLocation) {
            const prevMarker = document.getElementById(`marker-${gameState.selectedLocation}`);
            if (prevMarker) prevMarker.classList.remove('selected');
        }

        gameState.selectedLocation = locationId;

        const marker = document.getElementById(`marker-${locationId}`);
        if (marker) marker.classList.add('selected');

        const location = gameState.locations[locationId];

        document.getElementById('locationName').textContent = location.name;

        const factionBadge = document.getElementById('locationFaction');
        factionBadge.textContent = location.faction.charAt(0).toUpperCase() + location.faction.slice(1);
        factionBadge.className = `badge ${location.faction}`;

        // Update resource bar
        const resourcePercentage = Math.min(location.resources / 200 * 100, 100);
        document.getElementById('resourcesBar').style.width = `${resourcePercentage}%`;
        document.getElementById('resourcesValue').textContent = location.resources;

        // Update army bar — faction colored
        const armyPercentage = Math.min(location.army / 10 * 100, 100);
        const armyBar = document.getElementById('armyBar');
        armyBar.style.width = `${armyPercentage}%`;
        document.getElementById('armyValue').textContent = location.army;

        // Show/hide buttons
        const isCapital = location.type === 'capital';
        const isVillage = location.type === 'village';
        const isPlayerLocation = location.faction === gameState.playerFaction;

        document.getElementById('collectResourcesBtn').style.display =
            (isCapital && isPlayerLocation) ? 'block' : 'none';
        document.getElementById('createArmyBtn').style.display =
            (isCapital && isPlayerLocation) ? 'block' : 'none';
        document.getElementById('allOutAttackBtn').style.display =
            (isCapital && isPlayerLocation) ? 'block' : 'none';
        document.getElementById('sendResourcesBtn').style.display =
            (isVillage && isPlayerLocation) ? 'block' : 'none';

        // Cooldown
        const cooldownSpan = document.getElementById('resourceCooldown');
        if (isCapital && location.resource_cooldown) {
            cooldownSpan.classList.remove('d-none');
            cooldownSpan.querySelector('span').textContent = location.resource_cooldown;
            if (location.resource_cooldown > 0) startCooldownTimer(location.resource_cooldown);
        } else {
            cooldownSpan.classList.add('d-none');
        }

        // Button states
        if (isCapital) {
            document.getElementById('collectResourcesBtn').disabled = !isPlayerLocation || gameState.gameOver || location.resource_cooldown > 0;
            document.getElementById('createArmyBtn').disabled = !isPlayerLocation || location.resources < 30 || gameState.gameOver;
            document.getElementById('allOutAttackBtn').disabled = !isPlayerLocation || location.army <= 0 || gameState.gameOver;
        }
        document.getElementById('moveArmyBtn').disabled = !isPlayerLocation || location.army <= 0 || gameState.gameOver;
        document.getElementById('sendResourcesBtn').disabled = !isPlayerLocation || location.resources <= 0 || gameState.gameOver;

        document.getElementById('sourceLocationName').textContent = location.name;

        document.getElementById('locationDetails').style.display = 'block';
        document.getElementById('emptyState').style.display = 'none';

        updateMoveArmyDestinations(locationId);
    }

    // --- Move Army Destinations ---
    function updateMoveArmyDestinations(sourceId) {
        const destinationsList = document.getElementById('destinationsList');
        destinationsList.innerHTML = '';

        const sourceLocation = gameState.locations[sourceId];
        const sourceConnections = [];

        gameState.connections.forEach(conn => {
            if (conn[0] === sourceId) sourceConnections.push(conn[1]);
            else if (conn[1] === sourceId) sourceConnections.push(conn[0]);
        });

        sourceConnections.forEach(destId => {
            const destLocation = gameState.locations[destId];

            const item = document.createElement('button');
            item.className = 'list-group-item list-group-item-action destination-item';
            item.setAttribute('role', 'option');
            item.setAttribute('aria-selected', 'false');
            item.setAttribute('id', `dest-${destId}`);

            const isFriendly = destLocation.faction === sourceLocation.faction;
            const attackWarning = !isFriendly ?
                `<span style="color: #ef4444;">(Attack! ${sourceLocation.army} vs ${destLocation.army})</span>` :
                `<span style="color: #22c55e;">(Friendly reinforcement)</span>`;

            item.innerHTML = `
                <div class="d-flex justify-content-between align-items-center">
                    <div>
                        <h6 class="mb-0">${destLocation.name}
                            <small style="color: var(--text-muted);">${destLocation.faction}</small>
                        </h6>
                        <small>${attackWarning}</small>
                    </div>
                    <span class="badge bg-danger">Army: ${destLocation.army}</span>
                </div>
            `;

            item.addEventListener('click', () => {
                document.querySelectorAll('#destinationsList [role="option"]')
                    .forEach(opt => opt.setAttribute('aria-selected', 'false'));
                item.setAttribute('aria-selected', 'true');
                moveArmy(sourceId, destId);
            });

            item.addEventListener('keydown', (e) => {
                if (e.key === 'Enter' || e.key === ' ') { e.preventDefault(); item.click(); }
            });

            destinationsList.appendChild(item);
        });
    }

    // --- Actions ---
    async function collectResources(locationId) {
        try {
            const response = await fetch('/api/collect_resources', {
                method: 'POST',
                headers: { 'Content-Type': 'application/json' },
                body: JSON.stringify({ location_id: locationId })
            });
            const result = await response.json();

            if (result.success) {
                showActionStatus('success', result.message);
                gameState.locations[locationId].resources = result.current_resources;
                selectLocation(locationId);
                updateHUD();
                addEvent(`Resources collected at ${gameState.locations[locationId].name}`, gameState.playerFaction, 'fa-coins');
            } else {
                if (result.cooldown && result.cooldown_seconds) {
                    startCooldownTimer(result.cooldown_seconds);
                    showActionStatus('warning', result.message);
                } else {
                    showActionStatus('danger', result.message || 'Failed to collect resources');
                }
            }
        } catch (error) {
            showActionStatus('danger', 'Network error. Please try again.');
        }
    }

    async function createArmy(locationId) {
        try {
            const response = await fetch('/api/create_army', {
                method: 'POST',
                headers: { 'Content-Type': 'application/json' },
                body: JSON.stringify({ location_id: locationId })
            });
            const result = await response.json();

            if (result.success) {
                showActionStatus('success', result.message);
                gameState.locations[locationId].resources = result.current_resources;
                gameState.locations[locationId].army = result.current_army;
                refreshMap();
                selectLocation(locationId);
                updateHUD();
                addEvent(`Army created at ${gameState.locations[locationId].name}`, gameState.playerFaction, 'fa-shield-alt');
            } else {
                showActionStatus('danger', result.message || 'Failed to create army');
            }
        } catch (error) {
            showActionStatus('danger', 'Network error. Please try again.');
        }
    }

    async function moveArmy(sourceId, targetId) {
        showMoveArmyStatus('warning', 'Moving army...');

        try {
            const response = await fetch('/api/move_army', {
                method: 'POST',
                headers: { 'Content-Type': 'application/json' },
                body: JSON.stringify({ source_id: sourceId, target_id: targetId })
            });
            const result = await response.json();

            if (result.success || result.message) {
                showMoveArmyStatus('success', result.message);

                // Animated army march from source to target
                const targetFaction = gameState.locations[targetId]?.faction;
                const isAttack = targetFaction && targetFaction !== gameState.playerFaction;
                animateArmyMove(sourceId, targetId, gameState.playerFaction, isAttack);

                const srcName = gameState.locations[sourceId].name;
                const tgtName = gameState.locations[targetId].name;
                addEvent(`Army moved from ${srcName} to ${tgtName}`, gameState.playerFaction, 'fa-people-arrows');

                // Delay periodic refresh until animation completes (~4s max)
                let updateCount = 0;
                const maxUpdates = 6;
                setTimeout(() => {
                    const updateInterval = setInterval(() => {
                        refreshMapData();
                        updateCount++;
                        if (updateCount >= maxUpdates) {
                            clearInterval(updateInterval);
                            setTimeout(checkGameStatus, 1000);
                        }
                    }, 2500);
                }, 4500);

                if (result.game_over) {
                    gameState.gameOver = true;
                    gameState.winner = result.winner;
                    gameState.victoryMessage = result.victory_message;
                    checkGameOver();
                }

                const moveArmyModal = bootstrap.Modal.getInstance(document.getElementById('moveArmyModal'));
                if (moveArmyModal) moveArmyModal.hide();
            } else {
                showMoveArmyStatus('danger', result.error || 'Failed to move army');
            }
        } catch (error) {
            showMoveArmyStatus('danger', 'Network error. Please try again.');
        }
    }

    function showActionStatus(type, message) {
        const statusElement = document.getElementById('actionStatus');
        statusElement.className = `alert alert-${type} mt-3 small`;
        statusElement.textContent = message;
        statusElement.style.display = 'block';
        setTimeout(() => { statusElement.style.display = 'none'; }, 5000);
    }

    function showMoveArmyStatus(type, message) {
        const statusElement = document.getElementById('moveArmyStatus');
        statusElement.className = `alert alert-${type} mt-3`;
        statusElement.textContent = message;
        statusElement.style.display = 'block';
    }

    function refreshMap() {
        clearMap();
        drawConnections();
        createLocationMarkers();
        updateHUD();
        checkGameOver();

        if (gameState.selectedLocation) {
            selectLocation(gameState.selectedLocation);
        }
    }

    async function refreshMapData() {
        try {
            const response = await fetch('/api/map_data');
            const data = await response.json();

            gameState.locations = data.locations;
            gameState.gameOver = data.game_over;
            gameState.winner = data.winner;
            gameState.victoryMessage = data.victory_message;

            // Update the Wall Hold HUD if present (White Walkers Attack).
            if (data.wall_hold) {
                const nw = document.getElementById('hudHoldNightsWatch');
                const ww = document.getElementById('hudHoldWhiteWalkers');
                const threshold = data.wall_hold.threshold || 5;
                const holds = data.wall_hold.holds || {};
                if (nw) nw.textContent = `${holds.nights_watch || 0}/${threshold}`;
                if (ww) ww.textContent = `${holds.white_walkers || 0}/${threshold}`;
            }

            // Defer visual rebuild while animations are playing
            if (activeAnimations > 0) {
                pendingRefresh = true;
                updateHUD(); // HUD is safe to update immediately
                return;
            }

            refreshMap();

            const moveArmyModal = bootstrap.Modal.getInstance(document.getElementById('moveArmyModal'));
            if (moveArmyModal) moveArmyModal.hide();
        } catch (error) {
            console.error('Error refreshing map data:', error);
            const mapAlert = document.getElementById('mapAlert');
            mapAlert.textContent = 'Failed to refresh map data. Please try again.';
            mapAlert.classList.remove('d-none');
            setTimeout(() => { mapAlert.classList.add('d-none'); }, 5000);
        }
    }

    // Flush a deferred refresh once all animations finish
    function flushPendingRefresh() {
        if (pendingRefresh && activeAnimations === 0) {
            pendingRefresh = false;
            refreshMap();
        }
    }

    window.addEventListener('resize', () => {
        drawConnections();
        document.querySelectorAll('.transfer-arrow').forEach(el => el.remove());
    });

    // --- Init ---
    document.addEventListener('DOMContentLoaded', function() {
        initMap();

        document.getElementById('collectResourcesBtn').addEventListener('click', () => {
            if (gameState.selectedLocation) collectResources(gameState.selectedLocation);
        });

        document.getElementById('createArmyBtn').addEventListener('click', () => {
            if (gameState.selectedLocation) createArmy(gameState.selectedLocation);
        });

        document.getElementById('refreshMapBtn').addEventListener('click', refreshMapData);

        // Poll every 5 s so the wall-hold counter and resource HUD reflect
        // the wall-tick thread (30 s cadence) and AI moves within seconds
        // rather than up to a minute later.
        setInterval(refreshMapData, 5000);
        setInterval(checkGameStatus, 5000);

        document.getElementById('sendResourcesBtn').addEventListener('click', () => {
            if (gameState.selectedLocation) sendResourcesToCapital(gameState.selectedLocation);
        });

        document.getElementById('allOutAttackBtn').addEventListener('click', () => {
            if (!gameState.selectedLocation) return;
            const location = gameState.locations[gameState.selectedLocation];
            if (!confirm(`Launch an all-out attack with ${location.army} armies from ${location.name}? This cannot be undone!`)) return;
            launchAllOutAttack(gameState.selectedLocation);
        });

        // AI Toggle
        const aiToggle = document.getElementById('aiToggle');
        const aiStatus = document.getElementById('aiStatus');

        checkAIStatus();

        aiToggle.addEventListener('change', async function() {
            const enable = this.checked;
            try {
                const response = await fetch('/api/ai_toggle', {
                    method: 'POST',
                    headers: { 'Content-Type': 'application/json' },
                    body: JSON.stringify({ enable: enable })
                });
                const result = await response.json();

                if (result.success) {
                    if (enable) {
                        aiStatus.innerHTML = '<span class="ai-status-dot active"></span>AI is active - ' + result.message;
                        addEvent('AI opponent activated', 'neutral', 'fa-robot');
                    } else {
                        aiStatus.innerHTML = '<span class="ai-status-dot inactive"></span>AI is inactive';
                        addEvent('AI opponent deactivated', 'neutral', 'fa-robot');
                    }
                } else {
                    this.checked = !enable;
                    alert('Failed to toggle AI: ' + result.message);
                }
            } catch (error) {
                this.checked = !enable;
                alert('Failed to connect to AI service');
            }
        });

        setInterval(checkAIStatus, 10000);

        async function checkAIStatus() {
            try {
                const response = await fetch('/api/ai_status');
                const status = await response.json();

                if (status.active) {
                    aiToggle.checked = true;
                    aiStatus.innerHTML = `<span class="ai-status-dot active"></span>AI is active (${status.faction} faction)`;
                } else {
                    aiToggle.checked = false;
                    aiStatus.innerHTML = '<span class="ai-status-dot inactive"></span>AI is inactive';
                }
            } catch (error) {
                console.error('Failed to check AI status:', error);
            }
        }
    });

    // =============================================
    // ANIMATED TRAVEL SYSTEM
    // =============================================

    // Animate a unit traveling from source to target location
    // type: 'army' | 'cart' -- faction: 'southern' | 'northern'
    // onArrive: callback when the unit reaches the destination
    // opts: { armyCount: number } optional extra data
    function animateTravelingUnit(fromLocId, toLocId, faction, type, onArrive, opts) {
        const fromLoc = gameState.locations[fromLocId];
        const toLoc = gameState.locations[toLocId];
        if (!fromLoc || !toLoc) return;

        activeAnimations++;

        const fromX = mapContainer.clientWidth * (fromLoc.x / 100);
        const fromY = mapContainer.clientHeight * (fromLoc.y / 100);
        const toX = mapContainer.clientWidth * (toLoc.x / 100);
        const toY = mapContainer.clientHeight * (toLoc.y / 100);

        // Create the traveling unit element
        const unit = document.createElement('div');
        unit.className = `traveling-unit ${type} ${faction}`;

        const icon = document.createElement('i');
        if (type === 'army') {
            icon.className = 'fas fa-chess-knight';
        } else {
            icon.className = 'fas fa-coins';
        }
        unit.appendChild(icon);

        // Add army count badge if provided
        if (opts && opts.armyCount && opts.armyCount > 0) {
            const badge = document.createElement('span');
            badge.className = 'army-count';
            badge.textContent = opts.armyCount;
            unit.appendChild(badge);
        }

        // Start position
        unit.style.left = `${fromX}px`;
        unit.style.top = `${fromY}px`;
        unit.style.transform = 'translate(-50%, -50%)';
        mapContainer.appendChild(unit);

        // Calculate travel distance and duration -- longer for more drama
        const dx = toX - fromX;
        const dy = toY - fromY;
        const distance = Math.sqrt(dx * dx + dy * dy);
        const duration = Math.max(2000, Math.min(4000, distance * 6)); // 2s - 4s

        // Add connection pulse dots along the path
        const pulseCount = Math.max(3, Math.round(distance / 80));
        const pulses = [];
        for (let i = 1; i <= pulseCount; i++) {
            const t = i / (pulseCount + 1);
            const pulse = document.createElement('div');
            pulse.className = `connection-pulse ${faction}`;
            pulse.style.left = `${fromX + dx * t}px`;
            pulse.style.top = `${fromY + dy * t}px`;
            pulse.style.animationDelay = `${i * 0.15}s`;
            mapContainer.appendChild(pulse);
            pulses.push(pulse);
        }

        // Collect trail segments for cleanup
        const trailElements = [];

        // Animate with requestAnimationFrame for smooth movement
        const startTime = performance.now();
        let trailTimer = 0;
        let prevX = fromX;
        let prevY = fromY;

        function step(now) {
            const elapsed = now - startTime;
            const progress = Math.min(elapsed / duration, 1);

            // Ease-in-out curve
            const ease = progress < 0.5
                ? 2 * progress * progress
                : 1 - Math.pow(-2 * progress + 2, 2) / 2;

            const cx = fromX + dx * ease;
            const cy = fromY + dy * ease;
            unit.style.left = `${cx}px`;
            unit.style.top = `${cy}px`;

            // Drop trail particles every ~60ms
            trailTimer += (now - (step._lastFrame || now));
            step._lastFrame = now;
            if (trailTimer > 60) {
                trailTimer = 0;

                // Glowing particle
                const particle = document.createElement('div');
                particle.className = `trail-particle ${type === 'cart' ? 'resource' : faction}`;
                particle.style.left = `${cx + (Math.random() - 0.5) * 8}px`;
                particle.style.top = `${cy + (Math.random() - 0.5) * 8}px`;
                mapContainer.appendChild(particle);
                trailElements.push(particle);
                setTimeout(() => particle.remove(), 1500);

                // Glowing trail line segment from previous position
                const segDx = cx - prevX;
                const segDy = cy - prevY;
                const segLen = Math.sqrt(segDx * segDx + segDy * segDy);
                if (segLen > 2) {
                    const seg = document.createElement('div');
                    seg.className = `trail-line-segment ${type === 'cart' ? 'resource' : faction}`;
                    seg.style.left = `${prevX}px`;
                    seg.style.top = `${prevY}px`;
                    seg.style.width = `${segLen}px`;
                    seg.style.transform = `rotate(${Math.atan2(segDy, segDx)}rad)`;
                    mapContainer.appendChild(seg);
                    trailElements.push(seg);
                    setTimeout(() => seg.remove(), 2500);
                }

                prevX = cx;
                prevY = cy;
            }

            if (progress < 1) {
                requestAnimationFrame(step);
            } else {
                // Arrived! Clean up unit and pulses
                unit.remove();
                pulses.forEach(p => p.remove());

                activeAnimations--;
                if (onArrive) onArrive();
                flushPendingRefresh();
            }
        }

        requestAnimationFrame(step);
    }

    // Show a burst effect at a location
    function showClashEffect(locationId, type) {
        const loc = gameState.locations[locationId];
        if (!loc) return;

        const x = mapContainer.clientWidth * (loc.x / 100);
        const y = mapContainer.clientHeight * (loc.y / 100);

        // Main burst
        const burst = document.createElement('div');
        burst.className = `clash-burst ${type}`;
        burst.style.left = `${x}px`;
        burst.style.top = `${y}px`;
        mapContainer.appendChild(burst);
        setTimeout(() => burst.remove(), 900);

        // Sparkle particles for captures
        if (type === 'capture') {
            for (let i = 0; i < 8; i++) {
                const sparkle = document.createElement('div');
                sparkle.className = 'capture-sparkle';
                const angle = (i / 8) * Math.PI * 2;
                const radius = 20 + Math.random() * 15;
                sparkle.style.left = `${x + Math.cos(angle) * radius}px`;
                sparkle.style.top = `${y + Math.sin(angle) * radius}px`;
                sparkle.style.background = Math.random() > 0.5 ? 'var(--southern-gold)' : '#22c55e';
                sparkle.style.boxShadow = `0 0 4px ${sparkle.style.background}`;
                sparkle.style.animationDelay = `${Math.random() * 0.3}s`;
                mapContainer.appendChild(sparkle);
                setTimeout(() => sparkle.remove(), 1500);
            }
        }

        // Flash the marker
        const marker = document.getElementById(`marker-${locationId}`);
        if (marker) {
            marker.classList.add('just-captured');
            setTimeout(() => marker.classList.remove('just-captured'), 700);
        }
    }

    // Animate army movement with travel + clash at arrival
    function animateArmyMove(sourceId, targetId, faction, isAttack) {
        animateTravelingUnit(sourceId, targetId, faction, 'army', () => {
            if (isAttack) {
                showClashEffect(targetId, 'attack');
            } else {
                showClashEffect(targetId, 'reinforce');
            }
        });
    }

    // Animate resource cart along a multi-hop path
    function animateResourcePath(path, faction) {
        if (path.length < 2) return;

        let hopIndex = 0;
        function nextHop() {
            if (hopIndex < path.length - 1) {
                animateTravelingUnit(path[hopIndex], path[hopIndex + 1], faction, 'cart', () => {
                    // Small flash at intermediate stops
                    if (hopIndex < path.length - 2) {
                        showClashEffect(path[hopIndex + 1], 'reinforce');
                    } else {
                        // Final destination — gold sparkle
                        showClashEffect(path[hopIndex + 1], 'capture');
                    }
                    hopIndex++;
                    setTimeout(nextHop, 300);
                });
            }
        }
        nextHop();
    }

    // Legacy compatibility wrapper
    function createTransferIndicator(fromLoc, toLoc, faction, type) {
        if (type === 'resources') {
            animateTravelingUnit(fromLoc, toLoc, faction, 'cart', () => {
                showClashEffect(toLoc, 'reinforce');
            });
        } else {
            const targetFaction = gameState.locations[toLoc]?.faction;
            const isAttack = targetFaction && targetFaction !== faction && targetFaction !== 'neutral';
            animateArmyMove(fromLoc, toLoc, faction, isAttack || targetFaction === 'neutral');
        }
    }

    async function sendResourcesToCapital(locationId) {
        try {
            const response = await fetch('/api/send_resources_to_capital', {
                method: 'POST',
                headers: { 'Content-Type': 'application/json' },
                body: JSON.stringify({ location_id: locationId })
            });
            const result = await response.json();

            if (result.success) {
                showActionStatus('success', result.message);
                addEvent(`Resources sent from ${gameState.locations[locationId].name} to capital`, gameState.playerFaction, 'fa-route');

                // Animate resource cart along the full path
                animateResourcePath(result.path, gameState.playerFaction);
                startResourceTransferUpdates(result.path);
            } else {
                showActionStatus('danger', result.message || 'Failed to send resources');
            }
        } catch (error) {
            showActionStatus('danger', 'Network error. Please try again.');
        }
    }

    function startResourceTransferUpdates(path) {
        // Delay refresh until cart animation finishes (~4s per hop)
        const animDelay = (path.length - 1) * 4500 + 1000;
        let updateCount = 0;
        const maxUpdates = path.length * 2;
        setTimeout(() => {
            const updateInterval = setInterval(() => {
                refreshMapData();
                updateCount++;
                if (updateCount >= maxUpdates) clearInterval(updateInterval);
            }, 2500);
        }, animDelay);
    }

    function startCooldownTimer(seconds) {
        const cooldownSpan = document.getElementById('resourceCooldown');
        const timeSpan = cooldownSpan.querySelector('span');
        const collectBtn = document.getElementById('collectResourcesBtn');
        let timeLeft = seconds;

        cooldownSpan.classList.remove('d-none');
        collectBtn.disabled = true;

        const timer = setInterval(() => {
            timeLeft--;
            timeSpan.textContent = timeLeft;
            if (timeLeft <= 0) {
                clearInterval(timer);
                cooldownSpan.classList.add('d-none');
                if (gameState.selectedLocation) {
                    const location = gameState.locations[gameState.selectedLocation];
                    if (location.faction === gameState.playerFaction) collectBtn.disabled = false;
                }
            }
        }, 1000);
    }

    async function launchAllOutAttack(locationId) {
        try {
            const response = await fetch('/api/all_out_attack', {
                method: 'POST',
                headers: { 'Content-Type': 'application/json', 'Accept': 'application/json' },
                body: JSON.stringify({ location_id: locationId })
            });

            if (!response.ok) throw new Error(`HTTP error! status: ${response.status}`);
            const result = await response.json();

            if (result.success) {
                showActionStatus('success', result.message);
                addEvent(`All-out attack launched from ${gameState.locations[locationId].name}!`, gameState.playerFaction, 'fa-skull-crossbones');

                // Animate army marching along the full attack path, hop by hop
                const hops = (result.path && result.path.length >= 2) ? result.path.length - 1 : 0;
                if (hops > 0) {
                    let hopIdx = 0;
                    function nextAttackHop() {
                        if (hopIdx < result.path.length - 1) {
                            const isLastHop = hopIdx === result.path.length - 2;
                            animateTravelingUnit(result.path[hopIdx], result.path[hopIdx + 1], gameState.playerFaction, 'army', () => {
                                if (isLastHop) {
                                    showClashEffect(result.path[hopIdx + 1], 'attack');
                                } else {
                                    showClashEffect(result.path[hopIdx + 1], 'reinforce');
                                }
                                hopIdx++;
                                setTimeout(nextAttackHop, 200);
                            });
                        }
                    }
                    nextAttackHop();
                }

                // Delay refreshes until animations finish (~4s per hop + buffer)
                const animDelay = hops * 4500 + 1000;
                let updateCount = 0;
                const maxUpdates = Math.max(3, hops * 2);
                setTimeout(() => {
                    const updateInterval = setInterval(() => {
                        refreshMapData();
                        updateCount++;
                        if (updateCount >= maxUpdates) {
                            clearInterval(updateInterval);
                            setTimeout(checkGameStatus, 1000);
                        }
                    }, 2000);
                }, animDelay);

                if (result.game_over) {
                    gameState.gameOver = true;
                    gameState.winner = result.winner;
                    gameState.victoryMessage = result.victory_message;
                    checkGameOver();
                }
            } else {
                showActionStatus('danger', result.message || 'Failed to launch attack');
            }
        } catch (error) {
            showActionStatus('danger', 'Failed to launch attack. Check console for details.');
        }
    }

    async function checkGameStatus() {
        try {
            const response = await fetch('/api/game_status');
            const status = await response.json();

            if (status.game_over && !gameState.gameOver) {
                gameState.gameOver = true;
                gameState.winner = status.winner;
                gameState.victoryMessage = status.victory_message;
                await refreshMapData();
                checkGameOver();
                addEvent(`Game over! ${status.winner.charAt(0).toUpperCase() + status.winner.slice(1)} wins!`, status.winner, 'fa-crown');
            }
        } catch (error) {
            console.error('Error checking game status:', error);
        }
    }

    // Expose animation functions for debugging/testing
    window._gameAnimations = {
        animateArmyMove,
        animateTravelingUnit,
        showClashEffect,
        animateResourcePath,
        addEvent
    };
</script>
{% endblock %}


================================================
FILE: game-of-tracing/war_map/templates/map_picker.html
================================================
{% extends "layout.html" %}

{% block title %}Pick a Map{% endblock %}

{% block content %}
<div class="faction-hero">
    <div class="col-lg-9 col-xl-8">
        <div class="text-center mb-5">
            <h1 class="faction-hero-title">A Game of Traces</h1>
            <p class="faction-hero-subtitle">Pick a battlefield. Each map has its own factions, economy, and win conditions.</p>
        </div>

        <form method="POST" action="{{ url_for('select_map') }}" id="mapPickerForm">
            <input type="hidden" name="map_id" id="mapIdInput" value="" required>

            <div class="row g-4 justify-content-center mb-4">
                {% for map_id, meta in maps.items() %}
                <div class="col-md-6">
                    <div class="card faction-card map-card" data-map-id="{{ map_id }}">
                        <div class="card-body">
                            <span class="faction-icon">
                                <i class="fas {{ meta.icon }}"></i>
                            </span>
                            <h4>{{ meta.display_name }}</h4>
                            <p class="faction-motto">
                                {% if meta.single_player %}Single-player · Hold to win
                                {% else %}Two-player · Capture to win
                                {% endif %}
                            </p>
                            <p class="faction-start">{{ meta.description }}</p>
                            <p class="small mt-2 mb-0">
                                <strong>Factions:</strong>
                                {{ meta.factions | join(', ') | replace('_', ' ') | title }}
                            </p>
                        </div>
                    </div>
                </div>
                {% endfor %}
            </div>

            <div class="text-center">
                <button type="submit" id="enterMapBtn" class="btn btn-primary btn-lg px-5 py-2" disabled>
                    <i class="fas fa-play me-2"></i>Enter The Realm
                </button>
            </div>
        </form>
    </div>
</div>
{% endblock %}

{% block scripts %}
<script>
    $(document).ready(function() {
        const mapInput = $('#mapIdInput');
        const enterBtn = $('#enterMapBtn');

        $('.map-card').click(function() {
            const mapId = $(this).data('map-id');
            mapInput.val(mapId);
            $('.map-card').removeClass('faction-selected');
            $(this).addClass('faction-selected');
            enterBtn.prop('disabled', false);
        });

        $('#mapPickerForm').on('submit', function(e) {
            if (!mapInput.val()) {
                e.preventDefault();
                $('.map-card').addClass('border-warning');
                setTimeout(() => $('.map-card').removeClass('border-warning'), 900);
            }
        });
    });
</script>
{% endblock %}


================================================
FILE: game-of-tracing/war_map/templates/replay.html
================================================
{% extends "layout.html" %}

{% block title %}Game Replay{% endblock %}

{% block content %}
<div class="row g-3">
    <div class="col-md-8">
        <div class="card">
            <div class="card-header">
                <h4 class="mb-0"><i class="fas fa-history me-2"></i>Game Session Replay</h4>
            </div>
            <div class="card-body">
                <p class="mb-4" style="color: var(--text-secondary);">Replay previous game sessions using span links and distributed tracing. Each session shows the complete chain of actions linked together through OpenTelemetry spans.</p>

                <div id="loading" class="text-center py-4">
                    <div class="spinner-border" role="status">
                        <span class="visually-hidden">Loading...</span>
                    </div>
                    <p class="mt-2" style="color: var(--text-secondary);">Loading game sessions from Tempo...</p>
                </div>

                <div id="sessions-list" style="display: none;">
                    <h5>Available Sessions</h5>
                    <div class="table-responsive">
                        <table class="table table-striped">
                            <thead>
                                <tr>
                                    <th>Session ID</th>
                                    <th>Action</th>
                                </tr>
                            </thead>
                            <tbody id="sessions-tbody">
                            </tbody>
                        </table>
                    </div>
                </div>

                <div id="no-sessions" class="alert alert-info" style="display: none;">
                    <i class="fas fa-info-circle me-2"></i>
                    No game sessions found. Play a game first to create replay data!
                </div>

                <div id="error-message" class="alert alert-danger" style="display: none;">
                    <i class="fas fa-exclamation-triangle me-2"></i>
                    <span id="error-text"></span>
                </div>
            </div>
        </div>
    </div>

    <div class="col-md-4">
        <div class="card mb-3">
            <div class="card-header">
                <h5 class="mb-0"><i class="fas fa-question-circle me-2"></i>How It Works</h5>
            </div>
            <div class="card-body">
                <h6 style="color: var(--northern-blue);">Span Links & Replay</h6>
                <p class="small" style="color: var(--text-secondary);">Each game action creates a span link to the previous action, forming a chain across different traces.</p>

                <h6 style="color: var(--northern-blue);">What You'll See</h6>
                <ul class="small" style="color: var(--text-secondary);">
                    <li><strong>Action Sequence</strong> - Chronological order of game moves</li>
                    <li><strong>Span Links</strong> - How actions connect to each other</li>
                    <li><strong>Trace Context</strong> - Full distributed tracing information</li>
                    <li><strong>Game Narrative</strong> - Complete story of how the game unfolded</li>
                </ul>

                <h6 style="color: var(--northern-blue);">Educational Value</h6>
                <ul class="small" style="color: var(--text-secondary);">
                    <li>Cross-trace relationships</li>
                    <li>Tempo API integration</li>
                    <li>TraceQL queries</li>
                    <li>Game state reconstruction</li>
                </ul>

                <div class="mt-3">
                    <a href="https://grafana.com/docs/tempo/latest/traceql/" target="_blank" class="btn btn-sm btn-outline-info">
                        <i class="fas fa-external-link-alt me-1"></i>Learn TraceQL
                    </a>
                </div>
            </div>
        </div>

        <div class="card">
            <div class="card-header">
                <h6 class="mb-0"><i class="fas fa-search me-2"></i>Try TraceQL Queries</h6>
            </div>
            <div class="card-body">
                <p class="small" style="color: var(--text-secondary);">Use these queries in Grafana Tempo:</p>
                <div class="mb-2">
                    <code>{game.session.id!=""}</code>
                    <small class="d-block" style="color: var(--text-muted);">Find all game sessions</small>
                </div>
                <div class="mb-2">
                    <code>{link.type="game_sequence"}</code>
                    <small class="d-block" style="color: var(--text-muted);">Find spans with links</small>
                </div>
                <div class="mb-2">
                    <code>{game.action.type="move_army"}</code>
                    <small class="d-block" style="color: var(--text-muted);">Find specific actions</small>
                </div>
            </div>
        </div>
    </div>
</div>
{% endblock %}

{% block scripts %}
<script>
document.addEventListener('DOMContentLoaded', function() {
    loadGameSessions();
});

async function loadGameSessions() {
    try {
        const response = await fetch('/api/replay/sessions');
        const data = await response.json();

        document.getElementById('loading').style.display = 'none';

        if (data.success && data.sessions && data.sessions.length > 0) {
            displaySessions(data.sessions);
        } else {
            document.getElementById('no-sessions').style.display = 'block';
        }
    } catch (error) {
        document.getElementById('loading').style.display = 'none';
        document.getElementById('error-message').style.display = 'block';
        document.getElementById('error-text').textContent = 'Failed to load game sessions: ' + error.message;
    }
}

function displaySessions(sessions) {
    const tbody = document.getElementById('sessions-tbody');
    tbody.innerHTML = '';

    sessions.forEach(session => {
        const row = document.createElement('tr');
        row.innerHTML = `
            <td><code>${session.session_id}</code></td>
            <td>
                <a href="/replay/${session.session_id}" class="btn btn-sm btn-primary">
                    <i class="fas fa-play me-1"></i>Replay
                </a>
            </td>
        `;
        tbody.appendChild(row);
    });

    document.getElementById('sessions-list').style.display = 'block';
}
</script>
{% endblock %}


================================================
FILE: game-of-tracing/war_map/templates/replay_session.html
================================================
{% extends "layout.html" %}

{% block title %}Session Replay{% endblock %}

{% block content %}
<div class="row g-3">
    <!-- Map Replay Area -->
    <div class="col-md-8">
        <div class="card">
            <div class="card-header d-flex justify-content-between align-items-center">
                <h4 class="mb-0"><i class="fas fa-play me-2"></i>Visual Game Replay</h4>
                <div>
                    <button id="play-pause-btn" class="btn btn-primary btn-sm me-1">
                        <i class="fas fa-play"></i> Play
                    </button>
                    <button id="step-btn" class="btn btn-outline-light btn-sm me-1">
                        <i class="fas fa-step-forward"></i> Step
                    </button>
                    <button id="reset-btn" class="btn btn-outline-light btn-sm me-1">
                        <i class="fas fa-undo"></i> Reset
                    </button>
                    <a href="/replay" class="btn btn-outline-light btn-sm">
                        <i class="fas fa-arrow-left me-1"></i>Back
                    </a>
                </div>
            </div>
            <div class="card-body">
                <div id="loading" class="text-center py-4">
                    <div class="spinner-border" role="status">
                        <span class="visually-hidden">Loading...</span>
                    </div>
                    <p class="mt-2" style="color: var(--text-secondary);">Loading session data from Tempo...</p>
                </div>

                <div id="replay-content" style="display: none;">
                    <!-- Progress Bar -->
                    <div class="mb-3">
                        <div class="d-flex justify-content-between align-items-center mb-2">
                            <small style="color: var(--text-secondary);"><strong>Session:</strong> <code id="session-id">{{ session_id }}</code></small>
                            <small style="color: var(--text-secondary);">Action <span id="current-step">0</span> of <span id="total-steps">0</span></small>
                        </div>
                        <div class="progress">
                            <div id="replay-progress" class="progress-bar bg-success" role="progressbar" style="width: 0%"></div>
                        </div>
                    </div>

                    <!-- Game Map -->
                    <div id="mapContainer" class="position-relative" style="height: 500px;">
                        <canvas id="mapCanvas" class="h-100 w-100"></canvas>
                        <div id="mapMarkers"></div>

                        <!-- Action Indicator -->
                        <div id="actionIndicator" class="position-absolute top-0 start-0 m-3 p-2 rounded d-none"
                             style="background: var(--bg-card); border: 1px solid var(--border-subtle); backdrop-filter: blur(8px);">
                            <strong id="actionType" style="color: var(--text-primary);">Action</strong>
                            <small id="actionDetails" class="d-block" style="color: var(--text-secondary);"></small>
                        </div>

                        <!-- Replay Speed Control -->
                        <div class="position-absolute bottom-0 start-0 m-3">
                            <label for="speed-control" class="form-label small" style="color: var(--text-muted);">Speed:</label>
                            <select id="speed-control" class="form-select form-select-sm" style="width: 100px;">
                                <option value="3000">Slow</option>
                                <option value="2000" selected>Normal</option>
                                <option value="1000">Fast</option>
                                <option value="500">Very Fast</option>
                            </select>
                        </div>
                    </div>
                </div>

                <div id="error-message" class="alert alert-danger" style="display: none;">
                    <i class="fas fa-exclamation-triangle me-2"></i>
                    <span id="error-text"></span>
                </div>
            </div>
        </div>
    </div>

    <!-- Span Details Panel -->
    <div class="col-md-4">
        <!-- Current Action Details -->
        <div class="card mb-3">
            <div class="card-header">
                <h6 class="mb-0"><i class="fas fa-crosshairs me-2" style="color: var(--southern-gold);"></i>Current Action</h6>
            </div>
            <div class="card-body">
                <div id="current-action-details">
                    <p style="color: var(--text-muted);">Click play to start replay</p>
                </div>
            </div>
        </div>

        <!-- Span Attributes -->
        <div class="card mb-3">
            <div class="card-header">
                <h6 class="mb-0"><i class="fas fa-tags me-2" style="color: var(--northern-blue);"></i>Span Attributes</h6>
            </div>
            <div class="card-body">
                <div id="span-attributes">
                    <p class="small" style="color: var(--text-muted);">No action selected</p>
                </div>
            </div>
        </div>

        <!-- Session Information -->
        <div class="card mb-3">
            <div class="card-header">
                <h6 class="mb-0"><i class="fas fa-info-circle me-2"></i>Session Info</h6>
            </div>
            <div class="card-body">
                <div id="session-info">
                    <p class="small mb-2"><strong>Player:</strong> <span id="player-name" style="color: var(--text-secondary);">Loading...</span></p>
                    <p class="small mb-2"><strong>Faction:</strong> <span id="faction-badge" class="badge">Loading...</span></p>
                    <p class="small mb-2"><strong>Total Actions:</strong> <span id="total-actions">0</span></p>
                    <p class="small mb-0"><strong>Data Source:</strong> <span id="data-source" class="badge bg-info">Tempo</span></p>
                </div>
            </div>
        </div>

        <!-- Span Links -->
        <div class="card">
            <div class="card-header">
                <h6 class="mb-0"><i class="fas fa-link me-2" style="color: #8b5cf6;"></i>Span Links</h6>
            </div>
            <div class="card-body">
                <div id="span-links-info">
                    <p class="small" style="color: var(--text-muted);">Loading span link analysis...</p>
                </div>
            </div>
        </div>
    </div>
</div>
{% endblock %}

{% block extra_css %}
<style>
    .location-marker .badge {
        font-size: 0.7em !important;
        min-width: 1.5em;
        height: 1.5em;
        line-height: 1.3;
        border: 1px solid rgba(255, 255, 255, 0.8);
        font-weight: bold;
        text-align: center;
        display: flex;
        align-items: center;
        justify-content: center;
    }

    .location-marker .badge.bg-warning {
        color: #000 !important;
        background-color: #ffc107 !important;
    }

    .location-marker .badge.bg-danger {
        color: #fff !important;
        background-color: #dc3545 !important;
    }

    #mapCanvas {
        position: absolute;
        top: 0;
        left: 0;
        z-index: 5;
    }
</style>
{% endblock %}

{% block scripts %}
<script>
// Game state and replay control
let sessionData = null;
let currentStep = 0;
let isPlaying = false;
let replayInterval = null;
let gameLocations = {};
let replaySpeed = 2000;

// Layout for the map this session was played on — provided server-side
// from ``LOCATION_POSITIONS_BY_MAP[map_id]`` so the replay matches WWA or
// any future map, not just the WoK default.
const REPLAY_MAP_ID = {{ map_id | tojson }};
const LOCATION_POSITIONS = {{ location_positions | tojson }};
const LOCATION_CONNECTIONS = {{ location_connections | tojson }};

document.addEventListener('DOMContentLoaded', function() {
    loadSessionData();
    setupEventListeners();
});

function setupEventListeners() {
    document.getElementById('play-pause-btn').addEventListener('click', togglePlayPause);
    document.getElementById('step-btn').addEventListener('click', stepForward);
    document.getElementById('reset-btn').addEventListener('click', resetReplay);
    document.getElementById('speed-control').addEventListener('change', function(e) {
        replaySpeed = parseInt(e.target.value);
    });
}

async function loadSessionData() {
    const sessionId = '{{ session_id }}';

    try {
        const response = await fetch(`/api/replay/session/${sessionId}`);
        const data = await response.json();

        document.getElementById('loading').style.display = 'none';

        if (data.success) {
            sessionData = data;
            initializeReplay();
            initializeMap();
            displaySessionInfo(data);
            displaySpanLinkChain(data.span_link_chain || []);
        } else {
            showError(data.error || 'Failed to load session');
        }
    } catch (error) {
        document.getElementById('loading').style.display = 'none';
        showError('Failed to load session: ' + error.message);
    }
}

function initializeReplay() {
    document.getElementById('replay-content').style.display = 'block';
    currentStep = 0;
    updateProgress();

    Object.keys(LOCATION_POSITIONS).forEach(locationId => {
        gameLocations[locationId] = {
            ...LOCATION_POSITIONS[locationId],
            faction: getInitialFaction(locationId),
            army: getInitialArmy(locationId),
            resources: getInitialResources(locationId)
        };
    });
}

function getInitialFaction(locationId) {
    if (locationId === 'southern_capital') return 'southern';
    if (locationId === 'northern_capital') return 'northern';
    return 'neutral';
}

function getInitialArmy(locationId) {
    if (locationId === 'southern_capital' || locationId === 'northern_capital') return 2;
    return 0;
}

function getInitialResources(locationId) {
    if (locationId === 'southern_capital' || locationId === 'northern_capital') return 50;
    if (locationId.startsWith('village_')) return 10;
    return 0;
}

function initializeMap() {
    const mapContainer = document.getElementById('mapContainer');
    const mapCanvas = document.getElementById('mapCanvas');

    mapCanvas.width = mapContainer.clientWidth;
    mapCanvas.height = mapContainer.clientHeight;

    drawConnections();
    createLocationMarkers();
}

function drawConnections() {
    const canvas = document.getElementById('mapCanvas');
    const ctx = canvas.getContext('2d');

    // Subtle grid
    ctx.strokeStyle = 'rgba(255, 255, 255, 0.02)';
    ctx.lineWidth = 1;
    const gridSize = 40;
    for (let x = 0; x < canvas.width; x += gridSize) {
        ctx.beginPath(); ctx.moveTo(x, 0); ctx.lineTo(x, canvas.height); ctx.stroke();
    }
    for (let y = 0; y < canvas.height; y += gridSize) {
        ctx.beginPath(); ctx.moveTo(0, y); ctx.lineTo(canvas.width, y); ctx.stroke();
    }

    // Connections with faction colors
    ctx.lineWidth = 2;

    LOCATION_CONNECTIONS.forEach(connection => {
        const loc1 = gameLocations[connection[0]] || LOCATION_POSITIONS[connection[0]];
        const loc2 = gameLocations[connection[1]] || LOCATION_POSITIONS[connection[1]];

        if (loc1 && loc2) {
            const x1 = canvas.width * (loc1.x / 100);
            const y1 = canvas.height * (loc1.y / 100);
            const x2 = canvas.width * (loc2.x / 100);
            const y2 = canvas.height * (loc2.y / 100);

            const f1 = (gameLocations[connection[0]] || {}).faction || 'neutral';
            const f2 = (gameLocations[connection[1]] || {}).faction || 'neutral';

            if (f1 !== 'neutral' && f1 === f2) {
                ctx.strokeStyle = f1 === 'southern' ? 'rgba(255, 215, 0, 0.35)' : 'rgba(79, 195, 247, 0.35)';
                ctx.setLineDash([]);
            } else if (f1 !== 'neutral' && f2 !== 'neutral' && f1 !== f2) {
                ctx.strokeStyle = 'rgba(239, 68, 68, 0.3)';
                ctx.setLineDash([8, 6]);
            } else {
                ctx.strokeStyle = 'rgba(120, 144, 156, 0.2)';
                ctx.setLineDash([]);
            }

            ctx.beginPath();
            ctx.moveTo(x1, y1);
            ctx.lineTo(x2, y2);
            ctx.stroke();
            ctx.setLineDash([]);
        }
    });
}

function createLocationMarkers() {
    const markersContainer = document.getElementById('mapMarkers');
    markersContainer.innerHTML = '';

    Object.keys(gameLocations).forEach(locationId => {
        const location = gameLocations[locationId];
        const marker = document.createElement('div');

        marker.id = `marker-${locationId}`;
        marker.className = `location-marker ${location.faction} ${location.type}`;
        marker.style.left = `${location.x}%`;
        marker.style.top = `${location.y}%`;

        marker.title = `${location.name}\nFaction: ${location.faction}\nArmy: ${location.army}\nResources: ${location.resources}`;

        let icon = document.createElement('i');
        if (location.type === 'capital') {
            icon.className = location.faction === 'southern' ? 'fas fa-sun' :
                             location.faction === 'northern' ? 'fas fa-snowflake' :
                             'fas fa-chess-rook';
        } else {
            icon.className = 'fas fa-map-marker-alt';
        }
        marker.appendChild(icon);

        if (location.army > 0) {
            const armyBadge = document.createElement('span');
            armyBadge.className = 'position-absolute top-0 start-100 translate-middle badge rounded-pill bg-danger';
            armyBadge.textContent = Math.floor(location.army);
            armyBadge.style.fontSize = '0.7em';
            armyBadge.style.minWidth = '1.5em';
            armyBadge.title = `Army: ${location.army}`;
            marker.appendChild(armyBadge);
        }

        if (location.resources > 0) {
            const resourcesBadge = document.createElement('span');
            resourcesBadge.className = 'position-absolute bottom-0 start-100 translate-middle badge rounded-pill bg-warning text-dark';
            resourcesBadge.textContent = Math.floor(location.resources);
            resourcesBadge.style.fontSize = '0.7em';
            resourcesBadge.style.minWidth = '1.5em';
            resourcesBadge.title = `Resources: ${location.resources}`;
            marker.appendChild(resourcesBadge);
        }

        // Label
        const label = document.createElement('span');
        label.className = 'location-label';
        label.textContent = location.name;
        marker.appendChild(label);

        markersContainer.appendChild(marker);
    });
}

function displaySessionInfo(data) {
    if (data.actions && data.actions.length > 0) {
        const firstAction = data.actions[0];
        document.getElementById('player-name').textContent = firstAction.player_name || 'Unknown';

        const factionBadge = document.getElementById('faction-badge');
        const faction = firstAction.faction || 'unknown';
        factionBadge.textContent = faction.charAt(0).toUpperCase() + faction.slice(1);
        factionBadge.className = `badge ${faction}`;
    }

    document.getElementById('total-actions').textContent = data.total_actions || 0;
    document.getElementById('data-source').textContent = data.data_source || 'Tempo';
}

function displaySpanLinkChain(spanLinkChain) {
    const linksInfo = document.getElementById('span-links-info');

    if (spanLinkChain.length === 0) {
        linksInfo.innerHTML = '<p class="small" style="color: var(--text-muted);">No span link data available</p>';
        return;
    }

    const validChain = spanLinkChain.every(link => link.valid_chain);

    let html = `<div class="mb-2">
        <span class="badge ${validChain ? 'bg-success' : 'bg-warning'}">
            ${validChain ? 'Valid Chain' : 'Issues Found'}
        </span>
    </div>`;

    spanLinkChain.slice(0, 5).forEach((link) => {
        html += `<div class="small mb-2 p-2" style="border-left: 3px solid ${link.valid_chain ? '#22c55e' : '#f59e0b'}; padding-left: 8px;">
            <strong style="color: var(--text-primary);">Step ${link.sequence}:</strong> ${link.action_type}<br>
            <span style="color: var(--text-muted);">${link.note}</span>
        </div>`;
    });

    if (spanLinkChain.length > 5) {
        html += `<small style="color: var(--text-muted);">... and ${spanLinkChain.length - 5} more</small>`;
    }

    linksInfo.innerHTML = html;
}

function togglePlayPause() {
    if (isPlaying) pauseReplay();
    else startReplay();
}

function startReplay() {
    if (!sessionData || !sessionData.actions) return;

    isPlaying = true;
    document.getElementById('play-pause-btn').innerHTML = '<i class="fas fa-pause"></i> Pause';

    replayInterval = setInterval(() => {
        if (currentStep < sessionData.actions.length) {
            executeAction(sessionData.actions[currentStep]);
            currentStep++;
            updateProgress();
        } else {
            pauseReplay();
        }
    }, replaySpeed);
}

function pauseReplay() {
    isPlaying = false;
    if (replayInterval) {
        clearInterval(replayInterval);
        replayInterval = null;
    }
    document.getElementById('play-pause-btn').innerHTML = '<i class="fas fa-play"></i> Play';
}

function stepForward() {
    if (currentStep < sessionData.actions.length) {
        executeAction(sessionData.actions[currentStep]);
        currentStep++;
        updateProgress();
    }
}

function resetReplay() {
    pauseReplay();
    currentStep = 0;

    Object.keys(LOCATION_POSITIONS).forEach(locationId => {
        gameLocations[locationId] = {
            ...LOCATION_POSITIONS[locationId],
            faction: getInitialFaction(locationId),
            army: getInitialArmy(locationId),
            resources: getInitialResources(locationId)
        };
    });

    // Redraw
    const canvas = document.getElementById('mapCanvas');
    const ctx = canvas.getContext('2d');
    ctx.clearRect(0, 0, canvas.width, canvas.height);
    drawConnections();
    createLocationMarkers();
    updateProgress();
    clearActionIndicator();
    clearCurrentActionDetails();
    clearSpanAttributes();
}

function clearSpanAttributes() {
    document.getElementById('span-attributes').innerHTML = '<p class="small" style="color: var(--text-muted);">No action selected</p>';
}

function executeAction(action) {
    showActionIndicator(action);
    updateCurrentActionDetails(action);
    updateSpanAttributes(action);

    switch(action.action_type) {
        case 'collect_resources':
            simulateCollectResources(action);
            highlightLocation(action.location_id || action.source_location);
            break;
        case 'create_army':
            simulateCreateArmy(action);
            highlightLocation(action.location_id || action.source_location);
            break;
        case 'move_army':
            if (action.source_location && action.target_location) {
                simulateMoveArmy(action);
                highlightMovement(action.source_location, action.target_location);
            }
            break;
        case 'all_out_attack':
            simulateAllOutAttack(action);
            highlightLocation(action.location_id || action.source_location, 'attack');
            break;
        case 'send_resources_to_capital':
            simulateSendResources(action);
            highlightLocation(action.location_id || action.source_location);
            break;
    }

    // Redraw connections for updated factions then recreate markers
    const canvas = document.getElementById('mapCanvas');
    const ctx = canvas.getContext('2d');
    ctx.clearRect(0, 0, canvas.width, canvas.height);
    drawConnections();
    createLocationMarkers();
}

function simulateCollectResources(action) {
    const locationId = action.location_id || action.source_location;
    if (gameLocations[locationId]) {
        gameLocations[locationId].resources = Math.floor(gameLocations[locationId].resources + 10);
    }
}

function simulateCreateArmy(action) {
    const locationId = action.location_id || action.source_location;
    if (gameLocations[locationId]) {
        gameLocations[locationId].resources = Math.max(0, Math.floor(gameLocations[locationId].resources - 30));
        gameLocations[locationId].army = Math.floor(gameLocations[locationId].army + 1);
    }
}

function simulateMoveArmy(action) {
    const sourceId = action.source_location;
    const targetId = action.target_location;

    if (gameLocations[sourceId] && gameLocations[targetId]) {
        const sourceLocation = gameLocations[sourceId];
        const targetLocation = gameLocations[targetId];

        if (sourceLocation.army > 0) {
            const movingArmies = Math.floor(sourceLocation.army);
            sourceLocation.army = 0;

            if (sourceLocation.faction !== targetLocation.faction) {
                if (movingArmies > targetLocation.army) {
                    targetLocation.faction = sourceLocation.faction;
                    targetLocation.army = Math.floor(movingArmies - targetLocation.army);
                } else {
                    targetLocation.army = Math.max(1, Math.floor(targetLocation.army - movingArmies));
                }
            } else {
                targetLocation.army = Math.floor(targetLocation.army + movingArmies);
            }
        }
    }
}

function simulateAllOutAttack(action) {
    const locationId = action.location_id || action.source_location;
    if (gameLocations[locationId]) {
        const location = gameLocations[locationId];
        const faction = location.faction;

        let enemyCapital = faction === 'southern' ? 'northern_capital' :
                           faction === 'northern' ? 'southern_capital' : null;

        if (enemyCapital && gameLocations[enemyCapital]) {
            const attackingArmies = Math.floor(location.army);
            location.army = 0;

            const enemyLocation = gameLocations[enemyCapital];
            if (attackingArmies > enemyLocation.army) {
                enemyLocation.faction = faction;
                enemyLocation.army = Math.floor(attackingArmies - enemyLocation.army);
            } else {
                enemyLocation.army = Math.max(1, Math.floor(enemyLocation.army - attackingArmies));
            }
        }
    }
}

function simulateSendResources(action) {
    const locationId = action.location_id || action.source_location;
    if (gameLocations[locationId]) {
        const location = gameLocations[locationId];
        const resources = Math.floor(location.resources);
        location.resources = 0;

        let capitalId = location.faction === 'southern' ? 'southern_capital' :
                        location.faction === 'northern' ? 'northern_capital' : null;

        if (capitalId && gameLocations[capitalId]) {
            gameLocations[capitalId].resources = Math.floor(gameLocations[capitalId].resources + resources);
        }
    }
}

function highlightLocation(locationId) {
    document.querySelectorAll('.location-marker').forEach(marker => {
        marker.classList.remove('active', 'action-highlight');
    });

    const marker = document.getElementById(`marker-${locationId}`);
    if (marker) {
        marker.classList.add('action-highlight');
        setTimeout(() => { marker.classList.remove('action-highlight'); }, 2000);
    }
}

function highlightMovement(sourceId, targetId) {
    const sourcePos = LOCATION_POSITIONS[sourceId];
    const targetPos = LOCATION_POSITIONS[targetId];
    if (!sourcePos || !targetPos) return;

    highlightLocation(sourceId);

    const mapContainer = document.getElementById('mapContainer');
    const arrow = document.createElement('div');
    arrow.className = 'movement-arrow';
    arrow.innerHTML = '<i class="fas fa-arrow-right"></i>';

    const dx = targetPos.x - sourcePos.x;
    const dy = targetPos.y - sourcePos.y;
    const angle = Math.atan2(dy, dx) * (180 / Math.PI);

    const avgX = (sourcePos.x + targetPos.x) / 2;
    const avgY = (sourcePos.y + targetPos.y) / 2;
    arrow.style.left = `${avgX}%`;
    arrow.style.top = `${avgY}%`;
    arrow.style.transform = `translate(-50%, -50%) rotate(${angle}deg)`;

    mapContainer.appendChild(arrow);

    setTimeout(() => { highlightLocation(targetId); }, 1000);
    setTimeout(() => { arrow.remove(); }, 2500);
}

function showActionIndicator(action) {
    const indicator = document.getElementById('actionIndicator');
    const actionType = document.getElementById('actionType');
    const actionDetails = document.getElementById('actionDetails');

    actionType.textContent = action.action_type || 'Unknown Action';

    let details = `Sequence: ${action.sequence || 'N/A'}`;
    if (action.player_name) details += ` | Player: ${action.player_name}`;
    if (action.source_location) details += ` | From: ${action.source_location}`;
    if (action.target_location) details += ` | To: ${action.target_location}`;

    actionDetails.textContent = details;
    indicator.classList.remove('d-none');

    setTimeout(() => { indicator.classList.add('d-none'); }, 3000);
}

function clearActionIndicator() {
    document.getElementById('actionIndicator').classList.add('d-none');
}

function updateCurrentActionDetails(action) {
    const detailsDiv = document.getElementById('current-action-details');
    const timestamp = new Date(action.start_time / 1000000);
    const duration = action.duration ? (action.duration / 1000000).toFixed(2) + 'ms' : 'N/A';
    const displaySequence = action.sequence || currentStep;

    detailsDiv.innerHTML = `
        <h6 style="color: var(--text-primary);">${action.action_type || action.operation}</h6>
        <p class="small mb-2"><strong>Step:</strong> ${displaySequence}</p>
        <p class="small mb-2"><strong>Player:</strong> ${action.player_name || 'Unknown'}</p>
        <p class="small mb-2"><strong>Faction:</strong> ${action.faction || 'Unknown'}</p>
        <p class="small mb-2"><strong>Time:</strong> ${timestamp.toLocaleTimeString()}</p>
        <p class="small mb-2"><strong>Duration:</strong> ${duration}</p>
        <p class="small mb-0"><strong>Span Links:</strong> ${action.span_links ? action.span_links.length : 0}</p>
    `;
}

function clearCurrentActionDetails() {
    document.getElementById('current-action-details').innerHTML = '<p style="color: var(--text-muted);">Click play to start replay</p>';
}

function updateSpanAttributes(action) {
    const attributesDiv = document.getElementById('span-attributes');

    if (!action.attributes || Object.keys(action.attributes).length === 0) {
        attributesDiv.innerHTML = '<p class="small" style="color: var(--text-muted);">No attributes available</p>';
        return;
    }

    let html = '';
    Object.entries(action.attributes).forEach(([key, value]) => {
        html += `<div class="attribute-item">
            <strong class="small" style="color: var(--text-secondary);">${key}</strong><br>
            <code class="small">${value}</code>
        </div>`;
    });

    attributesDiv.innerHTML = html;
}

function updateProgress() {
    const totalSteps = sessionData.actions.length;
    const progress = totalSteps > 0 ? (currentStep / totalSteps) * 100 : 0;

    document.getElementById('replay-progress').style.width = `${progress}%`;
    document.getElementById('current-step').textContent = Math.max(0, currentStep);

    const totalStepsElement = document.getElementById('total-steps');
    if (totalStepsElement) totalStepsElement.textContent = totalSteps;
}

function showError(message) {
    document.getElementById('error-message').style.display = 'block';
    document.getElementById('error-text').textContent = message;
}
</script>
{% endblock %}


================================================
FILE: gelf-log-ingestion/README.md
================================================
# GELF Log Ingestion Scenario

This scenario demonstrates how to ingest GELF (Graylog Extended Log Format) logs using Grafana Alloy's `loki.source.gelf` component. A Python application sends structured GELF messages over UDP to Alloy, which relabels GELF metadata (host, level, facility) into Loki labels before forwarding to Loki for storage and querying in Grafana.

## Architecture

```
gelf-logger (Python/pygelf) --UDP:12201--> Alloy (loki.source.gelf) --> Loki --> Grafana
```

## Running the Demo

### Step 1: Clone the repository
```bash
git clone https://github.com/grafana/alloy-scenarios.git
```

### Step 2: Deploy the monitoring stack
```bash
cd alloy-scenarios/gelf-log-ingestion
docker-compose up -d
```

### Step 3: Access Grafana Alloy UI
Open your browser and go to `http://localhost:12345` to inspect the Alloy pipeline and live debugging output.

### Step 4: Access Grafana UI
Open your browser and go to `http://localhost:3000`. Navigate to **Explore** and select the **Loki** datasource. Query logs using `{host="gelf-logger"}` or filter by label (e.g., `{level="6"}` for INFO).

## GELF Level Mapping

| GELF Level | Syslog Severity |
|------------|-----------------|
| 0          | Emergency       |
| 1          | Alert           |
| 2          | Critical        |
| 3          | Error           |
| 4          | Warning         |
| 5          | Notice          |
| 6          | Informational   |
| 7          | Debug           |


================================================
FILE: gelf-log-ingestion/app/main.py
================================================
import logging
import time
import random
from pygelf import GelfUdpHandler

logger = logging.getLogger("gelf-demo")
logger.setLevel(logging.DEBUG)
handler = GelfUdpHandler(host="alloy", port=12201, compress=False)
logger.addHandler(handler)

messages = [
    (logging.INFO, "User authentication successful", {"user_id": "42", "method": "oauth2"}),
    (logging.WARNING, "Slow database query detected", {"query_time_ms": "2500", "table": "orders"}),
    (logging.ERROR, "Failed to connect to payment gateway", {"gateway": "stripe", "retry_count": "3"}),
    (logging.INFO, "Order processed successfully", {"order_id": "ORD-12345", "total": "99.99"}),
    (logging.DEBUG, "Cache lookup completed", {"cache_hit": "true", "key": "user:42:profile"}),
    (logging.CRITICAL, "Disk space critically low", {"mount": "/data", "available_pct": "2"}),
    (logging.INFO, "Health check passed", {"service": "api", "response_ms": "12"}),
    (logging.WARNING, "Rate limit approaching threshold", {"client_ip": "10.0.1.50", "requests": "980"}),
]

print("Starting GELF log generator...")
while True:
    level, msg, extra = random.choice(messages)
    logger.log(level, msg, extra=extra)
    time.sleep(random.uniform(1, 3))


================================================
FILE: gelf-log-ingestion/config.alloy
================================================
livedebugging {
	enabled = true
}

// Receive GELF logs over UDP
loki.source.gelf "default" {
	forward_to = [loki.relabel.gelf.receiver]
}

// Relabel GELF metadata into useful labels
loki.relabel "gelf" {
	forward_to = [loki.write.local.receiver]

	rule {
		source_labels = ["__gelf_message_host"]
		target_label  = "host"
	}

	rule {
		source_labels = ["__gelf_message_level"]
		target_label  = "level"
	}

	rule {
		source_labels = ["__gelf_message_facility"]
		target_label  = "facility"
	}
}

loki.write "local" {
	endpoint {
		url = "http://loki:3100/loki/api/v1/push"
	}
}


================================================
FILE: gelf-log-ingestion/docker-compose.coda.yml
================================================
services:
  gelf-logger:
    image: python:${PYTHON_VERSION:-3.11-slim}
    container_name: gelf-logger
    volumes:
      - ./app/main.py:/app/main.py
    command: ["sh", "-c", "pip install pygelf && python3 /app/main.py"]


================================================
FILE: gelf-log-ingestion/docker-compose.yml
================================================

services:

  # GELF log generator using pygelf
  gelf-logger:
    image: python:${PYTHON_VERSION:-3.11-slim}
    container_name: gelf-logger
    volumes:
      - ./app/main.py:/app/main.py
    command: ["sh", "-c", "pip install pygelf && python3 /app/main.py"]
    depends_on:
      - alloy

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
      - 12201:12201/udp
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --stability.level=experimental --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - loki

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
       - sh
       - -euc
       - |
         mkdir -p /etc/grafana/provisioning/datasources
         cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
         apiVersion: 1
         datasources:
         - name: Loki
           type: loki
           access: proxy
           orgId: 1
           url: http://loki:3100
           basicAuth: false
           isDefault: false
           version: 1
           editable: false
         EOF
         /run.sh


================================================
FILE: gelf-log-ingestion/loki-config.yaml
================================================

# This is a complete configuration to deploy Loki backed by the filesystem.
# The index will be shipped to the storage via tsdb-shipper.

auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
  - from: 2020-05-15
    store: tsdb
    object_store: filesystem
    schema: v13
    index:
      prefix: index_
      period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true


================================================
FILE: image-versions.env
================================================
# Centralized Docker image versions for all examples.
#
# Renovate tracks each variable below — the `# renovate:` annotation
# tells the bot which docker image the version refers to. Bumps to
# this file land via renovate PRs that also bump the matching
# `${VAR:-default}` fallback in every docker-compose file. Both sides
# are driven by customManagers in renovate.json: one for this file,
# one generic rule that captures the depName from the `image:` line
# in compose files. Keep them in lockstep — the check-image-versions
# workflow will fail PRs where they drift.
#
# Adding a new image: declare `# renovate: datasource=docker depName=<image>`
# + `<NAME>_VERSION=<value>` here, and reference it in compose as
# `image: <image>:${<NAME>_VERSION:-<value>}`. No renovate.json edit needed.

# Grafana images
# renovate: datasource=docker depName=grafana/loki
GRAFANA_LOKI_VERSION=3.6.10
# renovate: datasource=docker depName=grafana/grafana
GRAFANA_VERSION=13.0.1
# renovate: datasource=docker depName=grafana/alloy
GRAFANA_ALLOY_VERSION=v1.16.1
# renovate: datasource=docker depName=grafana/tempo
GRAFANA_TEMPO_VERSION=2.10.4
# renovate: datasource=docker depName=grafana/pyroscope
GRAFANA_PYROSCOPE_VERSION=2.0.1

# Prometheus images
# renovate: datasource=docker depName=prom/prometheus
PROMETHEUS_VERSION=v3.11.3

# Other images
# renovate: datasource=docker depName=python
PYTHON_VERSION=3.11-slim

# nginx-monitoring scenario
# renovate: datasource=docker depName=nginx
NGINX_VERSION=1.30-alpine
# renovate: datasource=docker depName=nginx/nginx-prometheus-exporter
NGINX_EXPORTER_VERSION=1.5.1
# renovate: datasource=docker depName=curlimages/curl
CURL_VERSION=8.20.0

# rabbitmq-monitoring scenario
# renovate: datasource=docker depName=rabbitmq
RABBITMQ_VERSION=4.3.0-management
# renovate: datasource=docker depName=pivotalrabbitmq/perf-test
RABBITMQ_PERF_TEST_VERSION=2.24.0
# vault-secrets scenario
# renovate: datasource=docker depName=hashicorp/vault
VAULT_VERSION=2.0.0

# cloudwatch-metrics scenario
# renovate: datasource=docker depName=localstack/localstack
LOCALSTACK_VERSION=4.4.0


================================================
FILE: k8s/README.md
================================================

# Monitor Kubernetes Grafana Alloy

> Note this scenario works using the K8s Monitoring Helm chart. This abstracts the need to configure Loki and deploys best practices for monitoring Kubernetes clusters. The chart supports; metrics, logs, profiling, and tracing.

In this directory you will find a series of scenarios that demonstrate how to setup Alloy via the Kubernetes monitoring helm chart. Examples specific to each telemetry source are provided in the respective directories.

| Scenario | Description |
| --- | --- |
| [Logs](./logs) | Monitor Kubernetes logs with Grafana Alloy and Loki |
| [Metrics](./metrics) | Monitor Kubernetes metrics with Grafana Alloy and Prometheus |
| [Profiling](./profiling) | Monitor Kubernetes profiling with Grafana Alloy and Pyroscope |
| [Tracing](./tracing) | Monitor Kubernetes tracing with Grafana Alloy and Tempo |


================================================
FILE: k8s/events/README.md
================================================
# Kubernetes events to Loki — without the k8s-monitoring Helm chart

A focused scenario showing how `loki.source.kubernetes_events` works under the hood: Alloy is deployed as a plain `Deployment` with explicit RBAC and an Alloy `ConfigMap`, instead of being abstracted behind the [`k8s-monitoring` Helm chart](https://github.com/grafana/k8s-monitoring-helm) used in [`k8s/logs/`](../logs/).

## How this differs from `k8s/logs/`

| Aspect | `k8s/logs/` (existing) | `k8s/events/` (this) |
|---|---|---|
| Alloy deployment | `k8s-monitoring` Helm chart (collector preset) | Plain `kubectl apply` of ConfigMap + RBAC + Deployment |
| `loki.source.kubernetes_events` | Hidden inside the chart | **Visible directly in `alloy-config.yaml`** |
| Scope | Pod logs + cluster events (mixed) | **Cluster events only** with `type` / `reason` / `namespace` / `kind` labels |
| Demo intent | "ship everything for K8s monitoring" | "show how events ingestion actually works" |

If you want production-grade Kubernetes observability, use `k8s/logs/`. If you're learning the component or want to extend it (custom filtering, namespace scoping, alerting on event reasons), this scenario is the minimal moving-parts version.

## Prerequisites

- [Kind](https://kind.sigs.k8s.io/docs/user/quick-start/)
- [Helm](https://helm.sh/docs/intro/install/)
- The Grafana Helm repo: `helm repo add grafana https://grafana.github.io/helm-charts`

## Step 1 — Create the cluster

```bash
git clone https://github.com/grafana/alloy-scenarios.git
cd alloy-scenarios/k8s/events

kind create cluster --config kind.yml
```

## Step 2 — Create the `meta` namespace and install Loki + Grafana

```bash
kubectl create namespace meta

helm install --values loki-values.yml loki    grafana/loki    -n meta
helm install --values grafana-values.yml grafana grafana/grafana -n meta
```

Wait for them to be ready:

```bash
kubectl get pods -n meta -w
```

## Step 3 — Apply Alloy

```bash
kubectl apply -f alloy-rbac.yaml
kubectl apply -f alloy-config.yaml
kubectl apply -f alloy-deployment.yaml
```

The RBAC grants cluster-wide `get/list/watch` on `events` (and only that). The ConfigMap holds the Alloy pipeline. The Deployment is **single-replica on purpose** — events are cluster-scoped, so multiple Alloy replicas would produce duplicate log lines.

## Step 4 — Open Grafana

```bash
kubectl port-forward -n meta svc/grafana 3000:80
```

Username `admin`, password `adminadminadmin` (it's a dev scenario — see `grafana-values.yml`).

## Step 5 — Generate some events

```bash
# Trigger Created/Started/Pulled events
kubectl run events-test --image=nginx --restart=Never

# Trigger BackOff/Failed events
kubectl run events-fail --image=does-not-exist --restart=Never

# Wait, then trigger Killing
sleep 30
kubectl delete pod events-test events-fail
```

## Step 6 — Query in Loki

```logql
# All events
{job="kubernetes-events"}

# Just warnings
{job="kubernetes-events", type="Warning"}

# Pod events in default namespace
{job="kubernetes-events", namespace="default", kind="Pod"}

# Pull failures
{job="kubernetes-events", reason="Failed"}

# Backoff loops
{job="kubernetes-events", reason="BackOff"}
```

The promoted labels are `type`, `reason`, `namespace`, and `kind`. The involved-object name (`name`) is kept as **structured metadata** — high cardinality, but searchable via `| json` filters.

## Inspecting the Alloy pipeline

```bash
kubectl port-forward -n meta svc/alloy 12345:12345
```

Open http://localhost:12345 to see the component graph and use **livedebugging** to inspect events flowing through each stage.

## Tear down

```bash
kind delete cluster
```

## Customization ideas

- **Namespace scoping**: add `namespaces = ["prod", "default"]` to the `loki.source.kubernetes_events` block to filter at the source rather than at query time.
- **Drop noisy reasons**: add a `stage.match` block dropping `reason=~"Pulled|Pulling|Created"` if you only care about Warnings.
- **Alerting**: pair this with a Grafana alert on `count_over_time({type="Warning"}[5m])` for cluster-health monitoring.


================================================
FILE: k8s/events/alloy-config.yaml
================================================
# Alloy pipeline as a ConfigMap. Mounted into the alloy Deployment at
# /etc/alloy/config.alloy.
#
# Pipeline:
#   loki.source.kubernetes_events  (cluster-wide events feed)
#     → loki.process               (parse JSON, promote labels)
#     → loki.write                 (push to Loki in this cluster)
apiVersion: v1
kind: ConfigMap
metadata:
  name: alloy-config
  namespace: meta
data:
  config.alloy: |
    livedebugging {}

    loki.source.kubernetes_events "cluster" {
      job_name   = "kubernetes-events"
      log_format = "json"
      forward_to = [loki.process.events.receiver]
    }

    loki.process "events" {
      // The component emits a flat JSON envelope (top-level fields:
      // type, reason, kind, name, count, msg, sourcecomponent, etc).
      // The `namespace` label is already attached by the source component
      // itself, so we don't need to extract it here.
      stage.json {
        expressions = {
          type   = "type",
          reason = "reason",
          kind   = "kind",
          name   = "name",
        }
      }

      // Indexed labels — fast filtering for "show all Warnings in
      // namespace X with reason Y on a Pod".
      stage.labels {
        values = {
          type   = "",
          reason = "",
          kind   = "",
        }
      }

      // High-cardinality fields kept out of the label index but still
      // queryable via `| json` filters.
      stage.structured_metadata {
        values = {
          name = "",
        }
      }

      forward_to = [loki.write.loki.receiver]
    }

    loki.write "loki" {
      endpoint {
        url = "http://loki-gateway.meta.svc.cluster.local/loki/api/v1/push"
      }
    }


================================================
FILE: k8s/events/alloy-deployment.yaml
================================================
# A single-replica Deployment is the right shape for this scenario:
# `loki.source.kubernetes_events` watches a cluster-scoped resource, so
# more than one replica would just produce duplicate log lines for every
# event. (A DaemonSet would be wrong for the same reason.)
apiVersion: apps/v1
kind: Deployment
metadata:
  name: alloy
  namespace: meta
  labels:
    app.kubernetes.io/name: alloy
    app.kubernetes.io/part-of: alloy-events
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: alloy
  template:
    metadata:
      labels:
        app.kubernetes.io/name: alloy
    spec:
      serviceAccountName: alloy
      containers:
        - name: alloy
          image: grafana/alloy:v1.16.0
          args:
            - run
            - /etc/alloy/config.alloy
            - --server.http.listen-addr=0.0.0.0:12345
            - --storage.path=/var/lib/alloy/data
          ports:
            - name: http
              containerPort: 12345
          volumeMounts:
            - name: config
              mountPath: /etc/alloy
            - name: storage
              mountPath: /var/lib/alloy/data
      volumes:
        - name: config
          configMap:
            name: alloy-config
        - name: storage
          emptyDir: {}
---
# Lightweight Service so the Alloy UI can be port-forwarded easily.
apiVersion: v1
kind: Service
metadata:
  name: alloy
  namespace: meta
spec:
  selector:
    app.kubernetes.io/name: alloy
  ports:
    - name: http
      port: 12345
      targetPort: 12345


================================================
FILE: k8s/events/alloy-rbac.yaml
================================================
# Minimal RBAC for `loki.source.kubernetes_events`.
# It needs cluster-wide read/list/watch on events. Nothing else.
apiVersion: v1
kind: ServiceAccount
metadata:
  name: alloy
  namespace: meta
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: alloy-events-reader
rules:
  - apiGroups: [""]
    resources: ["events"]
    verbs: ["get", "list", "watch"]
  - apiGroups: ["events.k8s.io"]
    resources: ["events"]
    verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: alloy-events-reader
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: alloy-events-reader
subjects:
  - kind: ServiceAccount
    name: alloy
    namespace: meta


================================================
FILE: k8s/events/grafana-values.yml
================================================
---
persistence:
  type: pvc
  enabled: true

# DO NOT DO THIS IN PRODUCTION USECASES
adminUser: admin
adminPassword: adminadminadmin
# CONSIDER USING AN EXISTING SECRET
# Use an existing secret for the admin user.
# admin:
  ## Name of the secret. Can be templated.
#  existingSecret: ""
#  userKey: admin-user
#  passwordKey: admin-password

service:
  enabled: true
  type: ClusterIP

datasources:
  datasources.yaml:
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki-gateway.meta.svc.cluster.local:80
          basicAuth: false
          isDefault: false
          version: 1
          editable: false


================================================
FILE: k8s/events/kind.yml
================================================
# 1 control-plane + 2 workers — matches the other k8s/ scenarios.
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
  - role: control-plane
  - role: worker
  - role: worker


================================================
FILE: k8s/events/loki-values.yml
================================================
---
loki:
  auth_enabled: false
  commonConfig:
    replication_factor: 1
  schemaConfig:
    configs:
      - from: 2024-04-01
        store: tsdb
        object_store: s3
        schema: v13
        index:
          prefix: loki_index_
          period: 24h
  ingester:
    chunk_encoding: snappy
  tracing:
    enabled: true
  pattern_ingester:
      enabled: true
  limits_config:
    allow_structured_metadata: true
    volume_enabled: true
  ruler:
    enable_api: true
  querier:
    # Default is 4, if you have enough memory and CPU you can increase, reduce if OOMing
    max_concurrent: 4

minio:
  enabled: true
      
deploymentMode: SingleBinary
singleBinary:
  replicas: 1
  resources:
    limits:
      cpu: 4
      memory: 4Gi
    requests:
      cpu: 2
      memory: 2Gi
  extraEnv:
    # Keep a little bit lower than memory limits
    - name: GOMEMLIMIT
      value: 3750MiB

chunksCache:
  # default is 500MB, with limited memory keep this smaller
  writebackSizeLimit: 10MB


# Zero out replica counts of other deployment modes
backend:
  replicas: 0
read:
  replicas: 0
write:
  replicas: 0

ingester:
  replicas: 0
querier:
  replicas: 0
queryFrontend:
  replicas: 0
queryScheduler:
  replicas: 0
distributor:
  replicas: 0
compactor:
  replicas: 0
indexGateway:
  replicas: 0
bloomCompactor:
  replicas: 0
bloomGateway:
  replicas: 0

================================================
FILE: k8s/logs/README.md
================================================

# Monitor Kubernetes Logs with Grafana Alloy and Loki

> Note this scenario works using the K8s Monitoring Helm chart. This abstracts the need to configure Alloy and deploys best practices for monitoring Kubernetes clusters. The chart supports; metrics, logs, profiling, and tracing. For this scenario, we will use the K8s Monitoring Helm chart to monitor Kubernetes logs. 

This scenario demonstrates how to setup the Kubernetes monitoring helm and Loki. This scenario will install three Helm charts: Loki, Grafana, and k8s-monitoring-helm. Loki will be used to store the logs, Grafana will be used to visualize the logs, and Alloy (k8s-monitoring-helm) will be used to collect three different log sources:
* Pod Logs
* Kubernetes Events

## Prerequisites

Clone the repository:

```bash
git clone https://github.com/grafana/alloy-scenarios.git
```

Change to the directory:

```bash
cd alloy-scenarios/k8s/logs
```

Next you will need a Kubernetes cluster (In this example, we will configure a local Kubernetes cluster using [Kind](https://kind.sigs.k8s.io/docs/user/quick-start/))

An example kind cluster configuration is provided in the `kind.yml` file. To create a kind cluster using this configuration, run the following command:

```bash
kind create cluster --config kind.yml
```

Lastly you will need to make sure you install Helm on your local machine. You can install Helm by following the instructions [here](https://helm.sh/docs/intro/install/). You will also need to install the Grafana Helm repository:

```bash
helm repo add grafana https://grafana.github.io/helm-charts
```

## Create the `meta` and `prod` namespaces

The first step is to create the `meta` and `prod` namespaces. To create the namespaces, run the following commands:

```bash
kubectl create namespace meta && \
kubectl create namespace prod
```


## Install the Loki Helm Chart

The first step is to install the Loki Helm chart. This will install Loki in the `meta` namespace. The `loki-values.yml` file contains the configuration for the Loki Helm chart. To install Loki, run the following command:

```bash
helm install --values loki-values.yml loki grafana/loki -n meta
```

This installs Loki in monolithic mode. For more information on Loki modes, see the [Loki documentation](https://grafana.com/docs/loki/latest/get-started/deployment-modes/).

## Install the Grafana Helm Chart

The next step is to install the Grafana Helm chart. This will install Grafana in the `meta` namespace. The `grafana-values.yml` file contains the configuration for the Grafana Helm chart. To install Grafana, run the following command:

```bash
helm install --values grafana-values.yml grafana grafana/grafana --namespace meta
```
Note that within the `grafana-values.yml` file, the `grafana.ini` configuration is set to use the Loki data source. This is done by setting the `datasources.datasources.yaml` field to the Loki data source configuration.

## Install the K8s Monitoring Helm Chart

The final step is to install the K8s monitoring Helm chart. This will install Alloy in the `meta` namespace. The `k8s-monitoring-values.yml` file contains the configuration for the K8s monitoring Helm chart. This scenario requires `grafana/k8s-monitoring` chart v4 or later. To install the K8s monitoring Helm chart, run the following command:

```bash
helm install --values ./k8s-monitoring-values.yml k8s grafana/k8s-monitoring --version "^4.0.0" -n meta --create-namespace
```
Within the `k8s-monitoring-values.yml` file we declare the Alloy configuration. This configuration specifies the log sources that Alloy will collect logs from. In this scenario, we are collecting logs from two different sources: Pod Logs and Kubernetes Events.

## Accessing the Grafana UI

To access the Grafana UI, you will need to port-forward the Grafana pod to your local machine. First, get the name of the Grafana pod:

```bash
export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=grafana,app.kubernetes.io/instance=grafana" -o jsonpath="{.items[0].metadata.name}")
```

Next, port-forward the Grafana pod to your local machine:

```bash
kubectl --namespace meta port-forward $POD_NAME 3000
```

Open your browser and go to [http://localhost:3000](http://localhost:3000). You can log in with the default username `admin` and password `adminadminadmin`.

## Accessing the Alloy UI

To access the Alloy UI, you will need to port-forward the Alloy pod to your local machine. First, get the name of the Alloy pod:

```bash
export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=alloy-logs,app.kubernetes.io/instance=k8s" -o jsonpath="{.items[0].metadata.name}")
```

Next, port-forward the Alloy pod to your local machine:

```bash
kubectl --namespace meta port-forward $POD_NAME 12345
```

## View the logs using Explore Logs in Grafana

Explore Logs is a new feature in Grafana which provides a queryless way to explore logs. To access Explore Logs. To access Explore logs open a browser and go to [http://localhost:3000/a/grafana-lokiexplore-app](http://localhost:3000/a/grafana-lokiexplore-app).

## Adding a demo prod app

The k8s monitoring app is configured to collect logs from two namespaces: `meta` and `prod`. To add a demo prod app, run the following command:

```bash
helm install tempo grafana/tempo-distributed -n prod
```

This will install the Tempo distributed tracing system in the `prod` namespace.

================================================
FILE: k8s/logs/grafana-values.yml
================================================
---
persistence:
  type: pvc
  enabled: true

# DO NOT DO THIS IN PRODUCTION USECASES
adminUser: admin
adminPassword: adminadminadmin
# CONSIDER USING AN EXISTING SECRET
# Use an existing secret for the admin user.
# admin:
  ## Name of the secret. Can be templated.
#  existingSecret: ""
#  userKey: admin-user
#  passwordKey: admin-password

service:
  enabled: true
  type: ClusterIP

datasources:
  datasources.yaml:
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki-gateway.meta.svc.cluster.local:80
          basicAuth: false
          isDefault: false
          version: 1
          editable: false


================================================
FILE: k8s/logs/k8s-monitoring-values.yml
================================================
---
cluster:
  name: meta-monitoring-tutorial

destinations:
  loki:
    type: loki
    url: http://loki-gateway.meta.svc.cluster.local/loki/api/v1/push

clusterEvents:
  enabled: true
  collector: alloy-singleton
  namespaces:
    - meta
    - prod

podLogsViaKubernetesApi:
  enabled: true
  collector: alloy-logs
  namespaces:
    - meta
    - prod
  structuredMetadata:
    pod: pod

collectors:
  alloy-singleton:
    presets: [singleton]
  alloy-logs:
    presets: [clustered]


================================================
FILE: k8s/logs/killercoda/loki-values.yml
================================================
---
loki:
  auth_enabled: false
  commonConfig:
    replication_factor: 1
  schemaConfig:
    configs:
      - from: 2024-04-01
        store: tsdb
        object_store: s3
        schema: v13
        index:
          prefix: loki_index_
          period: 24h
  ingester:
    chunk_encoding: snappy
  tracing:
    enabled: true
  pattern_ingester:
      enabled: true
  limits_config:
    allow_structured_metadata: true
    volume_enabled: true
  ruler:
    enable_api: true
  querier:
    # Default is 4, if you have enough memory and CPU you can increase, reduce if OOMing
    max_concurrent: 4

minio:
  enabled: true
      
deploymentMode: SingleBinary
singleBinary:
  replicas: 1
  resources:
    limits:
      cpu: 0.5
      memory: 1Gi
    requests:
      cpu: 0.5
      memory: 1Gi
  extraEnv:
    # Keep a little bit lower than memory limits
    - name: GOMEMLIMIT
      value: 750MiB
  tolerations:
      - key: "node-role.kubernetes.io/control-plane"
        operator: "Exists"
        effect: "NoSchedule"


chunksCache:
  # default is 500MB, with limited memory keep this smaller
  writebackSizeLimit: 10MB
  enabled: false

resultsCache:
  writebackSizeLimit: 10MB
  enabled: false

test:
  enabled: false
lokiCanary:
  enabled: false


# Zero out replica counts of other deployment modes
backend:
  replicas: 0
read:
  replicas: 0
write:
  replicas: 0

ingester:
  replicas: 0
querier:
  replicas: 0
queryFrontend:
  replicas: 0
queryScheduler:
  replicas: 0
distributor:
  replicas: 0
compactor:
  replicas: 0
indexGateway:
  replicas: 0
bloomCompactor:
  replicas: 0
bloomGateway:
  replicas: 0

================================================
FILE: k8s/logs/kind.yml
================================================
# a cluster with 3 control-plane nodes and 3 workers
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
- role: worker
- role: worker


================================================
FILE: k8s/logs/loki-values.yml
================================================
---
loki:
  auth_enabled: false
  commonConfig:
    replication_factor: 1
  schemaConfig:
    configs:
      - from: 2024-04-01
        store: tsdb
        object_store: s3
        schema: v13
        index:
          prefix: loki_index_
          period: 24h
  ingester:
    chunk_encoding: snappy
  tracing:
    enabled: true
  pattern_ingester:
      enabled: true
  limits_config:
    allow_structured_metadata: true
    volume_enabled: true
  ruler:
    enable_api: true
  querier:
    # Default is 4, if you have enough memory and CPU you can increase, reduce if OOMing
    max_concurrent: 4

minio:
  enabled: true
      
deploymentMode: SingleBinary
singleBinary:
  replicas: 1
  resources:
    limits:
      cpu: 4
      memory: 4Gi
    requests:
      cpu: 2
      memory: 2Gi
  extraEnv:
    # Keep a little bit lower than memory limits
    - name: GOMEMLIMIT
      value: 3750MiB

chunksCache:
  # default is 500MB, with limited memory keep this smaller
  writebackSizeLimit: 10MB


# Zero out replica counts of other deployment modes
backend:
  replicas: 0
read:
  replicas: 0
write:
  replicas: 0

ingester:
  replicas: 0
querier:
  replicas: 0
queryFrontend:
  replicas: 0
queryScheduler:
  replicas: 0
distributor:
  replicas: 0
compactor:
  replicas: 0
indexGateway:
  replicas: 0
bloomCompactor:
  replicas: 0
bloomGateway:
  replicas: 0

================================================
FILE: k8s/metrics/README.md
================================================

# Monitor Kubernetes Metrics with Grafana Alloy and Prometheus

> Note this scenario works using the K8s Monitoring Helm chart. This abstracts the need to configure Alloy and deploys best practices for monitoring Kubernetes clusters.

This scenario demonstrates how to set up the Kubernetes monitoring Helm chart with Prometheus. This scenario will install three Helm charts: Prometheus, Grafana, and k8s-monitoring. Prometheus will be used to store the metrics, Grafana will be used to visualize the metrics, and Alloy (k8s-monitoring) will be used to collect:
* Cluster Metrics (kube-state-metrics, node-exporter, kubelet, cadvisor)
* Annotation-based autodiscovery (Prometheus-style annotations on pods)

## Prerequisites

Clone the repository:

```bash
git clone https://github.com/grafana/alloy-scenarios.git
```

Change to the directory:

```bash
cd alloy-scenarios/k8s/metrics
```

Next you will need a Kubernetes cluster. An example Kind cluster configuration is provided in the `kind.yml` file:

```bash
kind create cluster --config kind.yml
```

Install Helm and add required repositories:

```bash
helm repo add grafana https://grafana.github.io/helm-charts
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
```

## Create the `meta` namespace

```bash
kubectl create namespace meta
```

## Install Prometheus

```bash
helm install --values prometheus-values.yml prometheus prometheus-community/prometheus -n meta
```

## Install Grafana

```bash
helm install --values grafana-values.yml grafana grafana/grafana -n meta
```

## Install the K8s Monitoring Helm Chart

This scenario requires `grafana/k8s-monitoring` chart v4 or later.

```bash
helm install --values k8s-monitoring-values.yml k8s grafana/k8s-monitoring --version "^4.0.0" -n meta
```

## Accessing the Grafana UI

```bash
export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=grafana,app.kubernetes.io/instance=grafana" -o jsonpath="{.items[0].metadata.name}")
kubectl --namespace meta port-forward $POD_NAME 3000
```

Open [http://localhost:3000](http://localhost:3000) and log in with `admin` / `adminadminadmin`.

## Accessing the Alloy UI

```bash
export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=alloy-metrics,app.kubernetes.io/instance=k8s" -o jsonpath="{.items[0].metadata.name}")
kubectl --namespace meta port-forward $POD_NAME 12345
```

## Explore Metrics

In Grafana, go to **Explore** and select the **Prometheus** datasource. Try these queries:

* `up` - See all targets being scraped
* `container_cpu_usage_seconds_total` - Container CPU usage
* `container_memory_working_set_bytes` - Container memory usage
* `kube_pod_info` - Pod metadata from kube-state-metrics


================================================
FILE: k8s/metrics/grafana-values.yml
================================================
---
persistence:
  type: pvc
  enabled: true

adminUser: admin
adminPassword: adminadminadmin

service:
  enabled: true
  type: ClusterIP

datasources:
  datasources.yaml:
    apiVersion: 1
    datasources:
    - name: Prometheus
      type: prometheus
      access: proxy
      orgId: 1
      url: http://prometheus-server.meta.svc.cluster.local:80
      basicAuth: false
      isDefault: true
      version: 1
      editable: false


================================================
FILE: k8s/metrics/k8s-monitoring-values.yml
================================================
---
cluster:
  name: meta-monitoring-tutorial

destinations:
  prometheus:
    type: prometheus
    url: http://prometheus-server.meta.svc.cluster.local:80/api/v1/write

clusterMetrics:
  enabled: true

annotationAutodiscovery:
  enabled: true
  collector: alloy-metrics

collectors:
  alloy-metrics:
    presets: [clustered, statefulset]

telemetryServices:
  kube-state-metrics:
    deploy: true


================================================
FILE: k8s/metrics/kind.yml
================================================
# a cluster with 1 control-plane node and 2 workers
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
- role: worker
- role: worker


================================================
FILE: k8s/metrics/prometheus-values.yml
================================================
server:
  persistentVolume:
    enabled: false
  extraFlags:
    - web.enable-remote-write-receiver
    - enable-feature=native-histograms
    - enable-feature=exemplar-storage

alertmanager:
  enabled: false

kube-state-metrics:
  enabled: false

prometheus-node-exporter:
  enabled: false

prometheus-pushgateway:
  enabled: false


================================================
FILE: k8s/profiling/README.md
================================================
# Monitor Kubernetes Profiles with Grafana Alloy and Pyroscope

> Note this scenario works using the K8s Monitoring Helm chart. This abstracts the need to configure Alloy and deploys best practices for monitoring Kubernetes clusters.

This scenario demonstrates how to set up the Kubernetes monitoring Helm chart with Pyroscope for continuous profiling. This scenario will install three Helm charts: Pyroscope, Grafana, and k8s-monitoring. Pyroscope will store the profiles, Grafana will visualize them, and Alloy (k8s-monitoring) will scrape pprof endpoints from pods.

Alloy discovers pods with profiling annotations and scrapes their pprof endpoints (CPU, memory, goroutine, etc.).

## Prerequisites

Clone the repository:

```bash
git clone https://github.com/grafana/alloy-scenarios.git
```

Change to the directory:

```bash
cd alloy-scenarios/k8s/profiling
```

Next you will need a Kubernetes cluster. An example Kind cluster configuration is provided in the `kind.yml` file:

```bash
kind create cluster --config kind.yml
```

Install Helm and add the Grafana Helm repository:

```bash
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
```

## Create the `meta` namespace

```bash
kubectl create namespace meta
```

## Install Pyroscope

```bash
helm install --values pyroscope-values.yml pyroscope grafana/pyroscope -n meta
```

## Install Grafana

```bash
helm install --values grafana-values.yml grafana grafana/grafana -n meta
```

## Install the K8s Monitoring Helm Chart

This scenario requires `grafana/k8s-monitoring` chart v4 or later.

```bash
helm install --values k8s-monitoring-values.yml k8s grafana/k8s-monitoring --version "^4.0.0" -n meta
```

## Accessing the Grafana UI

```bash
export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=grafana,app.kubernetes.io/instance=grafana" -o jsonpath="{.items[0].metadata.name}")
kubectl --namespace meta port-forward $POD_NAME 3000
```

Open [http://localhost:3000](http://localhost:3000) and log in with `admin` / `adminadminadmin`.

## Accessing the Alloy UI

```bash
export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=alloy-profiles,app.kubernetes.io/instance=k8s" -o jsonpath="{.items[0].metadata.name}")
kubectl --namespace meta port-forward $POD_NAME 12345
```

## Enabling Profiling on Your Pods

To profile a Go application, ensure it exposes a pprof endpoint (typically at `:6060/debug/pprof/`) and add these annotations to the pod:

```yaml
metadata:
  annotations:
    profiles.grafana.com/memory.scrape: "true"
    profiles.grafana.com/memory.port_name: "http-metrics"
    profiles.grafana.com/cpu.scrape: "true"
    profiles.grafana.com/cpu.port_name: "http-metrics"
    profiles.grafana.com/goroutine.scrape: "true"
    profiles.grafana.com/goroutine.port_name: "http-metrics"
```

## Adding a Demo App

Deploy Pyroscope's demo Ride Share app to generate profiles:

```bash
kubectl apply -n meta -f - <<EOF
apiVersion: apps/v1
kind: Deployment
metadata:
  name: ride-share-go
spec:
  replicas: 1
  selector:
    matchLabels:
      app: ride-share-go
  template:
    metadata:
      labels:
        app: ride-share-go
      annotations:
        profiles.grafana.com/memory.scrape: "true"
        profiles.grafana.com/memory.port: "6060"
        profiles.grafana.com/cpu.scrape: "true"
        profiles.grafana.com/cpu.port: "6060"
        profiles.grafana.com/goroutine.scrape: "true"
        profiles.grafana.com/goroutine.port: "6060"
    spec:
      containers:
      - name: ride-share-go
        image: grafana/pyroscope-rideshare-go:latest
        ports:
        - containerPort: 5000
          name: http
        - containerPort: 6060
          name: pprof
        env:
        - name: REGION
          value: us-east-1
EOF
```

## Explore Profiles

In Grafana, navigate to the **Pyroscope** app or use **Explore** with the Pyroscope datasource. You can view:

* CPU profiles - flame graphs showing where CPU time is spent
* Memory profiles - heap allocation and usage
* Goroutine profiles - concurrent goroutine analysis


================================================
FILE: k8s/profiling/grafana-values.yml
================================================
---
persistence:
  type: pvc
  enabled: true

adminUser: admin
adminPassword: adminadminadmin

service:
  enabled: true
  type: ClusterIP

plugins:
  - grafana-pyroscope-app

datasources:
  datasources.yaml:
    apiVersion: 1
    datasources:
    - name: Pyroscope
      type: grafana-pyroscope-datasource
      access: proxy
      orgId: 1
      url: http://pyroscope.meta.svc.cluster.local:4040
      basicAuth: false
      isDefault: true
      version: 1
      editable: false


================================================
FILE: k8s/profiling/k8s-monitoring-values.yml
================================================
---
cluster:
  name: meta-monitoring-tutorial

destinations:
  pyroscope:
    type: pyroscope
    url: http://pyroscope.meta.svc.cluster.local:4040

profiling:
  enabled: true
  collector: alloy-profiles
  pprof:
    enabled: true

collectors:
  alloy-profiles:
    presets: [privileged, daemonset]


================================================
FILE: k8s/profiling/kind.yml
================================================
# a cluster with 1 control-plane node and 2 workers
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
- role: worker
- role: worker


================================================
FILE: k8s/profiling/pyroscope-values.yml
================================================
pyroscope:
  extraArgs:
    store.max-block-duration: 5m
  resources:
    requests:
      cpu: 500m
      memory: 512Mi
    limits:
      cpu: 1
      memory: 1Gi


================================================
FILE: k8s/tracing/README.md
================================================
# Monitor Kubernetes Traces with Grafana Alloy and Tempo

> Note this scenario works using the K8s Monitoring Helm chart. This abstracts the need to configure Alloy and deploys best practices for monitoring Kubernetes clusters.

This scenario demonstrates how to set up the Kubernetes monitoring Helm chart with Tempo for distributed trace collection. This scenario will install three Helm charts: Tempo, Grafana, and k8s-monitoring. Tempo will store the traces, Grafana will visualize them, and Alloy (k8s-monitoring) will receive traces via OTLP and forward them to Tempo.

Applications send traces to Alloy's OTLP endpoint, which then forwards them to Tempo.

## Prerequisites

Clone the repository:

```bash
git clone https://github.com/grafana/alloy-scenarios.git
```

Change to the directory:

```bash
cd alloy-scenarios/k8s/tracing
```

Next you will need a Kubernetes cluster. An example Kind cluster configuration is provided in the `kind.yml` file:

```bash
kind create cluster --config kind.yml
```

Install Helm and add the Grafana Helm repository:

```bash
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
```

## Create the `meta` and `prod` namespaces

```bash
kubectl create namespace meta && \
kubectl create namespace prod
```

## Install Tempo

```bash
helm install --values tempo-values.yml tempo grafana/tempo -n meta
```

## Install Grafana

```bash
helm install --values grafana-values.yml grafana grafana/grafana -n meta
```

## Install the K8s Monitoring Helm Chart

This scenario requires `grafana/k8s-monitoring` chart v4 or later.

```bash
helm install --values k8s-monitoring-values.yml k8s grafana/k8s-monitoring --version "^4.0.0" -n meta
```

This configures Alloy to receive OTLP traces on ports 4317 (gRPC) and 4318 (HTTP), then forward them to Tempo.

## Accessing the Grafana UI

```bash
export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=grafana,app.kubernetes.io/instance=grafana" -o jsonpath="{.items[0].metadata.name}")
kubectl --namespace meta port-forward $POD_NAME 3000
```

Open [http://localhost:3000](http://localhost:3000) and log in with `admin` / `adminadminadmin`.

## Accessing the Alloy UI

```bash
export POD_NAME=$(kubectl get pods --namespace meta -l "app.kubernetes.io/name=alloy-receiver,app.kubernetes.io/instance=k8s" -o jsonpath="{.items[0].metadata.name}")
kubectl --namespace meta port-forward $POD_NAME 12345
```

## Sending Traces

Applications in your cluster should set their OTLP exporter endpoint to the Alloy receiver service:

```
OTEL_EXPORTER_OTLP_ENDPOINT=http://k8s-alloy-receiver.meta.svc.cluster.local:4317
```

## Adding a Demo App

Deploy a sample instrumented application in the `prod` namespace to generate traces:

```bash
helm install tempo-distributed grafana/tempo-distributed -n prod
```

Or deploy any application instrumented with OpenTelemetry SDK pointing to the Alloy OTLP endpoint above.

## Explore Traces

In Grafana, go to **Explore** and select the **Tempo** datasource. Use TraceQL to search for traces:

* `{}` - View all traces
* `{resource.service.name="my-service"}` - Filter by service name
* `{status=error}` - Find error traces


================================================
FILE: k8s/tracing/grafana-values.yml
================================================
---
persistence:
  type: pvc
  enabled: true

adminUser: admin
adminPassword: adminadminadmin

service:
  enabled: true
  type: ClusterIP

datasources:
  datasources.yaml:
    apiVersion: 1
    datasources:
    - name: Tempo
      type: tempo
      access: proxy
      orgId: 1
      url: http://tempo.meta.svc.cluster.local:3200
      basicAuth: false
      isDefault: true
      version: 1
      editable: false


================================================
FILE: k8s/tracing/k8s-monitoring-values.yml
================================================
---
cluster:
  name: meta-monitoring-tutorial

destinations:
  tempo:
    type: otlp
    url: http://tempo.meta.svc.cluster.local:4317
    metrics:
      enabled: false
    logs:
      enabled: false
    traces:
      enabled: true

applicationObservability:
  enabled: true
  collector: alloy-receiver
  receivers:
    otlp:
      grpc:
        enabled: true
      http:
        enabled: true
  metrics:
    enabled: false
  logs:
    enabled: false

collectors:
  alloy-receiver:
    presets: [deployment]


================================================
FILE: k8s/tracing/kind.yml
================================================
# a cluster with 1 control-plane node and 2 workers
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
- role: worker
- role: worker


================================================
FILE: k8s/tracing/tempo-values.yml
================================================
tempo:
  storage:
    trace:
      backend: local
      local:
        path: /var/tempo/traces
      wal:
        path: /var/tempo/wal
  receivers:
    otlp:
      protocols:
        grpc:
          endpoint: "0.0.0.0:4317"
        http:
          endpoint: "0.0.0.0:4318"
  server:
    http_listen_port: 3200


================================================
FILE: kafka/README.md
================================================
# Kafka Scenarios

Learn how to use Grafana Alloy to monitor logs from Kafka.

## Overview

This demo showcases how to:
- Collect logs from a Kafka topic
- Process and transform JSON log data with Alloy
- Forward processed logs to Loki
- Visualize the logs in Grafana

## Components

- **Kafka**: Message broker storing logs
- **Kafka Producer**: Generates sample logs and sends them to Kafka
- **Grafana Alloy**: Observability pipeline that processes logs
- **Loki**: Log aggregation system
- **Grafana**: Visualization platform

## Running the Demo

### Step 1: Clone the repository
```bash
git clone https://github.com/grafana/alloy-scenarios.git
```

### Step 2: Deploy the monitoring stack
```bash
cd alloy-scenarios/kafka
docker-compose up -d
```

### Step 3: Access Grafana Alloy UI
Open your browser and go to `http://localhost:12345`. 

### Step 4: Access Grafana UI
Open your browser and go to `http://localhost:3000`.

Click `drilldown` to see the logs in Grafana.

## How It Works

1. The `gen_log.sh` script generates random JSON logs with different log levels, applications, and messages
2. These logs are sent to the Kafka topic `alloy-logs`
3. Alloy reads from this Kafka topic, processes the JSON data, and forwards it to Loki
4. Grafana connects to Loki to display and query the processed logs

Try creating dashboards in Grafana to visualize log frequencies by application or error levels!


================================================
FILE: kafka/config.alloy
================================================


livedebugging {
  enabled = true
}

loki.source.kafka "kafka" {
  brokers = ["kafka:9092"]
  topics  = ["alloy-logs"]
  labels   = {
    source = "kafka",
    component = "loki.source.kafka",

  }
  version = "3.8.0"
  forward_to = [loki.process.log_data.receiver]

}

loki.process "log_data" {
  forward_to = [loki.write.local.receiver]


  stage.json {
    drop_malformed = true,
    expressions = {
      level = "",
      msg   = "",
      app   = "app",
    }
  }

  stage.json {
    source = "app"
    expressions = {
      app_name    = "name",
      app_version = "version",
    }
  }

  stage.template {
    source   = "new_json"
    template = "{\"level\":\"{{ .level }}\",\"msg\":\"{{ .msg }}\",\"app_name\":\"{{ .app_name }}\",\"app_version\":\"{{ .app_version }}\"}"

  }

  stage.output {
    source = "new_json"
  }
 

}

loki.write "local" {
  endpoint {
    url = "http://loki:3100/loki/api/v1/push"
  }
}


================================================
FILE: kafka/docker-compose.coda.yml
================================================
services:
  kafka:
    image: 'bitnami/kafka:3.8'
    ports:
      - "9092:9092"
    volumes:
      - kafka_data:/bitnami/kafka
    environment:
       #KRaft must
      - KAFKA_CFG_NODE_ID=0
      - KAFKA_CFG_PROCESS_ROLES=controller,broker
      - KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093
      - KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093
      - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092
      - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT
      - KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER
    healthcheck:
      test: ["CMD", "kafka-topics.sh", "--bootstrap-server", "localhost:9092", "--version"]
      interval: 10s
      timeout: 10s
      retries: 5

  kafka-producer:
    image: 'bitnami/kafka:3.8'
    volumes:
      - ./gen_log.sh:/bin/gen_log.sh
      - kafka_data:/bitnami/kafka
    entrypoint: ["sh", "-c", "/bin/gen_log.sh"]

volumes:
  kafka_data:


================================================
FILE: kafka/docker-compose.yml
================================================
version: '3.8'

services:
  # kafka server instance
  kafka:
    image: 'bitnami/kafka:3.8'
    ports:
      - "9092:9092"
    volumes:
      - kafka_data:/bitnami/kafka

    environment:
       #KRaft must
      - KAFKA_CFG_NODE_ID=0
      - KAFKA_CFG_PROCESS_ROLES=controller,broker
      - KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093
      - KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093
      - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092
      - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT
      - KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER
    healthcheck:
      test: ["CMD", "kafka-topics.sh", "--bootstrap-server", "localhost:9092", "--version"]
      interval: 10s
      timeout: 10s
      retries: 5
  kafka-producer:
    image: 'bitnami/kafka:3.8'
    volumes:
      - ./gen_log.sh:/bin/gen_log.sh
      - kafka_data:/bitnami/kafka
    # change cmd 
    entrypoint: ["sh", "-c", "/bin/gen_log.sh"]

    
  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
      - 4318:4318
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
      - ./logs:/temp/logs
    command: run --server.http.listen-addr=0.0.0.0:12345 --stability.level=experimental  --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - loki

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
       - sh
       - -euc
       - |
         mkdir -p /etc/grafana/provisioning/datasources
         cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
         apiVersion: 1
         datasources:
         - name: Loki
           type: loki
           access: proxy
           orgId: 1
           url: http://loki:3100
           basicAuth: false
           isDefault: false
           version: 1
           editable: false
         EOF
         /run.sh
volumes:
  kafka_data:


================================================
FILE: kafka/gen_log.sh
================================================
#!/usr/bin/env bash
set -euo pipefail

LEVELS=(info warn error debug)
APPS=(test auth payment order catalog)
MSGS=(
  "Hello World from Grafana Alloy integration – log pipeline initialized successfully."
  "User authentication succeeded: user_id=42, ip=192.168.1.100, method=OAuth2."
  "Order created: order_id=12345, items=[{\"sku\":\"ABC\",\"qty\":2},{\"sku\":\"XYZ\",\"qty\":1}], total=USD 299.99."
  "Payment processing failed: transaction_id=67890, error_code=PMT-402, reason=Insufficient funds."
  "Cache miss on key user_profile_42; fetching from primary DB and repopulating cache."
  "Background job completed: task=metrics-aggregation, duration=12.34s, processed=2500 records."
  "High memory usage detected on host host-01: usage=87.5%, threshold=80% — consider scaling up."
  "Debug info: received payload with 15 fields, sample_field=\"some long detailed info here\", parsing succeeded."
)

# Always running, sending logs to kafka every two seconds.
while true; do
  level=${LEVELS[RANDOM % ${#LEVELS[@]}]}
  msg=${MSGS[RANDOM % ${#MSGS[@]}]}
  app=${APPS[RANDOM % ${#APPS[@]}]}
  version="0.$((RANDOM % 10)).$((RANDOM % 100))"

  printf '{"level":"%s","msg":"%s","app":{"name":"%s","version":"%s"}}\n' \
    "$level" "$msg" "$app" "$version"
  sleep 2
done | kafka-console-producer.sh \
    --bootstrap-server kafka:9092 \
    --topic alloy-logs


================================================
FILE: kafka/loki-config.yaml
================================================

# This is a complete configuration to deploy Loki backed by the filesystem.
# The index will be shipped to the storage via tsdb-shipper.

auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
  - from: 2020-05-15
    store: tsdb
    object_store: filesystem
    schema: v13
    index:
      prefix: index_
      period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

# Note: We are setting the max chunk age far lower than the default expected value
# This is due to the fact this scenario is used within the LogCLI demo and we need a short flush time.
# To show how logcli stats --since 24h '{service_name="Delivery World", package_size="Large"}' works.
ingester:
  max_chunk_age: 5m # Should be 2 hours

================================================
FILE: linux/README.md
================================================
# Monitoring Linux with Alloy

Grafana Alloy can be used to monitor Linux servers and containers. In this guide, we will show you how to deploy Grafana Alloy in a Docker environment to monitor Linux system metrics and logs. The setup consists of:
* Node Exporter metrics for system performance monitoring
* System logs collection with Loki

## Prerequisites

* Git - You will need Git to clone the repository.
* Docker and Docker Compose - This tutorial uses Docker to host Grafana, Loki, Prometheus, and Alloy.
* Linux environment - Either a Linux host running Docker or a Linux VM.

## About this Demo

This demo runs Alloy in a container alongside Grafana, Prometheus, and Loki, creating a self-contained monitoring stack. The Alloy container acts as a "fake Linux server" to demonstrate monitoring capabilities out of the box.

In a production environment, you would typically install Alloy directly on each Linux server you want to monitor.

## Step 1: Clone the Repository

Clone the repository to your machine:

```bash
git clone https://github.com/grafana/alloy-scenarios.git
cd alloy-scenarios/linux
```

## Step 2: Deploy the Monitoring Stack

Use Docker Compose to deploy Grafana, Loki, Prometheus, and Alloy:

```bash
docker-compose up -d
```

You can check the status of the containers:

```bash
docker ps
```

Grafana should be running on [http://localhost:3000](http://localhost:3000).

## Step 3: Explore the Monitoring Data

Once the stack is running, you can explore the collected metrics and logs:

1. Access Grafana at [http://localhost:3000](http://localhost:3000) (default credentials are admin/admin)
2. Import the Node Exporter dashboard to visualize system metrics:
   - Go to Dashboards → Import
   - Upload the JSON file from [here](https://grafana.com/api/dashboards/1860/revisions/37/download)
   - Select the Prometheus data source and click Import

This community dashboard provides comprehensive system metrics including CPU, memory, disk, and network usage.

## Step 4: Viewing Logs

Open your browser and go to [http://localhost:3000/a/grafana-lokiexplore-app](http://localhost:3000/a/grafana-lokiexplore-app). This will take you to the Loki explorer in Grafana.

## Deploying on Bare Metal

To monitor actual Linux servers in production, you would:

1. Install Alloy directly on each Linux server

2. Modify the `config.alloy` file to point to your Prometheus and Loki instances:
   ```
   prometheus.remote_write "local" {
     endpoint {
       url = "http://localhost:9090/api/v1/write"
     }
   }
   
   loki.write "local" {
     endpoint {
       url = "http://localhost:3100/loki/api/v1/push"
     }
   }
   ```

3. Run Alloy as a service:
   ```bash
   sudo alloy run /path/to/config.alloy
   ```

## Configuration Customization

The included `config.alloy` file sets up:

1. Node Exporter integration to collect system metrics
2. Log collection from system logs and journal
3. Relabeling rules to organize metrics and logs
4. Remote write endpoints for Prometheus and Loki

You can customize which collectors are enabled/disabled and adjust scrape intervals in the configuration file.

## Troubleshooting

If you encounter issues:

* Check container logs: `docker-compose logs`
* Verify Alloy is running: `docker-compose ps`
* Ensure ports are not conflicting with existing services
* Review the Alloy configuration in `config.alloy`


================================================
FILE: linux/config.alloy
================================================
// This block relabels metrics coming from node_exporter to add standard labels
discovery.relabel "integrations_node_exporter" {
  targets = prometheus.exporter.unix.integrations_node_exporter.targets

  rule {
    // Set the instance label to the hostname of the machine
    target_label = "instance"
    replacement  = constants.hostname
  }

  rule {
    // Set a standard job name for all node_exporter metrics
    target_label = "job"
    replacement = "integrations/node_exporter"
  }
}

// Configure the node_exporter integration to collect system metrics
prometheus.exporter.unix "integrations_node_exporter" {
  // Disable unnecessary collectors to reduce overhead
  disable_collectors = ["ipvs", "btrfs", "infiniband", "xfs", "zfs"]
  enable_collectors = ["meminfo"]

  filesystem {
    // Exclude filesystem types that aren't relevant for monitoring
    fs_types_exclude     = "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|tmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$"
    // Exclude mount points that aren't relevant for monitoring
    mount_points_exclude = "^/(dev|proc|run/credentials/.+|sys|var/lib/docker/.+)($|/)"
    // Timeout for filesystem operations
    mount_timeout        = "5s"
  }

  netclass {
    // Ignore virtual and container network interfaces
    ignored_devices = "^(veth.*|cali.*|[a-f0-9]{15})$"
  }

  netdev {
    // Exclude virtual and container network interfaces from device metrics
    device_exclude = "^(veth.*|cali.*|[a-f0-9]{15})$"
  }


}

// Define how to scrape metrics from the node_exporter
prometheus.scrape "integrations_node_exporter" {
scrape_interval = "15s"
  // Use the targets with labels from the discovery.relabel component
  targets    = discovery.relabel.integrations_node_exporter.output
  // Send the scraped metrics to the relabeling component
  forward_to = [prometheus.remote_write.local.receiver]
}


// Define where to send the metrics for storage
prometheus.remote_write "local" {
  endpoint {
    // Send metrics to a locally running Prometheus instance
    url = "http://prometheus:9090/api/v1/write"
  }
}

// Collect logs from systemd journal for node_exporter integration
loki.source.journal "logs_integrations_integrations_node_exporter_journal_scrape" {
  // Only collect logs from the last 24 hours
  max_age       = "24h0m0s"
  // Apply relabeling rules to the logs
  relabel_rules = discovery.relabel.logs_integrations_integrations_node_exporter_journal_scrape.rules
  // Send logs to the local Loki instance
  forward_to    = [loki.write.local.receiver]
}

// Define which log files to collect for node_exporter
local.file_match "logs_integrations_integrations_node_exporter_direct_scrape" {
  path_targets = [{
    // Target localhost for log collection
    __address__ = "localhost",
    // Collect standard system logs
    __path__    = "/var/log/{syslog,messages,*.log}",
    // Add instance label with hostname
    instance    = constants.hostname,
    // Add job label for logs
    job         = "integrations/node_exporter",
  }]
}

// Define relabeling rules for systemd journal logs
discovery.relabel "logs_integrations_integrations_node_exporter_journal_scrape" {
  targets = []

  rule {
    // Extract systemd unit information into a label
    source_labels = ["__journal__systemd_unit"]
    target_label  = "unit"
  }

  rule {
    // Extract boot ID information into a label
    source_labels = ["__journal__boot_id"]
    target_label  = "boot_id"
  }

  rule {
    // Extract the hostname of the machine into a label
    source_labels = ["__journal__hostname"]
    target_label  = "instance"
  }

  rule {
    // Extract the machine id into a label    
    source_labels = ["__journal__machine_id"]
    target_label  = "machine_id"
  }

  rule {
    // Extract transport information into a label
    source_labels = ["__journal__transport"]
    target_label  = "transport"
  }

  rule {
    // Extract log priority into a level label
    source_labels = ["__journal_priority_keyword"]
    target_label  = "level"
  }
}

// Collect logs from files for node_exporter
loki.source.file "logs_integrations_integrations_node_exporter_direct_scrape" {
  // Use targets defined in local.file_match
  targets    = local.file_match.logs_integrations_integrations_node_exporter_direct_scrape.targets
  // Send logs to the local Loki instance
  forward_to = [loki.write.local.receiver]
}

// Define where to send logs for storage
loki.write "local" {
    endpoint {
        // Send logs to a locally running Loki instance
        url ="http://loki:3100/loki/api/v1/push"
    }
}

// Enable live debugging features (empty config means use defaults)
livedebugging{}


================================================
FILE: linux/docker-compose.yml
================================================
version: '3.8'

services:

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - 3100:3100/tcp
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml


  prometheus:
     image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
     command:
       - --web.enable-remote-write-receiver
       - --config.file=/etc/prometheus/prometheus.yml
     ports:
      - 9090:9090/tcp
     volumes:
        - ./prom-config.yaml:/etc/prometheus/prometheus.yml


  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
       - sh
       - -euc
       - |
         mkdir -p /etc/grafana/provisioning/datasources
         cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
         apiVersion: 1
         datasources:
         - name: Loki
           type: loki
           access: proxy
           orgId: 1
           url: http://loki:3100
           basicAuth: false
           isDefault: false
           version: 1
           editable: false
         - name: Prometheus
           type: prometheus
           orgId: 1
           url: http://prometheus:9090
           basicAuth: false
           isDefault: true
           version: 1
           editable: false
         EOF
         /run.sh

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy


================================================
FILE: linux/loki-config.yaml
================================================

# This is a complete configuration to deploy Loki backed by the filesystem.
# The index will be shipped to the storage via tsdb-shipper.

auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
  - from: 2020-05-15
    store: tsdb
    object_store: filesystem
    schema: v13
    index:
      prefix: index_
      period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

# Note: We are setting the max chunk age far lower than the default expected value
# This is due to the fact this scenario is used within the LogCLI demo and we need a short flush time.
# To show how logcli stats --since 24h '{service_name="Delivery World", package_size="Large"}' works.
ingester:
  max_chunk_age: 5m # Should be 2 hours

================================================
FILE: linux/prom-config.yaml
================================================
# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).


================================================
FILE: log-api-gateway/README.md
================================================
# Log API Gateway

This scenario demonstrates using **Grafana Alloy** as a centralized log gateway via the `loki.source.api` component. Instead of scraping logs from files or containers, Alloy exposes a Loki-compatible push API endpoint that applications can send logs to directly.

## Architecture

```
┌─────────────────┐         ┌───────────────────────┐         ┌──────┐         ┌─────────┐
│  log-producer    │──POST──▶│  Alloy (loki.source.  │──push──▶│ Loki │◀─query──│ Grafana │
│  (Python script) │         │  api on :3500)        │         │      │         │         │
└─────────────────┘         └───────────────────────┘         └──────┘         └─────────┘
```

1. **log-producer** - A Python script that simulates multiple microservices (auth, order, notification) pushing structured logs to Alloy's Loki push API endpoint.
2. **Alloy** - Receives logs via `loki.source.api` on port 3500, enriches them with a `gateway=alloy` label, and forwards to Loki.
3. **Loki** - Stores and indexes the logs.
4. **Grafana** - Pre-configured with the Loki datasource for querying logs.

## Running

```bash
# From the repo root (uses centralized image versions)
./run-example.sh log-api-gateway

# Or directly
cd log-api-gateway && docker compose up -d
```

## Exploring

- **Grafana**: [http://localhost:3000](http://localhost:3000) - Query logs in the Explore view using the Loki datasource
- **Alloy UI**: [http://localhost:12345](http://localhost:12345) - Inspect the pipeline graph and component health

### Example LogQL Queries

```logql
# All logs from a specific service
{service_name="auth-service"}

# All logs passing through the gateway
{gateway="alloy"}

# Filter by environment
{environment="demo"}
```

## How It Works

The `loki.source.api` component in Alloy exposes a Loki-compatible HTTP endpoint (`/loki/api/v1/push`) that any application can push logs to. This is useful when:

- Applications already use the Loki push API format
- You want a centralized gateway to enrich, filter, or route logs before they reach Loki
- You need to decouple log producers from the storage backend

The Alloy pipeline in this scenario:

1. **`loki.source.api`** - Listens on port 3500 for incoming log push requests
2. **`loki.process`** - Adds a `gateway=alloy` static label to all received logs
3. **`loki.write`** - Forwards the enriched logs to Loki

## Stopping

```bash
cd log-api-gateway && docker compose down
```


================================================
FILE: log-api-gateway/app/producer.py
================================================
import requests
import time
import random
import json

ALLOY_URL = "http://alloy:3500/loki/api/v1/push"

services = [
    {"name": "auth-service", "messages": [
        "User login attempt from IP 10.0.1.50",
        "Token refresh completed for user_id=42",
        "Failed login: invalid credentials for user@example.com",
        "Session expired for session_id=abc123",
    ]},
    {"name": "order-service", "messages": [
        "New order created: ORD-98765",
        "Payment processed for order ORD-98765",
        "Order shipped: tracking_id=TRACK123",
        "Inventory check: item SKU-001 has 5 units remaining",
    ]},
    {"name": "notification-service", "messages": [
        "Email sent to user@example.com",
        "SMS notification queued for +1234567890",
        "Push notification delivered to device_id=xyz",
        "Notification batch completed: 150 messages sent",
    ]},
]

print("Starting log producer...")
while True:
    service = random.choice(services)
    message = random.choice(service["messages"])

    payload = {
        "streams": [{
            "stream": {
                "service_name": service["name"],
                "environment": "demo",
            },
            "values": [
                [str(int(time.time() * 1e9)), message]
            ]
        }]
    }

    try:
        resp = requests.post(ALLOY_URL, json=payload, headers={"Content-Type": "application/json"})
        if resp.status_code != 204:
            print(f"Unexpected status: {resp.status_code}")
    except Exception as e:
        print(f"Error sending log: {e}")

    time.sleep(random.uniform(0.5, 2.0))


================================================
FILE: log-api-gateway/config.alloy
================================================
livedebugging {
	enabled = true
}

// Accept logs via Loki push API - acts as a centralized log gateway
loki.source.api "default" {
	http {
		listen_address = "0.0.0.0"
		listen_port    = 3500
	}

	forward_to = [loki.process.enrich.receiver]
}

// Enrich logs with gateway metadata
loki.process "enrich" {
	forward_to = [loki.write.local.receiver]

	stage.static_labels {
		values = {
			"gateway" = "alloy",
		}
	}
}

loki.write "local" {
	endpoint {
		url = "http://loki:3100/loki/api/v1/push"
	}
}


================================================
FILE: log-api-gateway/docker-compose.coda.yml
================================================
services:
  log-producer:
    image: python:${PYTHON_VERSION:-3.11-slim}
    container_name: log-producer
    volumes:
      - ./app/producer.py:/app/producer.py
    command: sh -c "pip install requests && python3 /app/producer.py"


================================================
FILE: log-api-gateway/docker-compose.yml
================================================

services:

  # Python script that pushes logs to Alloy's Loki push API endpoint
  log-producer:
    image: python:${PYTHON_VERSION:-3.11-slim}
    container_name: log-producer
    volumes:
      - ./app/producer.py:/app/producer.py
    command: sh -c "pip install requests && python3 /app/producer.py"
    depends_on:
      - alloy

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
      - 3500:3500
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --stability.level=experimental --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - loki

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
       - sh
       - -euc
       - |
         mkdir -p /etc/grafana/provisioning/datasources
         cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
         apiVersion: 1
         datasources:
         - name: Loki
           type: loki
           access: proxy
           orgId: 1
           url: http://loki:3100
           basicAuth: false
           isDefault: true
           version: 1
           editable: false
         EOF
         /run.sh


================================================
FILE: log-api-gateway/loki-config.yaml
================================================
auth_enabled: false

server:
  http_listen_port: 3100
  grpc_listen_port: 9096
  log_level: debug
  grpc_server_max_concurrent_streams: 1000

common:
  instance_addr: 127.0.0.1
  path_prefix: /tmp/storage
  storage:
    filesystem:
      chunks_directory: /tmp/storage/chunks
      rules_directory: /tmp/storage/rules
  replication_factor: 1
  ring:
    kvstore:
      store: inmemory

query_range:
  results_cache:
    cache:
      embedded_cache:
        enabled: true
        max_size_mb: 100

limits_config:
  metric_aggregation_enabled: true

schema_config:
  configs:
    - from: 2020-10-24
      store: tsdb
      object_store: filesystem
      schema: v13
      index:
        prefix: index_
        period: 24h

pattern_ingester:
  enabled: true
  metric_aggregation:
    loki_address: localhost:3100

ruler:
  alertmanager_url: http://localhost:9093

frontend:
  encoding: protobuf

# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration
# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/
#
# Statistics help us better understand how Loki is used, and they show us performance
# levels for most users. This helps us prioritize features and documentation.
# For more information on what's sent, look at
# https://github.com/grafana/loki/blob/main/pkg/analytics/stats.go
# Refer to the buildReport method to see what goes into a report.
#
# If you would like to disable reporting, uncomment the following lines:
#analytics:
#  reporting_enabled: false


================================================
FILE: log-secret-filtering/README.md
================================================
# Log Secret Filtering

Demonstrates how Grafana Alloy's `loki.secretfilter` component automatically redacts secrets from log lines before they reach Loki.

## Overview

A Python application continuously writes log lines -- some containing fake secrets (AWS keys, database connection strings, GitHub tokens, JWTs, Slack webhooks) -- to a shared log file. Alloy tails the file, passes every line through `loki.secretfilter` using built-in Gitleaks patterns, and forwards the sanitized output to Loki. By the time logs appear in Grafana, sensitive values have been replaced with `<REDACTED:$SECRET_NAME>`.

The example includes:

- **secret-logger** -- Python app that emits a mix of normal and secret-containing log lines every 2 seconds.
- **Alloy** -- Tails the log file, applies `loki.secretfilter`, and pushes to Loki. Runs with `--stability.level=experimental` because `loki.secretfilter` is an experimental component.
- **Loki** -- Stores the redacted logs.
- **Grafana** -- Visualize and query logs to verify secrets have been removed.

## Running the Demo

1. Clone the repository:
   ```
   git clone https://github.com/grafana/alloy-scenarios.git
   cd alloy-scenarios
   ```

2. Navigate to this example directory:
   ```
   cd log-secret-filtering
   ```

3. Run using Docker Compose:
   ```
   docker compose up -d
   ```

   Or use the centralized image management:
   ```
   cd ..
   ./run-example.sh log-secret-filtering
   ```

4. Access Grafana at [http://localhost:3000](http://localhost:3000)

## What to Expect

1. Open Grafana and navigate to **Explore**.
2. Select the **Loki** datasource.
3. Run the query `{job="secret-app"}`.
4. You should see log lines where secrets have been replaced, for example:
   - `Found config: <REDACTED:aws-access-token> with secret`
   - `Database connection: <REDACTED:generic-api-key>`
   - Normal log lines (health checks, request timings) pass through unchanged.

## Architecture

```
┌─────────────────┐      ┌───────────────────────────────────────┐      ┌──────┐      ┌─────────┐
│  secret-logger  │─────▶│  Alloy                                │─────▶│ Loki │─────▶│ Grafana │
│  (writes logs)  │ file │  local.file_match ─▶ loki.source.file │ push │      │ query│         │
└─────────────────┘      │       ─▶ loki.secretfilter ─▶ loki.write     │      │      │         │
                         └───────────────────────────────────────┘      └──────┘      └─────────┘
```

## Alloy Pipeline

The `config.alloy` pipeline:

1. `local.file_match` -- discovers log files at `/tmp/logs/*.log`.
2. `loki.source.file` -- tails matched files and forwards log entries.
3. `loki.secretfilter` -- scans each log line against Gitleaks secret patterns and replaces matches with `<REDACTED:$SECRET_NAME>`.
4. `loki.write` -- pushes sanitized logs to Loki.

Visit the Alloy UI at [http://localhost:12345](http://localhost:12345) to inspect the running pipeline and use the live debugging view.


================================================
FILE: log-secret-filtering/app/main.py
================================================
import time
import random
import datetime

secrets = [
    'Found config: AKIAIOSFODNN7EXAMPLE with secret',
    'Database connection: postgresql://admin:SuperSecret123@db:5432/prod',
    'Setting API_KEY=ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdef12',
    'Bearer token: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U',
    'Slack webhook: https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX',
]

normal = [
    'Processing request from 192.168.1.100',
    'User login successful for user_id=42',
    'Health check passed: all systems operational',
    'Cache hit ratio: 94.2%',
    'Request completed in 23ms',
]

with open("/logs/app.log", "w") as f:
    pass

while True:
    line = random.choice(secrets + normal + normal)  # 2:1 ratio normal:secret
    ts = datetime.datetime.now().isoformat()
    with open("/logs/app.log", "a") as f:
        f.write(f"{ts} {line}\n")
    time.sleep(2)


================================================
FILE: log-secret-filtering/config.alloy
================================================
livedebugging {
	enabled = true
}

local.file_match "app_logs" {
	path_targets = [{"__path__" = "/tmp/logs/*.log", "job" = "secret-app"}]
	sync_period  = "5s"
}

loki.source.file "log_scrape" {
	targets       = local.file_match.app_logs.targets
	forward_to    = [loki.secretfilter.default.receiver]
	tail_from_end = true
}

// Redact secrets from log lines before sending to Loki.
// Uses built-in Gitleaks patterns to detect API keys, passwords, tokens, etc.
loki.secretfilter "default" {
	forward_to = [loki.write.local.receiver]

	redact_with = "<REDACTED:$SECRET_NAME>"
}

loki.write "local" {
	endpoint {
		url = "http://loki:3100/loki/api/v1/push"
	}
}


================================================
FILE: log-secret-filtering/docker-compose.coda.yml
================================================
services:
  secret-logger:
    image: python:${PYTHON_VERSION:-3.11-slim}
    volumes:
      - ./app/main.py:/app/main.py
      - ./logs:/logs
    command: python3 /app/main.py


================================================
FILE: log-secret-filtering/docker-compose.yml
================================================
services:
  # Python app that periodically logs fake secrets (API keys, passwords, tokens)
  secret-logger:
    image: python:${PYTHON_VERSION:-3.11-slim}
    volumes:
      - ./app/main.py:/app/main.py
      - ./logs:/logs
    command: python3 /app/main.py

  # Alloy telemetry pipeline — scrapes log files and redacts secrets before forwarding to Loki
  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
      - ./logs:/tmp/logs
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data --stability.level=experimental /etc/alloy/config.alloy
    depends_on:
      - loki

  # Loki for log aggregation
  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - 3100:3100/tcp
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  # Grafana for visualization
  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh
    depends_on:
      - loki


================================================
FILE: log-secret-filtering/loki-config.yaml
================================================
auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
  - from: 2020-05-15
    store: tsdb
    object_store: filesystem
    schema: v13
    index:
      prefix: index_
      period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

ingester:
  max_chunk_age: 2h


================================================
FILE: logs-file/README.md
================================================
# File Scenarios

Learn how to use Grafana Alloy to monitor logs from a file.

## Running the Demo

### Step 1: Clone the repository
```bash
git clone https://github.com/grafana/alloy-scenarios.git
```

### Step 2: Deploy the monitoring stack
```bash
cd alloy-scenarios/logs-file
docker-compose up -d
```

### Step 3: Access Grafana Alloy UI
Open your browser and go to `http://localhost:12345`. 

### Step 4: Access Grafana UI
Open your browser and go to `http://localhost:3000`.


================================================
FILE: logs-file/config.alloy
================================================


livedebugging {
  enabled = true
}

local.file_match "local_files" {
    path_targets = [{"__path__" = "/temp/logs/*.log", "job" = "python", "hostname" = constants.hostname}]
    sync_period  = "5s"
}
 
loki.source.file "log_scrape" {
    targets    = local.file_match.local_files.targets
    forward_to = [loki.write.local.receiver]
    tail_from_end = true


}

loki.write "local" {
  endpoint {
    url = "http://loki:3100/loki/api/v1/push"
  }
}

================================================
FILE: logs-file/docker-compose.coda.yml
================================================
services:
  logs-file:
    image: python:${PYTHON_VERSION:-3.11-slim}
    container_name: logs-file
    volumes:
      - ./main.py:/main.py
      - ./logs:/logs
    command: ["python3", "/main.py"]


================================================
FILE: logs-file/docker-compose.yml
================================================
version: '3.8'

services:
  # Syslog simulator using a Python script
  logs-file:
    image: python:${PYTHON_VERSION:-3.11-slim}
    container_name: logs-file
    volumes:
      - ./main.py:/main.py  # Syslog simulator script
      - ./logs:/logs  # Directory to store the logs
    depends_on:
      - alloy
    command: ["python3", "/main.py"]
  

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
      - 4318:4318
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
      - ./logs:/temp/logs
    command: run --server.http.listen-addr=0.0.0.0:12345 --stability.level=experimental  --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - loki

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
       - sh
       - -euc
       - |
         mkdir -p /etc/grafana/provisioning/datasources
         cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
         apiVersion: 1
         datasources:
         - name: Loki
           type: loki
           access: proxy
           orgId: 1
           url: http://loki:3100
           basicAuth: false
           isDefault: false
           version: 1
           editable: false
         EOF
         /run.sh


================================================
FILE: logs-file/loki-config.yaml
================================================

# This is a complete configuration to deploy Loki backed by the filesystem.
# The index will be shipped to the storage via tsdb-shipper.

auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
  - from: 2020-05-15
    store: tsdb
    object_store: filesystem
    schema: v13
    index:
      prefix: index_
      period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

# Note: We are setting the max chunk age far lower than the default expected value
# This is due to the fact this scenario is used within the LogCLI demo and we need a short flush time.
# To show how logcli stats --since 24h '{service_name="Delivery World", package_size="Large"}' works.
ingester:
  max_chunk_age: 5m # Should be 2 hours

================================================
FILE: logs-file/main.py
================================================
import logging
import time
import random
import os

# Ensure the /logs directory exists
log_directory = "/logs"
log_file = os.path.join(log_directory, "app.log")

if not os.path.exists(log_directory):
    os.makedirs(log_directory)  # Create directory if it doesn't exist

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger("ExampleApp")

def simulate_process():
    """Simulates a process that generates logs every 5 seconds."""
    actions = ["start", "process", "error", "complete"]
    
    while True:  # Infinite loop
        action = random.choice(actions)

        if action == "start":
            logger.info("Process started successfully.")
        elif action == "process":
            logger.debug("Processing data...")
        elif action == "error":
            logger.error("An error occurred during processing.")
        elif action == "complete":
            logger.warning("Process completed with minor warnings.")

        time.sleep(5)  # Generate a log every 5 seconds

if __name__ == "__main__":
    logger.info("Application started.")

    try:
        simulate_process()
    except Exception as e:
        logger.critical(f"Unhandled exception: {e}")

    logger.info("Application finished.")


================================================
FILE: logs-tcp/README.md
================================================
# Logs Over TCP Scenario

This scenario demonstrates how to send TCP logs to Alloy within a JSON format. We then use `log.process` to parse the logs and extract the fields from the JSON logs. These fields are used to generate labels and structured metadata for the logs.

## Running the Demo

### Step 1: Clone the repository
```bash
git clone https://github.com/grafana/alloy-scenarios.git
```

### Step 2: Deploy the monitoring stack
```bash
cd alloy-scenarios/logs-tcp
docker-compose up -d
```

### Step 3: Access Grafana Alloy UI
Open your browser and go to `http://localhost:12345`. 

### Step 4: Access Grafana UI
Open your browser and go to `http://localhost:3000`.


================================================
FILE: logs-tcp/config.alloy
================================================


livedebugging {
  enabled = true
}

loki.source.api "loki_push_api" {
    http {
        listen_address = "0.0.0.0"
        listen_port = 9999
    }
    forward_to = [
        loki.process.labels.receiver,
    ]
}

loki.process "labels" {
    stage.json {
      expressions = { "extracted_service" = "service_name", 
                      "extracted_code_line" = "code_line", 
                      "extracted_server" = "server_id", 
                    }
    }

  stage.labels {
    values = {
      "service_name" = "extracted_service",
    }
  }

  stage.structured_metadata {
    values = {
      "code_line" = "extracted_code_line",
      "server" = "extracted_server",
    }
}

forward_to = [loki.write.local.receiver]

}

loki.write "local" {
  endpoint {
    url = "http://loki:3100/loki/api/v1/push"
  }
}

================================================
FILE: logs-tcp/docker-compose.coda.yml
================================================
services:
  simulator:
    image: python:${PYTHON_VERSION:-3.11-slim}
    container_name: simulator
    volumes:
      - ./simulator.py:/simulator.py
    environment:
      - TARGET_HOST=alloy
      - TARGET_PORT=9999
    command: ["python3", "/simulator.py"]


================================================
FILE: logs-tcp/docker-compose.yml
================================================
version: '3.8'

services:

  # Syslog simulator using a Python script
  simulator:
    image: python:${PYTHON_VERSION:-3.11-slim}
    container_name: simulator
    volumes:
      - ./simulator.py:/simulator.py  # Syslog simulator script
    environment:
      - TARGET_HOST=alloy
      - TARGET_PORT=9999
    command: ["python3", "/simulator.py"]
    depends_on:
      - alloy
  

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
      - 4318:4318
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
      - ./logs:/tmp/app-logs/
    command: run --server.http.listen-addr=0.0.0.0:12345 --stability.level=experimental  --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - loki

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
       - sh
       - -euc
       - |
         mkdir -p /etc/grafana/provisioning/datasources
         cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
         apiVersion: 1
         datasources:
         - name: Loki
           type: loki
           access: proxy
           orgId: 1
           url: http://loki:3100
           basicAuth: false
           isDefault: false
           version: 1
           editable: false
         EOF
         /run.sh

volumes:
  rsyslog_data:


================================================
FILE: logs-tcp/loki-config.yaml
================================================
auth_enabled: false

server:
  http_listen_port: 3100
  grpc_listen_port: 9096
  log_level: debug
  grpc_server_max_concurrent_streams: 1000

common:
  instance_addr: 127.0.0.1
  path_prefix: /tmp/storage
  storage:
    filesystem:
      chunks_directory: /tmp/storage/chunks
      rules_directory: /tmp/storage/rules
  replication_factor: 1
  ring:
    kvstore:
      store: inmemory

query_range:
  results_cache:
    cache:
      embedded_cache:
        enabled: true
        max_size_mb: 100

limits_config:
  metric_aggregation_enabled: true

schema_config:
  configs:
    - from: 2020-10-24
      store: tsdb
      object_store: filesystem
      schema: v13
      index:
        prefix: index_
        period: 24h

pattern_ingester:
  enabled: true
  metric_aggregation:
    loki_address: localhost:3100

ruler:
  alertmanager_url: http://localhost:9093

frontend:
  encoding: protobuf


# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration
# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/
#
# Statistics help us better understand how Loki is used, and they show us performance
# levels for most users. This helps us prioritize features and documentation.
# For more information on what's sent, look at
# https://github.com/grafana/loki/blob/main/pkg/analytics/stats.go
# Refer to the buildReport method to see what goes into a report.
#
# If you would like to disable reporting, uncomment the following lines:
#analytics:
#  reporting_enabled: false

================================================
FILE: logs-tcp/simulator.py
================================================
import socket
import time
import os
import random
import json
from datetime import datetime

# Get the target host and port from environment variables
target_host = os.getenv('TARGET_HOST', 'localhost')
target_port = int(os.getenv('TARGET_PORT', 5140))

# Define the endpoint path
endpoint_path = "/loki/api/v1/raw"

# Create a TCP socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
    sock.connect((target_host, target_port))
except socket.error as e:
    print(f"Failed to connect to {target_host}:{target_port} - {e}")
    exit(1)

# Define log levels and messages
log_levels = ["INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"]
messages = [
    "System started successfully",
    "User login successful",
    "Configuration loaded",
    "Connection to database failed",
    "Data processed successfully",
    "Invalid API request received",
    "Memory usage high",
    "Disk space low",
    "Unknown error occurred",
    "Service restarted",
]

# Define extra fields for the log payload
service_names = ["AuthService", "DataService", "PaymentService", "NotificationService"]
regions = ["us-east-1", "eu-west-1", "ap-south-1", "sa-east-1"]
server_ids = ["srv-101", "srv-202", "srv-303", "srv-404"]

# Generate and send JSON log messages every few seconds
while True:
    try:
        # Correct timestamp format
        timestamp = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
        log_level = random.choice(log_levels)
        message_text = random.choice(messages)
        service_name = random.choice(service_names)
        region = random.choice(regions)
        server_id = random.choice(server_ids)
        code_line = random.randint(20, 120)  # Simulate random code line numbers

        # Create the JSON log payload
        log_payload = {
            "timestamp": timestamp,
            "severity": log_level,
            "body": message_text,
            "service_name": service_name,
            "code_line": code_line,
            "region": region,
            "server_id": server_id
        }

        # Convert the log payload to JSON string
        log_json = json.dumps(log_payload)

        # Create the HTTP POST request to send the log
        http_request = (
            f"POST {endpoint_path} HTTP/1.1\r\n"
            f"Host: {target_host}\r\n"
            "Content-Type: application/json\r\n"
            f"Content-Length: {len(log_json)}\r\n"
            "Connection: keep-alive\r\n"
            "\r\n"
            f"{log_json}"
        )

        # Send the HTTP request over TCP
        sock.sendall(http_request.encode())
        print(f"Sent JSON log message to {target_host}:{target_port} - {log_json}")
    except socket.error as e:
        print(f"Failed to send log message - {e}")
        break

    # Wait for a few seconds before sending the next message
    time.sleep(random.randint(3, 8))  # Send a message every 3-8 seconds


================================================
FILE: mail-house/README.md
================================================
# Mail House Scenario

Learn how to parse structured logs into Labels and Structured Metadata.

## Running the Demo

### Step 1: Clone the repository
```bash
git clone https://github.com/grafana/alloy-scenarios.git
```

### Step 2: Deploy the monitoring stack
```bash
cd alloy-scenarios/mail-house
docker-compose up -d
```

### Step 3: Access Grafana Alloy UI
Open your browser and go to `http://localhost:12345`. 

### Step 4: Access Grafana UI
Open your browser and go to `http://localhost:3000`.


================================================
FILE: mail-house/config.alloy
================================================


livedebugging {
  enabled = true
}

loki.source.api "loki_push_api" {
    http {
        listen_address = "0.0.0.0"
        listen_port = 9999
    }
    forward_to = [
        loki.process.labels.receiver,
    ]
}

loki.process "labels" {
    stage.json {
      expressions = { 
                      "timestamp" = "",
                      "state" = "", 
                      "package_size" = "", 
                      "package_status" = "", 
                      "package_id" = "",
                      "mail_house_id" = "",
                    }
    }

  stage.timestamp {
    source = "timestamp"
    format = "RFC3339"
}

  stage.labels {
    values = {
      "state" = "",
      "package_size" = "",
      "mail_house_id" = "",
    }
  }

  stage.structured_metadata {
    values = {
      "package_status" = "",
      "package_id" = "",
    }
  }

  stage.static_labels {
    values = {
      "service_name" = "Delivery World",
    }
  }

  stage.output {
    source = "message"
}
  

forward_to = [loki.write.local.receiver]

}

loki.write "local" {
  endpoint {
    url = "http://loki:3100/loki/api/v1/push"
  }
}

================================================
FILE: mail-house/docker-compose.coda.yml
================================================
services:
  mail-house-01:
    image: python:${PYTHON_VERSION:-3.11-slim}
    volumes:
      - ./main.py:/main.py
    command: ["python3", "/main.py"]
    environment:
      - MAIL_HOUSE_ID=DEPOT-01
    restart: unless-stopped

  mail-house-02:
    image: python:${PYTHON_VERSION:-3.11-slim}
    volumes:
      - ./main.py:/main.py
    command: ["python3", "/main.py"]
    environment:
      - MAIL_HOUSE_ID=DEPOT-02
    restart: unless-stopped

  mail-house-03:
    image: python:${PYTHON_VERSION:-3.11-slim}
    volumes:
      - ./main.py:/main.py
    command: ["python3", "/main.py"]
    environment:
      - MAIL_HOUSE_ID=DEPOT-03
    restart: unless-stopped


================================================
FILE: mail-house/docker-compose.yml
================================================
version: '3.8'

services:
  mail-house-01:
    image: python:${PYTHON_VERSION:-3.11-slim}
    volumes:
      - ./main.py:/main.py
    depends_on:
      - alloy
    command: ["python3", "/main.py"]
    environment:
      - MAIL_HOUSE_ID=DEPOT-01
    restart: unless-stopped
  
  mail-house-02:
    image: python:${PYTHON_VERSION:-3.11-slim}
    volumes:
      - ./main.py:/main.py
    depends_on:
      - alloy
    command: ["python3", "/main.py"]
    environment:
      - MAIL_HOUSE_ID=DEPOT-02
    restart: unless-stopped
    
  mail-house-03:
    image: python:${PYTHON_VERSION:-3.11-slim}
    volumes:
      - ./main.py:/main.py
    depends_on:
      - alloy
    command: ["python3", "/main.py"]
    environment:
      - MAIL_HOUSE_ID=DEPOT-03
    restart: unless-stopped

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
      - 4318:4318
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --stability.level=experimental  --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - loki

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
       - sh
       - -euc
       - |
         mkdir -p /etc/grafana/provisioning/datasources
         cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
         apiVersion: 1
         datasources:
         - name: Loki
           type: loki
           access: proxy
           orgId: 1
           url: http://loki:3100
           basicAuth: false
           isDefault: false
           version: 1
           editable: false
         EOF
         /run.sh


================================================
FILE: mail-house/loki-config.yaml
================================================

# This is a complete configuration to deploy Loki backed by the filesystem.
# The index will be shipped to the storage via tsdb-shipper.

auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
  - from: 2020-05-15
    store: tsdb
    object_store: filesystem
    schema: v13
    index:
      prefix: index_
      period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

# Note: We are setting the max chunk age far lower than the default expected value
# This is due to the fact this scenario is used within the LogCLI demo and we need a short flush time.
# To show how logcli stats --since 24h '{service_name="Delivery World", package_size="Large"}' works.
ingester:
  max_chunk_age: 5m # Should be 2 hours

================================================
FILE: mail-house/main.py
================================================
import random
import json
import time
import socket
from datetime import datetime
import os


# Get the target host and port from environment variables
target_host = os.getenv('TARGET_HOST', 'alloy')
target_port = int(os.getenv('TARGET_PORT', 9999))
# Get the mail house ID from environment variables
mail_house_id = os.getenv('MAIL_HOUSE_ID', 'MAIL-01')

# Define the endpoint path
endpoint_path = "/loki/api/v1/raw"

# List of states and cities in America (abbreviated version)
STATES_CITIES = {
    "California": ["Los Angeles", "San Francisco", "San Diego"],
    "Texas": ["Houston", "Dallas", "Austin"],
    "New York": ["New York City", "Buffalo", "Rochester"],
    "Florida": ["Miami", "Orlando", "Tampa"],
    "Illinois": ["Chicago", "Springfield", "Naperville"],
}

# Package statuses and metadata
PACKAGE_SIZES = ["Small", "Medium", "Large"]
PACKAGE_TYPES = ["Documents", "Electronics", "Clothing", "Food", "Furniture"]
PACKAGE_STATUS_LEVELS = ["info", "warning", "critical", "error"]
PACKAGE_NOTES = [
    "In transit",
    "Out for delivery",
    "Delivered successfully",
    "Delayed due to weather",
    "Address not found",
    "Returned to sender",
    "Damaged during transit",
]


def generate_log_entry():
    state = random.choice(list(STATES_CITIES.keys()))
    city = random.choice(STATES_CITIES[state])
    
    log_entry = {
        "timestamp": datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f") + "Z",
        "state": state,
        "city": city,
        "package_id": f"PKG{random.randint(10000, 99999)}",
        "package_type": random.choice(PACKAGE_TYPES),
        "package_size": random.choice(PACKAGE_SIZES),
        "package_status": random.choice(PACKAGE_STATUS_LEVELS),
        "note": random.choice(PACKAGE_NOTES),
        "sender": {
            "name": f"Sender{random.randint(1, 100)}",
            "address": f"{random.randint(100, 999)} {random.choice(['Main St', 'Broadway', 'Elm St', 'Maple Ave'])}, {city}, {state}",
        },
        "receiver": {
            "name": f"Receiver{random.randint(1, 100)}",
            "address": f"{random.randint(100, 999)} {random.choice(['Oak St', 'Pine Rd', 'Cedar Blvd', 'Willow Ln'])}, {random.choice(STATES_CITIES[state])}, {state}",
        },
        "mail_house_id": mail_house_id, 
    }
    return log_entry


def main():
    # Create a TCP socket
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    try:
        sock.connect((target_host, target_port))
    except socket.error as e:
        print(f"Failed to connect to {target_host}:{target_port} - {e}")
        time.sleep(1)
        main()
    
    while True:
        try:
            log_entry = generate_log_entry()
            log_entry_json = json.dumps(log_entry)

            http_request = (
                f"POST {endpoint_path} HTTP/1.1\r\n"
                f"Host: {target_host}\r\n"
                "Content-Type: application/json\r\n"
                f"Content-Length: {len(log_entry_json)}\r\n"
                "Connection: keep-alive\r\n"
                "\r\n"
                f"{log_entry_json}"
            )

            # Send the HTTP request over TCP
            sock.sendall(http_request.encode())
            print(f"Sent JSON log message to {target_host}:{target_port} - {log_entry_json}")

            # Wait for a few seconds before sending the next log
            time.sleep(1)
        except socket.error as e:
            print(f"Failed to send log message - {e}")
            # Close the socket and exit
            sock.close()
            exit(1)
            

if __name__ == "__main__":
    main()


================================================
FILE: memcached-monitoring/README.md
================================================
# Memcached Monitoring with Grafana Alloy

This scenario demonstrates how to monitor a Memcached instance using Grafana Alloy's built-in `prometheus.exporter.memcached` component.

## Architecture

- **Memcached** - The monitored Memcached instance
- **Grafana Alloy** - Collects Memcached metrics via `prometheus.exporter.memcached` and remote writes them to Prometheus
- **Prometheus** - Stores the scraped metrics
- **Grafana** - Visualizes Memcached metrics (auto-provisioned with Prometheus datasource)

## Running

```bash
# From this directory
docker compose up -d

# Or from the repo root using centralized image versions
./run-example.sh memcached-monitoring
```

## Accessing

- **Grafana**: http://localhost:3000 (no login required)
- **Alloy UI**: http://localhost:12345
- **Prometheus**: http://localhost:9090

## Key Metrics

Once running, you can query Memcached metrics in Grafana or Prometheus. Some useful metrics include:

- `memcached_up` - Whether Memcached is reachable
- `memcached_current_connections` - Number of current connections
- `memcached_current_bytes` - Current number of bytes stored
- `memcached_current_items` - Current number of items stored
- `memcached_commands_total` - Total commands by command type (get, set, etc.)
- `memcached_items_evicted_total` - Total number of items evicted
- `memcached_read_bytes_total` / `memcached_written_bytes_total` - Network throughput

## Stopping

```bash
docker compose down
```


================================================
FILE: memcached-monitoring/config.alloy
================================================
// Memcached Monitoring with Grafana Alloy
// This configuration scrapes Memcached metrics using the built-in prometheus.exporter.memcached component
// and remote writes them to Prometheus.

livedebugging {
	enabled = true
}

prometheus.exporter.memcached "default" {
	address = "memcached:11211"
}

prometheus.scrape "memcached" {
	targets    = prometheus.exporter.memcached.default.targets
	forward_to = [prometheus.remote_write.default.receiver]
}

prometheus.remote_write "default" {
	endpoint {
		url = "http://prometheus:9090/api/v1/write"
	}
}


================================================
FILE: memcached-monitoring/docker-compose.coda.yml
================================================
services:
  memcached:
    image: memcached:1.6@sha256:277e0c4f249b118e95ab10e535bae2fa1af772271d9152f3468e58d59348db56
    ports:
      - "11211:11211"


================================================
FILE: memcached-monitoring/docker-compose.yml
================================================
services:
  memcached:
    image: memcached:1.6@sha256:277e0c4f249b118e95ab10e535bae2fa1af772271d9152f3468e58d59348db56
    ports:
      - "11211:11211"

  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --config.file=/etc/prometheus/prometheus.yml
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - memcached
      - prometheus


================================================
FILE: memcached-monitoring/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s


================================================
FILE: mysql-monitoring/README.md
================================================
# MySQL Monitoring with Grafana Alloy

This scenario demonstrates how to monitor a MySQL database using Grafana Alloy's `prometheus.exporter.mysql` component. Alloy scrapes MySQL metrics and remote-writes them to Prometheus, which Grafana queries for visualization.

## Prerequisites

- Docker and Docker Compose installed

## Getting Started

```bash
git clone https://github.com/grafana/alloy-scenarios.git
cd alloy-scenarios/mysql-monitoring
docker compose up -d
```

## Access Points

| Service    | URL                          |
|------------|------------------------------|
| Grafana    | http://localhost:3000        |
| Alloy UI   | http://localhost:12345       |
| Prometheus | http://localhost:9090        |

## What to Expect

Once the stack is running, Alloy connects to the MySQL instance and exposes metrics via the `prometheus.exporter.mysql` component. These metrics are scraped every 15 seconds and forwarded to Prometheus using remote write.

Open Grafana at http://localhost:3000, navigate to **Explore**, select the **Prometheus** datasource, and query for `mysql_` prefixed metrics (e.g., `mysql_up`, `mysql_global_status_connections`, `mysql_global_status_threads_connected`).

You can also inspect the Alloy pipeline at http://localhost:12345 to verify that the exporter, scrape, and remote write components are healthy. Live debugging is enabled for real-time pipeline inspection.

## Stopping the Scenario

```bash
docker compose down
```


================================================
FILE: mysql-monitoring/config.alloy
================================================
// ###############################
// #### Metrics Configuration ####
// ###############################

// Enable live debugging for the Alloy UI.
livedebugging {
	enabled = true
}

// Expose MySQL metrics using the prometheus.exporter.mysql component.
prometheus.exporter.mysql "default" {
	data_source_name = "root:alloy@(mysql:3306)/"
}

// Configure a prometheus.scrape component to collect MySQL metrics.
prometheus.scrape "mysql" {
	targets    = prometheus.exporter.mysql.default.targets
	forward_to = [prometheus.remote_write.default.receiver]

	scrape_interval = "15s"
}

// Configure a prometheus.remote_write component to send metrics to Prometheus.
prometheus.remote_write "default" {
	endpoint {
		url = "http://prometheus:9090/api/v1/write"
	}
}


================================================
FILE: mysql-monitoring/docker-compose.coda.yml
================================================
services:
  mysql:
    image: mysql:9.7@sha256:f0ef1d92fa650fcfa5b85f1d82bb1a56a6dd579bf256b8f8f2a5a0b1b61c8b0b
    environment:
      - MYSQL_ROOT_PASSWORD=alloy
      - MYSQL_DATABASE=alloy
    ports:
      - "3306:3306"
    healthcheck:
      test: ["CMD", "mysqladmin", "ping", "-h", "localhost", "-u", "root", "-palloy"]
      interval: 10s
      timeout: 5s
      retries: 5


================================================
FILE: mysql-monitoring/docker-compose.yml
================================================
services:
  mysql:
    image: mysql:9.7@sha256:f0ef1d92fa650fcfa5b85f1d82bb1a56a6dd579bf256b8f8f2a5a0b1b61c8b0b
    environment:
      - MYSQL_ROOT_PASSWORD=alloy
      - MYSQL_DATABASE=alloy
    ports:
      - "3306:3306"
    healthcheck:
      test: ["CMD", "mysqladmin", "ping", "-h", "localhost", "-u", "root", "-palloy"]
      interval: 10s
      timeout: 5s
      retries: 5

  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --config.file=/etc/prometheus/prometheus.yml
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      mysql:
        condition: service_healthy


================================================
FILE: mysql-monitoring/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s


================================================
FILE: nginx-monitoring/README.md
================================================
# NGINX Monitoring with Grafana Alloy

End-to-end NGINX observability with a single Alloy pipeline:

- **Logs** — `loki.source.file` tails NGINX access and error logs; `loki.process` parses the combined log format and promotes `method` and `status` to labels.
- **Metrics** — `prometheus.scrape` scrapes `nginx-prometheus-exporter` (which itself reads NGINX's built-in `stub_status`) and remote-writes to Prometheus.

## Architecture

- **NGINX** — the monitored web server (`/nginx_status` enabled, access/error logs written to a shared volume)
- **nginx-prometheus-exporter** — translates `stub_status` into Prometheus metrics on `:9113`
- **loadgen** — small `curl` loop that hits NGINX once per second so the demo has visible activity (200s and 404s)
- **Grafana Alloy** — the pipeline above, exposed at `:12345`
- **Loki / Prometheus / Grafana** — backends and visualization, with Loki and Prometheus datasources auto-provisioned

## Running

```bash
# From this directory
docker compose up -d

# Or from the repo root using centralized image versions
./run-example.sh nginx-monitoring
```

## Accessing

- **Grafana**: http://localhost:3000 (no login required)
- **Alloy UI**: http://localhost:12345 — verify components are healthy and inspect the live data flow
- **Prometheus**: http://localhost:9090
- **NGINX**: http://localhost:8080 — `/` returns "ok", `/nginx_status` returns connection counters

## Trying it out

The `loadgen` container hits NGINX once per second (alternating a 200 response and a 404). Within ~30 seconds you should see:

### Logs (Loki)

```logql
# All access logs
{job="nginx", log_type="access"}

# Just 4xx
{job="nginx", log_type="access", status=~"4.."}

# Error log
{job="nginx", log_type="error"}
```

The combined-log regex extracts `remote_addr`, `time_local`, `method`, `path`, `status`, and `bytes_sent`. Of those, `method` and `status` are promoted to Loki labels for fast filtering; the rest stay in the line text.

### Metrics (Prometheus)

```promql
# Active connections
nginx_connections_active

# Accepted-since-start counter (per second)
rate(nginx_connections_accepted[1m])

# Total HTTP requests
nginx_http_requests_total
```

## Customization

- **Different log format**: edit the regex in `config.alloy` under `loki.process.nginx`. The default expects NGINX's built-in `combined` format.
- **Different exporter target**: change the `--nginx.scrape-uri` flag on `nginx-exporter` in `docker-compose.yml`.
- **More log sources**: add entries to `local.file_match.nginx.path_targets`.

## Stopping

```bash
docker compose down -v
```

The `-v` removes the shared `nginx-logs` volume so the next run starts with a clean log file.


================================================
FILE: nginx-monitoring/config.alloy
================================================
// NGINX Monitoring with Grafana Alloy.
// Logs: tail access.log + error.log via loki.source.file, parse the access log
// with a combined-format regex, and ship to Loki with method/status labels.
// Metrics: scrape nginx-prometheus-exporter and remote_write to Prometheus.

livedebugging {
	enabled = true
}

// --- logs pipeline ---

local.file_match "nginx" {
	path_targets = [
		{
			__path__ = "/var/log/nginx-data/access.log",
			job      = "nginx",
			log_type = "access",
		},
		{
			__path__ = "/var/log/nginx-data/error.log",
			job      = "nginx",
			log_type = "error",
		},
	]
	sync_period = "5s"
}

loki.source.file "nginx" {
	targets       = local.file_match.nginx.targets
	forward_to    = [loki.process.nginx.receiver]
	tail_from_end = true
}

loki.process "nginx" {
	// Extract `method` and `status` from access logs (combined format).
	// Error logs pass through unchanged.
	stage.match {
		selector = "{log_type=\"access\"}"

		stage.regex {
			expression = `^(?P<remote_addr>\S+) - (?P<remote_user>\S+) \[(?P<time_local>[^\]]+)\] "(?P<method>\S+) (?P<path>\S+) [^"]+" (?P<status>\d+) (?P<bytes_sent>\d+)`
		}

		stage.labels {
			values = {
				method = "",
				status = "",
			}
		}
	}

	forward_to = [loki.write.local.receiver]
}

loki.write "local" {
	endpoint {
		url = "http://loki:3100/loki/api/v1/push"
	}
}

// --- metrics pipeline ---

prometheus.scrape "nginx" {
	targets = [{
		__address__ = "nginx-exporter:9113",
		job         = "nginx",
	}]
	forward_to      = [prometheus.remote_write.local.receiver]
	scrape_interval = "15s"
}

prometheus.remote_write "local" {
	endpoint {
		url = "http://prometheus:9090/api/v1/write"
	}
}


================================================
FILE: nginx-monitoring/docker-compose.yml
================================================
services:
  nginx:
    image: nginx:${NGINX_VERSION:-1.30-alpine}
    container_name: nginx-monitoring-nginx
    ports:
      - "8080:80"
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf:ro
      - nginx-logs:/var/log/nginx-data

  nginx-exporter:
    image: nginx/nginx-prometheus-exporter:${NGINX_EXPORTER_VERSION:-1.5.1}
    container_name: nginx-monitoring-exporter
    command:
      - --nginx.scrape-uri=http://nginx:80/nginx_status
    ports:
      - "9113:9113"
    depends_on:
      - nginx

  loadgen:
    image: curlimages/curl:${CURL_VERSION:-8.20.0}
    container_name: nginx-monitoring-loadgen
    entrypoint:
      - sh
      - -c
      - |
        until curl -s -o /dev/null --max-time 2 http://nginx/; do sleep 1; done
        while true; do
          curl -s -o /dev/null http://nginx/
          curl -s -o /dev/null http://nginx/missing-page
          sleep 1
        done
    depends_on:
      - nginx
    restart: unless-stopped

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    container_name: nginx-monitoring-alloy
    ports:
      - "12345:12345"
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
      - nginx-logs:/var/log/nginx-data:ro
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - nginx
      - nginx-exporter
      - loki
      - prometheus

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    container_name: nginx-monitoring-loki
    ports:
      - "3100:3100"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    container_name: nginx-monitoring-prometheus
    command:
      - --web.enable-remote-write-receiver
      - --config.file=/etc/prometheus/prometheus.yml
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    container_name: nginx-monitoring-grafana
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - "3000:3000/tcp"
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        - name: Prometheus
          type: prometheus
          access: proxy
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh

volumes:
  nginx-logs:


================================================
FILE: nginx-monitoring/loki-config.yaml
================================================
auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
    - from: 2020-05-15
      store: tsdb
      object_store: filesystem
      schema: v13
      index:
        prefix: index_
        period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

ingester:
  max_chunk_age: 5m


================================================
FILE: nginx-monitoring/nginx.conf
================================================
worker_processes 1;
events {
    worker_connections 1024;
}

http {
    # Use the nginx built-in "combined" log format:
    # '$remote_addr - $remote_user [$time_local] "$request" '
    # '$status $body_bytes_sent "$http_referer" "$http_user_agent"'
    #
    # Write to a fresh path outside /var/log/nginx — that directory
    # has access.log/error.log pre-symlinked to /dev/stdout in the
    # nginx image, which Alloy's tailer cannot follow across containers.
    access_log /var/log/nginx-data/access.log combined;
    error_log  /var/log/nginx-data/error.log warn;

    server {
        listen 80;
        server_name _;

        location = / {
            add_header Content-Type text/plain;
            return 200 "ok\n";
        }

        location = /missing-page {
            return 404;
        }

        location /nginx_status {
            stub_status on;
            access_log off;
        }
    }
}


================================================
FILE: nginx-monitoring/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s


================================================
FILE: otel-basic-tracing/README.md
================================================
# OpenTelemetry Basic Tracing with Grafana Alloy

This example demonstrates how to collect and visualize OpenTelemetry traces using Grafana Alloy and Tempo.

## Overview

The example includes:

- A sample Python Flask application that generates various types of traces
- Grafana Alloy as the telemetry pipeline
- Tempo for trace storage and querying
- Prometheus for metrics collection (service graphs)
- Grafana for visualization

## Running the Demo

1. Clone the repository:
   ```
   git clone https://github.com/grafana/alloy-scenarios.git
   cd alloy-scenarios
   ```

2. Navigate to this example directory:
   ```
   cd otel-basic-tracing
   ```

3. Run using Docker Compose:
   ```
   docker compose up -d
   ```
   
   Or use the centralized image management:
   ```
   cd ..
   ./run-example.sh otel-basic-tracing
   ```

4. Access the demo application at http://localhost:8080
5. Access Grafana at http://localhost:3000
6. Access Prometheus at http://localhost:9090

## What to Expect

The demo application provides several endpoints that generate different types of traces:

- **/simple**: Generates a simple trace with a single span
- **/nested**: Generates a trace with nested spans (parent-child relationships)
- **/error**: Generates a trace that includes an error
- **/chain**: Simulates a chain of service calls to demonstrate distributed tracing

After accessing these endpoints, you can view the traces in Grafana by:

1. Opening http://localhost:3000
2. Navigating to Explore
3. Selecting the Tempo data source
4. Using the Search tab to find and visualize traces

## Service Graphs

This example includes service graph visualization capabilities. As you generate traces with the demo app (especially with the `/chain` endpoint), Tempo will generate service graph metrics that are sent to Prometheus.

To view the service graph:

1. Open Grafana (http://localhost:3000)
2. Navigate to Explore
3. Select the Tempo data source
4. Click on the "Service Graph" tab
5. You should see a visual representation of the relationships between services

## Architecture

```
┌────────────┐     ┌──────────┐      ┌───────┐      ┌─────────┐
│ Demo App   │────▶│ Alloy    │─────▶│ Tempo │─────▶│ Grafana │
│ (OTel SDK) │     │          │      │       │      │         │
└────────────┘     └──────────┘      └───┬───┘      └─────────┘
                                         │                ▲
                                         ▼                │
                                    ┌─────────┐           │
                                    │Prometheus│───────────┘
                                    └─────────┘
```

The Demo App generates traces using the OpenTelemetry SDK and sends them to Alloy, which processes and forwards them to Tempo. Tempo generates service graph metrics and sends them to Prometheus. Grafana queries both Tempo and Prometheus to visualize traces and service graphs.

## Customizing

The Alloy configuration is a simple placeholder. You can modify `config.alloy` to add processors, filters, or additional exporters to demonstrate more complex telemetry pipelines. 

================================================
FILE: otel-basic-tracing/app/Dockerfile
================================================
ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2
FROM python:${PYTHON_VERSION}

WORKDIR /app

COPY requirements.txt .
RUN pip install -r requirements.txt

COPY app.py .

CMD ["python", "app.py"] 

================================================
FILE: otel-basic-tracing/app/app.py
================================================
import os
import random
import time
from flask import Flask, request
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.resources import SERVICE_NAME, Resource
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.instrumentation.requests import RequestsInstrumentor
import requests

# Configure the tracer
resource = Resource.create(attributes={
    SERVICE_NAME: "trace-demo"
})
trace.set_tracer_provider(TracerProvider(resource=resource))

# Configure the OTLP exporter using environment variables
# OTEL_EXPORTER_OTLP_ENDPOINT will be used automatically
otlp_exporter = OTLPSpanExporter(endpoint="http://alloy:4317/v1/traces", insecure=True)
span_processor = BatchSpanProcessor(span_exporter=otlp_exporter, max_export_batch_size=1)
trace.get_tracer_provider().add_span_processor(span_processor)

# Create a tracer
tracer = trace.get_tracer(__name__)

# Create a Flask application
app = Flask(__name__)

# Instrument Flask
FlaskInstrumentor().instrument_app(app)

# Instrument requests
RequestsInstrumentor().instrument()

@app.route('/')
def home():
    return """
    <h1>OpenTelemetry Demo</h1>
    <p>This app demonstrates OpenTelemetry tracing with Grafana Alloy.</p>
    <ul>
        <li><a href="/simple">Simple Trace</a></li>
        <li><a href="/nested">Nested Trace</a></li>
        <li><a href="/error">Error Trace</a></li>
        <li><a href="/chain">Chain of Services</a></li>
        <li><a href="/delayed-chain">Delayed Chain (with Service D having high latency)</a></li>
    </ul>
    """

@app.route('/simple')
def simple_trace():
    with tracer.start_as_current_span("simple-operation") as span:
        span.set_attribute("operation.type", "simple")
        span.set_attribute("operation.value", random.randint(1, 100))
        time.sleep(0.1)  # Simulate work
        return {"status": "ok", "message": "Simple trace generated"}

@app.route('/nested')
def nested_trace():
    with tracer.start_as_current_span("parent-operation") as parent:
        parent.set_attribute("operation.type", "parent")
        time.sleep(0.05)  # Simulate work
        
        with tracer.start_as_current_span("child-operation-1") as child1:
            child1.set_attribute("operation.type", "child")
            child1.set_attribute("child.number", 1)
            time.sleep(0.05)  # Simulate work
            
        with tracer.start_as_current_span("child-operation-2") as child2:
            child2.set_attribute("operation.type", "child")
            child2.set_attribute("child.number", 2)
            time.sleep(0.05)  # Simulate work
            
            with tracer.start_as_current_span("grandchild-operation") as grandchild:
                grandchild.set_attribute("operation.type", "grandchild")
                time.sleep(0.05)  # Simulate work
                
        return {"status": "ok", "message": "Nested trace generated"}

@app.route('/error')
def error_trace():
    with tracer.start_as_current_span("error-operation") as span:
        span.set_attribute("operation.type", "error")
        try:
            # Simulate an error
            result = 1 / 0
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            return {"status": "error", "message": "Error trace generated"}

@app.route('/chain')
def chain_trace():
    with tracer.start_as_current_span("chain-root") as span:
        span.set_attribute("operation.step", "start")
        
        # Simulate a chain of service calls
        try:
            # Call ourselves to simulate microservice calls
            # In a real world example these would be different services
            service_b_url = f"http://localhost:8080/service/b?id={random.randint(1000, 9999)}"
            response = requests.get(service_b_url)
            return {"status": "ok", "message": "Chain trace generated", "data": response.json()}
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            return {"status": "error", "message": "Failed to complete chain"}

@app.route('/service/b')
def service_b():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span(f"service-b-handler") as span:
        span.set_attribute("service", "B")
        span.set_attribute("request.id", req_id)
        time.sleep(0.1)  # Simulate work
        
        # Call service C
        service_c_url = f"http://localhost:8080/service/c?id={req_id}"
        response = requests.get(service_c_url)
        return {"status": "ok", "message": "Service B completed", "data": response.json()}

@app.route('/service/c')
def service_c():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span(f"service-c-handler") as span:
        span.set_attribute("service", "C")
        span.set_attribute("request.id", req_id)
        time.sleep(0.15)  # Simulate work
        
        # Randomly fail sometimes to show error traces
        if random.random() < 0.2:  # 20% chance of failure
            span.set_status(trace.StatusCode.ERROR, "Random failure")
            return {"status": "error", "message": "Service C failed randomly"}
        
        return {"status": "ok", "message": "Service C completed successfully"}

# New delayed chain implementation
@app.route('/delayed-chain')
def delayed_chain_trace():
    with tracer.start_as_current_span("delayed-chain-root") as span:
        span.set_attribute("operation.step", "start")
        span.set_attribute("operation.type", "delayed-chain")
        
        try:
            # Start the chain with Service A
            service_a_url = f"http://localhost:8080/delayed/service-a?id={random.randint(1000, 9999)}"
            response = requests.get(service_a_url)
            return {
                "status": "ok", 
                "message": "Delayed chain trace generated", 
                "data": response.json()
            }
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            return {"status": "error", "message": "Failed to complete delayed chain"}

@app.route('/delayed/service-a')
def delayed_service_a():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span("service-a-handler") as span:
        span.set_attribute("service", "A")
        span.set_attribute("request.id", req_id)
        span.set_attribute("service.latency", "normal")
        time.sleep(0.1)  # Normal latency
        
        # Call service B
        service_b_url = f"http://localhost:8080/delayed/service-b?id={req_id}"
        response = requests.get(service_b_url)
        return {"status": "ok", "message": "Service A completed", "data": response.json()}

@app.route('/delayed/service-b')
def delayed_service_b():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span("service-b-handler") as span:
        span.set_attribute("service", "B")
        span.set_attribute("request.id", req_id)
        span.set_attribute("service.latency", "normal")
        time.sleep(0.15)  # Normal latency
        
        # Call service C
        service_c_url = f"http://localhost:8080/delayed/service-c?id={req_id}"
        response = requests.get(service_c_url)
        return {"status": "ok", "message": "Service B completed", "data": response.json()}

@app.route('/delayed/service-c')
def delayed_service_c():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span("service-c-handler") as span:
        span.set_attribute("service", "C")
        span.set_attribute("request.id", req_id)
        span.set_attribute("service.latency", "normal")
        time.sleep(0.2)  # Normal latency
        
        # Call the slow service D
        service_d_url = f"http://localhost:8080/delayed/service-d?id={req_id}"
        response = requests.get(service_d_url)
        return {"status": "ok", "message": "Service C completed", "data": response.json()}

@app.route('/delayed/service-d')
def delayed_service_d():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span("service-d-handler") as span:
        span.set_attribute("service", "D")
        span.set_attribute("request.id", req_id)
        span.set_attribute("service.latency", "high")
        span.set_attribute("latency.category", "bottleneck")
        
        # This service consistently has high latency (3-4 seconds)
        delay = random.uniform(3.0, 4.0)
        span.set_attribute("latency.seconds", delay)
        time.sleep(delay)  # High latency
        
        # Call final service E
        service_e_url = f"http://localhost:8080/delayed/service-e?id={req_id}"
        response = requests.get(service_e_url)
        return {"status": "ok", "message": "Service D completed (with delay)", "data": response.json()}

@app.route('/delayed/service-e')
def delayed_service_e():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span("service-e-handler") as span:
        span.set_attribute("service", "E")
        span.set_attribute("request.id", req_id)
        span.set_attribute("service.latency", "normal")
        time.sleep(0.1)  # Normal latency
        
        return {"status": "ok", "message": "Service E completed (chain end)"}

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8080) 

================================================
FILE: otel-basic-tracing/app/requirements.txt
================================================
flask
requests
opentelemetry-api
opentelemetry-sdk
opentelemetry-exporter-otlp
opentelemetry-instrumentation-flask
opentelemetry-instrumentation-requests

================================================
FILE: otel-basic-tracing/config-otel.yaml
================================================
#
# OTel Collector YAML Configuration for OpenTelemetry Trace Collection
#
# This is the OTel-native equivalent of config.alloy for use with the Alloy OTel Engine.
# Run with: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
#

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  batch: {}

exporters:
  otlp/tempo:
    endpoint: tempo:4317
    tls:
      insecure: true

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [batch]
      exporters: [otlp/tempo]


================================================
FILE: otel-basic-tracing/config.alloy
================================================
/*
 * Alloy Configuration for OpenTelemetry Trace Collection
 */

otelcol.receiver.otlp "default" {
  http {}
  grpc {}

  output {
    traces = [otelcol.processor.batch.default.input]
  }
}

otelcol.processor.batch "default" {
  output {
    traces = [otelcol.exporter.otlp.tempo.input]
  }
}

otelcol.exporter.otlp "tempo" {
  client {
    endpoint = "tempo:4317"
    tls {
			insecure = true
		}
  }
} 

================================================
FILE: otel-basic-tracing/docker-compose-otel.yml
================================================
# OTel Engine Override
#
# Uses Alloy's experimental OTel Engine to run a standard OTel Collector YAML config
# instead of the River/HCL config.alloy file.
#
# Usage: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
# Stop:  docker compose -f docker-compose.yml -f docker-compose-otel.yml down
#

services:
  alloy:
    command: otel --config=/etc/alloy/config-otel.yaml
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
    ports:
      - 8888:8888      # OTel Engine HTTP server
      - 4317:4317      # OTLP gRPC
      - 4318:4318      # OTLP HTTP

  # Override demo-app endpoint to use standard OTLP gRPC port
  demo-app:
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=demo-service
      - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo


================================================
FILE: otel-basic-tracing/docker-compose.coda.yml
================================================
services:
  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    network_mode: host
    restart: unless-stopped
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:12345
      - OTEL_SERVICE_NAME=demo-service
      - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo


================================================
FILE: otel-basic-tracing/docker-compose.yml
================================================
version: '3.8'

services:
  # Prometheus for metrics collection
  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --enable-feature=native-histograms
      - --enable-feature=exemplar-storage
      - --config.file=/etc/prometheus/prometheus.yml
    ports:
      - 9090:9090/tcp
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml

  # Tempo for tracing
  tempo:
    image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4}
    command: ["-config.file=/etc/tempo.yaml"]
    ports:
      - 3200:3200/tcp    # tempo
    volumes:
      - ./tempo-config.yaml:/etc/tempo.yaml
    depends_on:
      - prometheus
  
  memcached:
    image: memcached:1.6.40@sha256:572b011ce33954ee809066d8cecbeb3ec98912109ee3be3663a3197425fd81ac
    container_name: memcached
    ports:
      - "11211:11211"
    environment:
      - MEMCACHED_MAX_MEMORY=64m  # Set the maximum memory usage
      - MEMCACHED_THREADS=4       # Number of threads to use


  # Grafana for visualization
  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
      - GF_INSTALL_PLUGINS=https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        - name: Tempo
          type: tempo
          access: proxy
          orgId: 1
          url: http://tempo:3200
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
          jsonData:
            serviceMap:
              datasourceUid: 'Prometheus'
            nodeGraph:
              enabled: true
        EOF
        /run.sh
    depends_on:
      - prometheus
      - tempo

  # Alloy for telemetry pipeline
  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345      # Alloy HTTP server
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy

  # Demo app that generates OpenTelemetry traces
  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8080:8080
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:12345
      - OTEL_SERVICE_NAME=demo-service
      - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo 

================================================
FILE: otel-basic-tracing/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s


================================================
FILE: otel-basic-tracing/tempo-config.yaml
================================================
stream_over_http_enabled: true
server:
  http_listen_port: 3200
  log_level: info


cache:
  background:
    writeback_goroutines: 5
  caches:
  - roles:
    - frontend-search  
    memcached: 
      addresses: dns+memcached:11211

query_frontend:
  search:
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09
    metadata_slo:
        duration_slo: 5s
        throughput_bytes_slo: 1.073741824e+09
  trace_by_id:
    duration_slo: 100ms
  metrics:
    max_duration: 200h                # maximum duration of a metrics query, increase for local setups
    query_backend_after: 5m
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09

distributor:
  receivers:                           # this configuration will listen on all ports and protocols that tempo is capable of.
    jaeger:                            # the receives all come from the OpenTelemetry collector.  more configuration information can
      protocols:                       # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver
        thrift_http:                   #
          endpoint: "tempo:14268"      # for a production deployment you should only enable the receivers you need!
        grpc:
          endpoint: "tempo:14250"
        thrift_binary:
          endpoint: "tempo:6832"
        thrift_compact:
          endpoint: "tempo:6831"
    zipkin:
      endpoint: "tempo:9411"
    otlp:
      protocols:
        grpc:
          endpoint: "tempo:4317"
        http:
          endpoint: "tempo:4318"
    opencensus:
      endpoint: "tempo:55678"

ingester:
  max_block_duration: 5m               # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally

compactor:
  compaction:
    block_retention: 720h                # overall Tempo trace retention. set for demo purposes

metrics_generator:
  registry:
    external_labels:
      source: tempo
      cluster: docker-compose
  storage:
    path: /var/tempo/generator/wal
    remote_write:
      - url: http://prometheus:9090/api/v1/write
        send_exemplars: true
  traces_storage:
    path: /var/tempo/generator/traces
  processor:
    local_blocks:
      filter_server_spans: false
      flush_to_storage: true

storage:
  trace:
    backend: local                     # backend configuration to use
    wal:
      path: /var/tempo/wal             # where to store the wal locally
    local:
      path: /var/tempo/blocks

overrides:
  defaults:
    metrics_generator:
      processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator
      generate_native_histograms: both
      

================================================
FILE: otel-examples/README.md
================================================
# OTel Engine Examples

These scenarios use the **Alloy OTel Engine** -- an experimental feature introduced in Alloy v1.14 that lets you run standard [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) YAML configurations directly inside Alloy. Instead of writing Alloy's River/HCL syntax, you use the exact same YAML format that the upstream OTel Collector uses.

## What is the Alloy OTel Engine?

Grafana Alloy has traditionally used its own **River** configuration language (HCL-like syntax in `config.alloy` files). Starting with v1.14, Alloy ships an experimental **OTel Engine** that accepts standard OTel Collector YAML. This means:

- **No new language to learn** -- if you already know OTel Collector config, you can use Alloy directly
- **Copy-paste from upstream docs** -- OTel Collector examples work as-is
- **Migration path** -- move from vanilla OTel Collector to Alloy without rewriting configs
- **Best of both worlds** -- Alloy's single-binary distribution with OTel Collector's YAML config

The OTel Engine is started with:

```bash
alloy otel --config=<CONFIG_FILE>
```

You can validate configs before running:

```bash
alloy otel validate --config=<CONFIG_FILE>
```

## Running These Examples

Each scenario has a `docker-compose.yml` with the full stack:

```bash
cd <scenario-dir> && docker compose up -d
```

Or from the repo root with centralized image versions:

```bash
cd otel-examples/<scenario-dir> && docker compose --env-file ../../image-versions.env up -d
```

### Access the stack

- **Grafana**: [http://localhost:3000](http://localhost:3000) (no login required)
- **Alloy UI**: [http://localhost:12345](http://localhost:12345) (pipeline debugging UI, enabled via the `alloyengine` extension)

### Stop

```bash
docker compose down
```

## Scenarios

| Scenario | Description | Key OTel Components |
|----------|-------------|-------------------|
| [filelog-processing](filelog-processing/) | Collect and parse mixed-format log files (JSON + plaintext) using the filelog receiver's operator chain | `filelog` receiver, `json_parser`, `regex_parser`, `severity_parser` operators |
| [pii-redaction](pii-redaction/) | Scrub credit cards, emails, and IP addresses from traces and logs using OTTL `replace_pattern` | `transform` processor (OTTL) |
| [routing-multi-tenant](routing-multi-tenant/) | Route logs to different Loki tenants based on resource attributes using fan-out + filter | `forward` connector, `filter` processor, `resource` processor |
| [cost-control](cost-control/) | Drop health checks, filter debug logs, and apply head-based sampling to reduce telemetry volume | `filter` processor, `probabilistic_sampler` processor |
| [resource-enrichment](resource-enrichment/) | Auto-discover and attach host/OS/Docker metadata to all telemetry signals | `resourcedetection` processor (env, system, docker) |
| [count-connector](count-connector/) | Derive count metrics (request rate, error rate) from traces and logs | `count` connector |
| [ottl-transform](ottl-transform/) | A cookbook of OTTL patterns: JSON parsing, severity mapping, attribute promotion, truncation | `transform` processor (OTTL) |
| [host-metrics](host-metrics/) | Collect CPU, memory, disk, network metrics -- an OTel-native replacement for node_exporter | `hostmetrics` receiver |
| [multi-pipeline-fanout](multi-pipeline-fanout/) | Send traces to two backends with different processing per destination (full vs. sampled) | `forward` connector, `probabilistic_sampler` processor |
| [kafka-buffer](kafka-buffer/) | Buffer traces through Kafka for durability and backpressure handling | `kafka` receiver/exporter |

## Alloy UI and the `alloyengine` Extension

Each scenario includes an `alloyengine` extension in `config-otel.yaml` that starts the Alloy River UI alongside the OTel pipeline. This gives you the visual pipeline debugging UI at [http://localhost:12345](http://localhost:12345).

If you prefer a pure OTel Collector config without the Alloy UI, you can remove the `alloyengine` extension block and the `extensions: [alloyengine]` line from the `service:` section in `config-otel.yaml`, and drop the `config.alloy` volume mount from `docker-compose.yml`. The OTel pipeline will work identically -- you just won't have the UI.

## OTel Engine vs. River Configs

For comparison, the parent repo's existing scenarios (e.g., `otel-basic-tracing/`, `otel-span-metrics/`) also have OTel YAML alternatives alongside their River configs. Run those with:

```bash
docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
```

## Available Connectors

The Alloy OTel Engine supports these connectors: `count`, `grafanacloud`, `servicegraph`, `spanmetrics`, `forward`.

## Further Reading

- [Alloy OTel Engine Documentation](https://grafana.com/docs/alloy/latest/set-up/otel_engine/)
- [OpenTelemetry Collector Configuration](https://opentelemetry.io/docs/collector/configuration/)
- [OTTL (OpenTelemetry Transformation Language)](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/pkg/ottl)


================================================
FILE: otel-examples/cost-control/README.md
================================================
# Telemetry Cost Control

Reduce observability costs by filtering noisy telemetry and applying probabilistic sampling in the Alloy OTel pipeline, before data reaches your backends.

## What This Demonstrates

- **Filter processor** to drop unwanted spans (health checks, readiness probes, metrics endpoints)
- **Filter processor** to drop low-severity logs (DEBUG level)
- **Probabilistic sampler** for head-based trace sampling (keeps 25% of remaining traces)
- **Transform processor** to strip high-cardinality attributes (`http.user_agent`, cookies) that inflate storage

## Prerequisites

- Docker and Docker Compose

## Run

```bash
docker compose up -d
```

## Alloy UI

The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline.

If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`.

## Explore

Open Grafana at [http://localhost:3000](http://localhost:3000).

### Verify filtering is working

1. **Traces (Tempo):** Go to Explore > Tempo. Search for traces from `cost-control-demo`. You should see `/api/order` and `/api/error` spans but **no** `/health`, `/ready`, or `/metrics` spans -- those are dropped by the filter processor.

2. **Logs (Loki):** Go to Explore > Loki. Query `{service_name="cost-control-demo"}`. You should see INFO and ERROR logs but **no** DEBUG logs.

3. **Sampling:** Only ~25% of the remaining (non-filtered) traces make it through. Compare the demo app's request rate with the trace count in Tempo to see the reduction.

### Sample Loki query

```logql
{service_name="cost-control-demo"} | json
```

### Check the Alloy OTel pipeline

Visit the Alloy OTel HTTP server at [http://localhost:8888](http://localhost:8888).

## Key Configuration

The `config-otel.yaml` pipeline applies three cost-control stages:

1. **`filter/traces`** -- Drops spans where `http.target` or `http.route` matches `/health`, `/ready`, or `/metrics`. These high-frequency probes generate enormous trace volume with no diagnostic value.

2. **`filter/logs`** -- Drops log records with `severity_number < 9` (below INFO). DEBUG logs are useful in development but costly at scale.

3. **`probabilistic_sampler`** -- Keeps 25% of remaining traces via consistent head-based sampling. Adjust `sampling_percentage` to trade off between cost and visibility.

4. **`transform/strip`** -- Removes `http.user_agent` and `http.request.header.cookie` attributes from spans. These high-cardinality fields consume significant index and storage space.

## Stop

```bash
docker compose down
```


================================================
FILE: otel-examples/cost-control/app/Dockerfile
================================================
ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2
FROM python:${PYTHON_VERSION}
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY app.py .
CMD ["python", "app.py"]


================================================
FILE: otel-examples/cost-control/app/app.py
================================================
"""
Demo Flask app for the cost-control scenario.

Generates a noisy mix of telemetry: frequent health/ready checks, DEBUG logs,
and occasional real business traces. The Alloy OTel pipeline filters out the
noise using filter processors and probabilistic sampling.
"""

import logging
import random
import threading
import time

from flask import Flask, jsonify
from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.sdk.resources import Resource
from opentelemetry.trace import StatusCode

# --- OTel Setup ---
resource = Resource.create({
    "service.name": "cost-control-demo",
    "service.version": "1.0.0",
})

# Traces
tracer_provider = TracerProvider(resource=resource)
tracer_provider.add_span_processor(
    BatchSpanProcessor(OTLPSpanExporter(endpoint="alloy:4317", insecure=True))
)
trace.set_tracer_provider(tracer_provider)
tracer = trace.get_tracer(__name__)

# Logs via OTel
logger_provider = LoggerProvider(resource=resource)
logger_provider.add_log_record_processor(
    BatchLogRecordProcessor(OTLPLogExporter(endpoint="alloy:4317", insecure=True))
)
handler = LoggingHandler(level=logging.DEBUG, logger_provider=logger_provider)
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("cost-control-demo")
logger.addHandler(handler)

# --- Flask App ---
app = Flask(__name__)
FlaskInstrumentor().instrument_app(app)


@app.route("/health")
def health():
    """Noisy health check endpoint - called very frequently."""
    logger.debug("Health check OK")
    return jsonify({"status": "healthy"})


@app.route("/ready")
def ready():
    """Noisy readiness probe endpoint."""
    logger.debug("Readiness check OK")
    return jsonify({"status": "ready"})


@app.route("/api/order")
def order():
    """Real business endpoint that produces useful traces."""
    with tracer.start_as_current_span("process-order") as span:
        order_id = f"ORD-{random.randint(1000, 9999)}"
        span.set_attribute("order.id", order_id)
        span.set_attribute("order.amount", round(random.uniform(10.0, 500.0), 2))
        span.set_attribute("customer.tier", random.choice(["gold", "silver", "bronze"]))

        # Simulate processing time
        time.sleep(random.uniform(0.05, 0.2))

        logger.info("Order %s processed successfully", order_id)
        return jsonify({"order_id": order_id, "status": "completed"})


@app.route("/api/error")
def error():
    """Endpoint that occasionally generates errors."""
    with tracer.start_as_current_span("handle-error") as span:
        error_code = random.choice(["TIMEOUT", "INVALID_INPUT", "DB_ERROR"])
        span.set_attribute("error.code", error_code)
        span.set_status(StatusCode.ERROR, f"Simulated error: {error_code}")
        span.record_exception(Exception(f"Simulated {error_code}"))

        logger.error("Request failed with error: %s", error_code)
        return jsonify({"error": error_code}), 500


def load_generator():
    """Background thread that generates traffic with a noisy distribution."""
    import requests

    base_url = "http://localhost:8080"
    # Wait for Flask to start
    time.sleep(5)

    while True:
        r = random.random()
        try:
            if r < 0.70:
                requests.get(f"{base_url}/health", timeout=2)
            elif r < 0.80:
                requests.get(f"{base_url}/ready", timeout=2)
            elif r < 0.95:
                requests.get(f"{base_url}/api/order", timeout=2)
            else:
                requests.get(f"{base_url}/api/error", timeout=2)
        except Exception:
            pass

        # Also emit frequent DEBUG logs (noise)
        logger.debug("Background tick at %s", time.time())
        time.sleep(random.uniform(0.2, 1.0))


if __name__ == "__main__":
    thread = threading.Thread(target=load_generator, daemon=True)
    thread.start()
    app.run(host="0.0.0.0", port=8080)


================================================
FILE: otel-examples/cost-control/app/requirements.txt
================================================
flask
opentelemetry-api
opentelemetry-sdk
opentelemetry-exporter-otlp
opentelemetry-instrumentation-flask


================================================
FILE: otel-examples/cost-control/config-otel.yaml
================================================
#
# OTel Collector YAML: Telemetry Cost Control
#
# Demonstrates using filter and probabilistic_sampler processors
# to drop noisy telemetry (health checks, debug logs) and apply
# head-based sampling to reduce observability costs.
#

extensions:
  alloyengine:
    config:
      file: /etc/alloy/config.alloy
    flags:
      server.http.listen-addr: 0.0.0.0:12345

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  # Drop health check and readiness probe spans
  filter/traces:
    error_mode: ignore
    traces:
      span:
        - attributes["http.target"] == "/health"
        - attributes["http.target"] == "/ready"
        - attributes["http.target"] == "/metrics"
        - attributes["http.route"] == "/health"
        - attributes["http.route"] == "/ready"

  # Drop DEBUG-level logs
  filter/logs:
    error_mode: ignore
    logs:
      log_record:
        - severity_number < 9

  # Head-based probabilistic sampling: keep 25% of remaining traces
  probabilistic_sampler:
    sampling_percentage: 25

  # Strip high-cardinality attributes to reduce storage
  transform/strip:
    error_mode: ignore
    trace_statements:
      - context: span
        statements:
          - delete_key(attributes, "http.user_agent")
          - delete_key(attributes, "http.request.header.cookie")

  batch: {}

exporters:
  otlp/tempo:
    endpoint: tempo:4317
    tls:
      insecure: true

  otlphttp/loki:
    endpoint: http://loki:3100/otlp

  debug:
    verbosity: basic

service:
  extensions: [alloyengine]
  pipelines:
    traces:
      receivers: [otlp]
      processors: [filter/traces, probabilistic_sampler, transform/strip, batch]
      exporters: [otlp/tempo]
    logs:
      receivers: [otlp]
      processors: [filter/logs, batch]
      exporters: [otlphttp/loki, debug]


================================================
FILE: otel-examples/cost-control/config.alloy
================================================
// Minimal Alloy config to enable the Alloy UI alongside the OTel Engine.
// The OTel pipeline is defined in config-otel.yaml.


================================================
FILE: otel-examples/cost-control/docker-compose.coda.yml
================================================
services:
  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    network_mode: host
    restart: unless-stopped
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
      - OTEL_SERVICE_NAME=cost-control-demo


================================================
FILE: otel-examples/cost-control/docker-compose.yml
================================================
version: '3.8'

services:
  # Loki for log storage
  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    command: -config.file=/etc/loki/local-config.yaml
    ports:
      - 3100:3100/tcp
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml

  # Tempo for trace storage
  tempo:
    image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4}
    command: ["-config.file=/etc/tempo.yaml"]
    ports:
      - 3200:3200/tcp
    volumes:
      - ./tempo-config.yaml:/etc/tempo.yaml

  # Grafana for visualization
  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        - name: Tempo
          type: tempo
          access: proxy
          orgId: 1
          url: http://tempo:3200
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh
    depends_on:
      - loki
      - tempo

  # Alloy in OTel engine mode
  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    command: otel --config=/etc/alloy/config-otel.yaml
    ports:
      - 8888:8888       # OTel engine HTTP server
      - 4317:4317       # OTLP gRPC
      - 4318:4318       # OTLP HTTP
      - 12345:12345     # Alloy UI
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
      - ./config.alloy:/etc/alloy/config.alloy
    depends_on:
      - loki
      - tempo

  # Demo app that generates noisy telemetry
  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8080:8080
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=cost-control-demo
    depends_on:
      - alloy


================================================
FILE: otel-examples/cost-control/loki-config.yaml
================================================
auth_enabled: false
server:
  http_listen_port: 3100
  grpc_listen_port: 9096
  log_level: info
common:
  instance_addr: 127.0.0.1
  path_prefix: /tmp/storage
  storage:
    filesystem:
      chunks_directory: /tmp/storage/chunks
      rules_directory: /tmp/storage/rules
  replication_factor: 1
  ring:
    kvstore:
      store: inmemory
query_range:
  results_cache:
    cache:
      embedded_cache:
        enabled: true
        max_size_mb: 100
limits_config:
  metric_aggregation_enabled: true
schema_config:
  configs:
    - from: 2020-10-24
      store: tsdb
      object_store: filesystem
      schema: v13
      index:
        prefix: index_
        period: 24h
pattern_ingester:
  enabled: true
  metric_aggregation:
    loki_address: localhost:3100
ruler:
  alertmanager_url: http://localhost:9093
frontend:
  encoding: protobuf


================================================
FILE: otel-examples/cost-control/tempo-config.yaml
================================================
stream_over_http_enabled: true
server:
  http_listen_port: 3200
  log_level: info
distributor:
  receivers:
    otlp:
      protocols:
        grpc:
          endpoint: "tempo:4317"
        http:
          endpoint: "tempo:4318"
ingester:
  max_block_duration: 5m
compactor:
  compaction:
    block_retention: 720h
storage:
  trace:
    backend: local
    wal:
      path: /var/tempo/wal
    local:
      path: /var/tempo/blocks
overrides:
  defaults: {}


================================================
FILE: otel-examples/count-connector/README.md
================================================
# Count Connector (Derive Metrics from Signals)

Use the OTel count connector to automatically derive count metrics from traces and logs -- the "metrics from signals" pattern -- without additional instrumentation.

## What This Demonstrates

- **Count connector** deriving metrics from trace spans and log records
- Generating error rate metrics (`span.error.count`, `log.error.count`) from signal status codes
- Generating volume metrics (`span.count`, `log.count`) for throughput monitoring
- Routing derived metrics to Prometheus while original signals go to Tempo and Loki

## Prerequisites

- Docker and Docker Compose

## Run

```bash
docker compose up -d
```

## Alloy UI

The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline.

If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`.

## Explore

Open Grafana at [http://localhost:3000](http://localhost:3000).

### View derived metrics in Prometheus

Go to Explore > Prometheus and query the following metrics:

```promql
# Total span count (rate per second)
rate(span_count_total[5m])

# Error span count (rate per second)
rate(span_error_count_total[5m])

# Error rate as a percentage
rate(span_error_count_total[5m]) / rate(span_count_total[5m]) * 100

# Total log record count
rate(log_count_total[5m])

# Error log count
rate(log_error_count_total[5m])
```

### View original traces in Tempo

Go to Explore > Tempo and search for `count-connector-demo` traces. You will see both successful (OK) and error traces.

### View original logs in Loki

Go to Explore > Loki and query:

```logql
{service_name="count-connector-demo"} | json
```

### Check the Alloy OTel pipeline

Visit the Alloy OTel HTTP server at [http://localhost:8888](http://localhost:8888).

## Key Configuration

The `config-otel.yaml` pipeline uses the **count connector** to bridge signals:

1. **`connectors/count`** -- Defines four derived metrics:
   - `span.count` -- Total number of spans received
   - `span.error.count` -- Spans where `status.code == 2` (ERROR)
   - `log.count` -- Total number of log records received
   - `log.error.count` -- Logs where `severity_number >= 17` (ERROR and above)

2. **Pipeline wiring:**
   - `traces` pipeline: receives OTLP, exports to both `count` connector and `otlp/tempo`
   - `logs` pipeline: receives OTLP, exports to both `count` connector and `otlphttp/loki`
   - `metrics` pipeline: receives from `count` connector, exports to `otlphttp/prometheus`

The count connector acts as both an exporter (in the traces/logs pipelines) and a receiver (in the metrics pipeline), bridging signals without any application changes.

## Stop

```bash
docker compose down
```


================================================
FILE: otel-examples/count-connector/app/Dockerfile
================================================
ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2
FROM python:${PYTHON_VERSION}
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY app.py .
CMD ["python", "app.py"]


================================================
FILE: otel-examples/count-connector/app/app.py
================================================
"""
Demo Flask app for the count-connector scenario.

Generates a mix of successful and error traces plus log records at various
severity levels. The Alloy OTel pipeline uses the count connector to derive
metrics (span.count, span.error.count, log.count, log.error.count) from
these signals.
"""

import logging
import random
import threading
import time

from flask import Flask, jsonify
from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.sdk.resources import Resource
from opentelemetry.trace import StatusCode

# --- OTel Setup ---
resource = Resource.create({
    "service.name": "count-connector-demo",
    "service.version": "1.0.0",
})

# Traces
tracer_provider = TracerProvider(resource=resource)
tracer_provider.add_span_processor(
    BatchSpanProcessor(OTLPSpanExporter(endpoint="alloy:4317", insecure=True))
)
trace.set_tracer_provider(tracer_provider)
tracer = trace.get_tracer(__name__)

# Logs via OTel
logger_provider = LoggerProvider(resource=resource)
logger_provider.add_log_record_processor(
    BatchLogRecordProcessor(OTLPLogExporter(endpoint="alloy:4317", insecure=True))
)
handler = LoggingHandler(level=logging.DEBUG, logger_provider=logger_provider)
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("count-connector-demo")
logger.addHandler(handler)

# --- Flask App ---
app = Flask(__name__)
FlaskInstrumentor().instrument_app(app)


@app.route("/api/process")
def process():
    """Simulates a processing request. ~80% success, ~20% error."""
    with tracer.start_as_current_span("process-request") as span:
        request_id = f"REQ-{random.randint(1000, 9999)}"
        span.set_attribute("request.id", request_id)

        time.sleep(random.uniform(0.02, 0.15))

        if random.random() < 0.20:
            error_type = random.choice(["ValidationError", "TimeoutError", "DatabaseError"])
            span.set_status(StatusCode.ERROR, f"Simulated {error_type}")
            span.set_attribute("error.type", error_type)
            span.record_exception(Exception(f"Simulated {error_type}"))
            logger.error("Request %s failed: %s", request_id, error_type)
            return jsonify({"request_id": request_id, "error": error_type}), 500

        logger.info("Request %s processed successfully", request_id)
        return jsonify({"request_id": request_id, "status": "ok"})


@app.route("/api/notify")
def notify():
    """Simulates sending a notification."""
    with tracer.start_as_current_span("send-notification") as span:
        channel = random.choice(["email", "sms", "push"])
        span.set_attribute("notification.channel", channel)

        time.sleep(random.uniform(0.01, 0.1))

        if random.random() < 0.10:
            span.set_status(StatusCode.ERROR, "Notification delivery failed")
            logger.error("Notification via %s failed", channel)
            return jsonify({"channel": channel, "status": "failed"}), 500

        logger.info("Notification sent via %s", channel)
        return jsonify({"channel": channel, "status": "sent"})


@app.route("/health")
def health():
    return jsonify({"status": "healthy"})


def load_generator():
    """Background thread generating continuous traffic every 2 seconds."""
    import requests

    base_url = "http://localhost:8080"
    time.sleep(5)

    while True:
        try:
            endpoint = random.choice(["/api/process", "/api/process", "/api/notify"])
            requests.get(f"{base_url}{endpoint}", timeout=5)
        except Exception:
            pass

        # Also emit some standalone log records
        severity = random.choices(
            ["info", "warn", "error"],
            weights=[60, 25, 15],
            k=1,
        )[0]
        if severity == "info":
            logger.info("Background task check - all systems normal")
        elif severity == "warn":
            logger.warning("Background task check - queue depth elevated")
        else:
            logger.error("Background task check - connectivity issue detected")

        time.sleep(2)


if __name__ == "__main__":
    thread = threading.Thread(target=load_generator, daemon=True)
    thread.start()
    app.run(host="0.0.0.0", port=8080)


================================================
FILE: otel-examples/count-connector/app/requirements.txt
================================================
flask
opentelemetry-api
opentelemetry-sdk
opentelemetry-exporter-otlp
opentelemetry-instrumentation-flask


================================================
FILE: otel-examples/count-connector/config-otel.yaml
================================================
#
# OTel Collector YAML: Count Connector (Derive Metrics from Signals)
#
# Demonstrates using the count connector to derive count metrics
# from traces and logs: error rates, request counts per service,
# log volume by severity -- "metrics from signals" pattern.
#

extensions:
  alloyengine:
    config:
      file: /etc/alloy/config.alloy
    flags:
      server.http.listen-addr: 0.0.0.0:12345

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

connectors:
  count:
    spans:
      span.count:
        description: Total number of spans received
      span.error.count:
        description: Number of error spans
        conditions:
          - status.code == 2
    logs:
      log.count:
        description: Total number of log records
      log.error.count:
        description: Number of error log records
        conditions:
          - severity_number >= 17

processors:
  batch: {}

  # Convert delta temporality (from count connector) to cumulative for Prometheus
  deltatocumulative: {}

exporters:
  otlp/tempo:
    endpoint: tempo:4317
    tls:
      insecure: true

  otlphttp/loki:
    endpoint: http://loki:3100/otlp

  otlphttp/prometheus:
    endpoint: http://prometheus:9090/api/v1/otlp
    tls:
      insecure: true

service:
  extensions: [alloyengine]
  pipelines:
    # Ingest traces and forward to count connector + Tempo
    traces:
      receivers: [otlp]
      processors: [batch]
      exporters: [count, otlp/tempo]
    # Ingest logs and forward to count connector + Loki
    logs:
      receivers: [otlp]
      processors: [batch]
      exporters: [count, otlphttp/loki]
    # Export derived count metrics to Prometheus
    metrics:
      receivers: [count]
      processors: [deltatocumulative, batch]
      exporters: [otlphttp/prometheus]


================================================
FILE: otel-examples/count-connector/config.alloy
================================================
// Minimal Alloy config to enable the Alloy UI alongside the OTel Engine.
// The OTel pipeline is defined in config-otel.yaml.


================================================
FILE: otel-examples/count-connector/docker-compose.coda.yml
================================================
services:
  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    network_mode: host
    restart: unless-stopped
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
      - OTEL_SERVICE_NAME=count-connector-demo


================================================
FILE: otel-examples/count-connector/docker-compose.yml
================================================
version: '3.8'

services:
  # Loki for log storage
  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    command: -config.file=/etc/loki/local-config.yaml
    ports:
      - 3100:3100/tcp
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml

  # Prometheus for metrics storage (receives derived count metrics via OTLP)
  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --web.enable-otlp-receiver
      - --enable-feature=native-histograms
      - --config.file=/etc/prometheus/prometheus.yml
    ports:
      - 9090:9090/tcp
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml

  # Tempo for trace storage
  tempo:
    image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4}
    command: ["-config.file=/etc/tempo.yaml"]
    ports:
      - 3200:3200/tcp
    volumes:
      - ./tempo-config.yaml:/etc/tempo.yaml

  # Grafana for visualization
  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        - name: Tempo
          type: tempo
          access: proxy
          orgId: 1
          url: http://tempo:3200
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
          jsonData:
            serviceMap:
              datasourceUid: 'Prometheus'
            nodeGraph:
              enabled: true
        EOF
        /run.sh
    depends_on:
      - loki
      - prometheus
      - tempo

  # Alloy in OTel engine mode
  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    command: otel --config=/etc/alloy/config-otel.yaml
    ports:
      - 8888:8888       # OTel engine HTTP server
      - 4317:4317       # OTLP gRPC
      - 4318:4318       # OTLP HTTP
      - 12345:12345     # Alloy UI
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
      - ./config.alloy:/etc/alloy/config.alloy
    depends_on:
      - loki
      - prometheus
      - tempo

  # Demo app that generates traces and logs for count connector
  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8080:8080
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=count-connector-demo
    depends_on:
      - alloy


================================================
FILE: otel-examples/count-connector/loki-config.yaml
================================================
auth_enabled: false
server:
  http_listen_port: 3100
  grpc_listen_port: 9096
  log_level: info
common:
  instance_addr: 127.0.0.1
  path_prefix: /tmp/storage
  storage:
    filesystem:
      chunks_directory: /tmp/storage/chunks
      rules_directory: /tmp/storage/rules
  replication_factor: 1
  ring:
    kvstore:
      store: inmemory
query_range:
  results_cache:
    cache:
      embedded_cache:
        enabled: true
        max_size_mb: 100
limits_config:
  metric_aggregation_enabled: true
schema_config:
  configs:
    - from: 2020-10-24
      store: tsdb
      object_store: filesystem
      schema: v13
      index:
        prefix: index_
        period: 24h
pattern_ingester:
  enabled: true
  metric_aggregation:
    loki_address: localhost:3100
ruler:
  alertmanager_url: http://localhost:9093
frontend:
  encoding: protobuf


================================================
FILE: otel-examples/count-connector/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s
otlp:
  promote_resource_attributes:
    - service.instance.id
    - service.name
    - service.namespace
    - service.version
    - deployment.environment
storage:
  tsdb:
    out_of_order_time_window: 30m


================================================
FILE: otel-examples/count-connector/tempo-config.yaml
================================================
stream_over_http_enabled: true
server:
  http_listen_port: 3200
  log_level: info
distributor:
  receivers:
    otlp:
      protocols:
        grpc:
          endpoint: "tempo:4317"
        http:
          endpoint: "tempo:4318"
ingester:
  max_block_duration: 5m
compactor:
  compaction:
    block_retention: 720h
metrics_generator:
  registry:
    external_labels:
      source: tempo
      cluster: docker-compose
  storage:
    path: /var/tempo/generator/wal
    remote_write:
      - url: http://prometheus:9090/api/v1/write
        send_exemplars: true
  traces_storage:
    path: /var/tempo/generator/traces
  processor:
    local_blocks:
      filter_server_spans: false
      flush_to_storage: true

storage:
  trace:
    backend: local
    wal:
      path: /var/tempo/wal
    local:
      path: /var/tempo/blocks

overrides:
  defaults:
    metrics_generator:
      processors: [service-graphs, span-metrics, local-blocks]
      generate_native_histograms: both


================================================
FILE: otel-examples/filelog-processing/README.md
================================================
# Filelog Processing

Demonstrates the OTel Collector **filelog receiver** with operator chains to parse mixed-format log files. A log generator writes both JSON and plaintext log lines to a shared volume, and Alloy (running the OTel engine) reads, parses, and ships them to Loki.

## What This Demonstrates

- **Filelog receiver** reading log files from disk using glob patterns
- **Conditional operator chains** that detect log format and apply the correct parser (JSON vs regex)
- **Severity parsing** to map log levels to OTel severity
- **Resource attribute injection** to tag all logs with a service name
- Exporting parsed logs to **Loki via OTLP/HTTP**

## Prerequisites

- Docker and Docker Compose

## Run

```bash
docker compose up -d
```

## Alloy UI

The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline.

If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`.

## Explore

1. Open Grafana at [http://localhost:3000](http://localhost:3000) (no login required).
2. Go to **Explore** and select the **Loki** datasource.
3. Try these LogQL queries:

```logql
{service_name="log-demo"}
```

```logql
{service_name="log-demo"} | json
```

```logql
{service_name="log-demo"} |= "ERROR"
```

4. Observe that both JSON and plaintext lines are ingested, with severity levels and timestamps correctly parsed.

## Key Configuration

The `config-otel.yaml` defines a filelog receiver with chained operators:

- **`json_parser`** (conditional) -- fires when the log line starts with `{`, extracting structured fields and timestamps.
- **`regex_parser`** (conditional) -- fires when the log line starts with a date pattern, capturing timestamp, level, and message.
- **`severity_parser`** -- maps the parsed `level` attribute to OTel severity.
- **`add` operator** -- injects `service.name` as a resource attribute.

Logs are batched and exported to Loki's OTLP endpoint at `http://loki:3100/otlp`.

## Stop

```bash
docker compose down
```


================================================
FILE: otel-examples/filelog-processing/app/generate_logs.py
================================================
"""
Log generator that writes mixed-format log lines to /var/log/app/demo.log.

Alternates between JSON and plaintext formats with random log levels
to exercise the filelog receiver's operator chains.
"""

import json
import os
import random
import time
from datetime import datetime, timezone

LOG_DIR = "/var/log/app"
LOG_FILE = os.path.join(LOG_DIR, "demo.log")

LEVELS = ["DEBUG", "INFO", "INFO", "INFO", "WARN", "ERROR"]

JSON_MESSAGES = [
    ("User logged in", {"user_id": "u123", "region": "us-east"}),
    ("Order placed", {"order_id": "ord-9876", "amount": 49.99}),
    ("Cache hit", {"cache_key": "session:abc", "ttl": 300}),
    ("Payment processed", {"user_id": "u456", "method": "credit_card"}),
    ("Item shipped", {"order_id": "ord-5432", "carrier": "fedex"}),
    ("User signed up", {"user_id": "u789", "plan": "premium"}),
]

PLAIN_MESSAGES = [
    "Failed to process request for user u456",
    "Connection timeout reaching database primary",
    "Rate limit exceeded for API key ak-1234",
    "Scheduled cleanup completed, removed 42 expired sessions",
    "Health check passed for service order-api",
    "Retrying failed webhook delivery attempt 3/5",
    "Disk usage at 78% on volume /data",
]


def write_json_line(f, level):
    msg, extra = random.choice(JSON_MESSAGES)
    record = {
        "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z",
        "level": level,
        "message": msg,
        **extra,
    }
    f.write(json.dumps(record) + "\n")


def write_plain_line(f, level):
    ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
    msg = random.choice(PLAIN_MESSAGES)
    f.write(f"{ts} {level} {msg}\n")


def main():
    os.makedirs(LOG_DIR, exist_ok=True)
    print(f"Writing logs to {LOG_FILE}")

    while True:
        level = random.choice(LEVELS)
        with open(LOG_FILE, "a") as f:
            if random.random() < 0.5:
                write_json_line(f, level)
            else:
                write_plain_line(f, level)
        time.sleep(2)


if __name__ == "__main__":
    main()


================================================
FILE: otel-examples/filelog-processing/config-otel.yaml
================================================
#
# OTel Collector YAML: File Log Processing
#
# Demonstrates the filelog receiver with operator chains for parsing
# mixed-format log files (JSON, plaintext, multiline stack traces).
#

extensions:
  alloyengine:
    config:
      file: /etc/alloy/config.alloy
    flags:
      server.http.listen-addr: 0.0.0.0:12345

receivers:
  filelog:
    include:
      - /var/log/app/*.log
    operators:
      # Try to parse as JSON first
      - type: json_parser
        if: body matches "^\\{"
        parse_from: body
        parse_to: attributes
        timestamp:
          parse_from: attributes.timestamp
          layout: "%Y-%m-%dT%H:%M:%S.%fZ"

      # For non-JSON lines, extract with regex
      - type: regex_parser
        if: body matches "^\\d{4}-\\d{2}-\\d{2}"
        regex: "^(?P<timestamp>\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}) (?P<level>\\w+) (?P<message>.*)"
        timestamp:
          parse_from: attributes.timestamp
          layout: "%Y-%m-%d %H:%M:%S,%f"

      # Map severity from parsed level
      - type: severity_parser
        parse_from: attributes.level
        if: attributes.level != nil

      # Add a static resource attribute
      - type: add
        field: resource["service.name"]
        value: log-demo

processors:
  batch:
    timeout: 2s
    send_batch_size: 256

exporters:
  otlphttp/loki:
    endpoint: http://loki:3100/otlp

service:
  extensions: [alloyengine]
  pipelines:
    logs:
      receivers: [filelog]
      processors: [batch]
      exporters: [otlphttp/loki]


================================================
FILE: otel-examples/filelog-processing/config.alloy
================================================
// Minimal Alloy config to enable the Alloy UI alongside the OTel Engine.
// The OTel pipeline is defined in config-otel.yaml.


================================================
FILE: otel-examples/filelog-processing/docker-compose.coda.yml
================================================
services:
  log-generator:
    image: python:${PYTHON_VERSION:-3.11-slim}
    network_mode: host
    restart: unless-stopped
    volumes:
      - ./app/generate_logs.py:/app/generate_logs.py
      - /var/log/alloy-demo:/var/log/app
    command: ["python3", "/app/generate_logs.py"]


================================================
FILE: otel-examples/filelog-processing/docker-compose.yml
================================================
version: '3.8'

services:
  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 8888:8888
      - 12345:12345     # Alloy UI
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
      - ./config.alloy:/etc/alloy/config.alloy
      - app-logs:/var/log/app
    command: otel --config=/etc/alloy/config-otel.yaml
    depends_on:
      - loki

  log-generator:
    image: python:${PYTHON_VERSION:-3.11-slim}
    volumes:
      - ./app/generate_logs.py:/app/generate_logs.py
      - app-logs:/var/log/app
    command: ["python3", "/app/generate_logs.py"]
    depends_on:
      - alloy

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    depends_on:
      - loki
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh

volumes:
  app-logs:


================================================
FILE: otel-examples/filelog-processing/loki-config.yaml
================================================
auth_enabled: false
server:
  http_listen_port: 3100
  grpc_listen_port: 9096
  log_level: info
common:
  instance_addr: 127.0.0.1
  path_prefix: /tmp/storage
  storage:
    filesystem:
      chunks_directory: /tmp/storage/chunks
      rules_directory: /tmp/storage/rules
  replication_factor: 1
  ring:
    kvstore:
      store: inmemory
query_range:
  results_cache:
    cache:
      embedded_cache:
        enabled: true
        max_size_mb: 100
limits_config:
  metric_aggregation_enabled: true
schema_config:
  configs:
    - from: 2020-10-24
      store: tsdb
      object_store: filesystem
      schema: v13
      index:
        prefix: index_
        period: 24h
pattern_ingester:
  enabled: true
  metric_aggregation:
    loki_address: localhost:3100
ruler:
  alertmanager_url: http://localhost:9093
frontend:
  encoding: protobuf


================================================
FILE: otel-examples/host-metrics/README.md
================================================
# Host Metrics with OTel Hostmetrics Receiver

Collect CPU, memory, disk, filesystem, network, and process metrics using the OpenTelemetry `hostmetrics` receiver -- an OTel-native replacement for Prometheus node_exporter. Metrics are exported via OTLP to Prometheus.

## What This Demonstrates

- **Hostmetrics receiver**: Collects system-level metrics without a separate exporter binary
- **Scrapers**: CPU (with utilization), memory (with utilization), disk, filesystem, network, load, and process scrapers
- **Resource detection**: Automatically adds host metadata (hostname, OS type) to all metrics
- **OTLP export to Prometheus**: Metrics are sent via OTLP to Prometheus's native OTLP receiver
- **Stress testing**: A stress container generates CPU and memory load to produce interesting metric data

## Metrics Collected

| Scraper    | Example Metrics                                                    |
|------------|-------------------------------------------------------------------|
| CPU        | `system_cpu_time`, `system_cpu_utilization`                        |
| Memory     | `system_memory_usage`, `system_memory_utilization`                 |
| Disk       | `system_disk_io`, `system_disk_operations`                         |
| Filesystem | `system_filesystem_usage`, `system_filesystem_utilization`         |
| Network    | `system_network_io`, `system_network_packets`                      |
| Load       | `system_cpu_load_average_1m`, `system_cpu_load_average_5m`         |
| Process    | `process_cpu_time`, `process_memory_physical_usage`                |

## Prerequisites

- Docker and Docker Compose
- Linux host (hostmetrics requires access to `/proc` and `/sys`)

## Run

```bash
docker compose up -d
```

## Alloy UI

The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline.

If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`.

## Explore

Open Grafana at [http://localhost:3000](http://localhost:3000) and go to **Explore > Prometheus**.

### Sample PromQL Queries

**CPU utilization:**
```promql
system_cpu_utilization{state="user"}
```

**Memory usage (bytes):**
```promql
system_memory_usage{state="used"}
```

**Disk I/O rate:**
```promql
rate(system_disk_io_total[5m])
```

**Network bytes transmitted:**
```promql
rate(system_network_io_total{direction="transmit"}[5m])
```

**System load averages:**
```promql
system_cpu_load_average_1m
```

**Top processes by CPU:**
```promql
topk(10, rate(process_cpu_time_total[5m]))
```

## Key Configuration

The `config-otel.yaml` configures:

1. **`hostmetrics` receiver**: Enables all major scrapers with 15s collection interval. CPU and memory utilization metrics are explicitly enabled.
2. **`resourcedetection` processor**: Uses `env` and `system` detectors to add hostname and OS metadata.
3. **`otlphttp/prometheus` exporter**: Sends metrics via OTLP to Prometheus's native OTLP endpoint.

The Alloy container runs with `pid: host` and mounts `/proc`, `/sys`, and `/` from the host to enable full system visibility.

## Stop

```bash
docker compose down
```


================================================
FILE: otel-examples/host-metrics/config-otel.yaml
================================================
#
# OTel Collector YAML: Host Metrics Collection
#
# Demonstrates using the hostmetrics receiver to collect CPU, memory,
# disk, filesystem, and network metrics -- an OTel-native replacement
# for Prometheus node_exporter.
#

extensions:
  alloyengine:
    config:
      file: /etc/alloy/config.alloy
    flags:
      server.http.listen-addr: 0.0.0.0:12345

receivers:
  hostmetrics:
    collection_interval: 15s
    scrapers:
      cpu:
        metrics:
          system.cpu.utilization:
            enabled: true
      memory:
        metrics:
          system.memory.utilization:
            enabled: true
      disk: {}
      filesystem: {}
      network: {}
      load: {}
      process:
        include:
          match_type: regexp
          names: [".*"]
        mute_process_exe_error: true
        mute_process_io_error: true
        mute_process_user_error: true

processors:
  # Detect host metadata automatically
  resourcedetection:
    detectors: [env, system]
    system:
      hostname_sources: ["os"]

  batch:
    timeout: 10s
    send_batch_size: 512

exporters:
  otlphttp/prometheus:
    endpoint: http://prometheus:9090/api/v1/otlp
    tls:
      insecure: true

service:
  extensions: [alloyengine]
  pipelines:
    metrics:
      receivers: [hostmetrics]
      processors: [resourcedetection, batch]
      exporters: [otlphttp/prometheus]


================================================
FILE: otel-examples/host-metrics/config.alloy
================================================
// Minimal Alloy config to enable the Alloy UI alongside the OTel Engine.
// The OTel pipeline is defined in config-otel.yaml.


================================================
FILE: otel-examples/host-metrics/docker-compose.coda.yml
================================================
services:
  stress:
    image: polinux/stress@sha256:b6144f84f9c15dac80deb48d3a646b55c7043ab1d83ea0a697c09097aaad21aa
    command: ["stress", "--cpu", "1", "--vm", "1", "--vm-bytes", "64M"]


================================================
FILE: otel-examples/host-metrics/docker-compose.yml
================================================
version: '3.8'

services:
  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-otlp-receiver
      - --enable-feature=native-histograms
      - --config.file=/etc/prometheus/prometheus.yml
    ports:
      - 9090:9090
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 8888:8888
      - 12345:12345     # Alloy UI
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
      - ./config.alloy:/etc/alloy/config.alloy
      - /proc:/hostfs/proc:ro
      - /sys:/hostfs/sys:ro
      - /:/hostfs:ro
    environment:
      - HOST_PROC=/hostfs/proc
      - HOST_SYS=/hostfs/sys
      - HOST_ETC=/hostfs/etc
      - HOST_VAR=/hostfs/var
      - HOST_RUN=/hostfs/run
    pid: host
    command: otel --config=/etc/alloy/config-otel.yaml
    depends_on:
      - prometheus

  stress:
    image: polinux/stress@sha256:b6144f84f9c15dac80deb48d3a646b55c7043ab1d83ea0a697c09097aaad21aa
    command: ["stress", "--cpu", "1", "--vm", "1", "--vm-bytes", "64M"]

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    depends_on:
      - prometheus
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh


================================================
FILE: otel-examples/host-metrics/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s
otlp:
  promote_resource_attributes:
    - service.instance.id
    - service.name
    - service.namespace
    - service.version
    - deployment.environment
    - host.name
    - os.type
storage:
  tsdb:
    out_of_order_time_window: 30m


================================================
FILE: otel-examples/kafka-buffer/README.md
================================================
# Kafka-Buffered Trace Pipeline

Demonstrates using Apache Kafka as a durable buffer in an OpenTelemetry trace pipeline. Alloy runs both the agent tier (OTLP receiver to Kafka) and the gateway tier (Kafka to Tempo) in a single collector instance, showcasing the two-tier architecture pattern.

## What This Demonstrates

- **Kafka as a durable buffer**: Traces are written to Kafka before being exported to Tempo, providing resilience against backend outages
- **Two-tier collector architecture**: The agent tier ingests OTLP and writes to Kafka; the gateway tier reads from Kafka and exports to Tempo
- **Single-collector demo**: Both tiers run in one Alloy instance for simplicity, but in production these would be separate deployments
- **KRaft mode Kafka**: Uses Bitnami Kafka with KRaft (no ZooKeeper required)
- **Auto topic creation**: The `otlp-traces` topic is created automatically on first write

## Architecture

```
App --OTLP--> Alloy (agent tier) --Kafka--> Alloy (gateway tier) --OTLP--> Tempo
```

In this demo, both tiers are the same Alloy instance with two separate pipelines:

1. **`traces/ingest`**: `otlp` receiver -> `kafka` exporter
2. **`traces/export`**: `kafka` receiver -> `batch` processor -> `otlp/tempo` exporter

## Prerequisites

- Docker and Docker Compose

## Run

```bash
docker compose up -d
```

Wait about 30 seconds for Kafka to initialize before traces start flowing.

## Alloy UI

The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline.

If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`.

## Explore

Open Grafana at [http://localhost:3000](http://localhost:3000) and go to **Explore > Tempo**.

Search for traces from `kafka-buffer-demo`. You should see traces for HTTP endpoints (`/api/items`, `/api/checkout`, `/api/health`) with database query child spans.

### Demonstrate Resilience

The key benefit of the Kafka buffer is resilience. Try this experiment:

1. Let the demo run for a minute to generate some traces
2. Stop Tempo: `docker compose stop tempo`
3. Wait 30 seconds (traces are buffering in Kafka)
4. Restart Tempo: `docker compose start tempo`
5. Check Grafana -- the buffered traces should appear in Tempo

This works because Kafka retains messages until the consumer (gateway tier) successfully reads them.

## Key Configuration

The `config-otel.yaml` defines:

1. **`kafka` exporter**: Writes OTLP-encoded trace data to the `otlp-traces` Kafka topic
2. **`kafka` receiver**: Reads from the same topic and deserializes traces
3. **Two pipelines**: `traces/ingest` (app -> Kafka) and `traces/export` (Kafka -> Tempo)

The Kafka exporter uses `otlp_proto` encoding, which preserves full trace fidelity through the buffer.

## Stop

```bash
docker compose down
```


================================================
FILE: otel-examples/kafka-buffer/app/Dockerfile
================================================
ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2
FROM python:${PYTHON_VERSION}
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY app.py .
CMD ["python", "app.py"]


================================================
FILE: otel-examples/kafka-buffer/app/app.py
================================================
"""
Flask app generating traces for the Kafka buffer demo.

Produces varied HTTP traces that flow through the Alloy pipeline:
  app -> OTLP -> Alloy -> Kafka -> Alloy -> Tempo

A background thread generates continuous load against the Flask endpoints.
"""

import random
import time
import threading

from flask import Flask, jsonify
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.sdk.resources import Resource

resource = Resource.create({
    "service.name": "kafka-buffer-demo",
    "service.version": "1.0.0",
    "deployment.environment": "demo",
})

tracer_provider = TracerProvider(resource=resource)
tracer_provider.add_span_processor(
    BatchSpanProcessor(OTLPSpanExporter(endpoint="alloy:4317", insecure=True))
)
trace.set_tracer_provider(tracer_provider)
tracer = trace.get_tracer("kafka-demo")

app = Flask(__name__)
FlaskInstrumentor().instrument_app(app)


@app.route("/api/items", methods=["GET"])
def list_items():
    with tracer.start_as_current_span("query-items-db") as span:
        span.set_attribute("db.system", "postgresql")
        span.set_attribute("db.statement", "SELECT * FROM items LIMIT 20")
        time.sleep(random.uniform(0.01, 0.04))
    return jsonify({"items": [{"id": i, "name": f"item-{i}"} for i in range(5)]})


@app.route("/api/items/<int:item_id>", methods=["GET"])
def get_item(item_id):
    with tracer.start_as_current_span("query-single-item") as span:
        span.set_attribute("db.system", "postgresql")
        span.set_attribute("db.statement", f"SELECT * FROM items WHERE id = {item_id}")
        span.set_attribute("app.item_id", item_id)
        time.sleep(random.uniform(0.005, 0.02))
    return jsonify({"id": item_id, "name": f"item-{item_id}", "price": round(random.uniform(5, 100), 2)})


@app.route("/api/checkout", methods=["POST"])
def checkout():
    with tracer.start_as_current_span("process-checkout") as span:
        span.set_attribute("app.cart_size", random.randint(1, 10))
        span.set_attribute("app.payment_method", random.choice(["credit_card", "paypal", "apple_pay"]))
        time.sleep(random.uniform(0.05, 0.15))

        # Simulate occasional failures
        if random.random() < 0.1:
            span.set_attribute("error", True)
            span.set_attribute("error.message", "Payment gateway timeout")
            return jsonify({"error": "Payment failed"}), 500

    return jsonify({"order_id": random.randint(10000, 99999), "status": "confirmed"}), 201


@app.route("/api/health")
def health():
    return jsonify({"status": "ok"})


def generate_load():
    """Background thread that sends requests to the Flask app."""
    import urllib.request

    time.sleep(5)  # Wait for Flask to start
    base = "http://localhost:8080"
    endpoints = [
        ("GET", f"{base}/api/items"),
        ("GET", f"{base}/api/items/1"),
        ("GET", f"{base}/api/items/2"),
        ("GET", f"{base}/api/items/3"),
        ("POST", f"{base}/api/checkout"),
        ("GET", f"{base}/api/health"),
    ]

    while True:
        method, url = random.choice(endpoints)
        try:
            req = urllib.request.Request(url, method=method)
            if method == "POST":
                req.add_header("Content-Type", "application/json")
                req.data = b'{"items": [1, 2, 3]}'
            urllib.request.urlopen(req)
        except Exception:
            pass
        time.sleep(random.uniform(0.5, 2.0))


if __name__ == "__main__":
    load_thread = threading.Thread(target=generate_load, daemon=True)
    load_thread.start()
    app.run(host="0.0.0.0", port=8080)


================================================
FILE: otel-examples/kafka-buffer/app/requirements.txt
================================================
flask
opentelemetry-api
opentelemetry-sdk
opentelemetry-exporter-otlp
opentelemetry-instrumentation-flask
opentelemetry-instrumentation-requests


================================================
FILE: otel-examples/kafka-buffer/config-otel.yaml
================================================
#
# OTel Collector YAML: Kafka-Buffered Pipeline (Gateway)
#
# Demonstrates a two-tier collector architecture:
#   Agent:   otlp receiver -> kafka exporter (writes to Kafka)
#   Gateway: kafka receiver -> batch -> otlp exporter (reads from Kafka, writes to backends)
#
# This config runs BOTH tiers in a single collector for demo purposes,
# using Kafka as a durable buffer between ingest and export.
#

extensions:
  alloyengine:
    config:
      file: /etc/alloy/config.alloy
    flags:
      server.http.listen-addr: 0.0.0.0:12345

receivers:
  # Tier 1: Accept OTLP from applications
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

  # Tier 2: Read back from Kafka
  kafka:
    brokers:
      - kafka:9092
    protocol_version: "3.0.0"
    traces:
      topic: otlp-traces

processors:
  batch: {}

exporters:
  # Tier 1: Write to Kafka buffer
  kafka:
    brokers:
      - kafka:9092
    topic: otlp-traces
    protocol_version: "3.0.0"
    encoding: otlp_proto

  # Tier 2: Write to Tempo
  otlp/tempo:
    endpoint: tempo:4317
    tls:
      insecure: true

service:
  extensions: [alloyengine]
  pipelines:
    # Agent tier: ingest OTLP and buffer to Kafka
    traces/ingest:
      receivers: [otlp]
      exporters: [kafka]
    # Gateway tier: read from Kafka and export to backend
    traces/export:
      receivers: [kafka]
      processors: [batch]
      exporters: [otlp/tempo]


================================================
FILE: otel-examples/kafka-buffer/config.alloy
================================================
// Minimal Alloy config to enable the Alloy UI alongside the OTel Engine.
// The OTel pipeline is defined in config-otel.yaml.


================================================
FILE: otel-examples/kafka-buffer/docker-compose.coda.yml
================================================
services:
  kafka:
    image: apache/kafka:4.2.0@sha256:9516fb7634bad307d17c33b589fde9023003b0cb761374f500002b980a3149b9
    network_mode: host
    restart: unless-stopped
    environment:
      - KAFKA_NODE_ID=0
      - KAFKA_PROCESS_ROLES=broker,controller
      - KAFKA_CONTROLLER_QUORUM_VOTERS=0@localhost:9093
      - KAFKA_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093
      - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://localhost:9092
      - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT
      - KAFKA_CONTROLLER_LISTENER_NAMES=CONTROLLER
      - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1
      - CLUSTER_ID=kafka-buffer-demo-cluster-001

  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    network_mode: host
    restart: unless-stopped
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317


================================================
FILE: otel-examples/kafka-buffer/docker-compose.yml
================================================
version: '3.8'

services:
  kafka:
    image: apache/kafka:4.2.0@sha256:9516fb7634bad307d17c33b589fde9023003b0cb761374f500002b980a3149b9
    ports:
      - 9092:9092
    environment:
      - KAFKA_NODE_ID=0
      - KAFKA_PROCESS_ROLES=broker,controller
      - KAFKA_CONTROLLER_QUORUM_VOTERS=0@kafka:9093
      - KAFKA_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093
      - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092
      - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT
      - KAFKA_CONTROLLER_LISTENER_NAMES=CONTROLLER
      - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1
      - CLUSTER_ID=kafka-buffer-demo-cluster-001

  tempo:
    image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4}
    command: ["-config.file=/etc/tempo.yaml"]
    ports:
      - 3200:3200
    volumes:
      - ./tempo-config.yaml:/etc/tempo.yaml

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 8888:8888
      - 4317:4317
      - 4318:4318
      - 12345:12345     # Alloy UI
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
      - ./config.alloy:/etc/alloy/config.alloy
    command: otel --config=/etc/alloy/config-otel.yaml
    depends_on:
      - kafka
      - tempo

  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8080:8080
    depends_on:
      - alloy

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    depends_on:
      - tempo
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Tempo
          type: tempo
          access: proxy
          orgId: 1
          url: http://tempo:3200
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh


================================================
FILE: otel-examples/kafka-buffer/tempo-config.yaml
================================================
stream_over_http_enabled: true
server:
  http_listen_port: 3200
  log_level: info
distributor:
  receivers:
    otlp:
      protocols:
        grpc:
          endpoint: "0.0.0.0:4317"
        http:
          endpoint: "0.0.0.0:4318"
ingester:
  max_block_duration: 5m
compactor:
  compaction:
    block_retention: 720h
storage:
  trace:
    backend: local
    wal:
      path: /var/tempo/wal
    local:
      path: /var/tempo/blocks
overrides:
  defaults: {}


================================================
FILE: otel-examples/multi-pipeline-fanout/README.md
================================================
# Multi-Pipeline Fan-Out

Demonstrates sending the same traces to multiple backends with different processing per destination using the OpenTelemetry forward connector. Full-fidelity traces go to a primary Tempo instance, while sampled and attribute-stripped traces go to a secondary instance. This is a common pattern for migrations and tiered storage strategies.

## What This Demonstrates

- **Forward connector**: The `forward/sampled` connector duplicates trace data from one pipeline into another
- **Fan-out pattern**: A single intake pipeline fans out to two export pipelines with independent processing
- **Probabilistic sampling**: The secondary pipeline only keeps 10% of traces
- **Attribute stripping**: The secondary pipeline removes sensitive/large attributes (user agent, cookies, request body) and truncates remaining attributes to 128 characters
- **Dual Tempo instances**: Two independent Tempo backends receiving different subsets and fidelity levels of the same trace data

## Prerequisites

- Docker and Docker Compose

## Run

```bash
docker compose up -d
```

## Alloy UI

The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline.

If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`.

## Explore

Open Grafana at [http://localhost:3000](http://localhost:3000).

### Compare Primary vs Secondary

1. Go to **Explore** and select **Tempo Primary** datasource
2. Search for traces from `fanout-demo-app`
3. Pick a trace and note the attributes: full `http.request.header.user_agent`, `http.request.header.cookie`, `http.request.body` values
4. Switch datasource to **Tempo Secondary**
5. Search for the same service -- you will see far fewer traces (only ~10%)
6. On traces that do appear, the user agent, cookie, and request body attributes are gone, and remaining attributes are truncated to 128 characters

### What to Look For

| Aspect              | Tempo Primary                  | Tempo Secondary                  |
|---------------------|-------------------------------|----------------------------------|
| Trace volume        | 100% of traces                | ~10% of traces                   |
| Attribute fidelity  | Full (all attributes present) | Stripped (no UA, cookies, body)  |
| Attribute length    | Unlimited                     | Truncated to 128 chars           |

## Key Configuration

The `config-otel.yaml` defines three pipelines:

1. **`traces/intake`**: Receives OTLP, batches, then exports to both `otlp/tempo-primary` and `forward/sampled`
2. **`traces/sampled`**: Receives from the forward connector, applies probabilistic sampling (10%), strips attributes, and exports to `otlp/tempo-secondary`

The forward connector (`forward/sampled`) acts as the bridge that duplicates data from the intake pipeline to the sampled pipeline.

## Stop

```bash
docker compose down
```


================================================
FILE: otel-examples/multi-pipeline-fanout/app/Dockerfile
================================================
ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2
FROM python:${PYTHON_VERSION}
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY app.py .
CMD ["python", "app.py"]


================================================
FILE: otel-examples/multi-pipeline-fanout/app/app.py
================================================
"""
Flask app generating varied traces for the multi-pipeline fan-out demo.

Produces traces with large attribute values, user agents, cookies, and
request bodies to demonstrate how the secondary pipeline strips these
while the primary retains full fidelity.
"""

import random
import time
import threading

from flask import Flask, jsonify, request
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.sdk.resources import Resource

resource = Resource.create({
    "service.name": "fanout-demo-app",
    "service.version": "1.0.0",
    "deployment.environment": "demo",
})

tracer_provider = TracerProvider(resource=resource)
tracer_provider.add_span_processor(
    BatchSpanProcessor(OTLPSpanExporter(endpoint="alloy:4317", insecure=True))
)
trace.set_tracer_provider(tracer_provider)
tracer = trace.get_tracer("fanout-demo")

app = Flask(__name__)
FlaskInstrumentor().instrument_app(app)

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 Safari/605.1.15",
    "Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 Chrome/120.0.0.0 Mobile Safari/537.36",
    "curl/8.4.0",
]

COOKIES = [
    "session=abc123def456; preferences=dark_mode; tracking_id=xx-" + "a" * 200,
    "session=xyz789; cart=item1,item2,item3; locale=en-US",
    "",
]


@app.route("/api/orders", methods=["GET"])
def list_orders():
    with tracer.start_as_current_span("fetch-orders-from-db") as span:
        span.set_attribute("db.system", "postgresql")
        span.set_attribute("db.statement", "SELECT * FROM orders WHERE status = 'active'")
        time.sleep(random.uniform(0.01, 0.05))
    return jsonify({"orders": [{"id": i, "status": "active"} for i in range(5)]})


@app.route("/api/orders", methods=["POST"])
def create_order():
    with tracer.start_as_current_span("insert-order") as span:
        span.set_attribute("db.system", "postgresql")
        span.set_attribute("db.statement", "INSERT INTO orders (product, qty) VALUES ($1, $2)")
        span.set_attribute("http.request.body", '{"product": "widget", "qty": 10, "notes": "' + "x" * 500 + '"}')
        time.sleep(random.uniform(0.02, 0.08))
    return jsonify({"id": random.randint(1000, 9999), "status": "created"}), 201


@app.route("/api/health")
def health():
    return jsonify({"status": "ok"})


def generate_load():
    """Background thread that sends requests to the Flask app."""
    import urllib.request

    time.sleep(5)  # Wait for Flask to start
    base = "http://localhost:8080"
    endpoints = [
        ("GET", f"{base}/api/orders"),
        ("POST", f"{base}/api/orders"),
        ("GET", f"{base}/api/health"),
    ]

    while True:
        method, url = random.choice(endpoints)
        try:
            req = urllib.request.Request(url, method=method)
            # Add varied headers that will become span attributes
            req.add_header("User-Agent", random.choice(USER_AGENTS))
            cookie = random.choice(COOKIES)
            if cookie:
                req.add_header("Cookie", cookie)
            if method == "POST":
                req.add_header("Content-Type", "application/json")
                req.data = b'{"product": "widget", "qty": 1}'
            urllib.request.urlopen(req)
        except Exception:
            pass
        time.sleep(random.uniform(0.5, 2.0))


if __name__ == "__main__":
    load_thread = threading.Thread(target=generate_load, daemon=True)
    load_thread.start()
    app.run(host="0.0.0.0", port=8080)


================================================
FILE: otel-examples/multi-pipeline-fanout/app/requirements.txt
================================================
flask
opentelemetry-api
opentelemetry-sdk
opentelemetry-exporter-otlp
opentelemetry-instrumentation-flask
opentelemetry-instrumentation-requests


================================================
FILE: otel-examples/multi-pipeline-fanout/config-otel.yaml
================================================
#
# OTel Collector YAML: Multi-Pipeline Fan-Out
#
# Demonstrates sending the same traces to multiple backends with
# different processing per destination: full-fidelity traces to Tempo,
# and sampled traces with reduced attributes to a second store.
# Common for migrations and tiered storage strategies.
#

extensions:
  alloyengine:
    config:
      file: /etc/alloy/config.alloy
    flags:
      server.http.listen-addr: 0.0.0.0:12345

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

connectors:
  forward/sampled: {}

processors:
  batch: {}

  # For the sampled pipeline: only keep 10% of traces
  probabilistic_sampler:
    sampling_percentage: 10

  # Strip detailed attributes for the sampled/cheap store
  transform/strip:
    error_mode: ignore
    trace_statements:
      - context: span
        statements:
          - delete_key(attributes, "http.request.header.user_agent")
          - delete_key(attributes, "http.request.header.cookie")
          - delete_key(attributes, "http.request.body")
          - truncate_all(attributes, 128)

exporters:
  # Primary: full-fidelity traces to Tempo
  otlp/tempo-primary:
    endpoint: tempo:4317
    tls:
      insecure: true

  # Secondary: sampled + stripped traces to second Tempo instance
  otlp/tempo-secondary:
    endpoint: tempo-secondary:4317
    tls:
      insecure: true

service:
  extensions: [alloyengine]
  pipelines:
    # Intake pipeline: receive and fan out to both destinations
    traces/intake:
      receivers: [otlp]
      processors: [batch]
      exporters: [otlp/tempo-primary, forward/sampled]
    # Sampled pipeline: reduced data to secondary store
    traces/sampled:
      receivers: [forward/sampled]
      processors: [probabilistic_sampler, transform/strip, batch]
      exporters: [otlp/tempo-secondary]


================================================
FILE: otel-examples/multi-pipeline-fanout/config.alloy
================================================
// Minimal Alloy config to enable the Alloy UI alongside the OTel Engine.
// The OTel pipeline is defined in config-otel.yaml.


================================================
FILE: otel-examples/multi-pipeline-fanout/docker-compose.coda.yml
================================================
services:
  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    network_mode: host
    restart: unless-stopped
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317


================================================
FILE: otel-examples/multi-pipeline-fanout/docker-compose.yml
================================================
version: '3.8'

services:
  tempo:
    image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4}
    command: ["-config.file=/etc/tempo.yaml"]
    ports:
      - 3200:3200
    volumes:
      - ./tempo-config.yaml:/etc/tempo.yaml

  tempo-secondary:
    image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4}
    command: ["-config.file=/etc/tempo.yaml"]
    ports:
      - 3201:3200
    volumes:
      - ./tempo-config.yaml:/etc/tempo.yaml

  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --enable-feature=native-histograms
      - --config.file=/etc/prometheus/prometheus.yml
    ports:
      - 9090:9090
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 8888:8888
      - 4317:4317
      - 4318:4318
      - 12345:12345     # Alloy UI
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
      - ./config.alloy:/etc/alloy/config.alloy
    command: otel --config=/etc/alloy/config-otel.yaml
    depends_on:
      - tempo
      - tempo-secondary

  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8080:8080
    depends_on:
      - alloy

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    depends_on:
      - tempo
      - tempo-secondary
      - prometheus
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        - name: Tempo Primary
          type: tempo
          access: proxy
          orgId: 1
          url: http://tempo:3200
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
          jsonData:
            serviceMap:
              datasourceUid: 'Prometheus'
            nodeGraph:
              enabled: true
        - name: Tempo Secondary
          type: tempo
          access: proxy
          orgId: 1
          url: http://tempo-secondary:3200
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        EOF
        /run.sh


================================================
FILE: otel-examples/multi-pipeline-fanout/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s
storage:
  tsdb:
    out_of_order_time_window: 30m


================================================
FILE: otel-examples/multi-pipeline-fanout/tempo-config.yaml
================================================
stream_over_http_enabled: true
server:
  http_listen_port: 3200
  log_level: info
distributor:
  receivers:
    otlp:
      protocols:
        grpc:
          endpoint: "0.0.0.0:4317"
        http:
          endpoint: "0.0.0.0:4318"
ingester:
  max_block_duration: 5m
compactor:
  compaction:
    block_retention: 720h
metrics_generator:
  registry:
    external_labels:
      source: tempo
      cluster: docker-compose
  storage:
    path: /var/tempo/generator/wal
    remote_write:
      - url: http://prometheus:9090/api/v1/write
        send_exemplars: true
  traces_storage:
    path: /var/tempo/generator/traces
  processor:
    local_blocks:
      filter_server_spans: false
      flush_to_storage: true

storage:
  trace:
    backend: local
    wal:
      path: /var/tempo/wal
    local:
      path: /var/tempo/blocks

overrides:
  defaults:
    metrics_generator:
      processors: [service-graphs, span-metrics, local-blocks]
      generate_native_histograms: both


================================================
FILE: otel-examples/ottl-transform/README.md
================================================
# OTTL Transform Cookbook

A cookbook of the most useful OpenTelemetry Transformation Language (OTTL) patterns running in Grafana Alloy's OTel engine. Demonstrates JSON body parsing, severity mapping, attribute promotion, truncation, pattern replacement, and conditional transforms.

## What This Demonstrates

- **JSON body parsing**: Log records arrive with JSON string bodies; OTTL parses them and promotes fields to attributes
- **Severity mapping**: String severity levels ("INFO", "WARN", "ERROR") are mapped to proper OTel severity numbers
- **Attribute cleanup**: Promoted fields like `level` and `timestamp` are deleted after extraction
- **Tier labeling**: Trace spans are automatically tagged with `app.tier=frontend` (when `http.target` is present) or `app.tier=backend` (when `db.system` is present)
- **Attribute truncation**: All span attributes are truncated to 256 characters
- **Resource enrichment**: A `deployment.environment=demo` attribute is added to all trace resources

## Prerequisites

- Docker and Docker Compose

## Run

```bash
docker compose up -d
```

## Alloy UI

The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline.

If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`.

## Explore

### Logs in Loki

Open Grafana at [http://localhost:3000](http://localhost:3000) and go to **Explore > Loki**.

Query to see parsed JSON attributes:

```logql
{service_name="ottl-demo-app"}
```

You should see that JSON fields from the log body (`order_id`, `message`, `amount`, `error_code`, etc.) have been promoted to log attributes. The `level` and `timestamp` fields should be removed after promotion. Severity should be correctly set (INFO=9, WARN=13, ERROR=17).

### Traces in Tempo

Switch to **Explore > Tempo** and search for traces from `ottl-demo-app`.

Look for:
- `app.tier` label on spans: `frontend` for HTTP spans, `backend` for database spans
- Long attribute values (like `http.user_agent` or `db.connection_string`) truncated to 256 characters
- `deployment.environment=demo` on trace resources

## Key Configuration

The `config-otel.yaml` defines three transform processors:

1. **`transform/parse-logs`**: Parses JSON string bodies with `ParseJSON(body)`, maps severity, and cleans up attributes
2. **`transform/traces`**: Adds tier labels based on attribute presence, truncates all attributes to 256 chars
3. **`transform/resources`**: Adds `deployment.environment=demo` to trace resources

These are wired into separate pipelines for traces and logs.

## Stop

```bash
docker compose down
```


================================================
FILE: otel-examples/ottl-transform/app/Dockerfile
================================================
ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2
FROM python:${PYTHON_VERSION}
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY app.py .
CMD ["python", "app.py"]


================================================
FILE: otel-examples/ottl-transform/app/app.py
================================================
"""
Demo app that sends "messy" telemetry to exercise OTTL transform patterns.

Sends:
- Log records with JSON string bodies (to test JSON parsing + attribute promotion)
- Log records with string severity fields but no severity_number set
- Traces with varied attributes (http.target, db.system, long values)
"""

import json
import time
import random
import logging

from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter

from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter

from opentelemetry.sdk.resources import Resource

resource = Resource.create({
    "service.name": "ottl-demo-app",
    "service.version": "1.0.0",
})

# --- Tracing setup ---
tracer_provider = TracerProvider(resource=resource)
tracer_provider.add_span_processor(
    BatchSpanProcessor(OTLPSpanExporter(endpoint="alloy:4317", insecure=True))
)
trace.set_tracer_provider(tracer_provider)
tracer = trace.get_tracer("ottl-demo")

# --- Logging setup ---
logger_provider = LoggerProvider(resource=resource)
logger_provider.add_log_record_processor(
    BatchLogRecordProcessor(OTLPLogExporter(endpoint="alloy:4317", insecure=True))
)
handler = LoggingHandler(logger_provider=logger_provider)
logger = logging.getLogger("ottl-demo")
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)


def send_json_log_records():
    """Send log records with JSON string bodies for OTTL JSON parsing."""
    orders = [
        {"timestamp": "2024-01-15T10:30:00Z", "level": "INFO", "message": "Order processed", "order_id": "ORD-123", "amount": 49.99},
        {"timestamp": "2024-01-15T10:30:01Z", "level": "ERROR", "message": "Payment failed", "order_id": "ORD-456", "error_code": "INSUFFICIENT_FUNDS"},
        {"timestamp": "2024-01-15T10:30:02Z", "level": "WARN", "message": "Inventory low", "product_id": "SKU-789", "remaining": 3},
        {"timestamp": "2024-01-15T10:30:03Z", "level": "INFO", "message": "User login", "user_id": "USR-101", "ip": "192.168.1.42"},
        {"timestamp": "2024-01-15T10:30:04Z", "level": "ERROR", "message": "Database timeout", "query": "SELECT * FROM orders", "duration_ms": 30000},
    ]
    record = random.choice(orders)
    # Send as a JSON string body -- OTTL will parse this
    logger.info(json.dumps(record))


def send_traces():
    """Send traces with varied attributes to exercise OTTL trace transforms."""
    # Frontend-style span with http.target
    with tracer.start_as_current_span("GET /api/orders") as span:
        span.set_attribute("http.method", "get")
        span.set_attribute("http.target", "/api/orders?page=1&limit=50")
        span.set_attribute("http.status_code", 200)
        span.set_attribute("http.user_agent", "Mozilla/5.0 " + "x" * 300)  # Very long value
        time.sleep(random.uniform(0.01, 0.05))

        # Backend-style span with db.system
        with tracer.start_as_current_span("SELECT orders") as db_span:
            db_span.set_attribute("db.system", "postgresql")
            db_span.set_attribute("db.statement", "SELECT id, status, amount FROM orders WHERE user_id = $1 ORDER BY created_at DESC LIMIT 50")
            db_span.set_attribute("db.name", "shop")
            db_span.set_attribute("db.operation", "SELECT")
            # Very long attribute to test truncation
            db_span.set_attribute("db.connection_string", "host=db.internal port=5432 dbname=shop user=app " + "extra_param=value " * 50)
            time.sleep(random.uniform(0.02, 0.08))

    # Another trace pattern
    with tracer.start_as_current_span("POST /api/checkout") as span:
        span.set_attribute("http.method", "post")
        span.set_attribute("http.target", "/api/checkout")
        span.set_attribute("http.status_code", random.choice([200, 201, 400, 500]))
        time.sleep(random.uniform(0.05, 0.15))


def main():
    print("OTTL demo app started. Sending messy telemetry every 3 seconds...")
    while True:
        try:
            send_json_log_records()
            send_traces()
        except Exception as e:
            print(f"Error sending telemetry: {e}")
        time.sleep(3)


if __name__ == "__main__":
    main()


================================================
FILE: otel-examples/ottl-transform/app/requirements.txt
================================================
opentelemetry-api
opentelemetry-sdk
opentelemetry-exporter-otlp-proto-grpc


================================================
FILE: otel-examples/ottl-transform/config-otel.yaml
================================================
#
# OTel Collector YAML: OTTL Transform Cookbook
#
# A "cookbook" of the most useful OTTL transformation patterns:
# JSON parsing, attribute promotion, severity mapping, conditional
# transforms, pattern replacement, and key deletion.
#

extensions:
  alloyengine:
    config:
      file: /etc/alloy/config.alloy
    flags:
      server.http.listen-addr: 0.0.0.0:12345

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  # Transform 1: Parse JSON log bodies and promote fields
  transform/parse-logs:
    error_mode: ignore
    log_statements:
      - context: log
        statements:
          # Parse JSON string body into a map
          - merge_maps(attributes, ParseJSON(body), "upsert") where IsString(body) and IsMatch(body, "^[{]")
          # Map string severity to proper severity number
          - set(severity_text, attributes["level"]) where attributes["level"] != nil
          - set(severity_number, 9) where attributes["level"] == "INFO"
          - set(severity_number, 13) where attributes["level"] == "WARN"
          - set(severity_number, 17) where attributes["level"] == "ERROR"
          # Clean up promoted attributes
          - delete_key(attributes, "level")
          - delete_key(attributes, "timestamp")

  # Transform 2: Enrich and clean trace attributes
  transform/traces:
    error_mode: ignore
    trace_statements:
      - context: span
        statements:
          # Add deployment environment from resource
          - set(attributes["app.tier"], "frontend") where attributes["http.target"] != nil
          - set(attributes["app.tier"], "backend") where attributes["db.system"] != nil
          # Truncate overly long attribute values
          - truncate_all(attributes, 256)
          # Normalize HTTP method to uppercase
          - replace_pattern(attributes["http.method"], "^(.*)$", "$$1")

  # Transform 3: Add computed resource attributes
  transform/resources:
    error_mode: ignore
    trace_statements:
      - context: resource
        statements:
          - set(attributes["deployment.environment"], "demo")

  batch: {}

exporters:
  otlp/tempo:
    endpoint: tempo:4317
    tls:
      insecure: true

  otlphttp/loki:
    endpoint: http://loki:3100/otlp

  debug:
    verbosity: detailed

service:
  extensions: [alloyengine]
  pipelines:
    traces:
      receivers: [otlp]
      processors: [transform/traces, transform/resources, batch]
      exporters: [otlp/tempo]
    logs:
      receivers: [otlp]
      processors: [transform/parse-logs, batch]
      exporters: [otlphttp/loki, debug]


================================================
FILE: otel-examples/ottl-transform/config.alloy
================================================
// Minimal Alloy config to enable the Alloy UI alongside the OTel Engine.
// The OTel pipeline is defined in config-otel.yaml.


================================================
FILE: otel-examples/ottl-transform/docker-compose.coda.yml
================================================
services:
  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    network_mode: host
    restart: unless-stopped
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317


================================================
FILE: otel-examples/ottl-transform/docker-compose.yml
================================================
version: '3.8'

services:
  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  tempo:
    image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4}
    command: ["-config.file=/etc/tempo.yaml"]
    ports:
      - 3200:3200
    volumes:
      - ./tempo-config.yaml:/etc/tempo.yaml

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 8888:8888
      - 4317:4317
      - 4318:4318
      - 12345:12345     # Alloy UI
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
      - ./config.alloy:/etc/alloy/config.alloy
    command: otel --config=/etc/alloy/config-otel.yaml
    depends_on:
      - loki
      - tempo

  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    depends_on:
      - alloy

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    depends_on:
      - loki
      - tempo
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        - name: Tempo
          type: tempo
          access: proxy
          orgId: 1
          url: http://tempo:3200
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        EOF
        /run.sh


================================================
FILE: otel-examples/ottl-transform/loki-config.yaml
================================================
auth_enabled: false
server:
  http_listen_port: 3100
  grpc_listen_port: 9096
  log_level: info
common:
  instance_addr: 127.0.0.1
  path_prefix: /tmp/storage
  storage:
    filesystem:
      chunks_directory: /tmp/storage/chunks
      rules_directory: /tmp/storage/rules
  replication_factor: 1
  ring:
    kvstore:
      store: inmemory
query_range:
  results_cache:
    cache:
      embedded_cache:
        enabled: true
        max_size_mb: 100
limits_config:
  metric_aggregation_enabled: true
schema_config:
  configs:
    - from: 2020-10-24
      store: tsdb
      object_store: filesystem
      schema: v13
      index:
        prefix: index_
        period: 24h
pattern_ingester:
  enabled: true
  metric_aggregation:
    loki_address: localhost:3100
ruler:
  alertmanager_url: http://localhost:9093
frontend:
  encoding: protobuf


================================================
FILE: otel-examples/ottl-transform/tempo-config.yaml
================================================
stream_over_http_enabled: true
server:
  http_listen_port: 3200
  log_level: info
distributor:
  receivers:
    otlp:
      protocols:
        grpc:
          endpoint: "tempo:4317"
        http:
          endpoint: "tempo:4318"
ingester:
  max_block_duration: 5m
compactor:
  compaction:
    block_retention: 720h
storage:
  trace:
    backend: local
    wal:
      path: /var/tempo/wal
    local:
      path: /var/tempo/blocks
overrides:
  defaults: {}


================================================
FILE: otel-examples/pii-redaction/README.md
================================================
# PII Redaction

Demonstrates using the OTel Collector **transform processor** with OTTL `replace_pattern` statements to scrub personally identifiable information (credit card numbers, email addresses, IP addresses) from traces and logs before they reach storage backends.

## What This Demonstrates

- **Transform processor** with OTTL expressions for pattern-based redaction
- Scrubbing PII from **trace span attributes** (credit cards, emails, IPs)
- Scrubbing PII from **log record bodies** (credit cards, emails)
- A Flask demo app that intentionally emits telemetry containing sensitive data
- Verifying that redacted data arrives in Tempo and Loki with masked values

## Prerequisites

- Docker and Docker Compose

## Run

```bash
docker compose up -d
```

The demo app automatically generates traffic every 3 seconds -- no manual interaction needed.

## Alloy UI

The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline.

If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`.

## Explore

1. Open Grafana at [http://localhost:3000](http://localhost:3000) (no login required).

### Check Traces (Tempo)

2. Go to **Explore** and select the **Tempo** datasource.
3. Search for traces from `pii-demo-app`.
4. Open a trace and inspect the `process-order` span attributes. You should see:
   - `user.credit_card` = `****-****-****-****`
   - `user.email` = `***@***.***`
   - `client.ip` = `***.***.***.***`

### Check Logs (Loki)

5. Switch to the **Loki** datasource.
6. Run:

```logql
{service_name="pii-demo-app"}
```

7. Log messages should contain masked values like `Payment processed for card ****-****-****-**** by ***@***.***`.

## Key Configuration

The `config-otel.yaml` defines two transform processors:

- **`transform/traces`** -- applies `replace_pattern` on span attributes to mask credit card numbers, emails, and IP addresses using regex.
- **`transform/logs`** -- applies `replace_pattern` on log bodies to mask credit cards and emails.

Both processors use `error_mode: ignore` so a failed match does not block the pipeline.

The pipeline receives OTLP data on ports 4317 (gRPC) and 4318 (HTTP), processes it through the transform stage, then exports traces to Tempo and logs to Loki.

## Stop

```bash
docker compose down
```


================================================
FILE: otel-examples/pii-redaction/app/Dockerfile
================================================
ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2
FROM python:${PYTHON_VERSION}
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY app.py .
CMD ["python", "app.py"]


================================================
FILE: otel-examples/pii-redaction/app/app.py
================================================
"""
Flask app that generates traces and logs containing PII data.

The PII (credit cards, emails, IPs) should be redacted by the Alloy
transform processor before reaching Loki and Tempo.
"""

import logging
import threading
import time

import requests
from flask import Flask, jsonify
from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.instrumentation.flask import FlaskInstrumentor

# --- Resource ---
resource = Resource.create({
    "service.name": "pii-demo-app",
    "service.version": "1.0.0",
})

# --- Traces ---
trace_exporter = OTLPSpanExporter(endpoint="alloy:4317", insecure=True)
tracer_provider = TracerProvider(resource=resource)
tracer_provider.add_span_processor(BatchSpanProcessor(trace_exporter))
trace.set_tracer_provider(tracer_provider)
tracer = trace.get_tracer(__name__)

# --- Logs ---
log_exporter = OTLPLogExporter(endpoint="alloy:4317", insecure=True)
logger_provider = LoggerProvider(resource=resource)
logger_provider.add_log_record_processor(BatchLogRecordProcessor(log_exporter))
otel_handler = LoggingHandler(level=logging.INFO, logger_provider=logger_provider)

logger = logging.getLogger("pii-demo")
logger.setLevel(logging.INFO)
logger.addHandler(otel_handler)

# --- Flask App ---
app = Flask(__name__)
FlaskInstrumentor().instrument_app(app)

# Sample PII data used in requests
ORDERS = [
    {
        "user": "alice",
        "credit_card": "4532-1234-5678-9012",
        "email": "alice@example.com",
        "ip": "192.168.1.100",
    },
    {
        "user": "bob",
        "credit_card": "5425-9876-5432-1098",
        "email": "bob@company.org",
        "ip": "10.0.42.7",
    },
    {
        "user": "charlie",
        "credit_card": "3782-822463-10005",
        "email": "charlie@startup.io",
        "ip": "172.16.0.55",
    },
]

order_index = 0


@app.route("/order", methods=["GET"])
def place_order():
    global order_index
    order = ORDERS[order_index % len(ORDERS)]
    order_index += 1

    with tracer.start_as_current_span("process-order") as span:
        # Set span attributes containing PII
        span.set_attribute("user.credit_card", order["credit_card"])
        span.set_attribute("user.email", order["email"])
        span.set_attribute("client.ip", order["ip"])
        span.set_attribute("order.user", order["user"])

        # Emit a log record containing PII in the body
        logger.info(
            f"Payment processed for card {order['credit_card']} "
            f"by {order['email']} from {order['ip']}"
        )

        return jsonify({"status": "ok", "user": order["user"]})


@app.route("/health", methods=["GET"])
def health():
    return jsonify({"status": "healthy"})


def traffic_generator():
    """Background thread that calls /order every 3 seconds."""
    time.sleep(5)  # Wait for Flask to start
    while True:
        try:
            requests.get("http://localhost:5000/order", timeout=5)
        except Exception:
            pass
        time.sleep(3)


if __name__ == "__main__":
    t = threading.Thread(target=traffic_generator, daemon=True)
    t.start()
    app.run(host="0.0.0.0", port=5000)


================================================
FILE: otel-examples/pii-redaction/app/requirements.txt
================================================
flask
opentelemetry-api
opentelemetry-sdk
opentelemetry-exporter-otlp
opentelemetry-instrumentation-flask
opentelemetry-instrumentation-requests


================================================
FILE: otel-examples/pii-redaction/config-otel.yaml
================================================
#
# OTel Collector YAML: PII Redaction Pipeline
#
# Demonstrates using the transform processor with OTTL to scrub
# sensitive data (credit cards, emails, IPs) from trace attributes
# and log bodies before export.
#

extensions:
  alloyengine:
    config:
      file: /etc/alloy/config.alloy
    flags:
      server.http.listen-addr: 0.0.0.0:12345

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  # Scrub PII from trace span attributes
  transform/traces:
    error_mode: ignore
    trace_statements:
      - context: span
        statements:
          # Mask credit card numbers (16 digits with optional separators)
          - replace_pattern(attributes["user.credit_card"], "\\d{4}[- ]?\\d{4}[- ]?\\d{4}[- ]?\\d{4}", "****-****-****-****")
          # Mask email addresses
          - replace_pattern(attributes["user.email"], "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "***@***.***")
          # Mask IP addresses
          - replace_pattern(attributes["client.ip"], "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", "***.***.***.***")

  # Scrub PII from log bodies
  transform/logs:
    error_mode: ignore
    log_statements:
      - context: log
        statements:
          # Mask credit card numbers in log body
          - replace_pattern(body, "\\d{4}[- ]?\\d{4}[- ]?\\d{4}[- ]?\\d{4}", "****-****-****-****")
          # Mask email addresses in log body
          - replace_pattern(body, "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "***@***.***")

  batch: {}

exporters:
  otlp/tempo:
    endpoint: tempo:4317
    tls:
      insecure: true

  otlphttp/loki:
    endpoint: http://loki:3100/otlp

  debug:
    verbosity: detailed

service:
  extensions: [alloyengine]
  pipelines:
    traces:
      receivers: [otlp]
      processors: [transform/traces, batch]
      exporters: [otlp/tempo]
    logs:
      receivers: [otlp]
      processors: [transform/logs, batch]
      exporters: [otlphttp/loki, debug]


================================================
FILE: otel-examples/pii-redaction/config.alloy
================================================
// Minimal Alloy config to enable the Alloy UI alongside the OTel Engine.
// The OTel pipeline is defined in config-otel.yaml.


================================================
FILE: otel-examples/pii-redaction/docker-compose.coda.yml
================================================
services:
  demo-app:
    build:
      context: ./app
      args:
        PYTHON_VERSION: ${PYTHON_VERSION:-3.11-slim}
    network_mode: host
    restart: unless-stopped
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317


================================================
FILE: otel-examples/pii-redaction/docker-compose.yml
================================================
version: '3.8'

services:
  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  tempo:
    image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4}
    command: ["-config.file=/etc/tempo.yaml"]
    ports:
      - 3200:3200/tcp
    volumes:
      - ./tempo-config.yaml:/etc/tempo.yaml

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 8888:8888
      - 4317:4317/tcp
      - 4318:4318/tcp
      - 12345:12345     # Alloy UI
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
      - ./config.alloy:/etc/alloy/config.alloy
    command: otel --config=/etc/alloy/config-otel.yaml
    depends_on:
      - loki
      - tempo

  demo-app:
    build:
      context: ./app
      args:
        PYTHON_VERSION: ${PYTHON_VERSION:-3.11-slim}
    depends_on:
      - alloy

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    depends_on:
      - loki
      - tempo
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        - name: Tempo
          type: tempo
          access: proxy
          orgId: 1
          url: http://tempo:3200
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh


================================================
FILE: otel-examples/pii-redaction/loki-config.yaml
================================================
auth_enabled: false
server:
  http_listen_port: 3100
  grpc_listen_port: 9096
  log_level: info
common:
  instance_addr: 127.0.0.1
  path_prefix: /tmp/storage
  storage:
    filesystem:
      chunks_directory: /tmp/storage/chunks
      rules_directory: /tmp/storage/rules
  replication_factor: 1
  ring:
    kvstore:
      store: inmemory
query_range:
  results_cache:
    cache:
      embedded_cache:
        enabled: true
        max_size_mb: 100
limits_config:
  metric_aggregation_enabled: true
schema_config:
  configs:
    - from: 2020-10-24
      store: tsdb
      object_store: filesystem
      schema: v13
      index:
        prefix: index_
        period: 24h
pattern_ingester:
  enabled: true
  metric_aggregation:
    loki_address: localhost:3100
ruler:
  alertmanager_url: http://localhost:9093
frontend:
  encoding: protobuf


================================================
FILE: otel-examples/pii-redaction/tempo-config.yaml
================================================
stream_over_http_enabled: true
server:
  http_listen_port: 3200
  log_level: info
distributor:
  receivers:
    otlp:
      protocols:
        grpc:
          endpoint: "tempo:4317"
        http:
          endpoint: "tempo:4318"
ingester:
  max_block_duration: 5m
compactor:
  compaction:
    block_retention: 720h
storage:
  trace:
    backend: local
    wal:
      path: /var/tempo/wal
    local:
      path: /var/tempo/blocks
overrides:
  defaults: {}


================================================
FILE: otel-examples/resource-enrichment/README.md
================================================
# Resource Enrichment

Automatically enrich all telemetry signals with host, OS, and container metadata using the Alloy OTel pipeline -- without changing application code.

## What This Demonstrates

- **`resourcedetection` processor** with `env`, `system`, and `docker` detectors to discover environment metadata
- **`resource` processor** to add custom attributes (`deployment.environment`, `service.namespace`)
- How the collector adds context that apps do not set themselves (hostname, OS type, architecture)
- **Debug exporter** with `detailed` verbosity to inspect enriched resource attributes

## Prerequisites

- Docker and Docker Compose

## Run

```bash
docker compose up -d
```

## Alloy UI

The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline.

If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`.

## Explore

Open Grafana at [http://localhost:3000](http://localhost:3000).

### Check enriched traces in Tempo

1. Go to Explore > Tempo.
2. Search for traces from `enrichment-demo`.
3. Click on any trace and expand the resource attributes. You should see attributes the app did **not** set:
   - `host.name` -- the collector container's hostname
   - `os.type` -- detected OS
   - `host.arch` -- CPU architecture
   - `deployment.environment` = `demo`
   - `service.namespace` = `otel-examples`

### Check enriched metrics in Prometheus

1. Go to Explore > Prometheus.
2. Query `app_requests_total` -- the metric labels should include `deployment_environment`, `service_namespace`, and other enriched attributes.

### Inspect debug exporter output

```bash
docker compose logs alloy
```

Look for the `debug` exporter output showing the full resource with detected attributes attached.

### Check the Alloy OTel pipeline

Visit the Alloy OTel HTTP server at [http://localhost:8888](http://localhost:8888).

## Key Configuration

The `config-otel.yaml` pipeline uses two processors:

1. **`resourcedetection`** -- Auto-detects environment metadata:
   - `env` detector: reads `OTEL_RESOURCE_ATTRIBUTES` environment variable
   - `system` detector: discovers `host.name`, `os.type`, `host.arch`
   - `docker` detector: discovers container metadata (requires Docker socket mount)
   - `override: false` ensures app-set attributes are not overwritten

2. **`resource`** -- Adds static attributes:
   - `deployment.environment` = `demo`
   - `service.namespace` = `otel-examples`
   - Uses `upsert` action so existing values are updated but new ones are also created

Note: The Alloy container mounts `/var/run/docker.sock` read-only to enable the Docker detector.

## Stop

```bash
docker compose down
```


================================================
FILE: otel-examples/resource-enrichment/app/Dockerfile
================================================
ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2
FROM python:${PYTHON_VERSION}
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY app.py .
CMD ["python", "app.py"]


================================================
FILE: otel-examples/resource-enrichment/app/app.py
================================================
"""
Demo Flask app for the resource-enrichment scenario.

A simple app that generates traces and metrics WITHOUT setting host/container
metadata. The Alloy OTel pipeline uses resourcedetection + resource processors
to automatically enrich all signals with environment attributes.
"""

import random
import threading
import time

from flask import Flask, jsonify
from opentelemetry import trace, metrics
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.sdk.resources import Resource

# --- OTel Setup (minimal resource - no host/container info) ---
resource = Resource.create({
    "service.name": "enrichment-demo",
    "service.version": "1.0.0",
})

# Traces
tracer_provider = TracerProvider(resource=resource)
tracer_provider.add_span_processor(
    BatchSpanProcessor(OTLPSpanExporter(endpoint="alloy:4317", insecure=True))
)
trace.set_tracer_provider(tracer_provider)
tracer = trace.get_tracer(__name__)

# Metrics
metric_reader = PeriodicExportingMetricReader(
    OTLPMetricExporter(endpoint="alloy:4317", insecure=True),
    export_interval_millis=10000,
)
meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
metrics.set_meter_provider(meter_provider)
meter = metrics.get_meter(__name__)

# Custom metrics
request_counter = meter.create_counter("app.requests", description="Total requests")
request_duration = meter.create_histogram("app.request.duration", unit="ms", description="Request duration")

# --- Flask App ---
app = Flask(__name__)
FlaskInstrumentor().instrument_app(app)


@app.route("/api/users")
def list_users():
    """Returns a list of mock users."""
    with tracer.start_as_current_span("fetch-users") as span:
        start = time.time()
        user_count = random.randint(1, 50)
        span.set_attribute("user.count", user_count)
        time.sleep(random.uniform(0.01, 0.1))

        request_counter.add(1, {"endpoint": "/api/users", "method": "GET"})
        request_duration.record((time.time() - start) * 1000, {"endpoint": "/api/users"})

        return jsonify({"users": [f"user-{i}" for i in range(user_count)]})


@app.route("/api/items")
def list_items():
    """Returns a list of mock items."""
    with tracer.start_as_current_span("fetch-items") as span:
        start = time.time()
        item_count = random.randint(1, 100)
        span.set_attribute("item.count", item_count)
        time.sleep(random.uniform(0.01, 0.15))

        request_counter.add(1, {"endpoint": "/api/items", "method": "GET"})
        request_duration.record((time.time() - start) * 1000, {"endpoint": "/api/items"})

        return jsonify({"items": [f"item-{i}" for i in range(item_count)]})


@app.route("/health")
def health():
    return jsonify({"status": "healthy"})


def load_generator():
    """Background thread that hits endpoints every 2 seconds."""
    import requests

    base_url = "http://localhost:8080"
    time.sleep(5)

    while True:
        try:
            endpoint = random.choice(["/api/users", "/api/items"])
            requests.get(f"{base_url}{endpoint}", timeout=5)
        except Exception:
            pass
        time.sleep(2)


if __name__ == "__main__":
    thread = threading.Thread(target=load_generator, daemon=True)
    thread.start()
    app.run(host="0.0.0.0", port=8080)


================================================
FILE: otel-examples/resource-enrichment/app/requirements.txt
================================================
flask
requests
opentelemetry-api
opentelemetry-sdk
opentelemetry-exporter-otlp
opentelemetry-instrumentation-flask
opentelemetry-instrumentation-requests


================================================
FILE: otel-examples/resource-enrichment/config-otel.yaml
================================================
#
# OTel Collector YAML: Resource Enrichment
#
# Demonstrates using the resourcedetection processor to automatically
# discover and attach environment metadata (host, OS, Docker container)
# to all telemetry signals without any app-level changes.
#

extensions:
  alloyengine:
    config:
      file: /etc/alloy/config.alloy
    flags:
      server.http.listen-addr: 0.0.0.0:12345

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  # Auto-detect environment metadata
  resourcedetection:
    detectors: [env, system, docker]
    system:
      hostname_sources: ["os"]
      resource_attributes:
        host.name:
          enabled: true
        os.type:
          enabled: true
        host.arch:
          enabled: true
    docker:
      resource_attributes:
        host.name:
          enabled: true
        os.type:
          enabled: true
    timeout: 5s
    override: false

  # Add custom resource attributes
  resource:
    attributes:
      - key: deployment.environment
        value: demo
        action: upsert
      - key: service.namespace
        value: otel-examples
        action: upsert

  batch: {}

exporters:
  otlp/tempo:
    endpoint: tempo:4317
    tls:
      insecure: true

  otlphttp/prometheus:
    endpoint: http://prometheus:9090/api/v1/otlp
    tls:
      insecure: true

  debug:
    verbosity: detailed

service:
  extensions: [alloyengine]
  pipelines:
    traces:
      receivers: [otlp]
      processors: [resourcedetection, resource, batch]
      exporters: [otlp/tempo, debug]
    metrics:
      receivers: [otlp]
      processors: [resourcedetection, resource, batch]
      exporters: [otlphttp/prometheus]


================================================
FILE: otel-examples/resource-enrichment/config.alloy
================================================
// Minimal Alloy config to enable the Alloy UI alongside the OTel Engine.
// The OTel pipeline is defined in config-otel.yaml.


================================================
FILE: otel-examples/resource-enrichment/docker-compose.coda.yml
================================================
services:
  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    network_mode: host
    restart: unless-stopped
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
      - OTEL_SERVICE_NAME=enrichment-demo


================================================
FILE: otel-examples/resource-enrichment/docker-compose.yml
================================================
version: '3.8'

services:
  # Prometheus for metrics storage (with OTLP receiver)
  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --web.enable-otlp-receiver
      - --enable-feature=native-histograms
      - --config.file=/etc/prometheus/prometheus.yml
    ports:
      - 9090:9090/tcp
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml

  # Tempo for trace storage
  tempo:
    image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4}
    command: ["-config.file=/etc/tempo.yaml"]
    ports:
      - 3200:3200/tcp
    volumes:
      - ./tempo-config.yaml:/etc/tempo.yaml

  # Grafana for visualization
  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        - name: Tempo
          type: tempo
          access: proxy
          orgId: 1
          url: http://tempo:3200
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
          jsonData:
            serviceMap:
              datasourceUid: 'Prometheus'
            nodeGraph:
              enabled: true
        EOF
        /run.sh
    depends_on:
      - prometheus
      - tempo

  # Alloy in OTel engine mode (with Docker socket for container detection)
  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    command: otel --config=/etc/alloy/config-otel.yaml
    ports:
      - 8888:8888       # OTel engine HTTP server
      - 4317:4317       # OTLP gRPC
      - 4318:4318       # OTLP HTTP
      - 12345:12345     # Alloy UI
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
      - ./config.alloy:/etc/alloy/config.alloy
      - /var/run/docker.sock:/var/run/docker.sock:ro
    depends_on:
      - prometheus
      - tempo

  # Demo app with minimal resource attributes (collector enriches them)
  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8080:8080
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=enrichment-demo
    depends_on:
      - alloy


================================================
FILE: otel-examples/resource-enrichment/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s
otlp:
  promote_resource_attributes:
    - service.instance.id
    - service.name
    - service.namespace
    - service.version
    - deployment.environment
storage:
  tsdb:
    out_of_order_time_window: 30m


================================================
FILE: otel-examples/resource-enrichment/tempo-config.yaml
================================================
stream_over_http_enabled: true
server:
  http_listen_port: 3200
  log_level: info
distributor:
  receivers:
    otlp:
      protocols:
        grpc:
          endpoint: "tempo:4317"
        http:
          endpoint: "tempo:4318"
ingester:
  max_block_duration: 5m
compactor:
  compaction:
    block_retention: 720h
metrics_generator:
  registry:
    external_labels:
      source: tempo
      cluster: docker-compose
  storage:
    path: /var/tempo/generator/wal
    remote_write:
      - url: http://prometheus:9090/api/v1/write
        send_exemplars: true
  traces_storage:
    path: /var/tempo/generator/traces
  processor:
    local_blocks:
      filter_server_spans: false
      flush_to_storage: true

storage:
  trace:
    backend: local
    wal:
      path: /var/tempo/wal
    local:
      path: /var/tempo/blocks

overrides:
  defaults:
    metrics_generator:
      processors: [service-graphs, span-metrics, local-blocks]
      generate_native_histograms: both


================================================
FILE: otel-examples/routing-multi-tenant/README.md
================================================
# Routing Multi-Tenant

Demonstrates using the OTel Collector **forward connector** and **filter processor** to route logs from different tenants into separate Loki organizations. A single OTLP intake pipeline fans out to per-tenant pipelines, each filtering by a `tenant` resource attribute and exporting with the correct `X-Scope-OrgID` header.

## What This Demonstrates

- **Forward connector** to fan out logs from one pipeline into multiple downstream pipelines
- **Filter processor** to keep only logs matching a specific tenant
- **Resource processor** to enrich logs with per-tenant attributes
- **Multi-tenant Loki** with `auth_enabled: true` and `X-Scope-OrgID` header routing
- Querying isolated tenant data in Grafana using separate datasources

## Prerequisites

- Docker and Docker Compose

## Run

```bash
docker compose up -d
```

The log generator automatically sends logs for both tenants every 2 seconds.

## Alloy UI

The Alloy pipeline debugging UI is available at [http://localhost:12345](http://localhost:12345). This is enabled by the `alloyengine` extension in `config-otel.yaml`, which runs the River UI alongside the OTel pipeline.

If you prefer a pure OTel config without the Alloy UI, remove the `extensions` block and the `extensions: [alloyengine]` line from `config-otel.yaml`.

## Explore

1. Open Grafana at [http://localhost:3000](http://localhost:3000) (no login required).
2. Go to **Explore**.

### Query team-a logs

3. Select the **Loki (team-a)** datasource and run:

```logql
{service_name="frontend-service"}
```

You should only see logs from team-a (frontend-service messages).

### Query team-b logs

4. Switch to the **Loki (team-b)** datasource and run:

```logql
{service_name="order-service"}
```

You should only see logs from team-b (order-service messages).

### Verify isolation

5. Confirm that team-a's datasource cannot see team-b's logs and vice versa -- this is enforced by Loki's multi-tenant `X-Scope-OrgID` header.

## Key Configuration

The `config-otel.yaml` uses a three-stage pipeline architecture:

1. **Intake pipeline** (`logs/intake`) -- receives all OTLP logs and exports to two forward connectors (`forward/team-a` and `forward/team-b`).
2. **Per-tenant pipelines** (`logs/team-a`, `logs/team-b`) -- each receives from its forward connector, applies a filter processor that drops logs not matching the tenant, enriches with a resource processor, and exports to a tenant-specific Loki exporter with the appropriate `X-Scope-OrgID` header.

The filter processors use `resource.attributes["tenant"] != "team-a"` (and `team-b`) to drop non-matching logs, effectively routing each tenant's data to its own Loki organization.

## Stop

```bash
docker compose down
```


================================================
FILE: otel-examples/routing-multi-tenant/app/generate_logs.py
================================================
"""
Multi-tenant log generator using OTel SDK.

Alternates between sending logs with resource attribute tenant="team-a"
and tenant="team-b" via OTLP gRPC to alloy:4317.
"""

import logging
import time
import random

from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter

TEAM_A_MESSAGES = [
    "Team A: Deployed frontend v2.3.1 to production",
    "Team A: User authentication service healthy",
    "Team A: CDN cache invalidation completed",
    "Team A: A/B test experiment-42 started for 10% of users",
    "Team A: Search index rebuild finished in 23s",
    "Team A: Rate limiter triggered for IP range 10.0.0.0/8",
]

TEAM_B_MESSAGES = [
    "Team B: Payment gateway latency increased to 450ms",
    "Team B: Inventory sync completed for warehouse-west",
    "Team B: Order fulfillment pipeline processed 1,247 orders",
    "Team B: Database replica lag at 120ms",
    "Team B: Shipping label API returned 503, retrying",
    "Team B: Nightly report generation started",
]

LEVELS = [logging.DEBUG, logging.INFO, logging.INFO, logging.WARNING, logging.ERROR]


def create_logger(tenant: str, service_name: str) -> logging.Logger:
    """Create an OTel-instrumented logger for a specific tenant."""
    resource = Resource.create({
        "service.name": service_name,
        "tenant": tenant,
    })
    exporter = OTLPLogExporter(endpoint="alloy:4317", insecure=True)
    provider = LoggerProvider(resource=resource)
    provider.add_log_record_processor(BatchLogRecordProcessor(exporter))

    handler = LoggingHandler(level=logging.DEBUG, logger_provider=provider)
    logger = logging.getLogger(f"tenant-{tenant}")
    logger.setLevel(logging.DEBUG)
    logger.addHandler(handler)
    return logger


def main():
    print("Starting multi-tenant log generator...")
    time.sleep(3)  # Wait for Alloy to be ready

    logger_a = create_logger("team-a", "frontend-service")
    logger_b = create_logger("team-b", "order-service")

    while True:
        # Send a team-a log
        level = random.choice(LEVELS)
        msg = random.choice(TEAM_A_MESSAGES)
        logger_a.log(level, msg)

        time.sleep(1)

        # Send a team-b log
        level = random.choice(LEVELS)
        msg = random.choice(TEAM_B_MESSAGES)
        logger_b.log(level, msg)

        time.sleep(1)


if __name__ == "__main__":
    main()


================================================
FILE: otel-examples/routing-multi-tenant/app/requirements.txt
================================================
opentelemetry-api
opentelemetry-sdk
opentelemetry-exporter-otlp-proto-grpc


================================================
FILE: otel-examples/routing-multi-tenant/config-otel.yaml
================================================
#
# OTel Collector YAML: Multi-Tenant Routing
#
# Demonstrates using the forward connector to fan out logs into
# multiple pipelines, then filter processors to route by tenant
# attribute. Each tenant gets its own processing and Loki org ID.
#
# Available connectors in Alloy OTel Engine: count, grafanacloud,
# servicegraph, spanmetrics, forward.
#

extensions:
  alloyengine:
    config:
      file: /etc/alloy/config.alloy
    flags:
      server.http.listen-addr: 0.0.0.0:12345

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

connectors:
  # Fork logs into per-tenant pipelines
  forward/team-a: {}
  forward/team-b: {}

processors:
  batch: {}

  # Keep only team-a logs
  filter/team-a:
    error_mode: ignore
    logs:
      log_record:
        - resource.attributes["tenant"] != "team-a"

  # Keep only team-b logs
  filter/team-b:
    error_mode: ignore
    logs:
      log_record:
        - resource.attributes["tenant"] != "team-b"

  # Add team resource attribute for team-a
  resource/team-a:
    attributes:
      - key: team
        value: team-a
        action: upsert

  # Add team resource attribute for team-b
  resource/team-b:
    attributes:
      - key: team
        value: team-b
        action: upsert

exporters:
  otlphttp/loki-team-a:
    endpoint: http://loki:3100/otlp
    headers:
      X-Scope-OrgID: team-a

  otlphttp/loki-team-b:
    endpoint: http://loki:3100/otlp
    headers:
      X-Scope-OrgID: team-b

service:
  extensions: [alloyengine]
  pipelines:
    # Intake: receive and fan out to both tenant pipelines
    logs/intake:
      receivers: [otlp]
      exporters: [forward/team-a, forward/team-b]
    # Team A pipeline: filter + enrich + export
    logs/team-a:
      receivers: [forward/team-a]
      processors: [filter/team-a, resource/team-a, batch]
      exporters: [otlphttp/loki-team-a]
    # Team B pipeline: filter + enrich + export
    logs/team-b:
      receivers: [forward/team-b]
      processors: [filter/team-b, resource/team-b, batch]
      exporters: [otlphttp/loki-team-b]


================================================
FILE: otel-examples/routing-multi-tenant/config.alloy
================================================
// Minimal Alloy config to enable the Alloy UI alongside the OTel Engine.
// The OTel pipeline is defined in config-otel.yaml.


================================================
FILE: otel-examples/routing-multi-tenant/docker-compose.coda.yml
================================================
services:
  log-generator:
    image: python:${PYTHON_VERSION:-3.11-slim}
    network_mode: host
    restart: unless-stopped
    volumes:
      - ./app/generate_logs.py:/app/generate_logs.py
      - ./app/requirements.txt:/app/requirements.txt
    command:
      - sh
      - -c
      - "pip install -r /app/requirements.txt && python /app/generate_logs.py"
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317


================================================
FILE: otel-examples/routing-multi-tenant/docker-compose.yml
================================================
version: '3.8'

services:
  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 8888:8888
      - 4317:4317/tcp
      - 4318:4318/tcp
      - 12345:12345     # Alloy UI
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
      - ./config.alloy:/etc/alloy/config.alloy
    command: otel --config=/etc/alloy/config-otel.yaml
    depends_on:
      - loki

  log-generator:
    image: python:${PYTHON_VERSION:-3.11-slim}
    volumes:
      - ./app/generate_logs.py:/app/generate_logs.py
      - ./app/requirements.txt:/app/requirements.txt
    command:
      - sh
      - -c
      - "pip install -r /app/requirements.txt && python /app/generate_logs.py"
    depends_on:
      - alloy

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    depends_on:
      - loki
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki (team-a)
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
          jsonData:
            httpHeaderName1: X-Scope-OrgID
          secureJsonData:
            httpHeaderValue1: team-a
        - name: Loki (team-b)
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
          jsonData:
            httpHeaderName1: X-Scope-OrgID
          secureJsonData:
            httpHeaderValue1: team-b
        EOF
        /run.sh


================================================
FILE: otel-examples/routing-multi-tenant/loki-config.yaml
================================================
auth_enabled: true
server:
  http_listen_port: 3100
  grpc_listen_port: 9096
  log_level: info
common:
  instance_addr: 127.0.0.1
  path_prefix: /tmp/storage
  storage:
    filesystem:
      chunks_directory: /tmp/storage/chunks
      rules_directory: /tmp/storage/rules
  replication_factor: 1
  ring:
    kvstore:
      store: inmemory
query_range:
  results_cache:
    cache:
      embedded_cache:
        enabled: true
        max_size_mb: 100
limits_config:
  metric_aggregation_enabled: true
schema_config:
  configs:
    - from: 2020-10-24
      store: tsdb
      object_store: filesystem
      schema: v13
      index:
        prefix: index_
        period: 24h
pattern_ingester:
  enabled: true
  metric_aggregation:
    loki_address: localhost:3100
ruler:
  alertmanager_url: http://localhost:9093
frontend:
  encoding: protobuf


================================================
FILE: otel-metrics-pipeline/README.md
================================================
# OTel Metrics Pipeline

Demonstrates a full OpenTelemetry metrics pipeline through Grafana Alloy: a Python application generates OTLP metrics which flow through Alloy (with batching and attribute transformation) into Prometheus, and are visualized in Grafana.

## Overview

The pipeline includes:
- **Python demo app** -- generates counters, histograms, and up-down counters via the OpenTelemetry SDK, sending them as OTLP/gRPC to Alloy.
- **Grafana Alloy** -- receives OTLP metrics, batches them, applies a transform processor (adds a `deployment.environment` resource attribute), and exports via OTLP/HTTP to Prometheus.
- **Prometheus** -- ingests metrics through its native OTLP receiver with native histogram support enabled.
- **Grafana** -- auto-provisioned with a Prometheus datasource for exploring the metrics.

## Running the Demo

1. Clone the repository:
   ```
   git clone https://github.com/grafana/alloy-scenarios.git
   cd alloy-scenarios
   ```

2. Navigate to this example directory:
   ```
   cd otel-metrics-pipeline
   ```

3. Run using Docker Compose:
   ```
   docker compose up -d
   ```

   Or use the centralized image management:
   ```
   cd ..
   ./run-example.sh otel-metrics-pipeline
   ```

4. Access the services:
   - **Grafana**: http://localhost:3000
   - **Alloy UI**: http://localhost:12345
   - **Prometheus**: http://localhost:9090

## What to Expect

After a few seconds the demo app begins emitting metrics. You can explore them in several ways:

- **Prometheus** -- navigate to http://localhost:9090 and query for metrics such as `app_requests_total`, `app_errors_total`, `app_request_duration_milliseconds`, or `app_active_users`. Note that OTLP metric names are translated to Prometheus conventions (dots become underscores, units are appended as suffixes).
- **Grafana Explore** -- open http://localhost:3000/explore, select the Prometheus datasource, and build PromQL queries against the ingested metrics.
- **Alloy pipeline UI** -- visit http://localhost:12345 to inspect the live component graph showing the receiver, batch processor, transform processor, and exporter.

## Metrics Generated

| Metric | Type | Description |
|---|---|---|
| `app.requests.total` | Counter | Total HTTP requests by endpoint, method, and status |
| `app.errors.total` | Counter | Total errors by endpoint |
| `app.request.duration` | Histogram | Request latency in milliseconds |
| `app.active_users` | UpDownCounter | Current active users by region |

## Architecture

```
┌─────────────┐  OTLP/gRPC   ┌───────────────┐  OTLP/HTTP  ┌────────────┐
│  Python App  │─────────────▶│  Grafana Alloy │────────────▶│ Prometheus │
│ (metrics gen)│   :4317      │  (batch +      │   :9090     │            │
└─────────────┘               │   transform)   │             └─────┬──────┘
                              └───────────────┘                    │
                                   :12345                          │
                                 (Alloy UI)                        ▼
                                                             ┌──────────┐
                                                             │ Grafana  │
                                                             │  :3000   │
                                                             └──────────┘
```


================================================
FILE: otel-metrics-pipeline/app/main.py
================================================
import time
import random

from opentelemetry import metrics
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.sdk.resources import Resource

resource = Resource.create({"service.name": "demo-metrics-app"})
exporter = OTLPMetricExporter()
reader = PeriodicExportingMetricReader(exporter, export_interval_millis=5000)
provider = MeterProvider(resource=resource, metric_readers=[reader])
metrics.set_meter_provider(provider)

meter = metrics.get_meter(__name__)

# Create different metric types
request_counter = meter.create_counter("app.requests.total", description="Total requests", unit="requests")
error_counter = meter.create_counter("app.errors.total", description="Total errors", unit="errors")
latency_histogram = meter.create_histogram("app.request.duration", description="Request duration", unit="ms")
active_users = meter.create_up_down_counter("app.active_users", description="Active users")

print("Starting OTLP metrics generator...")
while True:
    # Simulate request metrics
    endpoint = random.choice(["/api/users", "/api/orders", "/api/products", "/health"])
    method = random.choice(["GET", "POST"])
    status = random.choice(["200", "200", "200", "200", "404", "500"])

    request_counter.add(1, {"endpoint": endpoint, "method": method, "status": status})

    if status == "500":
        error_counter.add(1, {"endpoint": endpoint})

    latency = random.uniform(5, 500) if status != "500" else random.uniform(500, 2000)
    latency_histogram.record(latency, {"endpoint": endpoint, "method": method})

    # Simulate active users fluctuation
    active_users.add(random.choice([-1, 0, 1]), {"region": random.choice(["us-east", "eu-west"])})

    time.sleep(1)


================================================
FILE: otel-metrics-pipeline/config-otel.yaml
================================================
#
# OTel Collector YAML Configuration for OpenTelemetry Metrics Pipeline
#
# This is the OTel-native equivalent of config.alloy for use with the Alloy OTel Engine.
# Run with: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
#

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  batch: {}

  transform:
    error_mode: ignore
    metric_statements:
      - context: resource
        statements:
          - set(attributes["deployment.environment"], "demo")

exporters:
  otlphttp/prometheus:
    endpoint: http://prometheus:9090/api/v1/otlp
    tls:
      insecure: true

service:
  pipelines:
    metrics:
      receivers: [otlp]
      processors: [batch, transform]
      exporters: [otlphttp/prometheus]


================================================
FILE: otel-metrics-pipeline/config.alloy
================================================
livedebugging {
	enabled = true
}

// Receive OTLP metrics from the demo app
otelcol.receiver.otlp "default" {
	http { }

	grpc { }

	output {
		metrics = [otelcol.processor.batch.default.input]
	}
}

// Batch metrics for efficient export
otelcol.processor.batch "default" {
	output {
		metrics = [otelcol.processor.transform.default.input]
	}
}

// Transform metric attributes (example: add a deployment label)
otelcol.processor.transform "default" {
	error_mode = "ignore"

	metric_statements {
		context    = "resource"
		statements = [
			"set(attributes[\"deployment.environment\"], \"demo\")",
		]
	}

	output {
		metrics = [otelcol.exporter.otlphttp.prometheus.input]
	}
}

// Export metrics to Prometheus via OTLP
otelcol.exporter.otlphttp "prometheus" {
	client {
		endpoint = "http://prometheus:9090/api/v1/otlp"

		tls {
			insecure = true
		}
	}
}


================================================
FILE: otel-metrics-pipeline/docker-compose-otel.yml
================================================
# OTel Engine Override
#
# Uses Alloy's experimental OTel Engine to run a standard OTel Collector YAML config
# instead of the River/HCL config.alloy file.
#
# Usage: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
# Stop:  docker compose -f docker-compose.yml -f docker-compose-otel.yml down
#

services:
  alloy:
    command: otel --config=/etc/alloy/config-otel.yaml
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
    ports:
      - 8888:8888      # OTel Engine HTTP server


================================================
FILE: otel-metrics-pipeline/docker-compose.coda.yml
================================================
services:
  app:
    image: python:${PYTHON_VERSION:-3.11-slim}
    network_mode: host
    restart: unless-stopped
    volumes:
      - ./app:/app
    working_dir: /app
    command: sh -c "pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-grpc && python3 main.py"
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317


================================================
FILE: otel-metrics-pipeline/docker-compose.yml
================================================

services:
  # Python app that generates OTLP metrics
  app:
    image: python:${PYTHON_VERSION:-3.11-slim}
    volumes:
      - ./app:/app
    working_dir: /app
    command: sh -c "pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-grpc && python3 main.py"
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
    depends_on:
      - alloy

  # Alloy for telemetry pipeline
  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345      # Alloy HTTP server
      - 4317:4317        # OTLP gRPC
      - 4318:4318        # OTLP HTTP
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - prometheus

  # Prometheus for metrics storage
  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --web.enable-otlp-receiver
      - --enable-feature=native-histograms
      - --config.file=/etc/prometheus/prometheus.yml
    ports:
      - 9090:9090/tcp
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml

  # Grafana for visualization
  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh
    depends_on:
      - prometheus


================================================
FILE: otel-metrics-pipeline/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s

otlp:
  promote_resource_attributes:
    - service.instance.id
    - service.name
    - service.namespace
    - service.version
    - deployment.environment

storage:
  tsdb:
    out_of_order_time_window: 30m


================================================
FILE: otel-span-metrics/README.md
================================================
# OTel Span Metrics (RED Metrics from Traces)

This scenario demonstrates how to generate **RED metrics** (Request rate, Error rate, Duration) from OpenTelemetry traces using Grafana Alloy's `otelcol.connector.spanmetrics` component.

## Overview

Instead of relying on Tempo's built-in metrics generator, this approach uses Alloy's spanmetrics connector to derive metrics directly from trace spans in the telemetry pipeline. This gives you fine-grained control over which dimensions are extracted and how histograms are configured.

### Architecture

```
Flask App ---(OTLP/gRPC)---> Alloy ---> Tempo (traces)
                                |
                                +---> spanmetrics connector ---> Prometheus (RED metrics)
```

### What Gets Generated

The `otelcol.connector.spanmetrics` component produces the following metrics from every span:

- **`duration_milliseconds`** - Histogram of span durations (for latency/duration analysis)
- **`calls`** - Counter of span calls, with `status_code` label (for request rate and error rate)

Additional dimensions extracted: `http.method`, `http.status_code`.

## Running

```bash
# From repo root
./run-example.sh otel-span-metrics

# Or directly
cd otel-span-metrics && docker compose up -d
```

## Accessing the UIs

| Service    | URL                        |
|------------|----------------------------|
| Grafana    | http://localhost:3000      |
| Alloy      | http://localhost:12345     |
| Prometheus | http://localhost:9090      |
| Tempo      | http://localhost:3200      |
| Demo App   | http://localhost:5000      |

## Exploring the Metrics

Once the scenario is running and the load generator has been active for a minute or so, open Grafana and navigate to the **Explore** page with the **Prometheus** datasource. Try these queries:

```promql
# Request rate by service and span name
rate(duration_milliseconds_count[5m])

# Error rate (spans with error status)
rate(calls{status_code="STATUS_CODE_ERROR"}[5m])

# P95 latency by span name
histogram_quantile(0.95, rate(duration_milliseconds_bucket[5m]))
```

## Stopping

```bash
cd otel-span-metrics && docker compose down
```


================================================
FILE: otel-span-metrics/app/load.py
================================================
import requests, time, random
endpoints = ["http://app:5000/", "http://app:5000/api/data", "http://app:5000/api/slow"]
while True:
    try:
        url = random.choice(endpoints[:2])  # mostly hit fast endpoints
        if random.random() < 0.1:
            url = endpoints[2]  # occasionally hit slow
        requests.get(url, timeout=5)
    except:
        pass
    time.sleep(random.uniform(0.5, 2.0))


================================================
FILE: otel-span-metrics/app/main.py
================================================
from flask import Flask, jsonify
import random, time

from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.resources import Resource

resource = Resource.create({"service.name": "demo-app"})
provider = TracerProvider(resource=resource)
exporter = OTLPSpanExporter()
provider.add_span_processor(BatchSpanProcessor(exporter))
trace.set_tracer_provider(provider)
tracer = trace.get_tracer(__name__)

app = Flask(__name__)

@app.route("/")
def index():
    with tracer.start_as_current_span("index"):
        time.sleep(random.uniform(0.01, 0.05))
        return jsonify({"status": "ok"})

@app.route("/api/data")
def get_data():
    with tracer.start_as_current_span("get-data"):
        time.sleep(random.uniform(0.02, 0.1))
        if random.random() < 0.1:
            raise Exception("Random error")
        return jsonify({"data": [1, 2, 3]})

@app.route("/api/slow")
def slow():
    with tracer.start_as_current_span("slow-operation"):
        time.sleep(random.uniform(0.5, 2.0))
        return jsonify({"status": "done"})

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000)


================================================
FILE: otel-span-metrics/app/requirements.txt
================================================
flask
opentelemetry-api
opentelemetry-sdk
opentelemetry-exporter-otlp-proto-grpc


================================================
FILE: otel-span-metrics/config-otel.yaml
================================================
#
# OTel Collector YAML Configuration for Span Metrics (RED Metrics from Traces)
#
# This is the OTel-native equivalent of config.alloy for use with the Alloy OTel Engine.
# Run with: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
#

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  batch: {}

connectors:
  spanmetrics:
    histogram:
      explicit: {}
    dimensions:
      - name: http.method
      - name: http.status_code
    metrics_flush_interval: 5s

exporters:
  otlphttp/prometheus:
    endpoint: http://prometheus:9090/api/v1/otlp
    tls:
      insecure: true

  otlp/tempo:
    endpoint: tempo:4317
    tls:
      insecure: true

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [batch]
      exporters: [spanmetrics, otlp/tempo]
    metrics:
      receivers: [spanmetrics]
      exporters: [otlphttp/prometheus]


================================================
FILE: otel-span-metrics/config.alloy
================================================
livedebugging {
	enabled = true
}

// Receive OTel traces from the demo app
otelcol.receiver.otlp "default" {
	http { }

	grpc { }

	output {
		traces = [otelcol.processor.batch.default.input]
	}
}

// Batch traces for efficiency
otelcol.processor.batch "default" {
	output {
		traces = [
			otelcol.connector.spanmetrics.default.input,
			otelcol.exporter.otlp.tempo.input,
		]
	}
}

// Generate RED metrics from spans
otelcol.connector.spanmetrics "default" {
	histogram {
		explicit { }
	}

	dimension {
		name = "http.method"
	}

	dimension {
		name = "http.status_code"
	}

	metrics_flush_interval = "5s"

	output {
		metrics = [otelcol.exporter.otlphttp.prometheus.input]
	}
}

// Send RED metrics to Prometheus
otelcol.exporter.otlphttp "prometheus" {
	client {
		endpoint = "http://prometheus:9090/api/v1/otlp"

		tls {
			insecure = true
		}
	}
}

// Send traces to Tempo
otelcol.exporter.otlp "tempo" {
	client {
		endpoint = "tempo:4317"

		tls {
			insecure = true
		}
	}
}


================================================
FILE: otel-span-metrics/docker-compose-otel.yml
================================================
# OTel Engine Override
#
# Uses Alloy's experimental OTel Engine to run a standard OTel Collector YAML config
# instead of the River/HCL config.alloy file.
#
# Usage: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
# Stop:  docker compose -f docker-compose.yml -f docker-compose-otel.yml down
#

services:
  alloy:
    command: otel --config=/etc/alloy/config-otel.yaml
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
    ports:
      - 8888:8888      # OTel Engine HTTP server


================================================
FILE: otel-span-metrics/docker-compose.coda.yml
================================================
services:
  app:
    image: python:${PYTHON_VERSION:-3.11-slim}
    network_mode: host
    restart: unless-stopped
    volumes:
      - ./app:/app
    working_dir: /app
    command: sh -c "pip install -r requirements.txt && python main.py"
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317

  load:
    image: python:${PYTHON_VERSION:-3.11-slim}
    network_mode: host
    restart: unless-stopped
    volumes:
      - ./app:/app
    working_dir: /app
    command: sh -c "pip install requests && python load.py"


================================================
FILE: otel-span-metrics/docker-compose.yml
================================================

services:
  # Python Flask app that generates traces
  app:
    image: python:${PYTHON_VERSION:-3.11-slim}
    ports:
      - 5000:5000/tcp
    volumes:
      - ./app:/app
    working_dir: /app
    command: sh -c "pip install -r requirements.txt && python main.py"
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
    depends_on:
      - alloy

  # Load generator to continuously hit the app endpoints
  load:
    image: python:${PYTHON_VERSION:-3.11-slim}
    volumes:
      - ./app:/app
    working_dir: /app
    command: sh -c "pip install requests && python load.py"
    depends_on:
      - app

  # Alloy for telemetry pipeline with spanmetrics connector
  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345      # Alloy HTTP server
      - 4317:4317        # OTLP gRPC
      - 4318:4318        # OTLP HTTP
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - prometheus
      - tempo

  # Prometheus for metrics collection
  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --web.enable-otlp-receiver
      - --enable-feature=native-histograms
      - --config.file=/etc/prometheus/prometheus.yml
    ports:
      - 9090:9090/tcp
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml

  # Tempo for trace storage
  tempo:
    image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4}
    command: ["-config.file=/etc/tempo.yaml"]
    ports:
      - 3200:3200/tcp
    volumes:
      - ./tempo-config.yaml:/etc/tempo.yaml
    depends_on:
      - tempo-init
      - memcached

  # Init container to set up Tempo storage directories
  tempo-init:
    image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4}
    user: root
    entrypoint:
      - "chown"
      - "10001:10001"
      - "/var/tempo"
    volumes:
      - tempo-data:/var/tempo

  memcached:
    image: memcached:1.6@sha256:277e0c4f249b118e95ab10e535bae2fa1af772271d9152f3468e58d59348db56
    container_name: memcached
    ports:
      - "11211:11211"
    environment:
      - MEMCACHED_MAX_MEMORY=64m
      - MEMCACHED_THREADS=4

  # Grafana for visualization
  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        - name: Tempo
          type: tempo
          access: proxy
          orgId: 1
          url: http://tempo:3200
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
          jsonData:
            serviceMap:
              datasourceUid: 'Prometheus'
            nodeGraph:
              enabled: true
        EOF
        /run.sh
    depends_on:
      - prometheus
      - tempo

volumes:
  tempo-data:


================================================
FILE: otel-span-metrics/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s

otlp:
  promote_resource_attributes:
    - service.instance.id
    - service.name
    - service.namespace
    - service.version

storage:
  tsdb:
    out_of_order_time_window: 30m


================================================
FILE: otel-span-metrics/tempo-config.yaml
================================================
stream_over_http_enabled: true
server:
  http_listen_port: 3200
  log_level: info

cache:
  background:
    writeback_goroutines: 5
  caches:
  - roles:
    - frontend-search
    memcached:
      addresses: dns+memcached:11211

query_frontend:
  search:
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09
    metadata_slo:
        duration_slo: 5s
        throughput_bytes_slo: 1.073741824e+09
  trace_by_id:
    duration_slo: 100ms
  metrics:
    max_duration: 200h
    query_backend_after: 5m
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09

distributor:
  receivers:
    jaeger:
      protocols:
        thrift_http:
          endpoint: "tempo:14268"
        grpc:
          endpoint: "tempo:14250"
        thrift_binary:
          endpoint: "tempo:6832"
        thrift_compact:
          endpoint: "tempo:6831"
    zipkin:
      endpoint: "tempo:9411"
    otlp:
      protocols:
        grpc:
          endpoint: "tempo:4317"
        http:
          endpoint: "tempo:4318"
    opencensus:
      endpoint: "tempo:55678"

ingester:
  max_block_duration: 5m

compactor:
  compaction:
    block_retention: 720h

storage:
  trace:
    backend: local
    wal:
      path: /var/tempo/wal
    local:
      path: /var/tempo/blocks


================================================
FILE: otel-tail-sampling/README.md
================================================
# OpenTelemetry Tail Sampling with Grafana Alloy

This example demonstrates how to implement tail sampling for OpenTelemetry traces using Grafana Alloy, allowing you to intelligently filter and sample traces based on various criteria.

## Overview

The example includes:

- A Python Flask application that automatically generates different types of traces in the background
- Grafana Alloy configured with tail sampling policies and transform processor
- Tempo for trace storage and querying
- Prometheus for metrics collection
- Grafana for visualization
- Live debugging for monitoring the sampling process

## Running the Demo

1. Clone the repository:
   ```
   git clone https://github.com/grafana/alloy-scenarios.git
   cd alloy-scenarios
   ```

2. Navigate to this example directory:
   ```
   cd otel-tail-sampling
   ```

3. Run using Docker Compose:
   ```
   docker compose up -d
   ```
   
   Or use the centralized image management:
   ```
   cd ..
   ./run-example.sh otel-tail-sampling
   ```

4. Access the demo application at http://localhost:8080
5. Access Grafana at http://localhost:3000
6. Access Prometheus at http://localhost:9090
7. Access Alloy's live debugging endpoint at http://localhost:12345/debug/livedebugging

## What to Expect

The demo application automatically generates various types of traces in the background:

- **Simple Traces**: Basic single-span traces
- **Nested Traces**: Traces with parent-child relationships
- **Error Traces**: Traces containing errors
- **High Latency Traces**: Traces with execution times over 5 seconds
- **Delayed Chain Traces**: Service chains with Service D consistently having high latency (3-4 seconds)

You can also manually trigger trace generation using the web UI. The application will continuously generate a mix of these trace types in the background at random intervals.

## Processing Pipeline

This example demonstrates a more complex trace processing pipeline with the following components:

> Note: In the case of tail sampling, this ensures that trace spans are presented to the tail sampler as early as possible, to ensure that a decision period includes all relevant spans for a trace. Batch processing potentially prevents spans from arriving at the sampler before a sampling decision is made once the first span for a trace has been seen. This can lead to incorrect decisions being made, and starts to rely on a cache being enabled for future sampling decisions.

1. **OTLP Receiver**: Receives traces from the application via gRPC or HTTP
2. **Tail Sampling Processor**: Applies sampling policies based on trace properties
3. **Batch Processor**: Groups spans for efficient processing
4. **OTLP Exporter**: Sends sampled traces to Tempo

## Tail Sampling Configuration

This example uses Alloy's `otelcol.processor.tail_sampling` processor, which makes sampling decisions based on the entire trace, not just individual spans. This allows for more intelligent sampling based on trace-wide properties.

> Note: Tempo indexes upon TraceID's and SpanID's not resource attributes.  Make sure you only send When requesting trace IDs or carrying out TraceQL queries, this will mean that returned traces will in fact consist of whichever duplicate span is encountered first. This will mean that subsequent queries will potentially not yield the same result, and that the service names for spans in the same trace could be comprised of both raw-traces and trace-demo-tail-sampled in the same trace, or appear to be from a sampled trace when it was in fact unsampled, or vice versa. To ensure consistency, only one set of spans with a unique ID and traceID should be emitted to Tempo. 

The tail sampling configuration includes the following policies:

1. **Attribute-Based Sampling**: Samples traces with a specific attribute value
   ```
   policy {
     name = "test-attribute-policy"
     type = "string_attribute"
     
     string_attribute {
       key    = "test_attr_key_1"
       values = ["test_attr_val_1"]
     }
   }
   ```

2. **Error Sampling**: Always samples traces with ERROR status
   ```
   policy {
     name = "error-policy"
     type = "status_code"
     
     status_code {
       status_codes = ["ERROR"]
     }
   }
   ```

3. **Latency-Based Sampling**: Samples traces that exceed a latency threshold
   ```
   policy {
     name = "latency-policy"
     type = "latency"
     
     latency {
       threshold_ms = 5000  // 5 seconds
     }
   }
   ```

4. **Numerical Range Sampling**: Samples traces with a numeric attribute in a specific range
   ```
   policy {
     name = "numeric-policy"
     type = "numeric_attribute"
     
     numeric_attribute {
       key       = "key1"
       min_value = 70
       max_value = 100
     }
   }
   ```

5. **URL-Based Filtering**: Excludes health check and metrics endpoints
   ```
   policy {
     name = "url-filter-policy"
     type = "string_attribute"
     
     string_attribute {
       key             = "http.url"
       values          = ["/health", "/metrics"]
       invert_match    = true
     }
   }
   ```

6. **Probabilistic Sampling**: Samples a percentage of remaining traces
   ```
   policy {
     name = "probabilistic-policy"
     type = "probabilistic"
     
     probabilistic {
       sampling_percentage = 10
     }
   }
   ```

## Live Debugging

This example enables Alloy's live debugging feature, which provides real-time insights into the sampling process:

```
livedebugging {
  enabled = true
}
```

Access the live debugging interface at http://localhost:12345 to see:

- Current processing pipeline state
- Trace sampling decisions in real-time
- Policy hit counts and performance metrics
- Throughput statistics

## Sampling Implications

With tail sampling enabled in this example:

- All error traces are preserved for troubleshooting
- High latency traces (>5s) are kept for performance analysis
- Traces with specific attribute values used for monitoring are retained
- Health check and metrics endpoints are filtered out to reduce noise
- A small percentage of other traces are kept for baseline monitoring
- Traces not matching any criteria are dropped, reducing storage needs
- Raw traces are stored with a different service name for comparison

## Viewing Traces in Grafana

To view the sampled traces:

1. Open Grafana (http://localhost:3000)
2. Navigate to Explore
3. Select the Tempo data source
4. Use the Search tab to find traces based on various criteria

## Sample Queries

Try these queries in Grafana's Tempo Explorer:

- Find all traces for the sampled service:
  ```
  {resource.service.name="trace-demo-tail-sampled"}
  ```

- Find error traces:
  ```
  {status=error}
  ```

- Find high latency traces:
  ```
  {duration>5s}
  ```

- Find traces with a specific attribute:
  ```
  {span.test_attr_key_1="test_attr_val_1"}
  ```
  
- Find traces with Service D bottleneck:
  ```
  {span.service.latency="high" && span.latency.category="bottleneck"}
  ```

## Customizing

You can modify the `config.alloy` file to adjust the sampling policies:

- Change the decision wait time to balance memory usage vs. complete trace visibility
- Adjust the sampling thresholds to capture more or fewer traces
- Add additional sampling policies based on your specific needs
- Modify the existing policies to match your application's attributes
- Update the transform processor to add or modify different attributes

## Further Resources

- [Grafana Alloy Tail Sampling Documentation](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.tail_sampling/)
- [Grafana Alloy Transform Processor Documentation](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.transform/)
- [OpenTelemetry Tail Sampling Processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/tailsamplingprocessor)
- [Live Debugging in Grafana Alloy](https://grafana.com/docs/alloy/latest/debug-alloy-flow/) 

================================================
FILE: otel-tail-sampling/app/Dockerfile
================================================
ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2
FROM python:${PYTHON_VERSION}

WORKDIR /app

COPY requirements.txt .
RUN pip install -r requirements.txt

COPY app.py .

CMD ["python", "app.py"] 

================================================
FILE: otel-tail-sampling/app/app.py
================================================
import os
import random
import time
import threading
import logging
import uuid
from flask import Flask, request
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.resources import Resource, SERVICE_NAME
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.instrumentation.requests import RequestsInstrumentor
import requests
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configure the tracer
resource = Resource.create(attributes={
    SERVICE_NAME: "trace-demo-tail-sampled"
})
trace.set_tracer_provider(TracerProvider(resource=resource))

# Configure the OTLP exporter using environment variables
# OTEL_EXPORTER_OTLP_ENDPOINT will be used automatically
otlp_exporter = OTLPSpanExporter(endpoint="http://alloy:4317/v1/traces", insecure=True)
span_processor = BatchSpanProcessor(span_exporter=otlp_exporter, max_export_batch_size=1)
trace.get_tracer_provider().add_span_processor(span_processor)

# Create a tracer
tracer = trace.get_tracer(__name__)

# Create a propagator for handling trace context
propagator = TraceContextTextMapPropagator()

# Create a Flask application
app = Flask(__name__)

# Instrument Flask
FlaskInstrumentor().instrument_app(app)

# Instrument requests
RequestsInstrumentor().instrument()

# Background trace generation functions
def generate_simple_trace():
    with tracer.start_as_current_span("simple-operation") as span:
        span.set_attribute("operation.type", "simple")
        span.set_attribute("operation.value", random.randint(1, 100))
        # Set a sampling-relevant attribute
        span.set_attribute("test_attr_key_1", "test_attr_val_1" if random.random() < 0.3 else "other_value")
        time.sleep(0.1)  # Simulate work
        logger.info("Generated simple trace")

def generate_nested_trace():
    with tracer.start_as_current_span("parent-operation") as parent:
        parent.set_attribute("operation.type", "parent")
        parent.set_attribute("key1", random.randint(1, 100))  # For numeric attribute sampling
        time.sleep(0.05)  # Simulate work
        
        with tracer.start_as_current_span("child-operation-1") as child1:
            child1.set_attribute("operation.type", "child")
            child1.set_attribute("child.number", 1)
            child1.set_attribute("key2", "value1" if random.random() < 0.5 else "other_value")  # For string attribute sampling
            time.sleep(0.05)  # Simulate work
            
        with tracer.start_as_current_span("child-operation-2") as child2:
            child2.set_attribute("operation.type", "child")
            child2.set_attribute("child.number", 2)
            time.sleep(0.05)  # Simulate work
            
            with tracer.start_as_current_span("grandchild-operation") as grandchild:
                grandchild.set_attribute("operation.type", "grandchild")
                time.sleep(0.05)  # Simulate work
                
        logger.info("Generated nested trace")

def generate_error_trace():
    with tracer.start_as_current_span("error-operation") as span:
        span.set_attribute("operation.type", "error")
        try:
            # Simulate an error
            result = 1 / 0
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            logger.info("Generated error trace")

def generate_high_latency_trace():
    with tracer.start_as_current_span("high-latency-operation") as span:
        span.set_attribute("operation.type", "high-latency")
        # Add a randomized latency between 3-10 seconds
        latency = random.uniform(3.0, 10.0)
        span.set_attribute("latency.seconds", latency)
        time.sleep(latency)  # Simulate high latency work
        logger.info(f"Generated high latency trace with {latency:.2f}s delay")

def generate_delayed_chain_trace():
    """Generate a chain of service calls with service D having high latency"""
    try:
        with tracer.start_as_current_span("delayed-chain-root") as span:
            span.set_attribute("operation.step", "start")
            span.set_attribute("operation.type", "delayed-chain")
            
            # Start the chain with Service A
            req_id = random.randint(1000, 9999)
            
            # Instead of making HTTP calls in the background, simulate the chain directly
            with tracer.start_as_current_span("service-a-handler") as span_a:
                span_a.set_attribute("service", "A")
                span_a.set_attribute("request.id", str(req_id))
                span_a.set_attribute("service.latency", "normal")
                span_a.set_attribute("http.url", "/delayed/service-a")
                time.sleep(0.1)  # Normal latency
                
                with tracer.start_as_current_span("service-b-handler") as span_b:
                    span_b.set_attribute("service", "B")
                    span_b.set_attribute("request.id", str(req_id))
                    span_b.set_attribute("service.latency", "normal")
                    span_b.set_attribute("http.url", "/delayed/service-b")
                    time.sleep(0.15)  # Normal latency
                    
                    with tracer.start_as_current_span("service-c-handler") as span_c:
                        span_c.set_attribute("service", "C")
                        span_c.set_attribute("request.id", str(req_id))
                        span_c.set_attribute("service.latency", "normal")
                        span_c.set_attribute("http.url", "/delayed/service-c")
                        time.sleep(0.2)  # Normal latency
                        
                        with tracer.start_as_current_span("service-d-handler") as span_d:
                            span_d.set_attribute("service", "D")
                            span_d.set_attribute("request.id", str(req_id))
                            span_d.set_attribute("service.latency", "high")
                            span_d.set_attribute("latency.category", "bottleneck")
                            span_d.set_attribute("http.url", "/delayed/service-d")
                            
                            # This service consistently has high latency (3-4 seconds)
                            delay = random.uniform(3.0, 4.0)
                            span_d.set_attribute("latency.seconds", delay)
                            time.sleep(delay)  # High latency
                            
                            with tracer.start_as_current_span("service-e-handler") as span_e:
                                span_e.set_attribute("service", "E")
                                span_e.set_attribute("request.id", str(req_id))
                                span_e.set_attribute("service.latency", "normal")
                                span_e.set_attribute("http.url", "/delayed/service-e")
                                time.sleep(0.1)  # Normal latency
            
            logger.info("Generated delayed chain trace with high latency in Service D")
    except Exception as e:
        logger.error(f"Error generating delayed chain trace: {e}")

# New function for generating true multi-service traces
def generate_multi_service_trace_bg():
    """Generate a trace that spans multiple services with true service.name differentiation"""
    try:
        # Create a unique trace ID for correlating spans
        trace_id = str(uuid.uuid4())
        transaction_id = str(uuid.uuid4())[:8]
        logger.info(f"Generating multi-service trace. Transaction ID: {transaction_id}")
        
        # Simulate a microservice architecture with:
        # 1. Frontend service (web-ui)
        # 2. API Gateway (api-gateway)
        # 3. Authentication service (auth-service)
        # 4. User service (user-service)
        # 5. Notification service (notification-service)
        # 6. Database service (db-service)
        
        # Create a custom resource for each service
        web_ui_resource = Resource.create(attributes={SERVICE_NAME: "web-ui"})
        api_gw_resource = Resource.create(attributes={SERVICE_NAME: "api-gateway"})
        auth_resource = Resource.create(attributes={SERVICE_NAME: "auth-service"})
        user_resource = Resource.create(attributes={SERVICE_NAME: "user-service"})
        notif_resource = Resource.create(attributes={SERVICE_NAME: "notification-service"})
        db_resource = Resource.create(attributes={SERVICE_NAME: "db-service"})
        
        # Create tracers for each service
        web_ui_tracer = trace.get_tracer("web-ui-tracer", resource=web_ui_resource)
        api_gw_tracer = trace.get_tracer("api-gw-tracer", resource=api_gw_resource)
        auth_tracer = trace.get_tracer("auth-tracer", resource=auth_resource)
        user_tracer = trace.get_tracer("user-tracer", resource=user_resource)
        notif_tracer = trace.get_tracer("notif-tracer", resource=notif_resource)
        db_tracer = trace.get_tracer("db-tracer", resource=db_resource)
        
        # 1. Frontend service (web-ui) - User logs in
        with web_ui_tracer.start_as_current_span("login-page-render") as web_span:
            web_span.set_attribute("component", "web-ui")
            web_span.set_attribute("transaction.id", transaction_id)
            web_span.set_attribute("user.action", "login")
            web_span.set_attribute("http.method", "GET")
            web_span.set_attribute("http.url", "/login")
            time.sleep(0.1)
            
            # 2. Send login request to API Gateway
            with api_gw_tracer.start_as_current_span("api-gateway-login-handler") as api_span:
                api_span.set_attribute("component", "api-gateway")
                api_span.set_attribute("transaction.id", transaction_id)
                api_span.set_attribute("endpoint", "/api/v1/login")
                api_span.set_attribute("http.method", "POST")
                time.sleep(0.15)
                
                # 3. API Gateway calls Authentication Service
                with auth_tracer.start_as_current_span("authenticate-user") as auth_span:
                    auth_span.set_attribute("component", "auth-service")
                    auth_span.set_attribute("transaction.id", transaction_id)
                    auth_span.set_attribute("auth.method", "password")
                    time.sleep(0.2)
                    
                    # 4. Auth service calls User Service to retrieve user details
                    with user_tracer.start_as_current_span("get-user-details") as user_span:
                        user_span.set_attribute("component", "user-service")
                        user_span.set_attribute("transaction.id", transaction_id)
                        user_span.set_attribute("user.id", f"user_{random.randint(1000, 9999)}")
                        
                        # 5. User service calls DB Service
                        with db_tracer.start_as_current_span("db-query") as db_span:
                            db_span.set_attribute("component", "db-service")
                            db_span.set_attribute("transaction.id", transaction_id)
                            db_span.set_attribute("db.operation", "SELECT")
                            db_span.set_attribute("db.table", "users")
                            
                            # Randomly introduce database latency
                            if random.random() < 0.3:
                                delay = random.uniform(0.5, 1.5)
                                db_span.set_attribute("db.latency", delay)
                                db_span.set_attribute("latency.category", "slow-query")
                                time.sleep(delay)
                            else:
                                time.sleep(0.1)
                
                # 6. After successful login, send notification
                with notif_tracer.start_as_current_span("send-login-notification") as notif_span:
                    notif_span.set_attribute("component", "notification-service")
                    notif_span.set_attribute("transaction.id", transaction_id)
                    notif_span.set_attribute("notification.type", "login_alert")
                    notif_span.set_attribute("notification.channel", random.choice(["email", "sms", "push"]))
                    time.sleep(0.15)
        
        logger.info(f"Generated multi-service trace with transaction ID: {transaction_id}")
        return transaction_id
    except Exception as e:
        logger.error(f"Error generating multi-service trace: {e}")
        return None

def generate_trace_batch():
    """Generates a batch of different trace types"""
    trace_generators = [
        generate_simple_trace,
        generate_nested_trace,
        generate_error_trace,
        generate_high_latency_trace,
        generate_delayed_chain_trace,
        generate_multi_service_trace_bg  # Add the new trace type
    ]
    
    # Randomly select which traces to generate with weighted probabilities
    weights = [0.20, 0.20, 0.15, 0.1, 0.15, 0.2]  # Add weight for multi-service trace
    
    for _ in range(random.randint(3, 8)):  # Generate 3-8 traces per batch
        selected_generator = random.choices(trace_generators, weights=weights, k=1)[0]
        selected_generator()
        time.sleep(random.uniform(0.1, 0.5))  # Small delay between traces

def trace_generator_thread():
    """Background thread that generates traces at regular intervals"""
    while True:
        try:
            generate_trace_batch()
            # Wait between 5-15 seconds before generating the next batch
            delay = random.uniform(5, 15)
            logger.info(f"Next trace batch in {delay:.2f} seconds")
            time.sleep(delay)
        except Exception as e:
            logger.error(f"Error in trace generation: {e}")
            time.sleep(5)  # Wait before retrying

# API endpoints
@app.route('/')
def home():
    return """
    <h1>OpenTelemetry Tail Sampling Demo</h1>
    <p>This app demonstrates OpenTelemetry tracing with Tail Sampling using Grafana Alloy.</p>
    <p>The app automatically generates various types of traces in the background.</p>
    <p>You can also trigger trace generation manually using these endpoints:</p>
    <ul>
        <li><a href="/simple">Simple Trace</a></li>
        <li><a href="/nested">Nested Trace</a></li>
        <li><a href="/error">Error Trace</a></li>
        <li><a href="/high-latency">High Latency Trace</a></li>
        <li><a href="/chain">Chain of Services</a></li>
        <li><a href="/delayed-chain">Delayed Chain (with Service D having high latency)</a></li>
        <li><a href="/multi-service">Multi-Service Trace (with different service.name values)</a></li>
        <li><a href="/batch">Generate Trace Batch</a></li>
    </ul>
    """

@app.route('/simple')
def simple_trace():
    generate_simple_trace()
    return {"status": "ok", "message": "Simple trace generated"}

@app.route('/nested')
def nested_trace():
    generate_nested_trace()
    return {"status": "ok", "message": "Nested trace generated"}

@app.route('/error')
def error_trace():
    generate_error_trace()
    return {"status": "ok", "message": "Error trace generated"}

@app.route('/high-latency')
def high_latency_trace():
    generate_high_latency_trace()
    return {"status": "ok", "message": "High latency trace generated"}

@app.route('/batch')
def batch_trace():
    generate_trace_batch()
    return {"status": "ok", "message": "Trace batch generated"}

@app.route('/multi-service')
def multi_service_trace():
    transaction_id = generate_multi_service_trace_bg()
    return {
        "status": "ok", 
        "message": "Multi-service trace generated", 
        "transaction_id": transaction_id,
        "services": ["web-ui", "api-gateway", "auth-service", "user-service", "notification-service", "db-service"]
    }

@app.route('/chain')
def chain_trace():
    with tracer.start_as_current_span("chain-root") as span:
        span.set_attribute("operation.step", "start")
        
        # Simulate a chain of service calls
        try:
            # Call ourselves to simulate microservice calls
            # In a real world example these would be different services
            service_b_url = f"http://localhost:8080/service/b?id={random.randint(1000, 9999)}"
            response = requests.get(service_b_url)
            return {"status": "ok", "message": "Chain trace generated", "data": response.json()}
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            return {"status": "error", "message": "Failed to complete chain"}

@app.route('/service/b')
def service_b():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span(f"service-b-handler") as span:
        span.set_attribute("service", "B")
        span.set_attribute("request.id", req_id)
        span.set_attribute("http.url", "/service/b")  # For URL-based sampling
        time.sleep(0.1)  # Simulate work
        
        # Call service C
        service_c_url = f"http://localhost:8080/service/c?id={req_id}"
        response = requests.get(service_c_url)
        return {"status": "ok", "message": "Service B completed", "data": response.json()}

@app.route('/service/c')
def service_c():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span(f"service-c-handler") as span:
        span.set_attribute("service", "C")
        span.set_attribute("request.id", req_id)
        span.set_attribute("http.url", "/service/c")  # For URL-based sampling
        time.sleep(0.15)  # Simulate work
        
        # Randomly fail sometimes to show error traces
        if random.random() < 0.2:  # 20% chance of failure
            span.set_status(trace.StatusCode.ERROR, "Random failure")
            return {"status": "error", "message": "Service C failed randomly"}
        
        return {"status": "ok", "message": "Service C completed successfully"}

# Add the delayed chain implementation
@app.route('/delayed-chain')
def delayed_chain_trace_endpoint():
    with tracer.start_as_current_span("delayed-chain-root") as span:
        span.set_attribute("operation.step", "start")
        span.set_attribute("operation.type", "delayed-chain")
        
        try:
            # Start the chain with Service A
            service_a_url = f"http://localhost:8080/delayed/service-a?id={random.randint(1000, 9999)}"
            response = requests.get(service_a_url)
            return {
                "status": "ok", 
                "message": "Delayed chain trace generated", 
                "data": response.json()
            }
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            return {"status": "error", "message": "Failed to complete delayed chain"}

@app.route('/delayed/service-a')
def delayed_service_a():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span("service-a-handler") as span:
        span.set_attribute("service", "A")
        span.set_attribute("request.id", req_id)
        span.set_attribute("service.latency", "normal")
        span.set_attribute("http.url", "/delayed/service-a")
        time.sleep(0.1)  # Normal latency
        
        # Call service B
        service_b_url = f"http://localhost:8080/delayed/service-b?id={req_id}"
        response = requests.get(service_b_url)
        return {"status": "ok", "message": "Service A completed", "data": response.json()}

@app.route('/delayed/service-b')
def delayed_service_b():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span("service-b-handler") as span:
        span.set_attribute("service", "B")
        span.set_attribute("request.id", req_id)
        span.set_attribute("service.latency", "normal")
        span.set_attribute("http.url", "/delayed/service-b")
        time.sleep(0.15)  # Normal latency
        
        # Call service C
        service_c_url = f"http://localhost:8080/delayed/service-c?id={req_id}"
        response = requests.get(service_c_url)
        return {"status": "ok", "message": "Service B completed", "data": response.json()}

@app.route('/delayed/service-c')
def delayed_service_c():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span("service-c-handler") as span:
        span.set_attribute("service", "C")
        span.set_attribute("request.id", req_id)
        span.set_attribute("service.latency", "normal")
        span.set_attribute("http.url", "/delayed/service-c")
        time.sleep(0.2)  # Normal latency
        
        # Call the slow service D
        service_d_url = f"http://localhost:8080/delayed/service-d?id={req_id}"
        response = requests.get(service_d_url)
        return {"status": "ok", "message": "Service C completed", "data": response.json()}

@app.route('/delayed/service-d')
def delayed_service_d():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span("service-d-handler") as span:
        span.set_attribute("service", "D")
        span.set_attribute("request.id", req_id)
        span.set_attribute("service.latency", "high")
        span.set_attribute("latency.category", "bottleneck")
        span.set_attribute("http.url", "/delayed/service-d")
        
        # This service consistently has high latency (3-4 seconds)
        delay = random.uniform(3.0, 4.0)
        span.set_attribute("latency.seconds", delay)
        time.sleep(delay)  # High latency
        
        # Call final service E
        service_e_url = f"http://localhost:8080/delayed/service-e?id={req_id}"
        response = requests.get(service_e_url)
        return {"status": "ok", "message": "Service D completed (with delay)", "data": response.json()}

@app.route('/delayed/service-e')
def delayed_service_e():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span("service-e-handler") as span:
        span.set_attribute("service", "E")
        span.set_attribute("request.id", req_id)
        span.set_attribute("service.latency", "normal")
        span.set_attribute("http.url", "/delayed/service-e")
        time.sleep(0.1)  # Normal latency
        
        return {"status": "ok", "message": "Service E completed (chain end)"}

if __name__ == '__main__':
    # Start the background trace generator thread
    trace_thread = threading.Thread(target=trace_generator_thread, daemon=True)
    trace_thread.start()
    
    logger.info("Starting the application with background trace generation")
    app.run(host='0.0.0.0', port=8080) 

================================================
FILE: otel-tail-sampling/app/requirements.txt
================================================
flask
requests
opentelemetry-api
opentelemetry-sdk
opentelemetry-exporter-otlp
opentelemetry-instrumentation-flask
opentelemetry-instrumentation-requests

================================================
FILE: otel-tail-sampling/config-otel.yaml
================================================
#
# OTel Collector YAML Configuration for Tail Sampling
#
# This is the OTel-native equivalent of config.alloy for use with the Alloy OTel Engine.
# Run with: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
#

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  # Tail Sampling: wait for complete traces before making sampling decisions
  tail_sampling:
    decision_wait: 10s
    num_traces: 100
    expected_new_traces_per_sec: 10
    policies:
      # Policy 1: Always sample traces with a specific attribute value
      - name: test-attribute-policy
        type: string_attribute
        string_attribute:
          key: test_attr_key_1
          values: [test_attr_val_1]

      # Policy 2: Sample error traces
      - name: error-policy
        type: status_code
        status_code:
          status_codes: [ERROR]

      # Policy 3: Sample high latency traces (> 5s)
      - name: latency-policy
        type: latency
        latency:
          threshold_ms: 5000

      # Policy 4: Sample traces matching a numeric attribute range
      - name: numeric-policy
        type: numeric_attribute
        numeric_attribute:
          key: key1
          min_value: 70
          max_value: 100

      # Policy 5: URL-based policy to filter out health checks
      - name: url-filter-policy
        type: string_attribute
        string_attribute:
          key: http.url
          values: ["/health", "/metrics"]
          invert_match: true

      # Policy 6: Probabilistic sampling as a fallback (sample 10% of remaining traces)
      - name: probabilistic-policy
        type: probabilistic
        probabilistic:
          sampling_percentage: 10

  batch: {}

exporters:
  otlp/tempo:
    endpoint: tempo:4317
    tls:
      insecure: true

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [tail_sampling, batch]
      exporters: [otlp/tempo]


================================================
FILE: otel-tail-sampling/config.alloy
================================================
/*
 * Alloy Configuration for OpenTelemetry Trace Collection with Tail Sampling
 */

// Receive OpenTelemetry traces
otelcol.receiver.otlp "default" {
  http {}
  grpc {}

  output {
    traces = [otelcol.processor.tail_sampling.default.input]
  }
}

// Tail Sampling processor
otelcol.processor.tail_sampling "default" {
  // Wait time to make a sampling decision
  decision_wait = "10s"
  
  // Number of traces kept in memory
  num_traces = 100
  
  // Expected new traces per second
  expected_new_traces_per_sec = 10
  
  // Policy 1: Always sample traces with a specific attribute value
  policy {
    name = "test-attribute-policy"
    type = "string_attribute"
    
    string_attribute {
      key    = "test_attr_key_1"
      values = ["test_attr_val_1"]
    }
  }
  
  // Policy 2: Sample error traces
  policy {
    name = "error-policy"
    type = "status_code"
    
    status_code {
      status_codes = ["ERROR"]
    }
  }
  
  // Policy 3: Sample high latency traces
  policy {
    name = "latency-policy"
    type = "latency"
    
    latency {
      threshold_ms = 5000  // 5 seconds
    }
  }
  
  // Policy 4: Sample traces matching a numeric attribute range
  policy {
    name = "numeric-policy"
    type = "numeric_attribute"
    
    numeric_attribute {
      key       = "key1"
      min_value = 70
      max_value = 100
    }
  }
  
  // Policy 5: URL-based policy to filter out health checks
  policy {
    name = "url-filter-policy"
    type = "string_attribute"
    
    string_attribute {
      key             = "http.url"
      values          = ["/health", "/metrics"]
      invert_match    = true  // Sample everything EXCEPT these URLs
    }
  }
  
  // Policy 6: Probabilistic sampling as a fallback (sample 10% of remaining traces)
  policy {
    name = "probabilistic-policy"
    type = "probabilistic"
    
    probabilistic {
      sampling_percentage = 10
    }
  }
  
  output {
    traces = [otelcol.processor.batch.default.input]
  }
}

// Batch processor to improve performance
otelcol.processor.batch "default" {
  output {
    traces = [otelcol.exporter.otlp.tempo.input]
  }
}

// Send sampled traces to Tempo
otelcol.exporter.otlp "tempo" {
  client {
    endpoint = "tempo:4317"
    tls {
      insecure = true
    }
  }
} 

livedebugging {
  enabled = true
}

================================================
FILE: otel-tail-sampling/docker-compose-otel.yml
================================================
# OTel Engine Override
#
# Uses Alloy's experimental OTel Engine to run a standard OTel Collector YAML config
# instead of the River/HCL config.alloy file.
#
# Usage: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
# Stop:  docker compose -f docker-compose.yml -f docker-compose-otel.yml down
#

services:
  alloy:
    command: otel --config=/etc/alloy/config-otel.yaml
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
    ports:
      - 8888:8888      # OTel Engine HTTP server


================================================
FILE: otel-tail-sampling/docker-compose.coda.yml
================================================
services:
  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    network_mode: host
    restart: unless-stopped
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=demo-service
      - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo


================================================
FILE: otel-tail-sampling/docker-compose.yml
================================================
version: '3.8'

services:
  # Prometheus for metrics collection
  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --web.enable-otlp-receiver
      - --enable-feature=native-histograms
      - --enable-feature=exemplar-storage
      - --config.file=/etc/prometheus/prometheus.yml
    ports:
      - 9090:9090/tcp
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml

  # Tempo for tracing
  tempo:
    image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4}
    command: ["-config.file=/etc/tempo.yaml"]
    ports:
      - 3200:3200/tcp    # tempo
    volumes:
      - ./tempo-config.yaml:/etc/tempo.yaml
    depends_on:
      - prometheus

  # Grafana for visualization
  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
      - GF_INSTALL_PLUGINS=https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        - name: Tempo
          type: tempo
          access: proxy
          orgId: 1
          url: http://tempo:3200
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
          jsonData:
            serviceMap:
              datasourceUid: 'Prometheus'
            nodeGraph:
              enabled: true
        EOF
        /run.sh
    depends_on:
      - prometheus
      - tempo

  # Alloy for telemetry pipeline and tail sampling
  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345      # Alloy HTTP server
      - 4317:4317/tcp    # OTLP gRPC
      - 4318:4318/tcp    # OTLP HTTP
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy

  # Demo app that generates OpenTelemetry traces
  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8080:8080
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=demo-service
      - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo 

================================================
FILE: otel-tail-sampling/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s


================================================
FILE: otel-tail-sampling/tempo-config.yaml
================================================
stream_over_http_enabled: true
server:
  http_listen_port: 3200
  log_level: info


cache:
  background:
    writeback_goroutines: 5
  caches:
  - roles:
    - frontend-search  
    memcached: 
      addresses: dns+memcached:11211

query_frontend:
  search:
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09
    metadata_slo:
        duration_slo: 5s
        throughput_bytes_slo: 1.073741824e+09
  trace_by_id:
    duration_slo: 100ms
  metrics:
    max_duration: 200h                # maximum duration of a metrics query, increase for local setups
    query_backend_after: 5m
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09

distributor:
  receivers:                           # this configuration will listen on all ports and protocols that tempo is capable of.
    jaeger:                            # the receives all come from the OpenTelemetry collector.  more configuration information can
      protocols:                       # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver
        thrift_http:                   #
          endpoint: "tempo:14268"      # for a production deployment you should only enable the receivers you need!
        grpc:
          endpoint: "tempo:14250"
        thrift_binary:
          endpoint: "tempo:6832"
        thrift_compact:
          endpoint: "tempo:6831"
    zipkin:
      endpoint: "tempo:9411"
    otlp:
      protocols:
        grpc:
          endpoint: "tempo:4317"
        http:
          endpoint: "tempo:4318"
    opencensus:
      endpoint: "tempo:55678"

ingester:
  max_block_duration: 5m               # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally

compactor:
  compaction:
    block_retention: 720h                # overall Tempo trace retention. set for demo purposes

metrics_generator:
  registry:
    external_labels:
      source: tempo
      cluster: docker-compose
  storage:
    path: /var/tempo/generator/wal
    remote_write:
      - url: http://prometheus:9090/api/v1/write
        send_exemplars: true
  traces_storage:
    path: /var/tempo/generator/traces
  processor:
    local_blocks:
      filter_server_spans: false
      flush_to_storage: true

storage:
  trace:
    backend: local                     # backend configuration to use
    wal:
      path: /var/tempo/wal             # where to store the wal locally
    local:
      path: /var/tempo/blocks

overrides:
  defaults:
    metrics_generator:
      processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator
      generate_native_histograms: both
      

================================================
FILE: otel-tracing-service-graphs/README.md
================================================
# Alloy Service Graphs with OpenTelemetry

This example demonstrates how to use Grafana Alloy to generate service graphs from OpenTelemetry traces and send them to Prometheus via OTLP HTTP, instead of relying on Tempo's built-in metrics generator.

## Overview

The example includes:

- A sample Python Flask application that generates various types of traces
- Grafana Alloy as the telemetry pipeline with service graph generation
- Tempo for trace storage and querying (without metrics generation)
- Prometheus with OTLP receiver enabled for metrics collection
- Memcached for Tempo caching
- Grafana for visualization

## Running the Demo

1. Clone the repository:
   ```
   git clone https://github.com/grafana/alloy-scenarios.git
   cd alloy-scenarios
   ```

2. Navigate to this example directory:
   ```
   cd otel-tracing-service-graphs
   ```

3. Run using Docker Compose:
   ```
   docker compose up -d
   ```
   
   Or use the centralized image management:
   ```
   cd ..
   ./run-example.sh otel-tracing-service-graphs
   ```

4. Access the demo application at http://localhost:8080
5. Access Grafana at http://localhost:3000
6. Access Prometheus at http://localhost:9090

## What to Expect

The demo application provides several endpoints that generate different types of traces:

- **/simple**: Generates a simple trace with a single span
- **/nested**: Generates a trace with nested spans (parent-child relationships)
- **/error**: Generates a trace that includes an error
- **/chain**: Simulates a chain of service calls to demonstrate distributed tracing

After accessing these endpoints, you can view the traces and service graphs in Grafana.

## Alloy Service Graph Generation

This example demonstrates using Alloy's `otelcol.connector.servicegraph` component to generate service graphs from traces, which offers several advantages over using Tempo's built-in metrics generator:

1. **More Flexibility**: Alloy's service graph connector allows for customization of dimensions and collection intervals
2. **Pipeline Integration**: The service graph metrics can be part of a larger telemetry pipeline with additional processing
3. **Reduced Load on Tempo**: By offloading the service graph generation to Alloy, Tempo can focus on trace storage and querying

The key component in the Alloy configuration is:

```
otelcol.connector.servicegraph "default" {
  metrics_flush_interval = "10s"
  dimensions = ["http.method"]
  
  output {
    metrics = [otelcol.exporter.otlphttp.prometheus.input]
  }
}
```

## Prometheus OTLP Integration

This example uses Prometheus's OTLP HTTP receiver endpoint. This approach has several benefits:

1. **Native OTLP Integration**: Uses the OpenTelemetry Protocol directly between Alloy and Prometheus
2. **Simplified Configuration**: Uses Prometheus's built-in OTLP receiver without needing special ports
3. **Better Metadata Handling**: Resource attributes from OTLP are properly promoted to Prometheus labels

The OTLP HTTP exporter configuration in Alloy is:

```
otelcol.exporter.otlphttp "prometheus" {
  client {
    endpoint = "http://prometheus:9090/api/v1/otlp"
    tls {
      insecure = true
    }
  }
}
```

And in Prometheus, we've enabled the OTLP receiver and configured resource attributes to be promoted to labels:

```
otlp:
  promote_resource_attributes:
    - service.instance.id
    - service.name
    - service.namespace
    - service.version
    - deployment.environment
    # ...and more relevant attributes
```

## Viewing Service Graphs

To view the service graph:

1. Open Grafana (http://localhost:3000)
2. Navigate to Explore
3. Select the Tempo data source
4. Click on the "Service Graph" tab
5. You should see a visual representation of the relationships between services

The service graph metrics are stored in Prometheus with the following metrics:
- `calls_total`: Total number of calls between services
- `calls_failed_total`: Total number of failed calls between services
- `latency`: Histogram of latencies between services

The metrics are segmented by HTTP method, allowing you to see which endpoints are being called.

## Architecture

```
┌────────────┐     ┌──────────────────────┐      ┌───────┐      ┌─────────┐
│ Demo App   │────▶│ Alloy                │─────▶│ Tempo │─────▶│ Grafana │
│ (OTel SDK) │     │ ┌──────────────────┐ │      │       │      │         │
└────────────┘     │ │Service Graph Gen.│ │      └───────┘      └─────────┘
                   │ └────────┬─────────┘ │                          ▲
                   └──────────┼───────────┘                          │
                              │                                      │
                              ▼                                      │
                        ┌─────────┐                                  │
                        │Prometheus│──────────────────────────────────┘
                        │  (OTLP)  │
                        └─────────┘
```

In this architecture:
1. The Demo App generates traces using the OpenTelemetry SDK and sends them to Alloy
2. Alloy processes the traces and:
   - Generates service graph metrics using the servicegraph connector
   - Forwards the raw traces to Tempo
3. Service graph metrics are sent to Prometheus via OTLP HTTP
4. Grafana queries both Tempo for traces and Prometheus for service graph metrics

## Customizing

The Alloy configuration can be further customized to add:
- Additional processors for trace data
- Filtering based on service names or other attributes
- Custom dimensions for the service graph metrics (currently using HTTP method)
- Additional metrics exporters for different backend systems 


================================================
FILE: otel-tracing-service-graphs/app/Dockerfile
================================================
ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2
FROM python:${PYTHON_VERSION}

WORKDIR /app

COPY requirements.txt .
RUN pip install -r requirements.txt

COPY app.py .

CMD ["python", "app.py"] 

================================================
FILE: otel-tracing-service-graphs/app/app.py
================================================
import os
import random
import time
import uuid
from flask import Flask, request
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.resources import SERVICE_NAME, Resource
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.instrumentation.requests import RequestsInstrumentor
import requests

# Configure the tracer
resource = Resource.create(attributes={
    SERVICE_NAME: "trace-demo"
})
trace.set_tracer_provider(TracerProvider(resource=resource))

# Configure the OTLP exporter using environment variables
# OTEL_EXPORTER_OTLP_ENDPOINT will be used automatically
otlp_exporter = OTLPSpanExporter(endpoint="http://alloy:4317/v1/traces", insecure=True)
span_processor = BatchSpanProcessor(span_exporter=otlp_exporter, max_export_batch_size=1)
trace.get_tracer_provider().add_span_processor(span_processor)

# Create a tracer
tracer = trace.get_tracer(__name__)

# Create a Flask application
app = Flask(__name__)

# Instrument Flask
FlaskInstrumentor().instrument_app(app)

# Instrument requests
RequestsInstrumentor().instrument()

@app.route('/')
def home():
    return """
    <h1>OpenTelemetry Service Graph Demo</h1>
    <p>This app demonstrates OpenTelemetry tracing with Grafana Alloy and service graph generation.</p>
    <ul>
        <li><a href="/simple">Simple Trace</a></li>
        <li><a href="/nested">Nested Trace</a></li>
        <li><a href="/error">Error Trace</a></li>
        <li><a href="/chain">Chain of Services</a></li>
        <li><a href="/delayed-chain">Delayed Chain (with Service D having high latency)</a></li>
        <li><a href="/multi-service">Multi-Service Trace (with different service.name values)</a></li>
    </ul>
    """

@app.route('/simple')
def simple_trace():
    with tracer.start_as_current_span("simple-operation") as span:
        span.set_attribute("operation.type", "simple")
        span.set_attribute("operation.value", random.randint(1, 100))
        time.sleep(0.1)  # Simulate work
        return {"status": "ok", "message": "Simple trace generated"}

@app.route('/nested')
def nested_trace():
    with tracer.start_as_current_span("parent-operation") as parent:
        parent.set_attribute("operation.type", "parent")
        time.sleep(0.05)  # Simulate work
        
        with tracer.start_as_current_span("child-operation-1") as child1:
            child1.set_attribute("operation.type", "child")
            child1.set_attribute("child.number", 1)
            time.sleep(0.05)  # Simulate work
            
        with tracer.start_as_current_span("child-operation-2") as child2:
            child2.set_attribute("operation.type", "child")
            child2.set_attribute("child.number", 2)
            time.sleep(0.05)  # Simulate work
            
            with tracer.start_as_current_span("grandchild-operation") as grandchild:
                grandchild.set_attribute("operation.type", "grandchild")
                time.sleep(0.05)  # Simulate work
                
        return {"status": "ok", "message": "Nested trace generated"}

@app.route('/error')
def error_trace():
    with tracer.start_as_current_span("error-operation") as span:
        span.set_attribute("operation.type", "error")
        try:
            # Simulate an error
            result = 1 / 0
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            return {"status": "error", "message": "Error trace generated"}

@app.route('/chain')
def chain_trace():
    with tracer.start_as_current_span("chain-root") as span:
        span.set_attribute("operation.step", "start")
        
        # Simulate a chain of service calls
        try:
            # Call ourselves to simulate microservice calls
            # In a real world example these would be different services
            service_b_url = f"http://localhost:8080/service/b?id={random.randint(1000, 9999)}"
            response = requests.get(service_b_url)
            return {"status": "ok", "message": "Chain trace generated", "data": response.json()}
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            return {"status": "error", "message": "Failed to complete chain"}

@app.route('/service/b')
def service_b():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span(f"service-b-handler") as span:
        span.set_attribute("service", "B")
        span.set_attribute("request.id", req_id)
        time.sleep(0.1)  # Simulate work
        
        # Call service C
        service_c_url = f"http://localhost:8080/service/c?id={req_id}"
        response = requests.get(service_c_url)
        return {"status": "ok", "message": "Service B completed", "data": response.json()}

@app.route('/service/c')
def service_c():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span(f"service-c-handler") as span:
        span.set_attribute("service", "C")
        span.set_attribute("request.id", req_id)
        time.sleep(0.15)  # Simulate work
        
        # Randomly fail sometimes to show error traces
        if random.random() < 0.2:  # 20% chance of failure
            span.set_status(trace.StatusCode.ERROR, "Random failure")
            return {"status": "error", "message": "Service C failed randomly"}
        
        return {"status": "ok", "message": "Service C completed successfully"}

# New delayed chain implementation
@app.route('/delayed-chain')
def delayed_chain_trace():
    with tracer.start_as_current_span("delayed-chain-root") as span:
        span.set_attribute("operation.step", "start")
        span.set_attribute("operation.type", "delayed-chain")
        
        try:
            # Start the chain with Service A
            service_a_url = f"http://localhost:8080/delayed/service-a?id={random.randint(1000, 9999)}"
            response = requests.get(service_a_url)
            return {
                "status": "ok", 
                "message": "Delayed chain trace generated", 
                "data": response.json()
            }
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            return {"status": "error", "message": "Failed to complete delayed chain"}

@app.route('/delayed/service-a')
def delayed_service_a():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span("service-a-handler") as span:
        span.set_attribute("service", "A")
        span.set_attribute("client.service.name", "frontend")
        span.set_attribute("request.id", req_id)
        span.set_attribute("service.latency", "normal")
        span.set_attribute("http.method", "GET")
        time.sleep(0.1)  # Normal latency
        
        # Call service B
        service_b_url = f"http://localhost:8080/delayed/service-b?id={req_id}"
        response = requests.get(service_b_url)
        return {"status": "ok", "message": "Service A completed", "data": response.json()}

@app.route('/delayed/service-b')
def delayed_service_b():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span("service-b-handler") as span:
        span.set_attribute("service", "B")
        span.set_attribute("client.service.name", "service-a")
        span.set_attribute("request.id", req_id)
        span.set_attribute("service.latency", "normal")
        span.set_attribute("http.method", "GET")
        time.sleep(0.15)  # Normal latency
        
        # Call service C
        service_c_url = f"http://localhost:8080/delayed/service-c?id={req_id}"
        response = requests.get(service_c_url)
        return {"status": "ok", "message": "Service B completed", "data": response.json()}

@app.route('/delayed/service-c')
def delayed_service_c():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span("service-c-handler") as span:
        span.set_attribute("service", "C")
        span.set_attribute("client.service.name", "service-b")
        span.set_attribute("request.id", req_id)
        span.set_attribute("service.latency", "normal")
        span.set_attribute("http.method", "GET")
        time.sleep(0.2)  # Normal latency
        
        # Call the slow service D
        service_d_url = f"http://localhost:8080/delayed/service-d?id={req_id}"
        response = requests.get(service_d_url)
        return {"status": "ok", "message": "Service C completed", "data": response.json()}

@app.route('/delayed/service-d')
def delayed_service_d():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span("service-d-handler") as span:
        span.set_attribute("service", "D")
        span.set_attribute("client.service.name", "service-c")
        span.set_attribute("request.id", req_id)
        span.set_attribute("service.latency", "high")
        span.set_attribute("latency.category", "bottleneck")
        span.set_attribute("http.method", "GET")
        
        # This service consistently has high latency (3-4 seconds)
        delay = random.uniform(3.0, 4.0)
        span.set_attribute("latency.seconds", delay)
        time.sleep(delay)  # High latency
        
        # Call final service E
        service_e_url = f"http://localhost:8080/delayed/service-e?id={req_id}"
        response = requests.get(service_e_url)
        return {"status": "ok", "message": "Service D completed (with delay)", "data": response.json()}

@app.route('/delayed/service-e')
def delayed_service_e():
    req_id = request.args.get('id', 'unknown')
    with tracer.start_as_current_span("service-e-handler") as span:
        span.set_attribute("service", "E")
        span.set_attribute("client.service.name", "service-d")
        span.set_attribute("request.id", req_id)
        span.set_attribute("service.latency", "normal")
        span.set_attribute("http.method", "GET")
        time.sleep(0.1)  # Normal latency
        
        return {"status": "ok", "message": "Service E completed (chain end)"}

@app.route('/multi-service')
def multi_service_trace():
    transaction_id = generate_multi_service_trace()
    return {
        "status": "ok", 
        "message": "Multi-service trace generated", 
        "transaction_id": transaction_id,
        "services": ["web-ui", "api-gateway", "auth-service", "user-service", "notification-service", "db-service"]
    }
# code fixed thanks to @hedss
def generate_multi_service_trace():
    """Generate a trace that spans multiple services with true service.name differentiation"""
    try:
        # Create a unique transaction ID for correlating spans
        transaction_id = str(uuid.uuid4())[:8]

        # Create a custom resource for each service
        web_ui_resource = Resource.create(attributes={SERVICE_NAME: "web-ui"})
        api_gw_resource = Resource.create(attributes={SERVICE_NAME: "api-gateway"})
        auth_resource = Resource.create(attributes={SERVICE_NAME: "auth-service"})
        user_resource = Resource.create(attributes={SERVICE_NAME: "user-service"})
        notif_resource = Resource.create(attributes={SERVICE_NAME: "notification-service"})
        db_resource = Resource.create(attributes={SERVICE_NAME: "db-service"})

        # Create trace providers with each resource
        web_ui_provider = TracerProvider(resource=web_ui_resource)
        api_gw_provider = TracerProvider(resource=api_gw_resource)
        auth_provider = TracerProvider(resource=auth_resource)
        user_provider = TracerProvider(resource=user_resource)
        notif_provider = TracerProvider(resource=notif_resource)
        db_provider = TracerProvider(resource=db_resource)

        # Connect the providers to the same OTLP exporter via span processors
        web_ui_provider.add_span_processor(span_processor)
        api_gw_provider.add_span_processor(span_processor)
        auth_provider.add_span_processor(span_processor)
        user_provider.add_span_processor(span_processor)
        notif_provider.add_span_processor(span_processor)
        db_provider.add_span_processor(span_processor)

        # Create tracers for each service using their respective providers
        web_ui_tracer = web_ui_provider.get_tracer("web-ui-tracer")
        api_gw_tracer = api_gw_provider.get_tracer("api-gw-tracer")
        auth_tracer = auth_provider.get_tracer("auth-tracer")
        user_tracer = user_provider.get_tracer("user-tracer")
        notif_tracer = notif_provider.get_tracer("notif-tracer")
        db_tracer = db_provider.get_tracer("db-tracer")

        # 1. Frontend service (web-ui) - User logs in
        with web_ui_tracer.start_as_current_span("login-page-render", kind=trace.SpanKind.SERVER) as web_span:
            web_span.set_attribute("component", "web-ui")
            web_span.set_attribute("transaction.id", transaction_id)
            web_span.set_attribute("user.action", "login")
            web_span.set_attribute("http.method", "GET")
            web_span.set_attribute("http.url", "/login")
            time.sleep(0.1)

            # 2. Send login request to API Gateway
            with web_ui_tracer.start_as_current_span("api-gateway-request", kind=trace.SpanKind.CLIENT) as web_client_span:
                web_client_span.set_attribute("component", "web-ui")
                web_client_span.set_attribute("transaction.id", transaction_id)
                web_client_span.set_attribute("http.method", "POST")
                web_client_span.set_attribute("http.url", "/api/v1/login")

                # API Gateway receives the request
                with api_gw_tracer.start_as_current_span("api-gateway-login-handler", kind=trace.SpanKind.SERVER) as api_span:
                    api_span.set_attribute("component", "api-gateway")
                    api_span.set_attribute("transaction.id", transaction_id)
                    api_span.set_attribute("endpoint", "/api/v1/login")
                    api_span.set_attribute("http.method", "POST")
                    time.sleep(0.15)

                    # 3. API Gateway calls Authentication Service
                    with api_gw_tracer.start_as_current_span("auth-service-request", kind=trace.SpanKind.CLIENT) as api_client_span:
                        api_client_span.set_attribute("component", "api-gateway")
                        api_client_span.set_attribute("transaction.id", transaction_id)
                        api_client_span.set_attribute("http.method", "POST")
                        api_client_span.set_attribute("http.url", "/auth/authenticate")

                        # Auth service receives the request
                        with auth_tracer.start_as_current_span("authenticate-user", kind=trace.SpanKind.SERVER) as auth_span:
                            auth_span.set_attribute("component", "auth-service")
                            auth_span.set_attribute("transaction.id", transaction_id)
                            auth_span.set_attribute("auth.method", "password")
                            time.sleep(0.2)

                            # 4. Auth service calls User Service
                            with auth_tracer.start_as_current_span("user-service-request", kind=trace.SpanKind.CLIENT) as auth_client_span:
                                auth_client_span.set_attribute("component", "auth-service")
                                auth_client_span.set_attribute("transaction.id", transaction_id)
                                auth_client_span.set_attribute("http.method", "GET")
                                auth_client_span.set_attribute("http.url", "/user/details")

                                # User service receives the request
                                with user_tracer.start_as_current_span("get-user-details", kind=trace.SpanKind.SERVER) as user_span:
                                    user_span.set_attribute("component", "user-service")
                                    user_span.set_attribute("transaction.id", transaction_id)
                                    user_span.set_attribute("user.id", f"user_{random.randint(1000, 9999)}")

                                    # 5. User service calls DB Service
                                    with user_tracer.start_as_current_span("db-service-request", kind=trace.SpanKind.CLIENT) as user_client_span:
                                        user_client_span.set_attribute("component", "user-service")
                                        user_client_span.set_attribute("transaction.id", transaction_id)
                                        user_client_span.set_attribute("db.operation", "SELECT")
                                        user_client_span.set_attribute("db.table", "users")

                                        # DB service receives the request
                                        with db_tracer.start_as_current_span("db-query", kind=trace.SpanKind.SERVER) as db_span:
                                            db_span.set_attribute("component", "db-service")
                                            db_span.set_attribute("transaction.id", transaction_id)
                                            db_span.set_attribute("db.operation", "SELECT")
                                            db_span.set_attribute("db.table", "users")

                                            # Randomly introduce database latency
                                            if random.random() < 0.3:
                                                delay = random.uniform(0.5, 1.5)
                                                db_span.set_attribute("db.latency", delay)
                                                db_span.set_attribute("latency.category", "slow-query")
                                                time.sleep(delay)
                                            else:
                                                time.sleep(0.1)

                    # 6. After successful login, send notification
                    with api_gw_tracer.start_as_current_span("notification-service-request", kind=trace.SpanKind.CLIENT) as notif_client_span:
                        notif_client_span.set_attribute("component", "api-gateway")
                        notif_client_span.set_attribute("transaction.id", transaction_id)
                        notif_client_span.set_attribute("http.method", "POST")
                        notif_client_span.set_attribute("http.url", "/notifications/send")

                        # Notification service receives the request
                        with notif_tracer.start_as_current_span("send-login-notification", kind=trace.SpanKind.SERVER) as notif_span:
                            notif_span.set_attribute("component", "notification-service")
                            notif_span.set_attribute("transaction.id", transaction_id)
                            notif_span.set_attribute("notification.type", "login_alert")
                            notif_span.set_attribute("notification.channel", random.choice(["email", "sms", "push"]))
                            time.sleep(0.15)

        return transaction_id
    except Exception as e:
        print(f"Error generating multi-service trace: {e}")
        return None

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8080) 

================================================
FILE: otel-tracing-service-graphs/app/requirements.txt
================================================
flask
requests
opentelemetry-api
opentelemetry-sdk
opentelemetry-exporter-otlp
opentelemetry-instrumentation-flask
opentelemetry-instrumentation-requests

================================================
FILE: otel-tracing-service-graphs/config-otel.yaml
================================================
#
# OTel Collector YAML Configuration for Service Graph Generation
#
# This is the OTel-native equivalent of config.alloy for use with the Alloy OTel Engine.
# Run with: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
#

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  batch: {}

connectors:
  servicegraph:
    metrics_flush_interval: 10s
    dimensions:
      - service.name
      - http.method
    store:
      max_items: 5000
      ttl: 30s

exporters:
  otlphttp/prometheus:
    endpoint: http://prometheus:9090/api/v1/otlp
    tls:
      insecure: true

  otlp/tempo:
    endpoint: tempo:4317
    tls:
      insecure: true

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [batch]
      exporters: [servicegraph, otlp/tempo]
    metrics:
      receivers: [servicegraph]
      exporters: [otlphttp/prometheus]


================================================
FILE: otel-tracing-service-graphs/config.alloy
================================================
/*
 * Alloy Configuration for OpenTelemetry Trace Collection with Service Graph Generation
 */

// Receive OpenTelemetry traces
otelcol.receiver.otlp "default" {
  http {}
  grpc {}

  output {
    traces = [otelcol.processor.batch.default.input]
  }
}

// Batch processor to improve performance
otelcol.processor.batch "default" {
  output {
    traces = [
      otelcol.connector.servicegraph.default.input,
      otelcol.exporter.otlp.tempo.input,
    ]
  }
}

// Service Graph Generator 
otelcol.connector.servicegraph "default" {
  metrics_flush_interval = "10s"
  dimensions = ["service.name", "http.method"]
  
  // Configure the span store for better pairing
  store {
    max_items = 5000
    ttl = "30s"
  }
  
  output {
    metrics = [otelcol.exporter.otlphttp.prometheus.input]
  }
}

// Send service graph metrics to Prometheus via OTLP
otelcol.exporter.otlphttp "prometheus" {
  client {
    endpoint = "http://prometheus:9090/api/v1/otlp"
    tls {
      insecure = true
    }
  }
}

// Send traces to Tempo for storage and visualization
otelcol.exporter.otlp "tempo" {
  client {
    endpoint = "tempo:4317"
    tls {
      insecure = true
    }
  }
} 

livedebugging {
  enabled = true
}

================================================
FILE: otel-tracing-service-graphs/docker-compose-otel.yml
================================================
# OTel Engine Override
#
# Uses Alloy's experimental OTel Engine to run a standard OTel Collector YAML config
# instead of the River/HCL config.alloy file.
#
# Usage: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
# Stop:  docker compose -f docker-compose.yml -f docker-compose-otel.yml down
#

services:
  alloy:
    command: otel --config=/etc/alloy/config-otel.yaml
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
    ports:
      - 8888:8888      # OTel Engine HTTP server
      - 4317:4317      # OTLP gRPC
      - 4318:4318      # OTLP HTTP

  # Override demo-app endpoint to use standard OTLP gRPC port
  demo-app:
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=demo-service
      - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo


================================================
FILE: otel-tracing-service-graphs/docker-compose.coda.yml
================================================
services:
  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    network_mode: host
    restart: unless-stopped
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:12345
      - OTEL_SERVICE_NAME=demo-service
      - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo


================================================
FILE: otel-tracing-service-graphs/docker-compose.yml
================================================
version: '3.8'

services:
  # Prometheus for metrics collection
  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --web.enable-otlp-receiver
      - --enable-feature=native-histograms
      - --enable-feature=exemplar-storage
      - --config.file=/etc/prometheus/prometheus.yml
    ports:
      - 9090:9090/tcp
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml

  # Tempo for tracing without metrics generation
  tempo:
    image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4}
    command: ["-config.file=/etc/tempo.yaml"]
    ports:
      - 3200:3200/tcp    # tempo
    volumes:
      - ./tempo-config.yaml:/etc/tempo.yaml
    depends_on:
      - prometheus

  memcached:
    image: memcached:1.6.40@sha256:572b011ce33954ee809066d8cecbeb3ec98912109ee3be3663a3197425fd81ac
    container_name: memcached
    ports:
      - "11211:11211"
    environment:
      - MEMCACHED_MAX_MEMORY=64m  # Set the maximum memory usage
      - MEMCACHED_THREADS=4       # Number of threads to use


  # Grafana for visualization
  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
      - GF_INSTALL_PLUGINS=https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        - name: Tempo
          type: tempo
          access: proxy
          orgId: 1
          url: http://tempo:3200
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
          jsonData:
            serviceMap:
              datasourceUid: 'Prometheus'
            nodeGraph:
              enabled: true
        EOF
        /run.sh
    depends_on:
      - prometheus
      - tempo

  # Alloy for telemetry pipeline and service graph generation
  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345      # Alloy HTTP server
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy

  # Demo app that generates OpenTelemetry traces
  demo-app:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8080:8080
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:12345
      - OTEL_SERVICE_NAME=demo-service
      - OTEL_RESOURCE_ATTRIBUTES=service.name=demo-service,service.version=1.0.0,deployment.environment=demo 

================================================
FILE: otel-tracing-service-graphs/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s

otlp:
  # Recommended attributes to be promoted to labels.
  promote_resource_attributes:
    - service.instance.id
    - service.name
    - service.namespace
    - service.version
    - cloud.availability_zone
    - cloud.region
    - container.name
    - deployment.environment
    - deployment.environment.name
    - k8s.cluster.name
    - k8s.container.name
    - k8s.cronjob.name
    - k8s.daemonset.name
    - k8s.deployment.name
    - k8s.job.name
    - k8s.namespace.name
    - k8s.pod.name
    - k8s.replicaset.name
    - k8s.statefulset.name


storage:
  tsdb:
    out_of_order_time_window: 30m


================================================
FILE: otel-tracing-service-graphs/tempo-config.yaml
================================================
stream_over_http_enabled: true
server:
  http_listen_port: 3200
  log_level: info


cache:
  background:
    writeback_goroutines: 5
  caches:
  - roles:
    - frontend-search  
    memcached: 
      addresses: dns+memcached:11211

query_frontend:
  search:
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09
    metadata_slo:
        duration_slo: 5s
        throughput_bytes_slo: 1.073741824e+09
  trace_by_id:
    duration_slo: 100ms
  metrics:
    max_duration: 200h                # maximum duration of a metrics query, increase for local setups
    query_backend_after: 5m
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09

distributor:
  receivers:                           # this configuration will listen on all ports and protocols that tempo is capable of.
    jaeger:                            # the receives all come from the OpenTelemetry collector.  more configuration information can
      protocols:                       # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver
        thrift_http:                   #
          endpoint: "tempo:14268"      # for a production deployment you should only enable the receivers you need!
        grpc:
          endpoint: "tempo:14250"
        thrift_binary:
          endpoint: "tempo:6832"
        thrift_compact:
          endpoint: "tempo:6831"
    zipkin:
      endpoint: "tempo:9411"
    otlp:
      protocols:
        grpc:
          endpoint: "tempo:4317"
        http:
          endpoint: "tempo:4318"
    opencensus:
      endpoint: "tempo:55678"

ingester:
  max_block_duration: 5m               # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally

compactor:
  compaction:
    block_retention: 720h                # overall Tempo trace retention. set for demo purposes

metrics_generator:
  registry:
    external_labels:
      source: tempo
      cluster: docker-compose
  storage:
    path: /var/tempo/generator/wal
    remote_write:
      - url: http://prometheus:9090/api/v1/write
        send_exemplars: true
  traces_storage:
    path: /var/tempo/generator/traces
  processor:
    local_blocks:
      filter_server_spans: false
      flush_to_storage: true

storage:
  trace:
    backend: local                     # backend configuration to use
    wal:
      path: /var/tempo/wal             # where to store the wal locally
    local:
      path: /var/tempo/blocks

overrides:
  defaults:
    metrics_generator:
      processors: [local-blocks] # enables metrics generator
      

================================================
FILE: postgres-monitoring/README.md
================================================
# PostgreSQL Monitoring with Grafana Alloy

This scenario demonstrates how to monitor a PostgreSQL database using Grafana Alloy's built-in `prometheus.exporter.postgres` component. Alloy scrapes PostgreSQL server metrics and forwards them to Prometheus via remote write. Grafana is pre-configured with Prometheus as a datasource so you can explore the collected metrics immediately.

## Prerequisites

- Docker
- Docker Compose
- Git

## Getting Started

```bash
git clone https://github.com/grafana/alloy-scenarios.git
cd alloy-scenarios/postgres-monitoring
docker compose up -d
```

To use the centralized image versions from the repo root:

```bash
cd alloy-scenarios
./run-example.sh postgres-monitoring
```

## Access Points

| Service    | URL                        |
|------------|----------------------------|
| Grafana    | http://localhost:3000      |
| Alloy UI   | http://localhost:12345     |
| Prometheus | http://localhost:9090      |

Grafana is configured with anonymous admin access enabled, so no login is required.

## What to Expect

Once the stack is running, Alloy connects to the PostgreSQL instance and begins collecting metrics via the `prometheus.exporter.postgres` component. These metrics are scraped every 15 seconds and forwarded to Prometheus.

Metrics you can explore in Grafana include:

- **pg_up** -- Whether the PostgreSQL instance is reachable
- **pg_stat_database_*/** -- Database-level statistics (transactions committed, rolled back, rows fetched, inserted, updated, deleted, deadlocks, temp files, etc.)
- **pg_stat_bgwriter_*/** -- Background writer statistics (buffers written, checkpoints, etc.)
- **pg_settings_*/** -- PostgreSQL server configuration settings exposed as metrics
- **pg_stat_activity_*/** -- Connection and session activity
- **pg_locks_*/** -- Lock statistics by mode

### Exploring Metrics

1. Open **Grafana** at http://localhost:3000
2. Go to **Explore** and select the **Prometheus** datasource
3. Search for metrics starting with `pg_` to browse all available PostgreSQL metrics

### Debugging the Pipeline

1. Open the **Alloy UI** at http://localhost:12345
2. Navigate to the component graph to see the pipeline: `prometheus.exporter.postgres` -> `prometheus.scrape` -> `prometheus.remote_write`
3. Use the **Live Debugging** feature (enabled in the config) to inspect data flowing through each component

## Stopping the Scenario

```bash
docker compose down
```


================================================
FILE: postgres-monitoring/config.alloy
================================================
// ##############################################
// #### PostgreSQL Metrics Configuration     ####
// ##############################################

livedebugging {
	enabled = true
}

// Expose PostgreSQL metrics using the built-in postgres exporter.
prometheus.exporter.postgres "example" {
	data_source_names = ["postgresql://alloy:alloy@postgres:5432/alloy?sslmode=disable"]
}

// Scrape the postgres exporter targets.
prometheus.scrape "postgres" {
	targets    = prometheus.exporter.postgres.example.targets
	forward_to = [prometheus.remote_write.default.receiver]

	scrape_interval = "15s"
}

// Send metrics to the local Prometheus instance via remote write.
prometheus.remote_write "default" {
	endpoint {
		url = "http://prometheus:9090/api/v1/write"
	}
}


================================================
FILE: postgres-monitoring/docker-compose.coda.yml
================================================
services:
  postgres:
    image: postgres:18@sha256:78481659c47e862334611ccdaf7c369c986b3046da9857112f3b309114a65fb4
    environment:
      POSTGRES_USER: alloy
      POSTGRES_PASSWORD: alloy
      POSTGRES_DB: alloy
    ports:
      - "5432:5432"
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U alloy"]
      interval: 5s
      timeout: 5s
      retries: 5


================================================
FILE: postgres-monitoring/docker-compose.yml
================================================
services:
  postgres:
    image: postgres:18@sha256:78481659c47e862334611ccdaf7c369c986b3046da9857112f3b309114a65fb4
    environment:
      POSTGRES_USER: alloy
      POSTGRES_PASSWORD: alloy
      POSTGRES_DB: alloy
    ports:
      - "5432:5432"
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U alloy"]
      interval: 5s
      timeout: 5s
      retries: 5

  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --config.file=/etc/prometheus/prometheus.yml
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      postgres:
        condition: service_healthy

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh


================================================
FILE: postgres-monitoring/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s


================================================
FILE: rabbitmq-monitoring/README.md
================================================
# RabbitMQ Monitoring with Grafana Alloy

This scenario demonstrates RabbitMQ observability with a single Alloy pipeline:

- **Metrics** - `prometheus.scrape` collects RabbitMQ's built-in `/metrics` endpoint from the `rabbitmq_prometheus` plugin and remote-writes the samples to Prometheus.
- **Logs** - `loki.source.docker` tails the RabbitMQ container logs from the Docker socket and sends them to Loki.

## Architecture

- **RabbitMQ** - the monitored broker, running the management and Prometheus plugins
- **loadgen** - a small RabbitMQ PerfTest publisher that creates the durable `alloy-sample` queue and publishes one persistent message per second
- **Grafana Alloy** - scrapes broker metrics, collects broker container logs, and forwards both signals
- **Loki / Prometheus / Grafana** - local backends and visualization, with datasources auto-provisioned

## Running

```bash
# From this directory
docker compose up -d

# Or from the repo root using centralized image versions
./run-example.sh rabbitmq-monitoring
```

## Accessing

- **Grafana**: http://localhost:3000 (no login required)
- **Alloy UI**: http://localhost:12345
- **Prometheus**: http://localhost:9090
- **Loki**: http://localhost:3100
- **RabbitMQ Management UI**: http://localhost:15672 (`guest` / `guest`)
- **RabbitMQ Prometheus endpoint**: http://localhost:15692/metrics

## Trying It Out

Within about 30 seconds, open Grafana Explore and run these queries.

### Metrics

```promql
rabbitmq_up
```

```promql
rabbitmq_queue_messages{queue="alloy-sample"}
```

```promql
rabbitmq_channels
```

The scenario sets `prometheus.return_per_object_metrics = true` so queue-level labels are visible on `/metrics`.

### Logs

```logql
{job="rabbitmq"}
```

```logql
{job="rabbitmq"} |~ "accepting AMQP connection|authenticated and granted access"
```

RabbitMQ logs connection lifecycle events by default. Channel counts are best checked with metrics:

```promql
rabbitmq_channels
```

## Key Configuration

- `enabled_plugins` enables `rabbitmq_management` and `rabbitmq_prometheus`.
- `rabbitmq.conf` sends debug-level console logs to Docker and returns per-object queue metrics from `/metrics`.
- `config.alloy` keeps the metrics and logs pipelines separate and labels RabbitMQ logs as `job="rabbitmq"`.

## Stopping

```bash
docker compose down -v
```


================================================
FILE: rabbitmq-monitoring/config.alloy
================================================
// RabbitMQ Monitoring with Grafana Alloy.
// Metrics: scrape RabbitMQ's built-in Prometheus endpoint.
// Logs: collect RabbitMQ container logs from Docker and ship them to Loki.

livedebugging {
	enabled = true
}

prometheus.scrape "rabbitmq" {
	targets = [{
		__address__ = "rabbitmq:15692",
		job         = "rabbitmq",
	}]
	forward_to      = [prometheus.remote_write.local.receiver]
	scrape_interval = "15s"
}

prometheus.remote_write "local" {
	endpoint {
		url = "http://prometheus:9090/api/v1/write"
	}
}

discovery.docker "linux" {
	host = "unix:///var/run/docker.sock"
}

discovery.relabel "rabbitmq_logs" {
	targets = discovery.docker.linux.targets

	rule {
		source_labels = ["__meta_docker_container_name"]
		regex         = "/rabbitmq-monitoring-rabbitmq"
		action        = "keep"
	}

	rule {
		target_label = "job"
		replacement  = "rabbitmq"
	}

	rule {
		source_labels = ["__meta_docker_container_name"]
		regex         = "/(.*)"
		target_label  = "container_name"
	}
}

loki.source.docker "rabbitmq" {
	host          = "unix:///var/run/docker.sock"
	targets       = discovery.docker.linux.targets
	relabel_rules = discovery.relabel.rabbitmq_logs.rules
	forward_to    = [loki.write.local.receiver]
}

loki.write "local" {
	endpoint {
		url = "http://loki:3100/loki/api/v1/push"
	}
}


================================================
FILE: rabbitmq-monitoring/docker-compose.coda.yml
================================================
services:
  rabbitmq:
    image: rabbitmq:${RABBITMQ_VERSION:-4.3.0-management}
    container_name: rabbitmq-monitoring-rabbitmq
    hostname: rabbitmq
    ports:
      - "5672:5672"
      - "15672:15672"
      - "15692:15692"
    environment:
      - RABBITMQ_DEFAULT_USER=guest
      - RABBITMQ_DEFAULT_PASS=guest
    volumes:
      - ./enabled_plugins:/etc/rabbitmq/enabled_plugins:ro
      - ./rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf:ro
    healthcheck:
      test: ["CMD", "rabbitmq-diagnostics", "-q", "ping"]
      interval: 10s
      timeout: 5s
      retries: 12

  loadgen:
    image: pivotalrabbitmq/perf-test:${RABBITMQ_PERF_TEST_VERSION:-2.24.0}
    container_name: rabbitmq-monitoring-loadgen
    command:
      - --uri
      - amqp://guest:guest@rabbitmq:5672
      - --queue
      - alloy-sample
      - --producers
      - "1"
      - --consumers
      - "0"
      - --rate
      - "1"
      - --size
      - "256"
      - --flag
      - persistent
      - --id
      - alloy-rabbitmq-demo
    depends_on:
      rabbitmq:
        condition: service_healthy
    restart: unless-stopped


================================================
FILE: rabbitmq-monitoring/docker-compose.yml
================================================
services:
  rabbitmq:
    image: rabbitmq:${RABBITMQ_VERSION:-4.3.0-management}
    container_name: rabbitmq-monitoring-rabbitmq
    hostname: rabbitmq
    ports:
      - "5672:5672"
      - "15672:15672"
      - "15692:15692"
    environment:
      - RABBITMQ_DEFAULT_USER=guest
      - RABBITMQ_DEFAULT_PASS=guest
    volumes:
      - ./enabled_plugins:/etc/rabbitmq/enabled_plugins:ro
      - ./rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf:ro
    healthcheck:
      test: ["CMD", "rabbitmq-diagnostics", "-q", "ping"]
      interval: 10s
      timeout: 5s
      retries: 12

  loadgen:
    image: pivotalrabbitmq/perf-test:${RABBITMQ_PERF_TEST_VERSION:-2.24.0}
    container_name: rabbitmq-monitoring-loadgen
    command:
      - --uri
      - amqp://guest:guest@rabbitmq:5672
      - --queue
      - alloy-sample
      - --producers
      - "1"
      - --consumers
      - "0"
      - --rate
      - "1"
      - --size
      - "256"
      - --flag
      - persistent
      - --id
      - alloy-rabbitmq-demo
    depends_on:
      rabbitmq:
        condition: service_healthy
    restart: unless-stopped

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    container_name: rabbitmq-monitoring-alloy
    ports:
      - "12345:12345"
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
      - /var/run/docker.sock:/var/run/docker.sock:ro
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      rabbitmq:
        condition: service_healthy
      loki:
        condition: service_started
      prometheus:
        condition: service_started

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    container_name: rabbitmq-monitoring-loki
    ports:
      - "3100:3100"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    container_name: rabbitmq-monitoring-prometheus
    command:
      - --web.enable-remote-write-receiver
      - --config.file=/etc/prometheus/prometheus.yml
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    container_name: rabbitmq-monitoring-grafana
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - "3000:3000/tcp"
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        - name: Prometheus
          type: prometheus
          access: proxy
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh


================================================
FILE: rabbitmq-monitoring/enabled_plugins
================================================
[rabbitmq_management,rabbitmq_prometheus].


================================================
FILE: rabbitmq-monitoring/loki-config.yaml
================================================
auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
    - from: 2020-05-15
      store: tsdb
      object_store: filesystem
      schema: v13
      index:
        prefix: index_
        period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

ingester:
  max_chunk_age: 5m


================================================
FILE: rabbitmq-monitoring/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s


================================================
FILE: rabbitmq-monitoring/rabbitmq.conf
================================================
prometheus.return_per_object_metrics = true

log.console = true
log.console.level = debug
log.console.formatter.single_line = on
log.file = false


================================================
FILE: redis-monitoring/README.md
================================================
# Redis Monitoring with Grafana Alloy

This scenario demonstrates how to monitor a Redis instance using Grafana Alloy's built-in `prometheus.exporter.redis` component.

## Architecture

- **Redis** - The monitored Redis instance
- **Grafana Alloy** - Collects Redis metrics via `prometheus.exporter.redis` and remote writes them to Prometheus
- **Prometheus** - Stores the scraped metrics
- **Grafana** - Visualizes Redis metrics (auto-provisioned with Prometheus datasource)

## Running

```bash
# From this directory
docker compose up -d

# Or from the repo root using centralized image versions
./run-example.sh redis-monitoring
```

## Accessing

- **Grafana**: http://localhost:3000 (no login required)
- **Alloy UI**: http://localhost:12345
- **Prometheus**: http://localhost:9090

## Key Metrics

Once running, you can query Redis metrics in Grafana or Prometheus. Some useful metrics include:

- `redis_up` - Whether Redis is reachable
- `redis_connected_clients` - Number of connected clients
- `redis_used_memory_bytes` - Memory usage
- `redis_commands_total` - Total commands processed
- `redis_keyspace_hits_total` / `redis_keyspace_misses_total` - Cache hit ratio

## Stopping

```bash
docker compose down
```


================================================
FILE: redis-monitoring/config.alloy
================================================
// Redis Monitoring with Grafana Alloy
// This configuration scrapes Redis metrics using the built-in prometheus.exporter.redis component
// and remote writes them to Prometheus.

livedebugging {
	enabled = true
}

prometheus.exporter.redis "default" {
	redis_addr = "redis:6379"
}

prometheus.scrape "redis" {
	targets    = prometheus.exporter.redis.default.targets
	forward_to = [prometheus.remote_write.default.receiver]
}

prometheus.remote_write "default" {
	endpoint {
		url = "http://prometheus:9090/api/v1/write"
	}
}


================================================
FILE: redis-monitoring/docker-compose.coda.yml
================================================
services:
  redis:
    image: redis:8@sha256:0c341492924cad6f5483f9133e43bd6c51ecdecbcadfac5b51657393b6a7936c
    ports:
      - "6379:6379"


================================================
FILE: redis-monitoring/docker-compose.yml
================================================
services:
  redis:
    image: redis:8@sha256:0c341492924cad6f5483f9133e43bd6c51ecdecbcadfac5b51657393b6a7936c
    ports:
      - "6379:6379"

  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --config.file=/etc/prometheus/prometheus.yml
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - redis
      - prometheus


================================================
FILE: redis-monitoring/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s


================================================
FILE: renovate.json
================================================
{
  "$schema": "https://docs.renovatebot.com/renovate-schema.json",
  "description": "Local additive config — extends whatever org-level renovate config the bot is configured with. Tracks the centralized version pins in image-versions.env so they stay current alongside the docker-compose fallback defaults.",
  "customManagers": [
    {
      "customType": "regex",
      "description": "Bump every VERSION variable in image-versions.env. Each line is preceded by a `# renovate: datasource=… depName=…` comment that tells the bot what the variable refers to.",
      "managerFilePatterns": [
        "/^image-versions\\.env$/"
      ],
      "matchStrings": [
        "# renovate: datasource=(?<datasource>.+?) depName=(?<depName>.+?)\\s+\\w+_VERSION=(?<currentValue>.+)"
      ]
    },
    {
      "customType": "regex",
      "description": "Bump grafana/k8s-monitoring chart version pinned in k8s/*/README.md install commands. The other two charts in each scenario (backend + grafana) are intentionally unpinned (`helm install` resolves latest at run time), so only k8s-monitoring needs tracking.",
      "managerFilePatterns": [
        "/^k8s/.+/README\\.md$/"
      ],
      "matchStrings": [
        "grafana/k8s-monitoring --version \"(?<currentValue>[^\"]+)\""
      ],
      "datasourceTemplate": "helm",
      "depNameTemplate": "k8s-monitoring",
      "registryUrlTemplate": "https://grafana.github.io/helm-charts"
    },
    {
      "customType": "regex",
      "description": "Sync ${*_VERSION:-default} fallbacks in every docker-compose file alongside image-versions.env updates. Captures depName from the image reference itself (e.g. `image: nginx/nginx-prometheus-exporter:${NGINX_EXPORTER_VERSION:-1.4.2}` → depName=nginx/nginx-prometheus-exporter, currentValue=1.4.2). One rule covers every variable — adding a new VERSION var to image-versions.env requires no change here as long as the compose line follows the convention `image: <depName>:${<NAME>_VERSION:-<value>}`. Renovate's docker-compose manager treats `${VAR}` substitution as a templated reference and won't update inline fallback defaults — without this customManager, the env file moves but the fallbacks drift, breaking `docker compose up` for users who don't pass --env-file.",
      "managerFilePatterns": [
        "/docker-compose(\\.coda)?\\.ya?ml$/"
      ],
      "matchStrings": [
        "image:\\s*(?<depName>\\S+?):\\$\\{[A-Z_]+_VERSION:-(?<currentValue>[^}]+)\\}"
      ],
      "datasourceTemplate": "docker"
    }
  ]
}


================================================
FILE: routing/README.MD
================================================
# Example scenario for grafana alloy routing

Simple example for cases where a singular source [e.g. OCP cluster log forwarder] is sending logs from multiple applications, and you need to route it to different loki instances [or in the example case, same loki instance, but different tenants].

The example covers an if-else scenario, where looking at the logs contents, the `tenantKey` loki request metadata is populated, for the logs to be stored in the appropriate tenant.

In this setup, alloy is a single instance that receives logs from multiple OCP clusters [test and prod].

It checks the origin of the log [via the `hostname` field in the request] and afterwards checks the type of log [via the custom `message.logger` field].


Diagram:
```
									- - - Loki tenant test app
	OCP - - -						|
	[test]	|						| - - Loki tenant test audit
			---> Alloy ---> Loki ---
			|						| - - Loki tenant prod app
	OCP - - - 						|
	[prod]							- - - Loki tenant prod audit
```	


## Testing
The provided docker compose has a local promtail-alloy-loki-grafana setup to mimick a real env [using promtail to mimick an OCP log forwarder]. Logs are stored in a local minio s3 bucket.

use `docker compose up` to bring the system up. [note, if minio has errors starting up, create a `.customData` dir]

Put your log entries [with newline endings] inside the `support/promtail/myCustomLog.log` file. They will be automatically pushed to loki via alloy.

Open grafana [localhost:3000], login with default admin [`admin/admin`] and go to `Explore`.

You will have multiple loki datasources; use the query `{job="myApp"}`. 

Depending on the log content you put in the custom log file, the log will be stored in the appropriate loki tenant and is fetchable using the correct datasource.


## Message examples

Test app message [goes to test app tenant]

`{"@timestamp":"2025-10-31T09:40:25.229103272Z","hostname":"tos-worker-002.tos.example.come","level":"info","log_source":"container","log_type":"application","message":"{\"timestamp\":\"2025-10-31 10:40:25.229\",\"level\":\"INFO\",\"thread\":\"http-nio-8080-exec-5\",\"mdc\":{\"correlationId\":\"test\",\"uniqueTrackingId\":\"test\"},\"logger\":\"app\",\"message\":\"[START] msisdn=38763383706\",\"context\":\"default\"}"}`

Test audit message [goes to test audit tenant]

`{"@timestamp":"2025-10-31T09:40:25.229103272Z","hostname":"tos-worker-002.tos.example.come","level":"info","log_source":"container","log_type":"application","message":"{\"timestamp\":\"2025-10-31 10:40:25.229\",\"level\":\"INFO\",\"thread\":\"http-nio-8080-exec-5\",\"mdc\":{\"correlationId\":\"test\",\"uniqueTrackingId\":\"test\"},\"logger\":\"audit\",\"message\":\"[START] msisdn=38763383706\",\"context\":\"default\"}"}`

Prod app message [goes to app tenant]

`{"@timestamp":"2025-10-31T09:40:25.229103272Z","hostname":"pos-worker-001.pos.example.come","level":"info","log_source":"container","log_type":"application","message":"{\"timestamp\":\"2025-10-31 10:40:25.229\",\"level\":\"INFO\",\"thread\":\"http-nio-8080-exec-5\",\"mdc\":{\"correlationId\":\"test\",\"uniqueTrackingId\":\"test\"},\"logger\":\"app\",\"message\":\"[START] msisdn=38763383706\",\"context\":\"default\"}"}`

Prod audit message [goes to audit tenant]

`{"@timestamp":"2025-10-31T09:40:25.229103272Z","hostname":"pos-worker-001.pos.example.come","level":"info","log_source":"container","log_type":"application","message":"{\"timestamp\":\"2025-10-31 10:40:25.229\",\"level\":\"INFO\",\"thread\":\"http-nio-8080-exec-5\",\"mdc\":{\"correlationId\":\"test\",\"uniqueTrackingId\":\"test\"},\"logger\":\"audit\",\"message\":\"[START] msisdn=38763383706\",\"context\":\"default\"}"}`


================================================
FILE: routing/config.alloy
================================================
//listen to api requests for incomming logs from OCP
loki.source.api "listener" {
    http {
        //listen_address = "" //defaults to all/localhost
        listen_port    = 3005
    }

	forward_to = [loki.process.default_values.receiver]
}

//set a default label, so that all logs that passed thru alloy are marked as such
loki.process "default_values" {
   stage.static_labels {
        values = {
            source = "grafana-alloy",
        }
    }

	forward_to = [loki.process.redirect_env.receiver]
}

//we check the hostname field to see if the source is the test or prod cluster
loki.process "redirect_env" {	
	stage.json {
	  expressions = {extractedHostname = "hostname"}
	}
	
	stage.labels {
	  values = {hostnameLabel = "extractedHostname"}
	}
	
	stage.match {
		pipeline_name = "Send to test tenants if tos source"
		selector = "{hostnameLabel =~ \"tos.*\"}"
		
		//default for test
		stage.tenant {
			value = "test_loki_app"
		}		
		
		//take main payload
		stage.json {
		  expressions = {payload = "message"}
		}
		
		//extract logger property from it
		stage.json {
		  source = "payload"
		  expressions = {logger = "logger"}
		}
		
		//set it as a label, match selector works only with it
		stage.labels {
		  values = {loggerLabel = "logger"}
		}
		
		//route only 'audit' logger types to audit tenant [rest goes to default - app tenant]
		stage.match {
		  pipeline_name = "Audit log routing"
		  selector = "{loggerLabel = \"audit\"}"
		
		  stage.tenant {
			value = "test_loki_audit"
		  }		
	}
	
	stage.match {
		pipeline_name = "Otherwise it is production - re check the logic above"
		selector = "{hostnameLabel !~ \"tos.*\"}"
		
		//default value - app
		stage.tenant {
			value = "loki_app"
		}		
				
		//take main payload
		stage.json {
		  expressions = {payload = "message"}
		}
		
		//extract logger property from it
		stage.json {
		  source = "payload"
		  expressions = {logger = "logger"}
		}
		
		
		stage.labels {
		  values = {loggerLabel = "logger"}
		}
		
		//route only 'audit' logger types to audit tenant [rest goes to default - app tenant]
		stage.match {
		  pipeline_name = "Audit log routing"
		  selector = "{loggerLabel = \"audit\"}"
		
		  stage.tenant {
			value = "loki_audit"
		  }
	   }
	}
	
	forward_to = [loki.write.loki_default.receiver]
}

loki.write "loki_default" {
  endpoint {
    url = "http://<loki_endpoint>:<loki_port>/loki/api/v1/push"
  }
}

================================================
FILE: routing/docker-compose.yaml
================================================
services:
  minio:
    image: "minio/minio:RELEASE.2024-10-29T16-01-48Z@sha256:ebd2af76d40ff25ccc630533615f7ccd55fbe83d629a4b7c7a1b6311c1af3d6c"
    restart: "unless-stopped"
    entrypoint:
      - "sh"
      - "-euc"
      - "mkdir -p /data/loki && /usr/bin/docker-entrypoint.sh minio server --quiet --address 0.0.0.0:9000 --console-address ':9001' /data"
    volumes:
      - "./.customData/minio:/data"
    environment:
      - "MINIO_ROOT_USER=myuser"
      - "MINIO_ROOT_PASSWORD=mypass"
    ports:
      - "9000:9000"
      - "9001:9001"

  loki:
    image: "grafana/loki:latest@sha256:73e905b51a7f917f7a1075e4be68759df30226e03dcb3cd2213b989cc0dc8eb4"
    restart: "unless-stopped"
    command: "-config.file=/etc/loki/server.yml"
    volumes:
      - "./support/loki/server.yml:/etc/loki/server.yml"
    ports:
      - "3100:3100"
      - "7946"
    depends_on:
      - "minio"

  grafana:
    image: "grafana/grafana:latest@sha256:0f86bada30d65ef9d0183b90c1e2682ac92d53d95da8bed322b984ea78a4a73a"
    restart: "unless-stopped"
    user: '0'
    volumes:
      - "./support/grafana/datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml"
      - "./.customData/grafana:/var/lib/grafana"
    ports:
      - "3000:3000"
    depends_on:
      - "loki"
      
  alloy:
    image: "grafana/alloy:latest@sha256:51aeb9d829239345070619dad3edd6873186f913c84f45b365b74574fcb38ec0"
    restart: "unless-stopped"
    command: "run --server.http.listen-addr=0.0.0.0:3000 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy"
    volumes:
      - "./config.alloy:/etc/alloy/config.alloy"
    ports:
      - "3005:3000"
      - "3015:3005"
    depends_on:
      - "loki"

###Local app that generates logs
  promtail:
    image: "grafana/promtail:latest@sha256:6cfa64ec432b24a912d640e2edb940eeae2666f61861a66c121d763dd7241381"
    volumes:
      - "./support/promtail/promtail-config.yml:/etc/promtail/config.yml"
      - "./support/promtail/myCustomLog.txt:/var/log/myCustomLog.txt"
    ports:
      - "9080:9080"
    depends_on: 
      - "alloy"

================================================
FILE: routing/support/grafana/datasources.yml
================================================
apiVersion: 1

datasources:
  - name: "Loki"
    type: "loki"
    access: "proxy"
    orgId: 1
    url: "http://loki:3100"
    basicAuth: false
    isDefault: false
    version: 1
    editable: false
    apiVersion: 1
    uid: "loki"
    jsonData:
      httpHeaderName1: 'X-Scope-OrgID'
    secureJsonData:
      httpHeaderValue1: 'fake'

  - name: "Loki app"
    type: "loki"
    access: "proxy"
    orgId: 1
    url: "http://loki:3100"
    basicAuth: false
    isDefault: false
    version: 1
    editable: false
    apiVersion: 1
    uid: "lokiApp"
    jsonData:
      httpHeaderName1: 'X-Scope-OrgID'
    secureJsonData:
      httpHeaderValue1: 'loki_app'
  
  - name: "Loki audit"
    type: "loki"
    access: "proxy"
    orgId: 1
    url: "http://loki:3100"
    basicAuth: false
    isDefault: false
    version: 1
    editable: false
    apiVersion: 1
    uid: "lokiAudit"
    jsonData:
      httpHeaderName1: 'X-Scope-OrgID'
    secureJsonData:
      httpHeaderValue1: 'loki_audit'
      
  - name: "Loki test app"
    type: "loki"
    access: "proxy"
    orgId: 1
    url: "http://loki:3100"
    basicAuth: false
    isDefault: false
    version: 1
    editable: false
    apiVersion: 1
    uid: "lokiTestApp"
    jsonData:
      httpHeaderName1: 'X-Scope-OrgID'
    secureJsonData:
      httpHeaderValue1: 'test_loki_app'

  - name: "Loki test audit"
    type: "loki"
    access: "proxy"
    orgId: 1
    url: "http://loki:3100"
    basicAuth: false
    isDefault: false
    version: 1
    editable: false
    apiVersion: 1
    uid: "lokiTestAudit"
    jsonData:
      httpHeaderName1: 'X-Scope-OrgID'
    secureJsonData:
      httpHeaderValue1: 'test_loki_audit'


================================================
FILE: routing/support/loki/server.yaml
================================================
auth_enabled: true

server:
  http_listen_address: 0.0.0.0
  grpc_listen_address: 0.0.0.0
  http_listen_port: 3100
  grpc_listen_port: 9095
  log_level: "info"

common:
  path_prefix: "/loki/data"
  storage:
    s3:
      endpoint: "minio:9000"
      insecure: true
      bucketnames: "loki"
      access_key_id: "myuser"
      secret_access_key: "mypass"
      s3forcepathstyle: true

memberlist:
  dead_node_reclaim_time: "30s"
  gossip_to_dead_nodes_time: "15s"
  left_ingesters_timeout: "30s"
  gossip_interval: "2s"
  bind_port: 7946
  bind_addr:
    - "0.0.0.0"
  join_members:
    - "loki"

ingester:
  lifecycler:
    join_after: "10s"
    observe_period: "5s"
    ring:
      replication_factor: 1
      kvstore:
        store: "memberlist"
    final_sleep: "0s"
  chunk_idle_period: "1m"
  wal:
    enabled: true
    dir: "/loki/wal"
  max_chunk_age: "1m"
  chunk_retain_period: "30s"
  chunk_encoding: "snappy"
  chunk_target_size: 1.572864e+06
  chunk_block_size: 262144
  flush_op_timeout: "10s"

schema_config:
  configs:
    - from: "2020-08-01"
      store: "tsdb"
      object_store: "s3"
      schema: "v13"
      index:
        prefix: "index_"
        period: "24h"

storage_config:
  boltdb_shipper:
    active_index_directory: "/tmp/index"
    cache_location: "/tmp/boltdb-cache"

limits_config:
  max_cache_freshness_per_query: "10m"
  reject_old_samples: true
  reject_old_samples_max_age: "30m"
  split_queries_by_interval: "15m"
  ingestion_rate_mb: 10
  ingestion_burst_size_mb: 20

table_manager:
  retention_deletes_enabled: true
  retention_period: "336h"

query_range:
  max_retries: 5
  align_queries_with_step: true
  parallelise_shardable_queries: true
  cache_results: true

frontend:
  log_queries_longer_than: "5s"
  compress_responses: true
  max_outstanding_per_tenant: 2048

query_scheduler:
  max_outstanding_requests_per_tenant: 1024

querier:
  query_ingesters_within: "2h"

compactor:
  working_directory: "/tmp/compactor"
  retention_enabled: true
  compaction_interval: 30m
  retention_delete_delay: 1h
  retention_delete_worker_count: 150

================================================
FILE: routing/support/promtail/myCustomLog.txt
================================================
############################################################################
#find this in grafana via - {job="myApp"} |= ``
############################################################################
############################################################################
2025-10-15 - MY APP - Started logging by custom means...


================================================
FILE: routing/support/promtail/promtail-config.yml
================================================
server:
  http_listen_port: 9080
  grpc_listen_port: 0

positions:
  filename: /tmp/positions.yaml

clients:
  - url: http://alloy:3005/loki/api/v1/push

scrape_configs:
  - job_name: logs
    static_configs:
    - targets:
        - localhost
      labels:
        job: myApp
        __path__: /var/log/myCustomLog.txt
        

================================================
FILE: run-example.sh
================================================
#!/bin/bash
set -euo pipefail

# Usage check
if [ $# -lt 1 ]; then
    echo "Usage: $0 <example-directory>"
    echo "Available examples:"
    ls -d */ | grep -v "k8s\|img\|.git" | tr -d '/'
    exit 1
fi

EXAMPLE_DIR=$1

# Check if the example directory exists
if [ ! -d "$EXAMPLE_DIR" ]; then
    echo "Error: Example directory '$EXAMPLE_DIR' not found."
    exit 1
fi

# Check if a docker-compose file exists in the example directory.
# Some scenarios use .yaml instead of .yml; accept either.
if [ ! -f "$EXAMPLE_DIR/docker-compose.yml" ] && [ ! -f "$EXAMPLE_DIR/docker-compose.yaml" ]; then
    echo "Error: No docker-compose.yml or docker-compose.yaml found in '$EXAMPLE_DIR'."
    exit 1
fi

# Source the image versions
if [ ! -f "image-versions.env" ]; then
    echo "Error: image-versions.env file not found."
    exit 1
fi

# Run docker-compose in the example directory with the environment variables
echo "Starting example: $EXAMPLE_DIR"
(cd "$EXAMPLE_DIR" && docker compose --env-file ../image-versions.env up -d)

echo "Example started successfully."
echo "Access Grafana at http://localhost:3000"
echo "To stop the example, run: cd $EXAMPLE_DIR && docker compose down" 

================================================
FILE: self-monitoring/README.md
================================================
# Self-Monitoring with Grafana Alloy

This example demonstrates how to configure Grafana Alloy to monitor itself, collecting both its own metrics and logs alongside other Docker containers.

## Prerequisites
- Docker
- Docker Compose
- Git

## Running the Demo

### Step 1: Clone the repository
```bash
git clone https://github.com/grafana/alloy-scenarios.git
```

### Step 2: Deploy the monitoring stack
```bash
cd alloy-scenarios/self-monitoring
docker-compose up -d
```

### Step 3: Access Grafana Alloy UI
Open your browser and go to `http://localhost:12345`. 

### Step 4: Access Prometheus UI
Open your browser and go to `http://localhost:9090`.

### Step 5: Access Loki
Loki is available at `http://localhost:3100`.

## What This Demo Shows

This scenario demonstrates:

- **Metrics Collection**: Using `prometheus.exporter.self` to export Alloy's own internal metrics
- **Log Collection**: Using `loki.source.docker` to collect logs from all Docker containers, including Alloy itself
- **Service Discovery**: Automatic discovery of Docker containers with proper labeling
- **Remote Write**: Sending metrics to Prometheus and logs to Loki

## Key Configuration Elements

### Self-Monitoring Metrics

The `prometheus.exporter.self` component exposes Alloy's internal metrics:
- Memory usage
- CPU utilization
- Component health
- Scrape statistics

### Docker Log Collection

The configuration automatically discovers and collects logs from all Docker containers running on the host, including:
- Alloy's own logs
- Prometheus logs
- Loki logs
- Any other containers running on the same Docker host


================================================
FILE: self-monitoring/config.alloy
================================================


prometheus.exporter.self "integrations_alloy_health" { }

discovery.relabel "integrations_alloy_health" {
	targets = prometheus.exporter.self.integrations_alloy_health.targets

	rule {
		target_label = "instance"
		replacement  = constants.hostname
	}

	rule {
		target_label = "container"
		replacement  = "alloy"
	}
}

prometheus.scrape "integrations_alloy_health" {
	targets = array.concat(
		discovery.relabel.integrations_alloy_health.output,
	)
	forward_to = [prometheus.relabel.integrations_alloy_health.receiver]
	job_name   = "integrations/alloy"
}

prometheus.relabel "integrations_alloy_health" {
	forward_to = [prometheus.remote_write.default.receiver]
}

prometheus.remote_write "default" {
	endpoint {
		url = "http://prometheus:9090/api/v1/write"
	}
}


// ###############################
// #### Logging Configuration ####
// ###############################

// Discover Docker containers and extract metadata.
discovery.docker "linux" {
  host = "unix:///var/run/docker.sock"
}

// Define a relabeling rule to create a service name from the container name.
discovery.relabel "logs_integrations_docker" {
      targets = []
  
  // Extract the docker-compose service name from container names like
  // /grafana-pathfinder-app-alloy-1 -> alloy
  rule {
      source_labels = ["__meta_docker_container_name"]
      regex = "^/(?:.+-)?([^-]+)-(?:\\d+)$"
      target_label = "container"
  }

     rule {
        target_label = "instance"
        replacement  = constants.hostname
    }

  }


// Configure a loki.source.docker component to collect logs from Docker containers.
loki.source.docker "default" {
  host       = "unix:///var/run/docker.sock"
  targets    = discovery.docker.linux.targets
  relabel_rules = discovery.relabel.logs_integrations_docker.rules
  forward_to = [loki.write.local.receiver]
}

loki.write "local" {
  endpoint {
    url = "http://loki:3100/loki/api/v1/push"
  }
}

================================================
FILE: self-monitoring/docker-compose.yaml
================================================
services:
  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --enable-feature=native-histograms
      - --config.file=/etc/prometheus/prometheus.yml
    ports:
      - 9090:9090/tcp

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - '3100:3100'
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345 # Alloy HTTP server
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
      - /var/run/docker.sock:/var/run/docker.sock
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy

================================================
FILE: self-monitoring/loki-config.yaml
================================================
# This is a complete configuration to deploy Loki backed by the filesystem.
# The index will be shipped to the storage via tsdb-shipper.

auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

distributor:
  otlp_config:
    # List of default otlp resource attributes to be picked as index labels
    # CLI flag: -distributor.otlp.default_resource_attributes_as_index_labels
    default_resource_attributes_as_index_labels:
      [
        service.name service.namespace service.instance.id deployment.environment deployment.environment.name cloud.region cloud.availability_zone k8s.cluster.name k8s.namespace.name k8s.container.name container.name k8s.replicaset.name k8s.deployment.name k8s.statefulset.name k8s.daemonset.name k8s.cronjob.name k8s.job.name,
      ]

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
    - from: 2020-05-15
      store: tsdb
      object_store: filesystem
      schema: v13
      index:
        prefix: index_
        period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true


================================================
FILE: snmp/Readme.md
================================================
# Monitoring Linux with Alloy

Grafana Alloy can be used to monitor Linux servers and containers. In this guide, we will show you how to deploy Grafana Alloy in a Docker environment to monitor Linux system metrics and logs. The setup consists of:
* Node Exporter metrics for system performance monitoring
* System logs collection with Loki

## Prerequisites

* Git - You will need Git to clone the repository.
* Docker and Docker Compose - This tutorial uses Docker to host Grafana, Loki, Prometheus, and Alloy.
* Linux environment - Either a Linux host running Docker or a Linux VM.

## About this Demo

This demo runs Alloy in a container alongside Grafana, Prometheus, and Loki, creating a self-contained monitoring stack. The Alloy container acts as a "fake Linux server" to demonstrate monitoring capabilities out of the box.

In a production environment, you would typically install Alloy directly on each Linux server you want to monitor.

## Step 1: Clone the Repository

Clone the repository to your machine:

```bash
git clone https://github.com/grafana/alloy-scenarios.git
cd alloy-scenarios/linux
```

## Step 2: Deploy the Monitoring Stack

Use Docker Compose to deploy Grafana, Loki, Prometheus, and Alloy:

```bash
docker-compose up -d
```

You can check the status of the containers:

```bash
docker ps
```

Grafana should be running on [http://localhost:3000](http://localhost:3000).

## Step 3: Explore the Monitoring Data

Once the stack is running, you can explore the collected metrics and logs:

1. Access Grafana at [http://localhost:3000](http://localhost:3000) (default credentials are admin/admin)
2. Import the Node Exporter dashboard to visualize system metrics:
   - Go to Dashboards → Import
   - Upload the JSON file from [here](https://grafana.com/api/dashboards/1860/revisions/37/download)
   - Select the Prometheus data source and click Import

This community dashboard provides comprehensive system metrics including CPU, memory, disk, and network usage.

## Step 4: Viewing Logs

Open your browser and go to [http://localhost:3000/a/grafana-lokiexplore-app](http://localhost:3000/a/grafana-lokiexplore-app). This will take you to the Loki explorer in Grafana.

## Deploying on Bare Metal

To monitor actual Linux servers in production, you would:

1. Install Alloy directly on each Linux server

2. Modify the `config.alloy` file to point to your Prometheus and Loki instances:
   ```
   prometheus.remote_write "local" {
     endpoint {
       url = "http://localhost:9090/api/v1/write"
     }
   }
   
   loki.write "local" {
     endpoint {
       url = "http://localhost:3100/loki/api/v1/push"
     }
   }
   ```

3. Run Alloy as a service:
   ```bash
   sudo alloy run /path/to/config.alloy
   ```

## Configuration Customization

The included `config.alloy` file sets up:

1. Node Exporter integration to collect system metrics
2. Log collection from system logs and journal
3. Relabeling rules to organize metrics and logs
4. Remote write endpoints for Prometheus and Loki

You can customize which collectors are enabled/disabled and adjust scrape intervals in the configuration file.

## Troubleshooting

If you encounter issues:

* Check container logs: `docker-compose logs`
* Verify Alloy is running: `docker-compose ps`
* Ensure ports are not conflicting with existing services
* Review the Alloy configuration in `config.alloy`


================================================
FILE: snmp/config.alloy
================================================
// --- Remote Write to Prometheus ---
prometheus.remote_write "remote" {
  endpoint {
    url = "http://prometheus:9090/api/v1/write"
  }
}

// --- SNMP Exporter Configuration ---
prometheus.exporter.snmp "snmp_exporter" {
    config_file = "/etc/snmp/snmp.yml"

    target "tm" {
        address     = "snmpd"
        module      = "CISCO"
        walk_params = "Cisco"
        labels = {
            "ilo_node" = "switch",
        }
    }

    walk_param "cisco" {
        retries = "2"
        timeout = "30s"
    }
}

// --- SNMP Scrape Configuration ---
discovery.relabel "snmp_targets" {
  targets = prometheus.exporter.snmp.snmp_exporter.targets
  rule {
    target_label = "job"
    replacement  = "smpt"
  }
}

prometheus.scrape "snmp_targets" {
  scrape_interval = "30s"
  targets         = discovery.relabel.snmp_targets.output
  forward_to      = [prometheus.remote_write.remote.receiver]
}

// --- Enable Live Debugging ---
livedebugging {}


================================================
FILE: snmp/docker-compose.yml
================================================
version: '3.8'

services:

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - 3100:3100/tcp
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml


  prometheus:
     image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
     command:
       - --web.enable-remote-write-receiver
       - --config.file=/etc/prometheus/prometheus.yml
     ports:
      - 9090:9090/tcp
     volumes:
        - ./prom-config.yaml:/etc/prometheus/prometheus.yml


  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
       - sh
       - -euc
       - |
         mkdir -p /etc/grafana/provisioning/datasources
         cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
         apiVersion: 1
         datasources:
         - name: Loki
           type: loki
           access: proxy
           orgId: 1
           url: http://loki:3100
           basicAuth: false
           isDefault: false
           version: 1
           editable: false
         - name: Prometheus
           type: prometheus
           orgId: 1
           url: http://prometheus:9090
           basicAuth: false
           isDefault: true
           version: 1
           editable: false
         EOF
         /run.sh

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
      - ./snmp.yml:/etc/alloy/snmp.yml
      
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy


================================================
FILE: snmp/loki-config.yaml
================================================

# This is a complete configuration to deploy Loki backed by the filesystem.
# The index will be shipped to the storage via tsdb-shipper.

auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

distributor:
  otlp_config:
    # List of default otlp resource attributes to be picked as index labels
    # CLI flag: -distributor.otlp.default_resource_attributes_as_index_labels
      default_resource_attributes_as_index_labels: [service.name service.namespace service.instance.id deployment.environment deployment.environment.name cloud.region cloud.availability_zone k8s.cluster.name k8s.namespace.name k8s.container.name container.name k8s.replicaset.name k8s.deployment.name k8s.statefulset.name k8s.daemonset.name k8s.cronjob.name k8s.job.name]


server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
  - from: 2020-05-15
    store: tsdb
    object_store: filesystem
    schema: v13
    index:
      prefix: index_
      period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true


================================================
FILE: snmp/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s


================================================
FILE: snmp/snmp.yml
================================================
modules:
  CISCO:
    walk:
      - 1.4.6.1.4.3.9.9.244.1.2.1.1.7
    metrics:
      - name: ifInterface
        oid: 1.4.6.1.4.3.9.9.244.1.2.1.1.7
        type: gauge
        help: A unique value, greater than zero, for each interface
        indexes:
          - labelname: ifInterface
            type: gauge
auths:
  public_v1:
    community: <community>
    security_level: noAuthNoPriv
    version: 1
  public_v2:
    community: <community>
    security_level: noAuthNoPriv
    version: 2


================================================
FILE: syslog/README.md
================================================
# Syslog Scenario

This scenario demonstrates how to use ryslog and Alloy to monitor non RFC5424 compliant syslog messages. Alloy by itself does not support non RFC5424 compliant syslog messages. However, we can use rsyslog to convert non RFC5424 compliant syslog messages to RFC5424 compliant syslog messages. 

## Running the Demo

### Step 1: Clone the repository
```bash
git clone https://github.com/grafana/alloy-scenarios.git
```

### Step 2: Deploy the monitoring stack
```bash
cd alloy-scenarios/syslog
docker-compose up -d
```

### Step 3: Access Grafana Alloy UI
Open your browser and go to `http://localhost:12345`. 

### Step 4: Access Grafana UI
Open your browser and go to `http://localhost:3000`.


================================================
FILE: syslog/config.alloy
================================================


livedebugging {
  enabled = true
}

loki.source.syslog "local" {
  listener {
    address  = "0.0.0.0:51893"
    labels   = { component = "loki.source.syslog", protocol = "tcp" }
  }

  listener {
    address  = "0.0.0.0:51898"
    protocol = "udp"
    labels   = { component = "loki.source.syslog", protocol = "udp"}
  }

  forward_to = [loki.write.local.receiver]
}

loki.write "local" {
  endpoint {
    url = "http://loki:3100/loki/api/v1/push"
  }
}

================================================
FILE: syslog/docker-compose.coda.yml
================================================
services:
  rsyslog:
    image: rsyslog/syslog_appliance_alpine:latest@sha256:c0dd7cad9ff3234967ff59879590175b7590e8a5f5621ec49a85aff546b44a3b
    container_name: rsyslog
    ports:
      - "514:514/udp"
      - "514:514/tcp"
    volumes:
      - ./rsyslog.conf:/etc/rsyslog.conf

  syslog-simulator:
    image: python:${PYTHON_VERSION:-3.11-slim}
    container_name: syslog-simulator
    volumes:
      - ./syslog_simulator.py:/syslog_simulator.py
    environment:
      - SYSLOG_HOST=rsyslog
      - SYSLOG_PORT=514
    depends_on:
      - rsyslog
    command: ["python3", "/syslog_simulator.py"]


================================================
FILE: syslog/docker-compose.yml
================================================
version: '3.8'

services:
  # Rsyslog service
  rsyslog:
    image: rsyslog/syslog_appliance_alpine:latest@sha256:c0dd7cad9ff3234967ff59879590175b7590e8a5f5621ec49a85aff546b44a3b
    container_name: rsyslog
    ports:
      - "514:514/udp"     # Standard syslog UDP port
      - "514:514/tcp"     # Standard syslog TCP port (if needed)
    volumes:
      - ./rsyslog.conf:/etc/rsyslog.conf    # Custom rsyslog configuration
    depends_on:
      - alloy

  # Syslog simulator using a Python script
  syslog-simulator:
    image: python:${PYTHON_VERSION:-3.11-slim}
    container_name: syslog-simulator
    volumes:
      - ./syslog_simulator.py:/syslog_simulator.py  # Syslog simulator script
    environment:
      - SYSLOG_HOST=rsyslog
      - SYSLOG_PORT=514
    depends_on:
      - rsyslog
    command: ["python3", "/syslog_simulator.py"]
  

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345
      - 51893:51893
      - 51898:51898
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
      - ./logs:/tmp/app-logs/
    command: run --server.http.listen-addr=0.0.0.0:12345 --stability.level=experimental  --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - loki

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
       - sh
       - -euc
       - |
         mkdir -p /etc/grafana/provisioning/datasources
         cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
         apiVersion: 1
         datasources:
         - name: Loki
           type: loki
           access: proxy
           orgId: 1
           url: http://loki:3100
           basicAuth: false
           isDefault: false
           version: 1
           editable: false
         EOF
         /run.sh

volumes:
  rsyslog_data:


================================================
FILE: syslog/loki-config.yaml
================================================

# This is a complete configuration to deploy Loki backed by the filesystem.
# The index will be shipped to the storage via tsdb-shipper.

auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
  - from: 2020-05-15
    store: tsdb
    object_store: filesystem
    schema: v13
    index:
      prefix: index_
      period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

================================================
FILE: syslog/rsyslog.conf
================================================
# Load necessary modules
module(load="imudp")   # For receiving UDP messages
input(type="imudp" port="514")

# TCP endpoint configuration
*.* action(type="omfwd" target="alloy" port="51893" protocol="tcp" Template="RSYSLOG_SyslogProtocol23Format")

================================================
FILE: syslog/syslog_simulator.py
================================================
import socket
import time
import os
import random
from datetime import datetime

# Get the target host and port from environment variables
syslog_host = os.getenv('SYSLOG_HOST', 'localhost')
syslog_port = int(os.getenv('SYSLOG_PORT', 514))

# Create a UDP socket
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)

# Define log levels and messages
log_levels = ["INFO", "WARNING", "ERROR", "DEBUG", "CRITICAL"]
messages = [
    "System started successfully",
    "User login successful",
    "Configuration loaded",
    "Connection to database failed",
    "Data processed successfully",
    "Invalid API request received",
    "Memory usage high",
    "Disk space low",
    "Unknown error occurred",
    "Service restarted",
]

# Generate and send syslog messages every few seconds
while True:
    # Correct timestamp format
    timestamp = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
    log_level = random.choice(log_levels)
    message_text = random.choice(messages)
    pid = random.randint(100, 999)  # Simulate random process IDs
    app_name = "MyApp"
    hostname = socket.gethostname()
    msgid = '-'
    structured_data = '-'
    # Include the log level in the message body
    message_body = f"{log_level}: {message_text}"
    # Correct syslog message format
    message = f"<34>1 {timestamp} {hostname} {app_name} {pid} {msgid} {structured_data} {message_body}"
    sock.sendto(message.encode(), (syslog_host, syslog_port))
    print(f"Sent syslog message to {syslog_host}:{syslog_port} - {message_body}")
    time.sleep(random.randint(3, 8))  # Send a message every 3-8 seconds


================================================
FILE: systemd-journal/README.md
================================================
# systemd journal to Loki — focused filtering recipes

A focused logs-only scenario for shipping a Linux host's systemd journal to Loki, with filtering and label promotion tuned for keeping the index lean and queries fast.

## How this differs from `linux/`

| Aspect | `linux/` (existing) | `systemd-journal/` (this) |
|---|---|---|
| Scope | Metrics + journal + flat files (full Linux observability suite) | **Journal only** — focused scenario |
| Pipeline | Pass-through ingest, all units, all priorities | **Drops noisy units + drops info/debug priorities** |
| Stack | Prom + Loki + Grafana + node_exporter | **Loki + Grafana only** |
| Labels promoted | none specifically | `unit`, `priority`, `hostname` |
| Demo intent | "monitor a Linux box end-to-end" | "show advanced journal filtering recipes" |

If you want general-purpose Linux observability, use `linux/`. If you specifically need journal filtering recipes (drop noisy units, drop low-priority entries, label by unit/priority for fast filtering), this scenario is the minimal moving-parts version.

## Linux host required

`loki.source.journal` reads `/var/log/journal` and `/run/log/journal`. **These directories only exist on Linux hosts running systemd**. On macOS or Windows Docker Desktop:

- The bind mounts will resolve to empty directories (Docker creates them silently).
- Alloy will start cleanly but the source will sit idle with no journal entries.
- The scenario is functionally a no-op — there's no synthesised journal to fall back to.

To exercise the scenario fully you need:
- A Linux host (bare metal, VM, WSL2 with systemd, or a Linux VM on macOS such as OrbStack / Lima / multipass).
- `systemd` writing journals to `/var/log/journal` (persistent) or `/run/log/journal` (volatile). Most distros ship with at least the volatile journal active.

## Running

On a Linux host:

```bash
cd systemd-journal
docker compose up -d
```

Wait ~10 seconds, then open Grafana.

## Accessing

- **Grafana**: http://localhost:3000 (no login required)
- **Alloy UI**: http://localhost:12345 — confirm components are healthy and use livedebugging to inspect entries flowing through each stage
- **Loki API**: http://localhost:3100

## Trying it out

Generate some journal traffic on the Linux host:

```bash
# Trigger a notice
logger -p user.notice "test from systemd-journal scenario"

# Trigger an error
logger -p user.err "this is a test error"

# Tickle a service unit to produce events
sudo systemctl restart cron 2>/dev/null || sudo systemctl restart crond
```

Then in Grafana Explore on Loki:

```logql
# All journal entries (after filtering)
{job="systemd-journal"}

# Errors only
{job="systemd-journal", priority=~"err|crit|alert|emerg"}

# A specific unit
{job="systemd-journal", unit="ssh.service"}

# A specific host (useful when shipping from many)
{job="systemd-journal", hostname="my-server"}

# All recent NetworkManager events
{job="systemd-journal", unit="NetworkManager.service"}
```

## What's filtered out

The pipeline drops these at the Alloy side:

| Filter | What it drops | Why |
|---|---|---|
| `{unit=~"systemd-logind.service\|systemd-tmpfiles-clean.service\|cron.service"}` | Login session housekeeping, tmpfile cleanup, every cron tick | High-volume, low-signal in dev/ops dashboards |
| `{priority=~"info\|debug"}` | LOG_INFO and LOG_DEBUG entries | Keep `notice` and above |

To keep one of these back, edit `stage.match` in `config.alloy` — remove the corresponding entry from the regex.

## Why run Alloy as root

The Alloy container runs with `user: "0:0"`. On most Linux distros, `/var/log/journal/*.journal` files are owned by `root:systemd-journal` with mode 0640. Reading them requires either being root or a member of the `systemd-journal` group. Running Alloy as root inside a container with a read-only bind-mount keeps things simple for a demo. In production, prefer running the Alloy native package as a service — it joins the right groups automatically.

## Stopping

```bash
docker compose down -v
```

## Customization ideas

- **Promote more journal fields**: extend the `loki.relabel.journal` block. `__journal__pid` → `pid`, `__journal__exe` → `exe`, `__journal__cmdline` → `cmdline`, etc.
- **Per-environment unit filters**: maintain different `stage.match` regexes for prod vs dev.
- **Forward errors only**: add a `stage.match` keeping only `priority=~"err|crit|alert|emerg"` if you want a focused error stream.
- **Multi-host fan-in**: deploy this on every Linux host with the same `loki.write` URL pointing at a central Loki cluster.


================================================
FILE: systemd-journal/config.alloy
================================================
// systemd journal → Loki, with filtering recipes.
//
// Demonstrates three patterns the broader `linux/` scenario doesn't:
//   1. Promoting useful journal fields (`unit`, `priority`, `hostname`)
//      to Loki labels via `loki.relabel`.
//   2. Dropping noisy systemd units that flood the journal but rarely
//      carry useful signal.
//   3. Dropping low-priority entries (info/debug) at ingestion time
//      to keep Loki cardinality and storage low.
//
// Linux-host only — `loki.source.journal` reads /var/log/journal,
// which doesn't exist on macOS or Windows. See README for details.

livedebugging { enabled = true }

// Translate the journal's underscore-prefixed metadata into clean
// Loki label names. The journal exposes a lot of fields; we promote
// only a few useful ones.
loki.relabel "journal" {
	forward_to = []

	rule {
		source_labels = ["__journal__systemd_unit"]
		target_label  = "unit"
	}

	rule {
		source_labels = ["__journal_priority_keyword"]
		target_label  = "priority"
	}

	rule {
		source_labels = ["__journal__hostname"]
		target_label  = "hostname"
	}
}

loki.source.journal "host" {
	path          = "/var/log/journal"
	max_age       = "12h"
	relabel_rules = loki.relabel.journal.rules
	labels        = { job = "systemd-journal" }
	forward_to    = [loki.process.journal.receiver]
}

loki.process "journal" {
	// Drop high-volume units that rarely carry actionable signal in a
	// generic dev/ops dashboard. Tune this list to your environment.
	stage.match {
		selector = `{unit=~"systemd-logind.service|systemd-tmpfiles-clean.service|cron.service"}`
		action   = "drop"
	}

	// Drop low-priority entries (info / debug). Keep notice and above.
	// Adjust if you want to keep info messages.
	stage.match {
		selector = `{priority=~"info|debug"}`
		action   = "drop"
	}

	forward_to = [loki.write.local.receiver]
}

loki.write "local" {
	endpoint {
		url = "http://loki:3100/loki/api/v1/push"
	}
}


================================================
FILE: systemd-journal/docker-compose.yml
================================================
services:

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100/tcp"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - "3000:3000/tcp"
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    # Run as root so Alloy can read /var/log/journal — the journal files
    # are owned by root:systemd-journal with mode 0640 on most distros.
    user: "0:0"
    ports:
      - "12345:12345"
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
      # Bind-mount the host's journal read-only. On Linux hosts this
      # exposes the actual systemd journal. On macOS/Windows the path
      # doesn't exist and Docker creates an empty directory; Alloy
      # will run but the source will report "no journal entries".
      - /var/log/journal:/var/log/journal:ro
      - /run/log/journal:/run/log/journal:ro
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - loki


================================================
FILE: systemd-journal/loki-config.yaml
================================================
auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
    - from: 2020-05-15
      store: tsdb
      object_store: filesystem
      schema: v13
      index:
        prefix: index_
        period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

ingester:
  max_chunk_age: 5m


================================================
FILE: trace-delivery/README.md
================================================
# Trace Delivery Demo

This scenario demonstrates how distributed tracing works using a realistic sofa delivery workflow. It shows the journey of a sofa order from the shop to the customer's house, passing through multiple services.

## Overview

The demo includes five interconnected services simulating a sofa ordering and delivery process:

1. **Sofa Shop** - Where customers browse sofas and place orders
2. **Sofa Factory** - Manufactures the ordered sofas with detailed assembly steps
3. **Global Distribution Center** - Handles global logistics and shipping
4. **Local Distribution Center** - Manages local delivery logistics
5. **Customer House** - The final destination for delivery

Each service generates spans as part of a complete trace that follows the sofa from order to delivery. This demo includes three main scenarios:

1. **Successful Delivery** - A complete, happy-path delivery with no issues
2. **Failed Delivery** - Simulated failures at different points in the delivery process
3. **Latency Issues** - Abnormal delays in one service affecting the entire delivery process

## Architecture

```
┌────────────┐     ┌──────────────┐     ┌─────────────────────┐     ┌──────────────────┐     ┌────────────────┐
│  Sofa Shop │────▶│ Sofa Factory │────▶│ Global Distribution │────▶│ Local Distribution│────▶│ Customer House │
└────────────┘     └──────────────┘     └─────────────────────┘     └──────────────────┘     └────────────────┘
                                                                            │
                                                                            │
                                                                            ▼
                                                                     ┌────────────┐
                                                                     │ Sofa Shop  │
                                                                     └────────────┘
                                                                     (notification)
```

All services are instrumented with OpenTelemetry to generate traces, which are collected by Grafana Alloy and visualized in Grafana via Tempo.

## Demo Features

- **Realistic Business Process**: Simulates a real-world business workflow with multiple services and dependencies
- **Trace Context Propagation**: Demonstrates how trace context is passed between services
- **Background Trace Generation**: Automatically generates traces for all scenarios periodically
- **Nested Spans**: Shows detailed manufacturing steps with nested spans and span events
- **Bidirectional Communication**: Local Distribution center notifies the Shop when delivery is dispatched
- **Error Cases**: Shows how errors are recorded and propagated in traces with exceptions
- **Latency Visualization**: Illustrates how performance bottlenecks appear in traces
- **Span Events**: Each service adds detailed span events to provide context for operations
- **Tail Sampling**: Demonstrates tail sampling policies that focus on errors, latency issues, and specific order attributes
- **Service Graph**: Visualizes the connections between services 

## Running the Demo

1. Clone the repository:
   ```
   git clone https://github.com/grafana/alloy-scenarios.git
   cd alloy-scenarios
   ```

2. Navigate to this example directory:
   ```
   cd trace-delivery
   ```

3. Run using Docker Compose:
   ```
   docker compose up -d
   ```
   
   Or use the centralized image management:
   ```
   cd ..
   ./run-example.sh trace-delivery
   ```

4. Access the Sofa Shop at http://localhost:8080

## Demo Scenarios

### 1. Successful Delivery

Navigate to http://localhost:8080/demo/success to trigger a successful delivery flow, which will:
- Create an order for a Classic Comfort sofa
- Process it through all stages of the delivery pipeline
- Show the detailed manufacturing steps with nested spans
- Have the Local Distribution center notify the Shop of the dispatch
- Complete delivery successfully
- Generate a full trace that can be examined in Grafana

### 2. Failed Delivery

Navigate to http://localhost:8080/demo/failure to simulate a failure scenario, which will:
- Create an order for a Luxury Lounge sofa
- Simulate a failure at one of the services (factory by default)
- Record an actual exception in the trace with detailed error information
- Generate an error trace that will be sampled by the error policy

You can change where the failure occurs by adding a query parameter:
- http://localhost:8080/demo/failure?service=sofa-factory
- http://localhost:8080/demo/failure?service=global-distribution
- http://localhost:8080/demo/failure?service=local-distribution

### 3. Latency Issues

Navigate to http://localhost:8080/demo/latency to simulate a latency scenario, which will:
- Create an order for a Limited Edition Designer sofa
- Introduce significant latency in one service (factory by default)
- Add span events explaining the cause of the latency
- Demonstrate how tail sampling captures high-latency traces

You can change where the latency occurs by adding a query parameter:
- http://localhost:8080/demo/latency?service=sofa-factory
- http://localhost:8080/demo/latency?service=global-distribution
- http://localhost:8080/demo/latency?service=local-distribution

## Background Trace Generation

The demo automatically generates traces in the background to populate your trace data:
- Successful delivery traces (70% of background traces)
- Failure scenarios (15% of background traces)
- Latency scenarios (15% of background traces)

This helps ensure you have data to analyze without having to manually trigger scenarios.

## Viewing Traces

1. Open Grafana at http://localhost:3000
2. Navigate to Explore
3. Select Tempo as the data source
4. Click on the "Search" tab and select filters like:
   - `delivery.status = "failed"` to see failed deliveries
   - `sofa.model = "limited-edition"` to see traces for limited edition sofas
   - `customer.type = "vip"` to see VIP customer orders
   - `background = true` to see background-generated traces
   - `scenario = "delivery-failure"` to see failure scenarios
5. Or explore the service graph by clicking the "Service Graph" tab

## Span Events

Each span in the trace contains detailed events providing context about what's happening:
- **Manufacturing**: Events for each assembly step like frame construction, spring installation, etc.
- **Distribution**: Events for package preparation, routing, loading, etc.
- **Delivery**: Events for delivery dispatched, delivered, etc.
- **Failure**: Detailed information about what went wrong and where
- **Latency**: Information about delays and their causes

## Tail Sampling Policies

This demo configures Grafana Alloy with six tail sampling policies:

1. **Failed Delivery Policy**: Captures all traces with `delivery.status = "failed"`
2. **Error Policy**: Samples traces with errors
3. **Latency Policy**: Samples traces exceeding 5 seconds in duration
4. **VIP Customer Policy**: Samples all orders from VIP customers
5. **Limited Edition Policy**: Samples all orders for limited edition sofas
6. **Probabilistic Policy**: Samples 20% of all remaining traces

These policies ensure important traces (errors, performance issues, VIP customers) are retained while still sampling a representative subset of normal traffic.

## Troubleshooting

If you encounter issues:

1. **Missing services**: Ensure all containers are running with `docker compose ps`
2. **Network issues**: Check if services can communicate with each other
3. **Trace data missing**: Verify Alloy and Tempo are configured properly
4. **Service failures**: Check logs with `docker compose logs <service-name>`

## Customizing the Demo

You can modify the demo in several ways:

- Edit `app.py` to change service behavior, add new features, or adjust timing
- Modify `config.alloy` to change sampling policies or add new connectors
- Edit failure and latency probabilities in the script to increase/decrease error rates
- Add new sofa models or customer types to expand the demo

## Learning from the Demo

This demo helps understand:

1. How distributed tracing works across multiple services
2. How trace context is propagated through HTTP requests
3. How nested spans create a hierarchical view of operations
4. How span events provide detailed context about operations
5. How to use tail sampling to focus on important traces
6. How to troubleshoot errors and performance issues using traces
7. How service graphs visualize the relationships between services 

================================================
FILE: trace-delivery/app/Dockerfile
================================================
ARG PYTHON_VERSION=3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2
FROM python:${PYTHON_VERSION}

WORKDIR /app

COPY requirements.txt .
RUN pip install -r requirements.txt

COPY *.py ./

# We'll pass the service name as an environment variable
ENV SERVICE_PORT=8080
ENV PYTHONUNBUFFERED=1

CMD ["python", "app.py"] 

================================================
FILE: trace-delivery/app/app.py
================================================
import os
import random
import time
import uuid
import logging
import threading
from flask import Flask, request, jsonify
import requests
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.instrumentation.requests import RequestsInstrumentor
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Get environment variables
service_name = os.environ.get('OTEL_SERVICE_NAME', 'unknown-service')
service_port = int(os.environ.get('SERVICE_PORT', '8080'))

# Configure the tracer
resource = Resource.create()  # Use OTEL_RESOURCE_ATTRIBUTES environment variable
trace.set_tracer_provider(TracerProvider(resource=resource))

# Configure the OTLP exporter
otlp_exporter = OTLPSpanExporter()
span_processor = BatchSpanProcessor(span_exporter=otlp_exporter)
trace.get_tracer_provider().add_span_processor(span_processor)

# Create a tracer
tracer = trace.get_tracer(__name__)

# Create a propagator for handling trace context
propagator = TraceContextTextMapPropagator()

# Create Flask application
app = Flask(__name__)
FlaskInstrumentor().instrument_app(app)
RequestsInstrumentor().instrument()

# Furniture models available
sofa_models = [
    {"id": "classic-001", "name": "Classic Comfort", "price": 899.99, "production_time": 2},
    {"id": "modern-002", "name": "Modern Minimalist", "price": 1299.99, "production_time": 3},
    {"id": "luxury-003", "name": "Luxury Lounge", "price": 2499.99, "production_time": 5},
    {"id": "sectional-004", "name": "Sectional Supreme", "price": 1899.99, "production_time": 4},
    {"id": "limited-edition", "name": "Limited Edition Designer", "price": 4999.99, "production_time": 7}
]

# Customer types
customer_types = ["regular", "premium", "vip"]

# Distribution centers
distribution_centers = {
    "global": ["New York", "Shanghai", "Berlin", "Sydney"],
    "local": ["North District", "South District", "East District", "West District"]
}

# Simulated failures by service
failure_scenarios = {
    "sofa-factory": {"probability": 0.2, "message": "Production line issue: Unable to complete sofa manufacturing"},
    "global-distribution": {"probability": 0.15, "message": "Item lost in global distribution center"},
    "local-distribution": {"probability": 0.1, "message": "Delivery vehicle breakdown"}
}

# Simulated latency scenarios
latency_scenarios = {
    "sofa-factory": {"probability": 0.1, "min_delay": 5, "max_delay": 8, "message": "Production backlog causing delays"},
    "global-distribution": {"probability": 0.1, "min_delay": 6, "max_delay": 10, "message": "Customs inspection delay"},
    "local-distribution": {"probability": 0.1, "min_delay": 3, "max_delay": 7, "message": "Traffic congestion affecting local delivery"}
}

# Generate a unique order ID with a prefix
def generate_order_id():
    return f"ORD-{uuid.uuid4().hex[:8].upper()}"

# Select a random item from a list
def random_item(items):
    return random.choice(items)

# Determine if a failure should occur based on probability
def should_fail(service_name, order):
    # Check if this is a failure demo or has a failure scenario tag
    if order.get("demo") == "failure" and order.get("failure_service") == service_name:
        return True
    
    # Check if this is a background failure scenario
    if order.get("scenario") == "delivery-failure" and order.get("failure_service") == service_name:
        return True
    
    # Regular orders should NOT randomly fail
    return False

# Add latency if applicable for the service
def maybe_add_latency(service_name, span):
    if service_name in latency_scenarios:
        if random.random() < latency_scenarios[service_name]["probability"]:
            scenario = latency_scenarios[service_name]
            delay = random.uniform(scenario["min_delay"], scenario["max_delay"])
            reason = scenario["message"]
            span.set_attribute("latency.seconds", delay)
            span.set_attribute("latency.reason", reason)
            time.sleep(delay)
            return (True, delay, reason)
    return (False, None, None)

# SOFA SHOP SERVICE (entry point)
@app.route('/')
def home():
    if service_name == "sofa-shop":
        return """
        <h1>Sofa Shop - Trace Delivery Demo</h1>
        <p>Welcome to our sofa shop! Here you can order sofas and track their delivery through our system.</p>
        <h2>Endpoints:</h2>
        <ul>
            <li><a href="/catalog">View Catalog</a></li>
            <li><a href="/order">Place New Order</a> (random sofa)</li>
            <li><a href="/order-status?order_id=ORD-12345678">Check Order Status</a> (replace with your order ID)</li>
        </ul>
        <h2>Demo Scenarios:</h2>
        <ul>
            <li><a href="/demo/success">Successful Delivery Demo</a></li>
            <li><a href="/demo/failure">Failed Delivery Demo</a></li>
            <li><a href="/demo/latency">Delivery with Latency Demo</a></li>
        </ul>
        """
    else:
        return f"<h1>{service_name} service</h1><p>This service is part of the trace delivery demo.</p>"

# CATALOG ENDPOINT - SHOP SERVICE
@app.route('/catalog')
def catalog():
    if service_name != "sofa-shop":
        return jsonify({"error": f"Not available in {service_name}"}), 404
    
    with tracer.start_as_current_span("view-catalog") as span:
        span.set_attribute("action", "view-catalog")
        return jsonify({"sofas": sofa_models})

# ORDER ENDPOINT - SHOP SERVICE
@app.route('/order')
def place_order():
    if service_name != "sofa-shop":
        return jsonify({"error": f"Not available in {service_name}"}), 404
    
    with tracer.start_as_current_span("place-order") as span:
        # Generate order data
        order_id = generate_order_id()
        sofa = random_item(sofa_models)
        customer_type = random_item(customer_types)
        
        # Set span attributes
        span.set_attribute("order.id", order_id)
        span.set_attribute("sofa.model", sofa["id"])
        span.set_attribute("sofa.name", sofa["name"])
        span.set_attribute("sofa.price", sofa["price"])
        span.set_attribute("customer.type", customer_type)
        span.set_attribute("action", "place-order")
        
        # Create order
        order = {
            "order_id": order_id,
            "sofa": sofa,
            "customer_type": customer_type,
            "timestamp": time.time()
        }
        
        logger.info(f"New order placed: {order_id} for {sofa['name']}")
        
        # Forward to factory for manufacturing
        try:
            factory_url = os.environ.get('SERVICE_FACTORY_URL', 'http://sofa-factory:8081')
            headers = {}
            propagator.inject(headers)
            
            response = requests.post(
                f"{factory_url}/manufacture",
                json=order,
                headers=headers
            )
            
            if response.status_code == 200:
                result = response.json()
                return jsonify({
                    "message": "Order placed successfully!",
                    "order_id": order_id,
                    "sofa": sofa["name"],
                    "customer_type": customer_type,
                    "status": "manufacturing"
                })
            else:
                span.set_status(trace.StatusCode.ERROR)
                return jsonify({"error": "Failed to process order at factory", "details": response.text}), 500
        
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            return jsonify({"error": f"Failed to connect to factory: {str(e)}"}), 500

# ORDER STATUS ENDPOINT - SHOP SERVICE
@app.route('/order-status')
def check_order_status():
    if service_name != "sofa-shop":
        return jsonify({"error": f"Not available in {service_name}"}), 404
    
    order_id = request.args.get('order_id')
    if not order_id:
        return jsonify({"error": "No order ID provided"}), 400
    
    with tracer.start_as_current_span("check-order-status") as span:
        span.set_attribute("order.id", order_id)
        span.set_attribute("action", "check-order-status")
        
        # In a real system, we would look up the order status in a database
        # For this demo, we'll return a random status
        statuses = ["manufactured", "picked up", "in global distribution", "in local distribution", "out for delivery", "delivered"]
        status = random_item(statuses)
        
        return jsonify({
            "order_id": order_id,
            "status": status,
            "last_update": time.time()
        })

# DELIVERY NOTIFICATION ENDPOINT - SHOP SERVICE
@app.route('/delivery-notification', methods=['POST'])
def delivery_notification():
    if service_name != "sofa-shop":
        return jsonify({"error": f"Not available in {service_name}"}), 404
    
    notification = request.json
    order_id = notification.get("order_id")
    notification_type = notification.get("notification_type")
    delivery_time = notification.get("delivery_time")
    
    with tracer.start_as_current_span("process-delivery-notification") as span:
        span.set_attribute("order.id", order_id)
        span.set_attribute("notification.type", notification_type)
        span.set_attribute("action", "process-notification")
        
        # Add a span event for processing the notification
        span.add_event("notification_received", {
            "order_id": order_id,
            "notification_type": notification_type,
            "timestamp": time.time()
        })
        
        # In a real app, we would update the order status in the database
        # For this demo, we'll just log it
        logger.info(f"Notification received: Order {order_id} has been {notification_type} at {delivery_time}")
        
        # Simulate update to database or other processing
        time.sleep(0.1)
        
        # Add span event for completing notification processing
        span.add_event("notification_processed", {
            "order_id": order_id,
            "success": True,
            "timestamp": time.time()
        })
        
        return jsonify({
            "status": "success",
            "message": f"Notification for order {order_id} processed successfully",
            "notification_type": notification_type
        })

# MANUFACTURE ENDPOINT - FACTORY SERVICE
@app.route('/manufacture', methods=['POST'])
def manufacture():
    if service_name != "sofa-factory":
        return jsonify({"error": f"Not available in {service_name}"}), 404
    
    order = request.json
    order_id = order.get("order_id")
    sofa = order.get("sofa", {})
    is_background = order.get("background", False)
    
    with tracer.start_as_current_span("manufacture-sofa") as span:
        span.set_attribute("order.id", order_id)
        span.set_attribute("sofa.model", sofa.get("id", "unknown"))
        span.set_attribute("sofa.name", sofa.get("name", "unknown"))
        span.set_attribute("action", "manufacture")
        span.set_attribute("background", is_background)
        
        # Add a span event for manufacture start
        span.add_event("manufacture_started", {
            "order_id": order_id,
            "timestamp": time.time(),
            "sofa_model": sofa.get("name", "unknown")
        })
        
        # Check for simulated failure
        if should_fail(service_name, order):
            error_message = failure_scenarios[service_name]["message"]
            logger.error(f"Manufacturing failure for order {order_id}: {error_message}")
            span.set_attribute("error", True)
            span.set_attribute("error.message", error_message)
            span.set_attribute("delivery.status", "failed")
            
            # Add span event for the failure
            span.add_event("manufacture_failed", {
                "error": error_message,
                "timestamp": time.time()
            })
            
            # Record an actual exception to show in the trace
            try:
                raise Exception(f"Manufacturing process failed: {error_message}")
            except Exception as e:
                span.record_exception(e)
                span.set_status(trace.StatusCode.ERROR, str(e))
            
            return jsonify({"error": error_message}), 500
        
        # Add latency if applicable
        latency_result = (False, None, None)
        if order.get("demo") == "latency" and order.get("latency_service") == "sofa-factory":
            # For demo, explicitly add latency
            delay = random.uniform(5, 8)
            reason = "Production backlog causing delays"
            span.set_attribute("latency.seconds", delay)
            span.set_attribute("latency.reason", reason)
            time.sleep(delay)
            latency_result = (True, delay, reason)
        else:
            # Check for random latency
            latency_result = maybe_add_latency(service_name, span)
        
        # If latency was added, record the event
        if latency_result[0]:
            delay = latency_result[1]
            reason = latency_result[2]
            span.add_event("manufacture_delayed", {
                "delay_seconds": delay,
                "reason": reason,
                "timestamp": time.time()
            })
        
        # Create nested spans for the assembly process
        # 1. Frame construction
        with tracer.start_as_current_span("frame-construction") as frame_span:
            frame_span.set_attribute("order.id", order_id)
            frame_span.set_attribute("assembly.step", "frame")
            frame_span.set_attribute("material", "hardwood")
            
            # Simulate work
            time.sleep(0.2)
            
            frame_span.add_event("frame_completed", {
                "timestamp": time.time(),
                "quality_check": "passed"
            })
        
        # 2. Spring installation
        with tracer.start_as_current_span("spring-installation") as spring_span:
            spring_span.set_attribute("order.id", order_id)
            spring_span.set_attribute("assembly.step", "springs")
            spring_span.set_attribute("spring.count", 24)
            
            # Simulate work
            time.sleep(0.15)
            
            spring_span.add_event("springs_installed", {
                "timestamp": time.time(),
                "tension_test": "passed"
            })
        
        # 3. Cushion preparation
        with tracer.start_as_current_span("cushion-preparation") as cushion_span:
            cushion_span.set_attribute("order.id", order_id)
            cushion_span.set_attribute("assembly.step", "cushions")
            
            # Sub-step: foam cutting
            with tracer.start_as_current_span("foam-cutting") as foam_span:
                foam_span.set_attribute("material", "memory foam")
                foam_span.set_attribute("density", "high")
                time.sleep(0.1)
            
            # Sub-step: fabric cutting
            with tracer.start_as_current_span("fabric-cutting") as fabric_span:
                fabric_span.set_attribute("material", "premium leather" if sofa.get("id") == "luxury-003" else "fabric")
                time.sleep(0.1)
            
            # Sub-step: cushion assembly
            with tracer.start_as_current_span("cushion-assembly") as assembly_span:
                assembly_span.set_attribute("components", "foam + fabric + zippers")
                time.sleep(0.15)
            
            cushion_span.add_event("cushions_completed", {
                "timestamp": time.time()
            })
        
        # 4. Final assembly
        with tracer.start_as_current_span("final-assembly") as final_span:
            final_span.set_attribute("order.id", order_id)
            final_span.set_attribute("assembly.step", "final")
            
            # Simulate work
            time.sleep(0.25)
            
            final_span.add_event("assembly_completed", {
                "timestamp": time.time(),
                "inspector": f"Inspector #{random.randint(1, 10)}"
            })
        
        # Simulate manufacturing time (in addition to the assembly steps)
        production_time = sofa.get("production_time", 3)
        time.sleep(production_time / 20)  # Scale down for demo purposes
        
        # Add event for manufacturing completion
        span.add_event("manufacture_completed", {
            "order_id": order_id,
            "timestamp": time.time(),
            "quality_check": "passed",
            "inspector_id": f"QA-{random.randint(100, 999)}"
        })
        
        logger.info(f"Completed manufacturing for order {order_id}")
        
        # Request pickup from global distribution
        try:
            distribution_url = os.environ.get('SERVICE_DISTRIBUTION_URL', 'http://global-distribution:8082')
            headers = {}
            propagator.inject(headers)
            
            response = requests.post(
                f"{distribution_url}/pickup",
                json=order,
                headers=headers
            )
            
            if response.status_code == 200:
                result = response.json()
                return jsonify({
                    "order_id": order_id,
                    "status": "manufactured",
                    "next_step": "global distribution"
                })
            else:
                error_message = f"Global distribution pickup failed: {response.text}"
                span.set_status(trace.StatusCode.ERROR)
                span.set_attribute("delivery.status", "failed")
                return jsonify({"error": error_message}), 500
        
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            span.set_attribute("delivery.status", "failed")
            return jsonify({"error": f"Failed to connect to global distribution: {str(e)}"}), 500

# PICKUP ENDPOINT - GLOBAL DISTRIBUTION SERVICE
@app.route('/pickup', methods=['POST'])
def global_pickup():
    if service_name != "global-distribution":
        return jsonify({"error": f"Not available in {service_name}"}), 404
    
    order = request.json
    order_id = order.get("order_id")
    sofa = order.get("sofa", {})
    
    with tracer.start_as_current_span("global-distribution-pickup") as span:
        span.set_attribute("order.id", order_id)
        span.set_attribute("sofa.model", sofa.get("id", "unknown"))
        distribution_center = random_item(distribution_centers["global"])
        span.set_attribute("distribution.center", distribution_center)
        span.set_attribute("action", "global-pickup")
        
        # Add event for starting the pickup process
        span.add_event("global_pickup_started", {
            "order_id": order_id,
            "distribution_center": distribution_center,
            "timestamp": time.time()
        })
        
        # Check for simulated failure
        if should_fail(service_name, order):
            error_message = failure_scenarios[service_name]["message"]
            logger.error(f"Global distribution failure for order {order_id}: {error_message}")
            span.set_attribute("error", True)
            span.set_attribute("error.message", error_message)
            span.set_attribute("delivery.status", "failed")
            
            # Add event for the failure
            span.add_event("global_pickup_failed", {
                "error": error_message,
                "timestamp": time.time()
            })
            
            # Record an actual exception to show in the trace
            try:
                raise Exception(f"Global distribution failed: {error_message}")
            except Exception as e:
                span.record_exception(e)
                span.set_status(trace.StatusCode.ERROR, str(e))
            
            return jsonify({"error": error_message}), 500
        
        # Add latency if applicable
        latency_result = (False, None, None)
        if order.get("demo") == "latency" and order.get("latency_service") == "global-distribution":
            # For demo, explicitly add latency
            delay = random.uniform(6, 10)
            reason = "Customs inspection delay"
            span.set_attribute("latency.seconds", delay)
            span.set_attribute("latency.reason", reason)
            time.sleep(delay)
            latency_result = (True, delay, reason)
        else:
            # Check for random latency
            latency_result = maybe_add_latency(service_name, span)
        
        # If latency was added, record the event
        if latency_result[0]:
            delay = latency_result[1]
            reason = latency_result[2]
            span.add_event("global_pickup_delayed", {
                "delay_seconds": delay,
                "reason": reason,
                "timestamp": time.time()
            })
        
        # Create nested spans for logistics operations
        with tracer.start_as_current_span("inventory-processing") as inventory_span:
            inventory_span.set_attribute("order.id", order_id)
            inventory_span.set_attribute("operation", "inventory")
            inventory_span.set_attribute("location", distribution_center)
            
            # Simulate inventory processing
            time.sleep(0.1)
            
            inventory_span.add_event("inventory_processed", {
                "warehouse": f"{distribution_center}-{random.randint(1, 5)}",
                "timestamp": time.time()
            })
        
        with tracer.start_as_current_span("global-logistics") as logistics_span:
            logistics_span.set_attribute("order.id", order_id)
            logistics_span.set_attribute("operation", "logistics")
            
            # Simulate logistics processing
            time.sleep(0.2)
            
            # Select random transport type
            transport = random.choice(["air", "sea", "road", "rail"])
            logistics_span.set_attribute("transport.type", transport)
            
            logistics_span.add_event("transport_arranged", {
                "type": transport,
                "carrier": f"Carrier-{random.randint(100, 999)}",
                "timestamp": time.time()
            })
        
        # Simulate processing time
        time.sleep(0.3)
        
        # Add event for successful pickup
        span.add_event("global_pickup_completed", {
            "order_id": order_id,
            "distribution_center": distribution_center,
            "timestamp": time.time()
        })
        
        logger.info(f"Global distribution processed order {order_id}")
        
        # Forward to local distribution
        try:
            local_url = os.environ.get('SERVICE_LOCAL_URL', 'http://local-distribution:8083')
            headers = {}
            propagator.inject(headers)
            
            response = requests.post(
                f"{local_url}/deliver",
                json=order,
                headers=headers
            )
            
            if response.status_code == 200:
                result = response.json()
                return jsonify({
                    "order_id": order_id,
                    "status": "in global distribution",
                    "next_step": "local distribution"
                })
            else:
                error_message = f"Local distribution handoff failed: {response.text}"
                span.set_status(trace.StatusCode.ERROR)
                span.set_attribute("delivery.status", "failed")
                return jsonify({"error": error_message}), 500
        
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            span.set_attribute("delivery.status", "failed")
            return jsonify({"error": f"Failed to connect to local distribution: {str(e)}"}), 500

# DELIVER ENDPOINT - LOCAL DISTRIBUTION SERVICE
@app.route('/deliver', methods=['POST'])
def local_deliver():
    if service_name != "local-distribution":
        return jsonify({"error": f"Not available in {service_name}"}), 404
    
    order = request.json
    order_id = order.get("order_id")
    sofa = order.get("sofa", {})
    
    with tracer.start_as_current_span("local-distribution-delivery") as span:
        span.set_attribute("order.id", order_id)
        span.set_attribute("sofa.model", sofa.get("id", "unknown"))
        distribution_center = random_item(distribution_centers["local"])
        span.set_attribute("distribution.center", distribution_center)
        span.set_attribute("action", "local-delivery")
        
        # Add event for starting local delivery
        span.add_event("local_delivery_started", {
            "order_id": order_id,
            "distribution_center": distribution_center,
            "timestamp": time.time()
        })
        
        # Check for simulated failure
        if should_fail(service_name, order):
            error_message = failure_scenarios[service_name]["message"]
            logger.error(f"Local distribution failure for order {order_id}: {error_message}")
            span.set_attribute("error", True)
            span.set_attribute("error.message", error_message)
            span.set_attribute("delivery.status", "failed")
            
            # Add event for the failure
            span.add_event("local_delivery_failed", {
                "error": error_message,
                "timestamp": time.time()
            })
            
            # Record an actual exception to show in the trace
            try:
                raise Exception(f"Local delivery failed: {error_message}")
            except Exception as e:
                span.record_exception(e)
                span.set_status(trace.StatusCode.ERROR, str(e))
            
            return jsonify({"error": error_message}), 500
        
        # Add latency if applicable
        latency_result = (False, None, None)
        if order.get("demo") == "latency" and order.get("latency_service") == "local-distribution":
            # For demo, explicitly add latency
            delay = random.uniform(3, 7)
            reason = "Traffic congestion affecting local delivery"
            span.set_attribute("latency.seconds", delay)
            span.set_attribute("latency.reason", reason)
            time.sleep(delay)
            latency_result = (True, delay, reason)
        else:
            # Check for random latency
            latency_result = maybe_add_latency(service_name, span)
        
        # If latency was added, record the event
        if latency_result[0]:
            delay = latency_result[1]
            reason = latency_result[2]
            span.add_event("local_delivery_delayed", {
                "delay_seconds": delay,
                "reason": reason,
                "timestamp": time.time()
            })
        
        # Create nested spans for local delivery operations
        with tracer.start_as_current_span("package-preparation") as prep_span:
            prep_span.set_attribute("order.id", order_id)
            prep_span.set_attribute("operation", "package-prep")
            
            # Simulate packaging operations
            time.sleep(0.15)
            
            prep_span.add_event("package_prepared", {
                "packaging_type": "heavy-duty",
                "timestamp": time.time()
            })
        
        with tracer.start_as_current_span("delivery-route-planning") as route_span:
            route_span.set_attribute("order.id", order_id)
            route_span.set_attribute("operation", "route-planning")
            
            # Simulate route planning
            time.sleep(0.15)
            
            # Pick random delivery details
            vehicle = random.choice(["van", "truck", "specialized transport"])
            route_span.set_attribute("delivery.vehicle", vehicle)
            driver = f"Driver-{random.randint(100, 999)}"
            route_span.set_attribute("delivery.driver", driver)
            
            route_span.add_event("route_planned", {
                "vehicle": vehicle,
                "driver": driver,
                "estimated_arrival": time.time() + 3600,  # 1 hour from now
                "timestamp": time.time()
            })
        
        # Simulate processing time
        time.sleep(0.4)
        
        # Add event for successfully loaded for delivery
        span.add_event("local_delivery_loaded", {
            "order_id": order_id,
            "distribution_center": distribution_center,
            "timestamp": time.time()
        })
        
        logger.info(f"Local distribution processed order {order_id}")
        
        # Notify the shop that the order has been dispatched for delivery
        with tracer.start_as_current_span("notify-shop-delivery-dispatched") as notify_span:
            notify_span.set_attribute("order.id", order_id)
            notify_span.set_attribute("action", "notify-shop")
            
            # Create the notification
            notification = {
                "order_id": order_id,
                "sofa": sofa,
                "customer_type": order.get("customer_type", "regular"),
                "dispatch_time": time.time(),
                "notification_type": "delivery_dispatched",
                "vehicle": vehicle,
                "driver": driver,
                "distribution_center": distribution_center
            }
            
            # Send notification to shop
            shop_url = "http://sofa-shop:8080/delivery-notification"
            headers = {}
            propagator.inject(headers)
            
            notify_span.add_event("sending_notification", {
                "target": "sofa-shop",
                "notification_type": "delivery_dispatched",
                "timestamp": time.time()
            })
            
            # Try to send the notification - don't fail the whole delivery if this fails
            try:
                requests.post(
                    shop_url,
                    json=notification,
                    headers=headers,
                    timeout=1  # Short timeout so we don't block if shop is down
                )
                notify_span.add_event("notification_sent", {
                    "success": True,
                    "timestamp": time.time()
                })
            except Exception as notify_err:
                logger.warning(f"Failed to notify shop of dispatch: {str(notify_err)}")
                notify_span.record_exception(notify_err)
                notify_span.set_status(trace.StatusCode.ERROR, str(notify_err))
                notify_span.add_event("notification_failed", {
                    "success": False,
                    "error": str(notify_err),
                    "timestamp": time.time()
                })
        
        # Deliver to customer
        try:
            customer_url = os.environ.get('SERVICE_CUSTOMER_URL', 'http://customer-house:8084')
            headers = {}
            propagator.inject(headers)
            
            response = requests.post(
                f"{customer_url}/receive",
                json=order,
                headers=headers
            )
            
            if response.status_code == 200:
                result = response.json()
                span.add_event("local_delivery_completed", {
                    "order_id": order_id,
                    "timestamp": time.time()
                })
                return jsonify({
                    "order_id": order_id,
                    "status": "out for delivery",
                    "next_step": "customer delivery"
                })
            else:
                error_message = f"Customer delivery failed: {response.text}"
                span.set_status(trace.StatusCode.ERROR)
                span.set_attribute("delivery.status", "failed")
                return jsonify({"error": error_message}), 500
        
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            span.set_attribute("delivery.status", "failed")
            return jsonify({"error": f"Failed to connect to customer house: {str(e)}"}), 500

# RECEIVE ENDPOINT - CUSTOMER HOUSE SERVICE
@app.route('/receive', methods=['POST'])
def customer_receive():
    if service_name != "customer-house":
        return jsonify({"error": f"Not available in {service_name}"}), 404
    
    order = request.json
    order_id = order.get("order_id")
    sofa = order.get("sofa", {})
    customer_type = order.get("customer_type", "regular")
    
    with tracer.start_as_current_span("customer-house-receive") as span:
        span.set_attribute("order.id", order_id)
        span.set_attribute("sofa.model", sofa.get("id", "unknown"))
        span.set_attribute("customer.type", customer_type)
        span.set_attribute("action", "customer-receive")
        span.set_attribute("delivery.status", "delivered")
        
        # Add span event for delivery
        span.add_event("sofa_delivered", {
            "order_id": order_id,
            "timestamp": time.time(),
            "customer_type": customer_type
        })
        
        # Simulate final delivery
        time.sleep(0.2)
        
        logger.info(f"Order {order_id} successfully delivered to customer")
        
        # Generate customer satisfaction score - VIP customers are generally more satisfied
        satisfaction = random.randint(85, 100) if customer_type == "vip" else random.randint(70, 95)
        
        return jsonify({
            "order_id": order_id,
            "status": "delivered",
            "delivery_time": time.time(),
            "satisfaction": satisfaction
        })

# DEMO ENDPOINTS - SHOP SERVICE
@app.route('/demo/success')
def demo_success():
    if service_name != "sofa-shop":
        return jsonify({"error": f"Not available in {service_name}"}), 404
    
    # Set environment variables for other services to not fail
    os.environ["FORCE_SUCCESS"] = "true"
    
    with tracer.start_as_current_span("demo-success-flow") as span:
        # Use a predefined sofa for the demo
        order_id = generate_order_id()
        sofa = sofa_models[0]  # Classic sofa
        customer_type = "regular"
        
        span.set_attribute("order.id", order_id)
        span.set_attribute("sofa.model", sofa["id"])
        span.set_attribute("sofa.name", sofa["name"])
        span.set_attribute("customer.type", customer_type)
        span.set_attribute("demo", "success-flow")
        
        # Create order
        order = {
            "order_id": order_id,
            "sofa": sofa,
            "customer_type": customer_type,
            "timestamp": time.time(),
            "demo": "success"
        }
        
        logger.info(f"Demo success flow initiated: {order_id}")
        
        # Forward to factory for manufacturing
        try:
            factory_url = os.environ.get('SERVICE_FACTORY_URL', 'http://sofa-factory:8081')
            headers = {}
            propagator.inject(headers)
            
            response = requests.post(
                f"{factory_url}/manufacture",
                json=order,
                headers=headers
            )
            
            if response.status_code == 200:
                return jsonify({
                    "message": "Success demo initiated!",
                    "order_id": order_id,
                    "sofa": sofa["name"],
                    "trace_id": span.get_span_context().trace_id
                })
            else:
                return jsonify({"error": "Demo failed to start", "details": response.text}), 500
        
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            return jsonify({"error": f"Demo failed to start: {str(e)}"}), 500

@app.route('/demo/failure')
def demo_failure_endpoint():
    return demo_failure()

def demo_failure(failure_service=None, is_background=False):
    if service_name != "sofa-shop":
        return jsonify({"error": f"Not available in {service_name}"}), 404
    
    # Set environment variables for this specific demo
    os.environ["FORCE_FAILURE"] = "true"
    os.environ["FAILURE_SERVICE"] = failure_service or request.args.get('service', 'sofa-factory')
    
    with tracer.start_as_current_span("background-failure-scenario" if is_background else "demo-failure-flow") as span:
        # Use a predefined sofa for the demo
        order_id = generate_order_id()
        sofa = sofa_models[2]  # Luxury sofa
        customer_type = "premium"
        
        span.set_attribute("order.id", order_id)
        span.set_attribute("sofa.model", sofa["id"])
        span.set_attribute("sofa.name", sofa["name"])
        span.set_attribute("customer.type", customer_type)
        span.set_attribute("demo", "failure-flow")
        span.set_attribute("background", is_background)
        span.set_attribute("scenario", "delivery-failure")
        span.set_attribute("failure_service", os.environ["FAILURE_SERVICE"])
        
        # Create order
        order = {
            "order_id": order_id,
            "sofa": sofa,
            "customer_type": customer_type,
            "timestamp": time.time(),
            "demo": "failure",
            "background": is_background,
            "scenario": "delivery-failure",
            "failure_service": os.environ["FAILURE_SERVICE"]
        }
        
        logger.info(f"{'Background' if is_background else 'Demo'} failure flow initiated: {order_id} (failure in {os.environ['FAILURE_SERVICE']})")
        
        # Forward to factory for manufacturing
        try:
            factory_url = os.environ.get('SERVICE_FACTORY_URL', 'http://sofa-factory:8081')
            headers = {}
            propagator.inject(headers)
            
            response = requests.post(
                f"{factory_url}/manufacture",
                json=order,
                headers=headers
            )
            
            if is_background:
                return None
            else:
                return jsonify({
                    "message": "Failure demo initiated!",
                    "order_id": order_id,
                    "sofa": sofa["name"],
                    "failure_service": os.environ["FAILURE_SERVICE"],
                    "trace_id": span.get_span_context().trace_id
                })
        
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            if is_background:
                logger.error(f"Background demo failed to start: {str(e)}")
                return None
            else:
                return jsonify({"error": f"Demo failed to start: {str(e)}"}), 500

@app.route('/demo/latency')
def demo_latency_endpoint():
    return demo_latency()

def demo_latency(latency_service=None, is_background=False):
    if service_name != "sofa-shop":
        return jsonify({"error": f"Not available in {service_name}"}), 404
    
    # Set environment variables for this specific demo
    os.environ["FORCE_LATENCY"] = "true"
    os.environ["LATENCY_SERVICE"] = latency_service or request.args.get('service', 'sofa-factory')
    
    with tracer.start_as_current_span("background-latency-scenario" if is_background else "demo-latency-flow") as span:
        # Use a predefined sofa for the demo
        order_id = generate_order_id()
        sofa = sofa_models[4]  # Limited edition
        customer_type = "vip"
        
        span.set_attribute("order.id", order_id)
        span.set_attribute("sofa.model", sofa["id"])
        span.set_attribute("sofa.name", sofa["name"])
        span.set_attribute("customer.type", customer_type)
        span.set_attribute("demo", "latency-flow")
        span.set_attribute("background", is_background)
        span.set_attribute("scenario", "delivery-latency")
        span.set_attribute("latency_service", os.environ["LATENCY_SERVICE"])
        
        # Create order
        order = {
            "order_id": order_id,
            "sofa": sofa,
            "customer_type": customer_type,
            "timestamp": time.time(),
            "demo": "latency",
            "background": is_background,
            "scenario": "delivery-latency",
            "latency_service": os.environ["LATENCY_SERVICE"]
        }
        
        logger.info(f"{'Background' if is_background else 'Demo'} latency flow initiated: {order_id} (latency in {os.environ['LATENCY_SERVICE']})")
        
        # Forward to factory for manufacturing
        try:
            factory_url = os.environ.get('SERVICE_FACTORY_URL', 'http://sofa-factory:8081')
            headers = {}
            propagator.inject(headers)
            
            response = requests.post(
                f"{factory_url}/manufacture",
                json=order,
                headers=headers
            )
            
            if is_background:
                return None
            else:
                return jsonify({
                    "message": "Latency demo initiated!",
                    "order_id": order_id,
                    "sofa": sofa["name"],
                    "latency_service": os.environ["LATENCY_SERVICE"],
                    "trace_id": span.get_span_context().trace_id
                })
        
        except Exception as e:
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            if is_background:
                logger.error(f"Background demo failed to start: {str(e)}")
                return None
            else:
                return jsonify({"error": f"Demo failed to start: {str(e)}"}), 500

# Background trace generation functions
def generate_random_trace():
    """Generate a random trace in the background"""
    if service_name != "sofa-shop":
        return  # Only the shop should generate random traces
    
    # Randomly choose between normal order, error scenario, or latency scenario
    scenario_type = random.choices(
        ["normal", "error", "latency"], 
        weights=[0.7, 0.15, 0.15], 
        k=1
    )[0]
    
    try:
        if scenario_type == "normal":
            # Normal order flow
            order_id = generate_order_id()
            sofa = random_item(sofa_models)
            customer_type = random_item(customer_types)
            
            with tracer.start_as_current_span("background-successful-order") as span:
                span.set_attribute("order.id", order_id)
                span.set_attribute("sofa.model", sofa["id"])
                span.set_attribute("sofa.name", sofa["name"])
                span.set_attribute("sofa.price", sofa["price"])
                span.set_attribute("customer.type", customer_type)
                span.set_attribute("action", "place-order")
                span.set_attribute("background", True)
                span.set_attribute("scenario", "successful-delivery")
                
                # Add a span event for order creation
                span.add_event("order_created", {
                    "order_id": order_id,
                    "timestamp": time.time(),
                    "customer_type": customer_type,
                    "scenario": "successful-delivery"
                })
                
                # Create order
                order = {
                    "order_id": order_id,
                    "sofa": sofa,
                    "customer_type": customer_type,
                    "timestamp": time.time(),
                    "background": True,
                    "scenario": "successful-delivery"
                }
                
                logger.info(f"Background successful order placed: {order_id} for {sofa['name']}")
                
                # Forward to factory for manufacturing
                factory_url = os.environ.get('SERVICE_FACTORY_URL', 'http://sofa-factory:8081')
                headers = {}
                propagator.inject(headers)
                
                requests.post(
                    f"{factory_url}/manufacture",
                    json=order,
                    headers=headers
                )
        
        elif scenario_type == "error":
            # Error scenario
            failure_service = random.choice(list(failure_scenarios.keys()))
            demo_failure(failure_service=failure_service, is_background=True)
            
        elif scenario_type == "latency":
            # Latency scenario
            latency_service = random.choice(list(latency_scenarios.keys()))
            demo_latency(latency_service=latency_service, is_background=True)
            
    except Exception as e:
        logger.error(f"Error generating background trace: {str(e)}")

def trace_generator_thread():
    """Background thread that generates traces at regular intervals"""
    while True:
        try:
            # Only generate random traces if we're the sofa-shop service
            if service_name == "sofa-shop":
                generate_random_trace()
                
            # Wait between 20-60 seconds before generating the next trace
            delay = random.uniform(10, 20)
            logger.info(f"Next background trace in {delay:.2f} seconds")
            time.sleep(delay)
        except Exception as e:
            logger.error(f"Error in trace generation thread: {e}")
            time.sleep(10)  # Wait before retrying

if __name__ == '__main__':
    logger.info(f"Starting {service_name} service on port {service_port}")
    
    # Start the background trace generator thread (only for sofa-shop)
    if service_name == "sofa-shop":
        trace_thread = threading.Thread(target=trace_generator_thread, daemon=True)
        trace_thread.start()
        logger.info("Started background trace generator")
    
    app.run(host='0.0.0.0', port=service_port) 

================================================
FILE: trace-delivery/app/requirements.txt
================================================
flask
requests
opentelemetry-api
opentelemetry-sdk
opentelemetry-exporter-otlp
opentelemetry-instrumentation-flask
opentelemetry-instrumentation-requests

================================================
FILE: trace-delivery/config-otel.yaml
================================================
#
# OTel Collector YAML Configuration for Sofa Delivery Trace Demo
#
# This is the OTel-native equivalent of config.alloy for use with the Alloy OTel Engine.
# Run with: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
#

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  batch: {}

exporters:
  otlp/tempo:
    endpoint: tempo:4317
    tls:
      insecure: true

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [batch]
      exporters: [otlp/tempo]


================================================
FILE: trace-delivery/config.alloy
================================================
/*
 * Alloy Configuration for Sofa Delivery Trace Demo
 */

// Receive OpenTelemetry traces
otelcol.receiver.otlp "default" {
  http {}
  grpc {}

  output {
    traces = [otelcol.processor.batch.default.input]
  }
}

// Batch processor to improve performance
otelcol.processor.batch "default" {
  output {
    traces = [otelcol.exporter.otlp.tempo.input]
  }
}


// Send traces to Tempo
otelcol.exporter.otlp "tempo" {
  client {
    endpoint = "tempo:4317"
    tls {
      insecure = true
    }
  }
}

livedebugging {
  enabled = true
} 

================================================
FILE: trace-delivery/docker-compose-otel.yml
================================================
# OTel Engine Override
#
# Uses Alloy's experimental OTel Engine to run a standard OTel Collector YAML config
# instead of the River/HCL config.alloy file.
#
# Usage: docker compose -f docker-compose.yml -f docker-compose-otel.yml up -d
# Stop:  docker compose -f docker-compose.yml -f docker-compose-otel.yml down
#

services:
  alloy:
    command: otel --config=/etc/alloy/config-otel.yaml
    volumes:
      - ./config-otel.yaml:/etc/alloy/config-otel.yaml
    ports:
      - 8888:8888      # OTel Engine HTTP server


================================================
FILE: trace-delivery/docker-compose.coda.yml
================================================
services:
  # Sofa Shop Service
  sofa-shop:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8080:8080
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=sofa-shop
      - OTEL_RESOURCE_ATTRIBUTES=service.name=sofa-shop,service.version=1.0.0,deployment.environment=delivery-demo
      - SERVICE_FACTORY_URL=http://sofa-factory:8081
    depends_on:
      - sofa-factory
    restart: on-failure

  # Sofa Factory Service
  sofa-factory:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8081:8081
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=sofa-factory
      - OTEL_RESOURCE_ATTRIBUTES=service.name=sofa-factory,service.version=1.0.0,deployment.environment=delivery-demo
      - SERVICE_PORT=8081
      - SERVICE_DISTRIBUTION_URL=http://global-distribution:8082
    depends_on:
      - global-distribution
    restart: on-failure

  # Global Distribution Service
  global-distribution:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8082:8082
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=global-distribution
      - OTEL_RESOURCE_ATTRIBUTES=service.name=global-distribution,service.version=1.0.0,deployment.environment=delivery-demo
      - SERVICE_PORT=8082
      - SERVICE_LOCAL_URL=http://local-distribution:8083
    depends_on:
      - local-distribution
    restart: on-failure

  # Local Distribution Service
  local-distribution:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8083:8083
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=local-distribution
      - OTEL_RESOURCE_ATTRIBUTES=service.name=local-distribution,service.version=1.0.0,deployment.environment=delivery-demo
      - SERVICE_PORT=8083
      - SERVICE_CUSTOMER_URL=http://customer-house:8084
    depends_on:
      - customer-house
    restart: on-failure

  # Customer House Service
  customer-house:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8084:8084
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=customer-house
      - OTEL_RESOURCE_ATTRIBUTES=service.name=customer-house,service.version=1.0.0,deployment.environment=delivery-demo
      - SERVICE_PORT=8084
    restart: on-failure


================================================
FILE: trace-delivery/docker-compose.yml
================================================
version: '3.8'

services:
  # Prometheus for metrics collection
  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    command:
      - --web.enable-remote-write-receiver
      - --web.enable-otlp-receiver
      - --enable-feature=native-histograms
      - --enable-feature=exemplar-storage
      - --config.file=/etc/prometheus/prometheus.yml
    ports:
      - 9090:9090/tcp
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml

  memcached:
    image: memcached:1.6.40@sha256:572b011ce33954ee809066d8cecbeb3ec98912109ee3be3663a3197425fd81ac
    container_name: memcached
    ports:
      - "11211:11211"
    environment:
      - MEMCACHED_MAX_MEMORY=64m  # Set the maximum memory usage
      - MEMCACHED_THREADS=4       # Number of threads to use

  # Tempo for tracing
  tempo:
    image: grafana/tempo:${GRAFANA_TEMPO_VERSION:-2.10.4}
    command: ["-config.file=/etc/tempo.yaml"]
    ports:
      - 3200:3200/tcp    # tempo
    volumes:
      - ./tempo-config.yaml:/etc/tempo.yaml
    depends_on:
      - prometheus
      - memcached

  # Grafana for visualization
  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
      - GF_INSTALL_PLUGINS=https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app
    ports:
      - 3000:3000/tcp
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: false
          version: 1
          editable: false
        - name: Tempo
          type: tempo
          access: proxy
          orgId: 1
          url: http://tempo:3200
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
          jsonData:
            serviceMap:
              datasourceUid: 'Prometheus'
            nodeGraph:
              enabled: true
        EOF
        /run.sh
    depends_on:
      - prometheus
      - tempo

  # Alloy for telemetry pipeline
  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    ports:
      - 12345:12345      # Alloy HTTP server
      - 4317:4317/tcp    # OTLP gRPC (used by our services)
      - 4318:4318/tcp    # OTLP HTTP (used by our services)
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      - tempo
      - prometheus

  # Sofa Shop Service
  sofa-shop:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8080:8080
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=sofa-shop
      - OTEL_RESOURCE_ATTRIBUTES=service.name=sofa-shop,service.version=1.0.0,deployment.environment=delivery-demo
      - SERVICE_FACTORY_URL=http://sofa-factory:8081
    depends_on:
      - alloy
      - sofa-factory
    restart: on-failure

  # Sofa Factory Service
  sofa-factory:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8081:8081
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=sofa-factory
      - OTEL_RESOURCE_ATTRIBUTES=service.name=sofa-factory,service.version=1.0.0,deployment.environment=delivery-demo
      - SERVICE_PORT=8081
      - SERVICE_DISTRIBUTION_URL=http://global-distribution:8082
    depends_on:
      - alloy
      - global-distribution
    restart: on-failure

  # Global Distribution Service
  global-distribution:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8082:8082
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=global-distribution
      - OTEL_RESOURCE_ATTRIBUTES=service.name=global-distribution,service.version=1.0.0,deployment.environment=delivery-demo
      - SERVICE_PORT=8082
      - SERVICE_LOCAL_URL=http://local-distribution:8083
    depends_on:
      - alloy
      - local-distribution
    restart: on-failure

  # Local Distribution Service
  local-distribution:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8083:8083
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=local-distribution
      - OTEL_RESOURCE_ATTRIBUTES=service.name=local-distribution,service.version=1.0.0,deployment.environment=delivery-demo
      - SERVICE_PORT=8083
      - SERVICE_CUSTOMER_URL=http://customer-house:8084
    depends_on:
      - alloy
      - customer-house
    restart: on-failure

  # Customer House Service
  customer-house:
    build:
      context: ./app
      dockerfile: Dockerfile
      args:
        - PYTHON_VERSION=${PYTHON_VERSION:-3.11-slim}
    ports:
      - 8084:8084
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317
      - OTEL_SERVICE_NAME=customer-house
      - OTEL_RESOURCE_ATTRIBUTES=service.name=customer-house,service.version=1.0.0,deployment.environment=delivery-demo
      - SERVICE_PORT=8084
    depends_on:
      - alloy
    restart: on-failure 

================================================
FILE: trace-delivery/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s

otlp:
  # Recommended attributes to be promoted to labels.
  promote_resource_attributes:
    - service.name
    - service.namespace
    - service.version
    - deployment.environment

storage:
  tsdb:
    out_of_order_time_window: 30m 

================================================
FILE: trace-delivery/tempo-config.yaml
================================================
stream_over_http_enabled: true
server:
  http_listen_port: 3200
  log_level: info


cache:
  background:
    writeback_goroutines: 5
  caches:
  - roles:
    - frontend-search  
    memcached: 
      addresses: dns+memcached:11211

query_frontend:
  search:
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09
    metadata_slo:
        duration_slo: 5s
        throughput_bytes_slo: 1.073741824e+09
  trace_by_id:
    duration_slo: 100ms
  metrics:
    max_duration: 200h                # maximum duration of a metrics query, increase for local setups
    query_backend_after: 5m
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09

distributor:
  receivers:                           # this configuration will listen on all ports and protocols that tempo is capable of.
    jaeger:                            # the receives all come from the OpenTelemetry collector.  more configuration information can
      protocols:                       # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver
        thrift_http:                   #
          endpoint: "tempo:14268"      # for a production deployment you should only enable the receivers you need!
        grpc:
          endpoint: "tempo:14250"
        thrift_binary:
          endpoint: "tempo:6832"
        thrift_compact:
          endpoint: "tempo:6831"
    zipkin:
      endpoint: "tempo:9411"
    otlp:
      protocols:
        grpc:
          endpoint: "tempo:4317"
        http:
          endpoint: "tempo:4318"
    opencensus:
      endpoint: "tempo:55678"

ingester:
  max_block_duration: 5m               # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally

compactor:
  compaction:
    block_retention: 720h                # overall Tempo trace retention. set for demo purposes

metrics_generator:
  registry:
    external_labels:
      source: tempo
      cluster: docker-compose
  storage:
    path: /var/tempo/generator/wal
    remote_write:
      - url: http://prometheus:9090/api/v1/write
        send_exemplars: true
  traces_storage:
    path: /var/tempo/generator/traces
  processor:
    local_blocks:
      filter_server_spans: false
      flush_to_storage: true

storage:
  trace:
    backend: local                     # backend configuration to use
    wal:
      path: /var/tempo/wal             # where to store the wal locally
    local:
      path: /var/tempo/blocks

overrides:
  defaults:
    metrics_generator:
      processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator
      generate_native_histograms: both
      

================================================
FILE: vault-secrets/README.md
================================================
# Vault secrets with Grafana Alloy

Demonstrates Alloy's [`remote.vault`](https://grafana.com/docs/alloy/latest/reference/components/remote/remote.vault/) component pulling `prometheus.remote_write` basic_auth credentials from HashiCorp Vault at runtime, and shows that rotating the Vault secret is picked up without restarting Alloy.

## Overview

| Service       | Role                                                                |
| ------------- | ------------------------------------------------------------------- |
| `vault`       | HashiCorp Vault in dev mode. Boots, then seeds `secret/alloy/remote-write` from its entrypoint before unsealing the healthcheck. |
| `nginx-auth`  | Basic-auth reverse proxy in front of Prometheus's remote-write API. |
| `prometheus`  | Receives remote-writes from Alloy.                                  |
| `grafana`     | Pre-provisioned with Prometheus as the default datasource.          |
| `alloy`       | Scrapes its own `/metrics` and remote-writes via `nginx-auth`, with `basic_auth` credentials sourced from Vault. |

```
                                              ┌─────────────┐
                                  reread 30s  │             │
                ┌──── remote.vault ◀──────────│    Vault    │
                │   (auth.token)              │             │
                ▼                             └─────────────┘
            ┌────────┐                              ▲
            │ Alloy  │ scrape self → remote_write   │ vault kv put
            └────────┘    (basic_auth from Vault)   │ via rotate.sh
                │                                   │
                ▼                                   │
        ┌─────────────────┐   updated htpasswd     │
        │ nginx-auth      │◀────────────────────────┘
        │ (basic_auth)    │       via rotate.sh
        └─────────────────┘
                │
                ▼
          ┌────────────┐
          │ Prometheus │
          └────────────┘
                ▲
                │
          ┌────────────┐
          │  Grafana   │
          └────────────┘
```

## Running

```bash
docker compose up -d
# or, from the repo root:
./run-example.sh vault-secrets
```

| Service     | URL                                            |
| ----------- | ---------------------------------------------- |
| Grafana     | <http://localhost:3000>                        |
| Alloy UI    | <http://localhost:12345>                       |
| Prometheus  | <http://localhost:9090>                        |
| Vault       | <http://localhost:8200> (token: `root-token-for-demo`) |
| nginx-auth  | <http://localhost:8080> (basic-auth required)  |

## What to expect on a fresh boot

1. Watch nginx accept Alloy's writes:

   ```bash
   docker compose logs --tail=20 nginx-auth
   ```

   You should see `200` responses with `user=alloy`.

2. Confirm the seeded secret in Vault:

   ```bash
   docker exec -e VAULT_ADDR=http://127.0.0.1:8200 \
     -e VAULT_TOKEN=root-token-for-demo \
     vault-secrets-vault vault kv get secret/alloy/remote-write
   ```

3. Inspect the Alloy pipeline at <http://localhost:12345> — `prometheus.remote_write.via_nginx` should be healthy with no last-error.

4. Verify metrics flowed to Prometheus:

   ```bash
   curl -s 'http://localhost:9090/api/v1/query?query=up' | jq '.data.result'
   ```

## Demonstrating credential rotation

The interesting moment is the `401 → 200` transition: rotating nginx's htpasswd makes Alloy fail auth immediately, then Alloy recovers automatically once the Vault secret is updated and `remote.vault` re-reads (≤ 30 s).

```bash
# Step 1 — rotate htpasswd, reload nginx. Alloy starts 401-ing.
./rotate.sh htpasswd hunter2

# Watch nginx logs for 401s with user=-
docker compose logs -f nginx-auth

# Step 2 — update Vault to the new value. Alloy catches up within
# reread_frequency (30s) and goes back to 200 with user=alloy.
./rotate.sh vault hunter2

# Or do both in one go with a built-in 5s gap to make the 401 window
# observable:
./rotate.sh both rotated-password
```

You can also rotate Vault directly without the helper:

```bash
docker exec -e VAULT_ADDR=http://127.0.0.1:8200 \
  -e VAULT_TOKEN=root-token-for-demo \
  vault-secrets-vault \
  vault kv put secret/alloy/remote-write username=alloy password=hunter2
```

## Inspecting Vault

```bash
# Read the current secret
docker exec -e VAULT_ADDR=http://127.0.0.1:8200 \
  -e VAULT_TOKEN=root-token-for-demo \
  vault-secrets-vault vault kv get secret/alloy/remote-write

# Open the UI
open http://localhost:8200
# Token: root-token-for-demo
```

## Notes and caveats

- **Root token is hardcoded.** `root-token-for-demo` is fine for a demo, never for production. The real-world swap-in is `auth.approle` (with a wrapped role-id/secret-id) or `auth.kubernetes` — same component, different `auth.*` block.
- **`convert.nonsensitive` on `basic_auth.username`.** `remote.vault.creds.data.username` is a `Secret`; `basic_auth.username` expects a plain `string`, so it has to be unwrapped. `basic_auth.password` accepts `Secret` directly, so it doesn't need the conversion. Forgetting `convert.nonsensitive` on the username is the single most common mistake — the error is "expected string, got secret" at config load.
- **nginx is the source of truth for the credential.** If you update Vault but forget to update the htpasswd file, Alloy will 401 forever — that's the deliberate demo property, not a bug.
- **Vault dev-mode is in-memory.** A `docker compose down` followed by `up` resets the secret to `initial-password`.
- **Production caveat for the basic-auth path itself:** `Authorization: Basic …` is base64-encoded, not encrypted. In production this hop must be TLS — out of scope for this demo.

## Stopping

```bash
docker compose down --remove-orphans
```


================================================
FILE: vault-secrets/auth/htpasswd
================================================
alloy:$2y$05$yXToETJn9D.sOxFM3036b.l2/FkJU1iN2CIuWYAqIIgT7xSMDvJtO


================================================
FILE: vault-secrets/config.alloy
================================================
// vault-secrets scenario
//
// remote.vault pulls remote_write basic_auth credentials from HashiCorp
// Vault at runtime. reread_frequency makes Alloy pick up rotated values
// without a restart — see README for the rotation demo.

livedebugging {
	enabled = true
}

remote.vault "creds" {
	server = "http://vault:8200"
	// path = the KV mount; key = the secret path within that mount.
	// Alloy handles the KV v2 /data/ prefix internally.
	path = "secret"
	key  = "alloy/remote-write"

	reread_frequency = "30s"

	auth.token {
		token = "root-token-for-demo"
	}
}

prometheus.exporter.self "self" {}

prometheus.scrape "self" {
	targets         = prometheus.exporter.self.self.targets
	forward_to      = [prometheus.remote_write.via_nginx.receiver]
	scrape_interval = "10s"
}

prometheus.remote_write "via_nginx" {
	endpoint {
		url = "http://nginx-auth/api/v1/write"

		basic_auth {
			username = convert.nonsensitive(remote.vault.creds.data.username)
			password = remote.vault.creds.data.password
		}
	}
}


================================================
FILE: vault-secrets/docker-compose.yml
================================================
services:
  vault:
    image: hashicorp/vault:${VAULT_VERSION:-2.0.0}
    container_name: vault-secrets-vault
    ports:
      - "8200:8200"
    environment:
      VAULT_ADDR: http://127.0.0.1:8200
      VAULT_TOKEN: root-token-for-demo
    # Start dev-mode in the background, wait for readiness, then seed
    # secret/alloy/remote-write. The wait keeps Vault as PID 1.
    entrypoint:
      - sh
      - -euc
      - |
        vault server -dev \
          -dev-listen-address=0.0.0.0:8200 \
          -dev-root-token-id=root-token-for-demo &
        VAULT_PID=$$!
        until vault status >/dev/null 2>&1; do sleep 1; done
        vault kv put secret/alloy/remote-write \
          username=alloy \
          password=initial-password
        echo "seeded secret/alloy/remote-write"
        wait $$VAULT_PID
    healthcheck:
      # Pass only once the secret has been seeded — otherwise Alloy may
      # start before the KV write lands and fail its first reread.
      test: ["CMD", "sh", "-c", "vault kv get secret/alloy/remote-write >/dev/null 2>&1"]
      interval: 5s
      timeout: 3s
      retries: 20

  nginx-auth:
    image: nginx:${NGINX_VERSION:-1.30-alpine}
    container_name: vault-secrets-nginx-auth
    ports:
      - "8080:80"
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf:ro
      - ./auth/htpasswd:/etc/nginx/htpasswd:ro
    depends_on:
      - prometheus

  prometheus:
    image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
    container_name: vault-secrets-prometheus
    command:
      - --web.enable-remote-write-receiver
      - --config.file=/etc/prometheus/prometheus.yml
    volumes:
      - ./prom-config.yaml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    container_name: vault-secrets-grafana
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - "3000:3000/tcp"
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Prometheus
          type: prometheus
          orgId: 1
          url: http://prometheus:9090
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh

  alloy:
    image: grafana/alloy:${GRAFANA_ALLOY_VERSION:-v1.16.1}
    container_name: vault-secrets-alloy
    ports:
      - "12345:12345"
    volumes:
      - ./config.alloy:/etc/alloy/config.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
    depends_on:
      vault:
        condition: service_healthy
      nginx-auth:
        condition: service_started
      prometheus:
        condition: service_started


================================================
FILE: vault-secrets/nginx.conf
================================================
worker_processes 1;
events { worker_connections 1024; }

http {
    log_format auth '$remote_addr user=$remote_user [$time_local] '
                    '"$request" $status $body_bytes_sent';
    access_log /dev/stdout auth;
    error_log  /dev/stderr warn;

    upstream prom {
        server prometheus:9090;
    }

    server {
        listen 80;
        server_name _;

        location /api/v1/write {
            auth_basic           "alloy-remote-write";
            auth_basic_user_file /etc/nginx/htpasswd;

            proxy_pass         http://prom/api/v1/write;
            proxy_http_version 1.1;
            proxy_set_header   Host $host;
            proxy_set_header   X-Forwarded-For $remote_addr;
        }

        location = /healthz {
            access_log off;
            return 200 "ok\n";
        }
    }
}


================================================
FILE: vault-secrets/prom-config.yaml
================================================
global:
  scrape_interval: 15s
  evaluation_interval: 15s


================================================
FILE: vault-secrets/rotate.sh
================================================
#!/usr/bin/env bash
# Demo helper for the vault-secrets scenario.
#
# Usage:
#   ./rotate.sh htpasswd <new-password>   # update nginx htpasswd + reload
#   ./rotate.sh vault    <new-password>   # update the Vault secret
#   ./rotate.sh both     <new-password>   # do both, with a 5s gap so the
#                                         # 401 window is visible

set -euo pipefail

cmd=${1:-}
pw=${2:-}

if [[ -z "$cmd" || -z "$pw" ]]; then
  echo "usage: rotate.sh htpasswd|vault|both <new-password>" >&2
  exit 2
fi

cd "$(dirname "$0")"

rotate_htpasswd() {
  echo ">> generating new bcrypt entry for alloy"
  docker run --rm httpd:2.4-alpine htpasswd -nbB -C 5 alloy "$pw" \
    > auth/htpasswd
  echo ">> reloading nginx"
  docker exec vault-secrets-nginx-auth nginx -s reload
}

rotate_vault() {
  echo ">> writing new credentials to Vault"
  docker exec \
    -e VAULT_ADDR=http://127.0.0.1:8200 \
    -e VAULT_TOKEN=root-token-for-demo \
    vault-secrets-vault \
    vault kv put secret/alloy/remote-write \
      username=alloy \
      password="$pw"
}

case "$cmd" in
  htpasswd) rotate_htpasswd ;;
  vault)    rotate_vault ;;
  both)
    rotate_htpasswd
    echo ">> nginx flipped; Alloy will 401 until Vault catches up. Sleeping 5s..."
    sleep 5
    rotate_vault
    ;;
  *)
    echo "unknown command: $cmd" >&2
    exit 2
    ;;
esac


================================================
FILE: windows/README.md
================================================
# Monitoring Windows with Alloy

Grafana Alloy can be used to monitor Windows servers and desktops. In this guide we will show you how to install Grafana Alloy on a Windows machine and how to configure it to monitor the following system attributes:
* Windows Performance Metrics
* Windows Event Logs

## Prerequisites

* Git - You will need Git to clone the repository.
* Docker - In this tutorial we assume you are using Docker desktop for Windows. This is where we host Grafana, Loki and Prometheus. Note that you can also install native Windows versions of Grafana, Loki and Prometheus if you prefer or host them on a Linux server.
* Windows Server or Desktop - We will be monitoring a Windows machine, so you will need a Windows server or desktop to monitor.
* Admin access to the Windows machine - You will need admin access to the Windows machine to install the Grafana Alloy and configure it to collect metrics and logs.

## Step 1: Clone the Repository

Clone the repository to your Windows machine.

```bash
git clone https://github.com/grafana/alloy-scenarios.git
```

## Step 2: Deploy Grafana, Loki and Prometheus

First, you need to deploy Grafana, Loki and Prometheus on your Windows machine. Within this tutorial, we have included a docker-compose file that will deploy Grafana, Loki and Prometheus on your Windows machine.

```bash
cd alloy-scenarios/windows
docker-compose up -d
```

You can check the status of the containers by running the following command:

```bash
docker ps
```
Grafana should be running on [http://localhost:3000](http://localhost:3000).

## Step 3: Install Grafana Alloy

Follow the instructions in the [Grafana Alloy documentation](https://grafana.com/docs/alloy/latest/set-up/install/windows/) to install Grafana Alloy on your Windows machine.

Recommended steps:
* Install Grafana Alloy as a Windows service.
* Use Windows Installer to install Grafana Alloy.

Make sure to also checkout the [Grafana Alloy configuration](https://grafana.com/docs/alloy/latest/set-up/configuration/) documentation.

Personal recommendation: If you would like to see the Alloy UI from a remote machine you need to change the run arguments of the Grafana Alloy service. To do this:

1. Open Registery Editor.
2. Navigate to `HKEY_LOCAL_MACHINE\SOFTWARE\GrafanaLabs\Alloy`.
3. Double click on `Arguments`
4. Change the contents to the following:
```
run
C:\Program Files\GrafanaLabs\Alloy\config.alloy
--storage.path=C:\ProgramData\GrafanaLabs\Alloy\data
--server.http.listen-addr=0.0.0.0:12345
```
5. Restart the Grafana Alloy service. (Search for `Services` in the start menu, find `Grafana Alloy`, right click and restart)

You should now be able to access the Alloy UI from a remote machine by going to `http://<windows-machine-ip>:12345`.

## Step 4: Configure Grafana Alloy to Monitor Windows

Now that you have Grafana Alloy installed, you need to configure it to monitor your Windows machine. Grafana Alloy will currently be running a default configuration file. This needs to be replaced with the `config.alloy` file that is included in the `alloy-scenarios/windows` directory. To do this: 
1. Stop the Grafana Alloy service.
2. Replace the `config.alloy` file in `C:\Program Files\GrafanaLabs\Alloy` with the `config.alloy` file from the `alloy-scenarios/windows` directory.
3. Start the Grafana Alloy service.
4. Open your browser and go to `http://localhost:12345` to access the Alloy UI.

## Step 5: Viewing the Windows Performance Metrics and Event Logs

You will now be able to view the Windows Performance Metrics and Event Logs in Grafana:

* Open your browser and go to [http://localhost:3000/explore/metrics](http://localhost:3000/explore/metrics). This will take you to the metrics explorer in Grafana.

* Open your browser and go to [http://localhost:3000/a/grafana-lokiexplore-app](http://localhost:3000/a/grafana-lokiexplore-app). This will take you to the Loki explorer in Grafana.


================================================
FILE: windows/config.alloy
================================================
// ####################################
// Windows Server Metrics Configuration
// ####################################

prometheus.exporter.windows "default" {
  enabled_collectors = ["cpu","cs","logical_disk","net","os","service","system", "memory", "scheduled_task", "tcp"]
}

// Configure a prometheus.scrape component to collect windows metrics.
prometheus.scrape "example" {
  targets    = prometheus.exporter.windows.default.targets
  forward_to = [prometheus.remote_write.demo.receiver]
}

prometheus.remote_write "demo" {
  endpoint {
    url = "http://localhost:9090/api/v1/write"
  }
}

// ####################################
// Windows Server Logs Configuration
// ####################################

loki.source.windowsevent "application"  {
    eventlog_name = "Application"
    use_incoming_timestamp = true
    forward_to = [loki.process.endpoint.receiver]
}

loki.source.windowsevent "System"  {
    eventlog_name = "System"
    use_incoming_timestamp = true
    forward_to = [loki.process.endpoint.receiver]
}

loki.process "endpoint" {
  forward_to = [loki.write.endpoint.receiver]
  stage.json {
      expressions = {
          message = "",
          Overwritten = "",
          source = "",
          computer = "",
          eventRecordID = "",
          channel = "",
          component_id = "",
          execution = "",
      }
  }

  // Extract nested fields from the "execution" object (e.g. processId, processName).
  stage.json {
      source = "execution"
      expressions = {
          processId = "",
          processName = "",
      }
  }

  stage.structured_metadata {
      values = {
          "eventRecordID" = "",
          "channel" = "",
          "component_id" = "",
          "execution_processId" = "processId",
          "execution_processName" = "processName",
      }
  }

  stage.eventlogmessage {
      source = "message"
      overwrite_existing = true
  }

  stage.labels {
      values = {
          "service_name" = "source",
      }
}

stage.output {
    source = "message"
}

}


loki.write "endpoint" {
    endpoint {
        url ="http://localhost:3100/loki/api/v1/push"
    }
}

livedebugging{}

================================================
FILE: windows/docker-compose.yml
================================================
version: '3.8'

services:

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - 3100:3100/tcp
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml


  prometheus:
     image: prom/prometheus:${PROMETHEUS_VERSION:-v3.11.3}
     command:
       - --web.enable-remote-write-receiver
       - --config.file=/etc/prometheus/prometheus.yml
     ports:
      - 9090:9090/tcp
     volumes:
        - ./prom-config.yaml:/etc/prometheus/prometheus.yml


  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - 3000:3000/tcp
    entrypoint:
       - sh
       - -euc
       - |
         mkdir -p /etc/grafana/provisioning/datasources
         cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
         apiVersion: 1
         datasources:
         - name: Loki
           type: loki
           access: proxy
           orgId: 1
           url: http://loki:3100
           basicAuth: false
           isDefault: false
           version: 1
           editable: false
         - name: Prometheus
           type: prometheus
           orgId: 1
           url: http://prometheus:9090
           basicAuth: false
           isDefault: true
           version: 1
           editable: false
         EOF
         /run.sh


================================================
FILE: windows/loki-config.yaml
================================================

# This is a complete configuration to deploy Loki backed by the filesystem.
# The index will be shipped to the storage via tsdb-shipper.

auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
  - from: 2020-05-15
    store: tsdb
    object_store: filesystem
    schema: v13
    index:
      prefix: index_
      period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

# Note: We are setting the max chunk age far lower than the default expected value
# This is due to the fact this scenario is used within the LogCLI demo and we need a short flush time.
# To show how logcli stats --since 24h '{service_name="Delivery World", package_size="Large"}' works.
ingester:
  max_chunk_age: 5m # Should be 2 hours

================================================
FILE: windows/prom-config.yaml
================================================
# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).


================================================
FILE: windows-events/README.md
================================================
# Windows Security Event Logs with Grafana Alloy

A focused logs-only scenario for shipping the **Windows Security event channel** to Loki, with filtering and field-extraction tuned for SOC-style queries (logon attempts, privilege escalation, account changes).

## How this differs from the [`windows/`](../windows/) scenario

| Aspect | `windows/` (broad) | `windows-events/` (this) |
|---|---|---|
| Channels | Application + System + Performance metrics | **Security** only |
| Processing | Pass-through with basic JSON parsing | **Drops noise event IDs** + extracts security-specific fields as labels |
| Backend | Loki + Prometheus + Grafana | **Loki + Grafana** (no metrics) |
| Demo intent | "ship Windows logs to Loki" | "make Security events queryable for SOC use cases" |

If you want general-purpose Windows monitoring, use `windows/`. If you specifically care about Security audit events, use this one.

## Prerequisites

- A Windows host (Server or Desktop) with admin access — `loki.source.windowsevent` reads from the Windows Event Log API and only runs on Windows.
- Docker Desktop for Windows (or any Linux machine you can reach over the network) for the Loki/Grafana backend.
- Git, to clone the repo.

## Step 1 — Backend (Loki + Grafana)

On the machine that will host the backend (the Windows host itself, or any Linux machine):

```bash
git clone https://github.com/grafana/alloy-scenarios.git
cd alloy-scenarios/windows-events
docker compose up -d
```

Grafana is on `http://<backend-host>:3000` with the Loki datasource already provisioned.

## Step 2 — Install Alloy on the Windows host

Follow the [Windows install guide](https://grafana.com/docs/alloy/latest/set-up/install/windows/). Recommended: Windows Installer + Windows Service.

If your backend is on a different machine than the Windows host, edit the `loki.write` URL in `config.alloy` from `http://localhost:3100` to `http://<backend-host>:3100`.

## Step 3 — Replace the Alloy config

1. Stop the `Grafana Alloy` Windows service.
2. Replace `C:\Program Files\GrafanaLabs\Alloy\config.alloy` with the [`config.alloy`](./config.alloy) from this directory.
3. Start the service.
4. Open `http://localhost:12345` to confirm the components load without error.

## Step 4 — Generate Security events

To see traffic, trigger some auditable actions on the Windows host:

- **Failed logon (4625)**: try to log in with a wrong password from a remote machine, or run `runas /user:fakeuser cmd` and enter a wrong password.
- **Successful logon (4624)**: log out and back in, or open a new RDP session.
- **User created (4720)**: `net user testuser P@ssw0rd /add` from an admin shell.
- **Privilege use (4672)**: any action requiring Administrator elevation.

Some of these only generate events if the corresponding **audit policy** is enabled. Check `auditpol /get /category:*` on the Windows host; enable additional audit policies via `auditpol /set /subcategory:"<name>" /success:enable /failure:enable` if needed.

## Step 5 — Query in Grafana

```logql
# All Security events
{eventlog_name="Security"}

# Failed logons
{eventlog_name="Security", event_id="4625"}

# Successful logons by a specific user
{eventlog_name="Security", event_id="4624", target_user_name="alice"}

# All events affecting a specific user account
{eventlog_name="Security", target_user_name="alice"}

# Recent privileged-operation events
{eventlog_name="Security", event_id=~"4672|4673"}
```

The promoted labels are `event_id`, `subject_user_name`, `target_user_name`, and `logon_type`. Other event fields (computer, eventRecordID, channel) are kept as **structured metadata** — searchable via Loki's `| json` filter without inflating the label index.

## What's filtered out

The pipeline drops these event IDs at the Alloy side:

| Event ID | Description | Why dropped |
|---|---|---|
| 4658 | Handle to an object was closed | Pairs with 4656/4663; on its own rarely actionable |
| 4690 | Attempt to duplicate a handle to an object | Audit noise |
| 4674 | Operation attempted on a privileged object | Fires for routine privileged ops |
| 5379 | Credential Manager credentials were read | Frequent false-positive in normal use |

If you want one of these back, edit `stage.match` in `config.alloy` to remove the corresponding ID from the `event_id=~"…"` regex.

## Stopping

```bash
docker compose down -v
```

Stop the Alloy Windows service separately if you no longer want it running.


================================================
FILE: windows-events/config.alloy
================================================
// ###################################################################
// Windows Security Event Log → Loki, with filtering and field labels
// ###################################################################
//
// Differs from the broader `windows/` scenario in three ways:
//   1. Security channel only (Application + System are covered there)
//   2. Drops high-volume audit-noise event IDs that bury real signal
//   3. Promotes security-specific fields (subject_user_name,
//      target_user_name, logon_type) to labels for SOC-style queries
//
// Run target: a Windows host with Alloy installed natively. The
// docker-compose.yml in this directory only runs Loki + Grafana;
// Alloy itself is a Windows service.

livedebugging {}

// Ingest the Security channel. `use_incoming_timestamp = true` keeps
// the original event time rather than the time Alloy received it,
// which matters when replaying historical logs after an Alloy restart.
loki.source.windowsevent "security" {
	eventlog_name          = "Security"
	use_incoming_timestamp = true
	forward_to             = [loki.process.security.receiver]
}

loki.process "security" {
	// Step 1: parse the windowsevent JSON wrapper.
	stage.json {
		expressions = {
			message       = "",
			eventRecordID = "",
			channel       = "",
			computer      = "",
		}
	}

	// Step 2: parse the event message (XML/EventData) into top-level
	// fields. The exact keys depend on event type — `eventlogmessage`
	// pulls every named field from the XML/EventData payload.
	stage.eventlogmessage {
		source             = "message"
		overwrite_existing = true
	}

	// Step 3: drop high-noise event IDs that are rarely useful in a
	// SOC dashboard but consume most of the Security log volume:
	//   4658 — handle to an object closed
	//   4690 — attempt to duplicate a handle to an object
	//   4674 — operation attempted on a privileged object
	//   5379 — Credential Manager credentials read
	stage.match {
		selector = `{event_id=~"4658|4690|4674|5379"}`
		action   = "drop"
	}

	// Step 4: promote useful fields to labels. Indexed labels make
	// "show me all failed logons by username" queries cheap.
	stage.labels {
		values = {
			event_id          = "",
			subject_user_name = "",
			target_user_name  = "",
			logon_type        = "",
		}
	}

	// Step 5: keep heavyweight fields out of labels but searchable
	// via structured metadata.
	stage.structured_metadata {
		values = {
			eventRecordID = "",
			channel       = "",
			computer      = "",
		}
	}

	forward_to = [loki.write.endpoint.receiver]
}

loki.write "endpoint" {
	endpoint {
		url = "http://localhost:3100/loki/api/v1/push"
	}
}


================================================
FILE: windows-events/docker-compose.yml
================================================
services:

  loki:
    image: grafana/loki:${GRAFANA_LOKI_VERSION:-3.6.10}
    ports:
      - "3100:3100/tcp"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

  grafana:
    image: grafana/grafana:${GRAFANA_VERSION:-13.0.1}
    environment:
      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
      - GF_AUTH_ANONYMOUS_ENABLED=true
      - GF_AUTH_BASIC_ENABLED=false
    ports:
      - "3000:3000/tcp"
    entrypoint:
      - sh
      - -euc
      - |
        mkdir -p /etc/grafana/provisioning/datasources
        cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
        apiVersion: 1
        datasources:
        - name: Loki
          type: loki
          access: proxy
          orgId: 1
          url: http://loki:3100
          basicAuth: false
          isDefault: true
          version: 1
          editable: false
        EOF
        /run.sh


================================================
FILE: windows-events/loki-config.yaml
================================================
auth_enabled: false

limits_config:
  allow_structured_metadata: true
  volume_enabled: true

server:
  http_listen_port: 3100

common:
  ring:
    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
  replication_factor: 1
  path_prefix: /tmp/loki

schema_config:
  configs:
    - from: 2020-05-15
      store: tsdb
      object_store: filesystem
      schema: v13
      index:
        prefix: index_
        period: 24h

storage_config:
  tsdb_shipper:
    active_index_directory: /tmp/loki/index
    cache_location: /tmp/loki/index_cache
  filesystem:
    directory: /tmp/loki/chunks

pattern_ingester:
  enabled: true

ingester:
  max_chunk_age: 5m