Repository: aws-deepracer-community/deepracer-for-cloud
Branch: master
Commit: 6f4d1c768a4f
Files: 101
Total size: 363.0 KB

Directory structure:
gitextract_rez3pf4j/

├── .github/
│   └── workflows/
│       └── syntax-check.yml
├── .gitignore
├── LICENSE
├── README.md
├── bin/
│   ├── activate.sh
│   ├── autorun.sh
│   ├── detect.sh
│   ├── init.sh
│   ├── module/
│   │   ├── droa.sh
│   │   └── summary.sh
│   ├── prepare-mac.sh
│   ├── prepare.sh
│   ├── runonce.sh
│   └── scripts_wrapper.sh
├── defaults/
│   ├── debug-reward_function.py
│   ├── dependencies.json
│   ├── docker-daemon.json
│   ├── hyperparameters.json
│   ├── model_metadata.json
│   ├── model_metadata_cont.json
│   ├── model_metadata_sac.json
│   ├── reward_function.py
│   ├── template-run.env
│   ├── template-system.env
│   └── template-worker.env
├── docker/
│   ├── docker-compose-aws.yml
│   ├── docker-compose-cwlog.yml
│   ├── docker-compose-endpoint.yml
│   ├── docker-compose-eval-swarm.yml
│   ├── docker-compose-eval.yml
│   ├── docker-compose-keys.yml
│   ├── docker-compose-local-xorg-wsl.yml
│   ├── docker-compose-local-xorg.yml
│   ├── docker-compose-local.yml
│   ├── docker-compose-metrics.yml
│   ├── docker-compose-mount.yml
│   ├── docker-compose-robomaker-multi.yml
│   ├── docker-compose-robomaker-scripts.yml
│   ├── docker-compose-simapp.yml
│   ├── docker-compose-training-swarm.yml
│   ├── docker-compose-training.yml
│   ├── docker-compose-webviewer-swarm.yml
│   ├── docker-compose-webviewer.yml
│   └── metrics/
│       ├── configuration.env
│       ├── grafana/
│       │   └── provisioning/
│       │       ├── dashboards/
│       │       │   ├── dashboard.yml
│       │       │   └── deepracer-training-template.json
│       │       └── datasources/
│       │           └── influxdb.yml
│       └── telegraf/
│           └── etc/
│               └── telegraf.conf
├── docs/
│   ├── _config.yml
│   ├── docker.md
│   ├── droa.md
│   ├── head-to-head.md
│   ├── index.md
│   ├── installation.md
│   ├── mac.md
│   ├── metrics.md
│   ├── multi_gpu.md
│   ├── multi_run.md
│   ├── multi_worker.md
│   ├── opengl.md
│   ├── reference.md
│   ├── video.md
│   └── windows.md
├── requirements.txt
├── scripts/
│   ├── droa/
│   │   ├── __init__.py
│   │   ├── auth.py
│   │   ├── delete_model.py
│   │   ├── download_logs.py
│   │   ├── get_model.py
│   │   ├── import_model.py
│   │   └── list_models.py
│   ├── evaluation/
│   │   ├── prepare-config.py
│   │   ├── start.sh
│   │   └── stop.sh
│   ├── log-analysis/
│   │   ├── start.sh
│   │   └── stop.sh
│   ├── metrics/
│   │   ├── start.sh
│   │   └── stop.sh
│   ├── training/
│   │   ├── increment.sh
│   │   ├── prepare-config.py
│   │   ├── start.sh
│   │   └── stop.sh
│   ├── upload/
│   │   ├── download-model.sh
│   │   ├── increment.sh
│   │   ├── prepare-config.py
│   │   ├── upload-car.sh
│   │   └── upload-model.sh
│   └── viewer/
│       ├── index.template.html
│       ├── start.sh
│       └── stop.sh
└── utils/
    ├── Dockerfile.gpu-detect
    ├── cuda-check-tf.py
    ├── cuda-check.sh
    ├── download-car-model.py
    ├── evaluate.sh
    ├── sample-createspot.sh
    ├── setup-xorg.sh
    ├── start-local-browser.sh
    ├── start-xorg.sh
    ├── timed-stop.sh
    └── upload-rotate.sh

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/syntax-check.yml
================================================
name: Syntax Check

on:
  pull_request:
    branches:
      - master
      - dev

jobs:
  bash-syntax:
    name: Bash Syntax Check
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Check bash scripts in bin/
        run: |
          find bin/ -name '*.sh' | sort | while read -r f; do
            bash -n "$f" && echo "OK: $f"
          done

      - name: Check bash scripts in scripts/
        run: |
          find scripts/ -name '*.sh' | sort | while read -r f; do
            bash -n "$f" && echo "OK: $f"
          done

  python-syntax:
    name: Python Syntax Check
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - uses: actions/setup-python@v5
        with:
          python-version: '3.x'

      - name: Check Python scripts in scripts/
        run: |
          find scripts/ -name '*.py' | sort | while read -r f; do
            python3 -m py_compile "$f" && echo "OK: $f"
          done

  docker-compose-syntax:
    name: Docker Compose Syntax Check
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Check docker-compose YAML files
        run: |
          find docker/ -name 'docker-compose*.yml' | sort | while read -r f; do
            python3 -c 'import sys, yaml; yaml.safe_load(open(sys.argv[1])) or True; print("OK: " + sys.argv[1])' "$f" \
              || { echo "FAIL: $f"; exit 1; }
          done


================================================
FILE: .gitignore
================================================
.vscode/
.venv/
custom_files/
logs/
docker/volumes/
recording/
recording
/*.env
/*.bak
/*.tar
/*.json
DONE
data/
tmp/
autorun.s3url
nohup.out
/*.sh
_
experiments/


# Python
__pycache__/
*.py[cod]
*.pyo
*.pyd


================================================
FILE: LICENSE
================================================
Copyright 2019-2023 AWS DeepRacer Community. All Rights Reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy of this
software and associated documentation files (the "Software"), to deal in the Software
without restriction, including without limitation the rights to use, copy, modify,
merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


================================================
FILE: README.md
================================================
# DeepRacer-For-Cloud
Provides a quick and easy way to get up and running with a DeepRacer training environment using a cloud virtual machine or a local compter, such [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing) or the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu).

DRfC runs on Ubuntu 22.04 and 24.04. GPU acceleration requires a NVIDIA GPU, preferrably with more than 8GB of VRAM. ARM64/Graviton instances (AWS Graviton, Apple Silicon) are also supported for CPU-only training.

**Experimental:** macOS is supported in CPU-only mode via [Colima](https://github.com/abiosoft/colima), on both AWS Mac EC2 instances and local Mac hardware (Intel and Apple Silicon). See [docs/mac.md](docs/mac.md) for setup instructions.

## Introduction

DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities are in the [Deepracer Simapp](https://github.com/aws-deepracer-community/deepracer-simapp) repository.

As if December 2025 the original DeepRacer service in the AWS console is no longer available, and is replaced by [DeepRacer-on-AWS](https://aws.amazon.com/solutions/implementations/deepracer-on-aws/) which you can install in your own AWS environment. DeepRacer-For-Cloud is independent of any AWS service, so it is not directly impacted by this change.

## Main Features

DRfC supports a wide set of features to ensure that you can focus on creating the best model:
* User-friendly
	* Based on the continously updated community [Robomaker](https://github.com/aws-deepracer-community/deepracer-simapp) container, supporting a wide range of CPU and GPU setups.
	* Wide set of scripts (`dr-*`) enables effortless training.
	* Detection of your AWS DeepRacer Console models; allows upload of a locally trained model to any of them.
* Modes
	* Time Trial
	* Object Avoidance
	* Head-to-Bot
* Training
	* Multiple Robomaker instances per Sagemaker (N:1) to improve training progress.
	* Multiple training sessions in parallel - each being (N:1) if hardware supports it - to test out things in parallel.
	* Connect multiple nodes together (Swarm-mode only) to combine the powers of multiple computers/instances.
* Evaluation
	* Evaluate independently from training.
	* Save evaluation run to MP4 file in S3.
* Logging
	* Training metrics and trace files are stored to S3.
	* Optional integration with AWS CloudWatch.
	* Optional exposure of Robomaker internal log-files.
* Technology
	* Supports both Docker Swarm (used for connecting multiple nodes together) and Docker Compose

## Tech Stack

DRfC is built on top of the [AWS DeepRacer Simapp](https://github.com/aws-deepracer-community/deepracer-simapp) — a single Docker image used for three purposes:

* **Robomaker** — one or more containers providing robotics simulation via ROS and Gazebo
* **Sagemaker** — container running the model training job
* **RL Coach** — container that bootstraps the Sagemaker container using the Sagemaker SDK and Sagemaker Local

### Core Technologies

| Component | Version |
|-----------|---------|
| Ubuntu | 24.04 |
| Python | 3.12 |
| TensorFlow | 2.20 |
| CUDA | 12.6 (GPU only) |
| Redis | 8.6.1 |
| ROS | 2 Jazzy |
| Gazebo | Harmonic |

## Recommended AWS Instance Types

| Use case | Instance type | Notes |
|----------|--------------|-------|
| GPU | `g4dn.2xlarge` | NVIDIA T4, fastest training |
| Intel CPU | `c7i.2xlarge` | Latest Intel CPU generation, cost-effective CPU training |
| ARM CPU (Graviton) | `c8g.2xlarge` | AWS Graviton4, best price/performance for CPU |

### Images

Pre-built images are available on [Docker Hub](https://hub.docker.com/repository/docker/awsdeepracercommunity/deepracer-simapp) as `awsdeepracercommunity/deepracer-simapp:<VERSION>-cpu` (CPU) and `awsdeepracercommunity/deepracer-simapp:<VERSION>-gpu` (CUDA GPU). Both support OpenGL acceleration.

During installation DRfC will automatically pull the latest image based on whether you have a GPU or CPU installation.

## Documentation

Full documentation can be found on the [Deepracer-for-Cloud GitHub Pages](https://aws-deepracer-community.github.io/deepracer-for-cloud).

For importing and managing models via the community [DeepRacer on AWS (DRoA)](https://aws.amazon.com/solutions/implementations/deepracer-on-aws/) console, see the [DRoA integration guide](docs/droa.md).

## Support

* For general support it is suggested to join the [AWS DeepRacing Community](https://deepracing.io/). The Community Slack has a channel #dr-training-local where the community provides active support.
* Create a GitHub issue if you find an actual code issue, or where updates to documentation would be required.


================================================
FILE: bin/activate.sh
================================================
#!/usr/bin/env bash

# Portable readlink -f: BSD readlink (macOS) does not support -f.
_realpath() {
  if command -v realpath >/dev/null 2>&1; then
    realpath "$1"
  elif command -v grealpath >/dev/null 2>&1; then
    grealpath "$1"
  else
    readlink -f "$1"
  fi
}

# Portable version comparison: sort -V is GNU-only; macOS ships BSD sort.
verlte() {
  local v1 v2
  v1="$1" v2="$2"
  # Split into numeric fields and compare segment by segment.
  IFS='.' read -r -a a1 <<< "$v1"
  IFS='.' read -r -a a2 <<< "$v2"
  local i
  for (( i=0; i<${#a1[@]} || i<${#a2[@]}; i++ )); do
    local n1=${a1[$i]:-0} n2=${a2[$i]:-0}
    if (( n1 < n2 )); then return 0; fi
    if (( n1 > n2 )); then return 1; fi
  done
  return 0
}

# Find a free /24 subnet in 192.168.200-254/24 that doesn't conflict with
# existing Docker networks or host routes.
function find_free_subnet() {
  local USED NW_IDS
  NW_IDS=$(docker network ls -q 2>/dev/null)
  USED=$(
    { [[ -n "$NW_IDS" ]] && docker network inspect $NW_IDS \
          --format '{{range .IPAM.Config}}{{.Subnet}}{{end}}' 2>/dev/null; \
      if ip route show 2>/dev/null | grep -q .; then
          ip route show 2>/dev/null | awk '{print $1}' | grep -E '^[0-9]+\.'
      else
          # macOS: parse inet routes from netstat
          netstat -rn -f inet 2>/dev/null | awk 'NR>4 && $1~/^[0-9]/{print $1}'
      fi; } | sort -u
  )
  for j in $(seq 200 254); do
    local CANDIDATE="192.168.${j}.0/24"
    if ! echo "$USED" | grep -qF "$CANDIDATE"; then
      echo "$CANDIDATE"
      return 0
    fi
  done
  return 1
}

# Create the sagemaker-local Docker network with the required compose labels.
function _create_sagemaker_network() {
  local NW_SUBNET=$(find_free_subnet)
  local SWARM_FLAGS
  [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]] && SWARM_FLAGS="-d overlay --attachable --scope swarm"
  docker network create "$SAGEMAKER_NW" $SWARM_FLAGS \
    ${NW_SUBNET:+--subnet=$NW_SUBNET} \
    --label com.docker.compose.network=sagemaker-local \
    --label com.docker.compose.project=sagemaker-local >/dev/null 2>&1
}

function dr-update-env {

  local _saved_experiment="${DR_EXPERIMENT_NAME:-}"

  if [[ -f "$DIR/system.env" ]]; then
    LINES=$(grep -v '^#' $DIR/system.env)
    for l in $LINES; do
      env_var=$(echo $l | cut -f1 -d\=)
      env_val=$(echo $l | cut -f2 -d\=)
      eval "export $env_var=$env_val"
    done
  else
    echo "File system.env does not exist."
    return 1
  fi

  # Restore DR_EXPERIMENT_NAME if it was pre-set (e.g. via -e flag) so it takes
  # precedence over any value in system.env.
  if [[ -n "$_saved_experiment" ]]; then
    export DR_EXPERIMENT_NAME="$_saved_experiment"
  fi

  if [[ ! -z $DR_EXPERIMENT_NAME ]]; then
    if [[ ! -d "$DIR/experiments" ]]; then
      echo "Experiments directory $DIR/experiments does not exist."
      return 1
    fi
    if [[ ! -d "$DIR/experiments/$DR_EXPERIMENT_NAME" ]]; then
      echo "Experiment directory $DIR/experiments/$DR_EXPERIMENT_NAME does not exist."
      return 1
    fi
    export DR_CONFIG="$DIR/experiments/$DR_EXPERIMENT_NAME/run.env"
  fi

  if [[ -f "$DR_CONFIG" ]]; then
    LINES=$(grep -v '^#' $DR_CONFIG)
    for l in $LINES; do
      env_var=$(echo $l | cut -f1 -d\=)
      env_val=$(echo $l | cut -f2 -d\=)
      eval "export $env_var=$env_val"
    done
  else
    echo "File ${DR_CONFIG} does not exist."
    return 1
  fi

  if [[ -z "${DR_RUN_ID}" ]]; then
    export DR_RUN_ID=0
  fi

  if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
    export DR_ROBOMAKER_TRAIN_PORT=$(expr 8080 + $DR_RUN_ID)
    export DR_ROBOMAKER_EVAL_PORT=$(expr 8180 + $DR_RUN_ID)
    export DR_ROBOMAKER_GUI_PORT=$(expr 5900 + $DR_RUN_ID)
  else
    export DR_ROBOMAKER_TRAIN_PORT="8080-8089"
    export DR_ROBOMAKER_EVAL_PORT="8080-8089"
    export DR_ROBOMAKER_GUI_PORT="5901-5920"
  fi

  # Setting the default region to ensure that things work also in the
  # non default regions.
  export AWS_DEFAULT_REGION=${DR_AWS_APP_REGION}

}

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-}")" >/dev/null 2>&1 && pwd)"
DIR="$(dirname $SCRIPT_DIR)"
export DR_DIR=$DIR

# Parse arguments: -e <experiment-name> or a positional config file path
_DR_OPT_EXPERIMENT=""
OPTIND=1
while getopts ":e:" _opt; do
  case $_opt in
    e) _DR_OPT_EXPERIMENT="$OPTARG" ;;
    \?) break ;;
  esac
done
shift $(( OPTIND - 1 ))
unset _opt OPTIND

if [[ -n "$_DR_OPT_EXPERIMENT" ]]; then
  export DR_EXPERIMENT_NAME="$_DR_OPT_EXPERIMENT"
fi
unset _DR_OPT_EXPERIMENT

EXPERIMENT_FLAG="$( grep DR_EXPERIMENT_NAME $DIR/system.env | grep -v \#)"

if [[ -f "$1" ]]; then
  export DR_CONFIG=$(_realpath "$1")
  dr-update-env || return 1
elif [[ -n "${DR_EXPERIMENT_NAME:-}" ]] || [[ -n "$EXPERIMENT_FLAG" ]]; then
  dr-update-env || return 1
elif [[ -f "$DIR/run.env" ]]; then
  export DR_CONFIG="$DIR/run.env"
  dr-update-env || return 1
else
  echo "No configuration file."
  return 1
fi

## Activate Python virtual environment
if [[ -f "${DR_DIR}/.venv/bin/activate" ]]; then
  source "${DR_DIR}/.venv/bin/activate"
else
  echo "WARNING: Python venv not found at ${DR_DIR}/.venv. Run bin/prepare.sh to create it."
fi

# Check if Docker runs -- if not, then start it.
if [[ "$(type service 2>/dev/null)" ]]; then
  service docker status >/dev/null || sudo service docker start
fi

## Check if WSL2
if [[ -f /proc/version ]] && grep -qi Microsoft /proc/version && grep -q "WSL2" /proc/version; then
    IS_WSL2="yes"
fi

# Check if we will use Docker Swarm or Docker Compose
# If not defined then use Swarm
if [[ -z "${DR_DOCKER_STYLE}" ]]; then
  export DR_DOCKER_STYLE="swarm"
fi

if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
  export DR_DOCKER_FILE_SEP="-c"
  SWARM_NODE=$(docker node inspect self | jq .[0].ID -r)
  SWARM_NODE_UPDATE=$(docker node update --label-add Sagemaker=true $SWARM_NODE)
else
  export DR_DOCKER_FILE_SEP="-f"
fi

# Check if sagemaker-local network has required compose label; recreate if missing
SAGEMAKER_NW='sagemaker-local'

if ! docker network ls --format '{{.Name}}' | grep -q "^${SAGEMAKER_NW}$"; then
  echo "Network $SAGEMAKER_NW does not exist. Creating."
  _create_sagemaker_network
else
  NW_LABEL_NETWORK=$(docker network inspect "$SAGEMAKER_NW" --format '{{index .Labels "com.docker.compose.network"}}')
  if [[ "$NW_LABEL_NETWORK" != "sagemaker-local" ]]; then
    echo "Network $SAGEMAKER_NW is missing required label."
    NW_CONTAINERS=$(docker network inspect "$SAGEMAKER_NW" --format '{{len .Containers}}')
    if [[ "${NW_CONTAINERS:-0}" -gt 0 ]]; then
      dr-stop-all
    fi
    docker network rm "$SAGEMAKER_NW" >/dev/null 2>&1
    _create_sagemaker_network
    echo "Network $SAGEMAKER_NW recreated with required labels."

  fi
fi

# Check if CUDA_VISIBLE_DEVICES is configured.
if [[ -n "${CUDA_VISIBLE_DEVICES}" ]]; then
  echo "WARNING: You have CUDA_VISIBLE_DEVICES defined. The will no longer work as"
  echo "         expected. To control GPU assignment use DR_ROBOMAKER_CUDA_DEVICES"
  echo "         and DR_SAGEMAKER_CUDA_DEVICES and rlcoach v5.0.1 or later."
fi

# Check if CUDA_VISIBLE_DEVICES is configured.
if [ "${DR_CLOUD,,}" == "local" ] && [ -z "${DR_MINIO_IMAGE}" ]; then
  echo "WARNING: You have not configured DR_MINIO_IMAGE in system.env."
  echo "         System will default to tag RELEASE.2022-10-24T18-35-07Z"
  export DR_MINIO_IMAGE="RELEASE.2022-10-24T18-35-07Z"
fi

# Prepare the docker compose files depending on parameters
if [[ "${DR_CLOUD,,}" == "azure" ]]; then
  export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000"
  export DR_MINIO_URL="http://minio:9000"
  DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL"
  DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml"
  DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml"
  DR_MINIO_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local.yml"
elif [[ "${DR_CLOUD,,}" == "local" ]]; then
  export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000"
  export DR_MINIO_URL="http://minio:9000"
  DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL"
  DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml"
  DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml"
  DR_MINIO_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local.yml"
elif [[ "${DR_CLOUD,,}" == "remote" ]]; then
  export DR_LOCAL_S3_ENDPOINT_URL="$DR_REMOTE_MINIO_URL"
  export DR_MINIO_URL="$DR_REMOTE_MINIO_URL"
  DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL"
  DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml"
  DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml"
  DR_MINIO_COMPOSE_FILE=""
elif [[ "${DR_CLOUD,,}" == "aws" ]]; then
  DR_LOCAL_PROFILE_ENDPOINT_URL=""
  DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-aws.yml"
  DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-aws.yml"
else
  DR_LOCAL_PROFILE_ENDPOINT_URL=""
  DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml"
  DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml"
fi

# Add host X support for Linux and WSL2
if [[ "${DR_HOST_X,,}" == "true" ]]; then
  if [[ "$IS_WSL2" == "yes" ]]; then
  
    # Check if package x11-server-utils is installed
    if ! command -v xset &> /dev/null; then
      echo "WARNING: Package x11-server-utils is not installed. Please install it to enable X11 support."
    fi
  
    if [[ "${DR_DOCKER_STYLE,,}" == "swarm" && "${DR_USE_GUI,,}" == "true" ]]; then
      echo "WARNING: Cannot use GUI in Swarm mode. Please switch to Compose mode."
    fi

    DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg-wsl.yml"
    DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg-wsl.yml"
  else
    DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg.yml"
    DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg.yml"
  fi
fi

# Prevent docker swarms to restart
if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
  DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training-swarm.yml"
  DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval-swarm.yml"
fi

# Enable logs in CloudWatch
if [[ "${DR_CLOUD_WATCH_ENABLE,,}" == "true" ]]; then
  DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-cwlog.yml"
  DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-cwlog.yml"
fi

# Enable local simapp mount
if [[ -d "${DR_ROBOMAKER_MOUNT_SIMAPP_DIR,,}" ]]; then
  DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-simapp.yml"
  DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-simapp.yml"
fi

# Enable local scripts mount
if [[ -d "${DR_ROBOMAKER_MOUNT_SCRIPTS_DIR,,}" ]]; then
  DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-robomaker-scripts.yml"
  DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-robomaker-scripts.yml"
fi

## Check if we have an AWS IAM assumed role, or if we need to set specific credentials.
## On macOS/Darwin, IMDS is not reachable from inside the Colima VM, so always use
## explicit keys from the configured AWS profile.
if [[ "$(uname -s)" != "Darwin" ]] && [ "${DR_CLOUD,,}" == "aws" ] && [ $(aws --output json sts get-caller-identity 2>/dev/null | jq '.Arn' | awk /assumed-role/ | wc -l) -gt 0 ]; then
  export DR_LOCAL_S3_AUTH_MODE="role"
else
  export DR_LOCAL_ACCESS_KEY_ID=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_access_key_id | xargs)
  export DR_LOCAL_SECRET_ACCESS_KEY=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs)
  if [[ -z "${DR_LOCAL_ACCESS_KEY_ID}" || -z "${DR_LOCAL_SECRET_ACCESS_KEY}" ]]; then
    echo "ERROR: AWS credentials not found in profile '${DR_LOCAL_S3_PROFILE}'."
    echo "       Run: aws configure --profile ${DR_LOCAL_S3_PROFILE}"
    return 1
  fi
  DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-keys.yml"
  DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-keys.yml"
  export DR_UPLOAD_PROFILE="--profile $DR_UPLOAD_S3_PROFILE"
  export DR_LOCAL_S3_AUTH_MODE="profile"
fi

export DR_TRAIN_COMPOSE_FILE
export DR_EVAL_COMPOSE_FILE
export DR_LOCAL_PROFILE_ENDPOINT_URL

if [[ -n "${DR_MINIO_COMPOSE_FILE}" ]]; then
  export MINIO_UID=$(id -u)
  export MINIO_USERNAME=$(id -u -n)
  export MINIO_GID=$(id -g)
  export MINIO_GROUPNAME=$(id -g -n)
  if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then

    if [ "$DR_DOCKER_MAJOR_VERSION" -gt 24 ]; then
      DETACH_FLAG="--detach=true"
    fi

    docker stack deploy $DR_MINIO_COMPOSE_FILE $DETACH_FLAG s3
  else
    docker compose $DR_MINIO_COMPOSE_FILE -p s3 up -d
  fi

fi

## Version check
if [[ -z "$DR_SIMAPP_SOURCE" || -z "$DR_SIMAPP_VERSION" ]]; then
  DEFAULT_SIMAPP_VERSION=$(jq -r '.containers.simapp | select (.!=null)' $DIR/defaults/dependencies.json)
  echo "ERROR: Variable DR_SIMAPP_SOURCE or DR_SIMAPP_VERSION not defined."
  echo ""
  echo "As of version 5.3 the variables DR_SIMAPP_SOURCE and DR_SIMAPP_VERSION are required in system.env."
  echo "To continue to use the separate Sagemaker, Robomaker and RL Coach images, run 'git checkout legacy'."
  echo ""
  echo "Please add the following lines to your system.env file:"
  echo "DR_SIMAPP_SOURCE=awsdeepracercommunity/deepracer-simapp"
  echo "DR_SIMAPP_VERSION=${DEFAULT_SIMAPP_VERSION}-gpu"
  return
fi

DEPENDENCY_VERSION=$(jq -r '.master_version  | select (.!=null)' $DIR/defaults/dependencies.json)

SIMAPP_VER=$(docker inspect ${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION} 2>/dev/null | jq -r .[].Config.Labels.version)
if [ -z "$SIMAPP_VER" ]; then SIMAPP_VER=$SIMAPP_VERSION; fi
if [ -z "$SIMAPP_VER" ]; then
  # Image not pulled -- fall back to checking the configured version tag
  SIMAPP_VER=$(echo ${DR_SIMAPP_VERSION} | grep -oP '^\d+\.\d+(\.\d+)?')
fi
if [ -n "$SIMAPP_VER" ] && ! verlte $DEPENDENCY_VERSION $SIMAPP_VER; then
  echo "WARNING: Incompatible version of Deepracer Simapp. Expected >$DEPENDENCY_VERSION. Got $SIMAPP_VER."
fi

# Get Docker version
DOCKER_VERSION=$(docker --version | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1)
DR_DOCKER_MAJOR_VERSION=$(echo $DOCKER_VERSION | cut -d. -f1)
export DR_DOCKER_MAJOR_VERSION

## Create a dr-local-aws command
alias dr-local-aws='aws $DR_LOCAL_PROFILE_ENDPOINT_URL'

source $SCRIPT_DIR/scripts_wrapper.sh
source $SCRIPT_DIR/module/summary.sh
source $SCRIPT_DIR/module/droa.sh

function dr-update {
  dr-update-env
}

function dr-reload {
  source $DIR/bin/activate.sh $DR_CONFIG
}

## Show summary after activation if not in quiet mode and if in interactive shell
[[ $- == *i* && "${DR_QUIET_ACTIVATE,,}" != "true" ]] && dr-summary


================================================
FILE: bin/autorun.sh
================================================
#!/usr/bin/env bash

## this is the default autorun script
## file should run automatically after init.sh completes.
## this script downloads your configured run.env, system.env and any custom container requests

INSTALL_DIR_TEMP="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." >/dev/null 2>&1 && pwd)"

## retrieve the s3_location name you sent the instance in user data launch
## assumed to first line of file
S3_LOCATION=$(awk 'NR==1 {print; exit}' $INSTALL_DIR_TEMP/autorun.s3url)

source $INSTALL_DIR_TEMP/bin/activate.sh

## get the updatated run.env and system.env files and any others you stashed in s3
aws s3 sync s3://$S3_LOCATION $INSTALL_DIR_TEMP

## get the right docker containers, if needed
SYSENV="$INSTALL_DIR_TEMP/system.env"
SAGEMAKER_IMAGE=$(cat $SYSENV | grep DR_SAGEMAKER_IMAGE | sed 's/.*=//')
ROBOMAKER_IMAGE=$(cat $SYSENV | grep DR_ROBOMAKER_IMAGE | sed 's/.*=//')

docker pull awsdeepracercommunity/deepracer-sagemaker:$SAGEMAKER_IMAGE
docker pull awsdeepracercommunity/deepracer-robomaker:$ROBOMAKER_IMAGE

dr-reload

date | tee $INSTALL_DIR_TEMP/DONE-AUTORUN

## start training
cd $INSTALL_DIR_TEMP/scripts/training
./start.sh


================================================
FILE: bin/detect.sh
================================================
#!/usr/bin/env bash

## What am I?
if [[ -f /var/run/cloud-init/instance-data.json ]]; then
    # We have a cloud-init environment (Azure or AWS).
    CLOUD_NAME=$(jq -r '.v1."cloud-name"' /var/run/cloud-init/instance-data.json)
    if [[ "${CLOUD_NAME}" == "azure" ]]; then
        export CLOUD_NAME
        export CLOUD_INSTANCETYPE=$(jq -r '.ds."meta_data".imds.compute."vmSize"' /var/run/cloud-init/instance-data.json)
    elif [[ "${CLOUD_NAME}" == "aws" ]]; then
        export CLOUD_NAME
        export CLOUD_INSTANCETYPE=$(jq -r '.ds."meta-data"."instance-type"' /var/run/cloud-init/instance-data.json)
    else
        export CLOUD_NAME=local
    fi
else
    export CLOUD_NAME=local
fi


================================================
FILE: bin/init.sh
================================================
#!/usr/bin/env bash

trap ctrl_c INT

function ctrl_c() {
    echo "Requested to stop."
    exit 1
}

# Portable sed -i: BSD sed (macOS) requires an explicit empty-string backup suffix.
if sed --version 2>/dev/null | grep -q GNU; then
    sedi() { sed -i "$@"; }
else
    sedi() { sed -i '' "$@"; }
fi

# Find a free /24 subnet in 192.168.200-254/24 that doesn't conflict with
# existing Docker networks or host routes.
function find_free_subnet() {
    local USED NW_IDS
    NW_IDS=$(docker network ls -q 2>/dev/null)
    USED=$(
        { [[ -n "$NW_IDS" ]] && docker network inspect $NW_IDS \
              --format '{{range .IPAM.Config}}{{.Subnet}}{{end}}' 2>/dev/null; \
          if ip route show 2>/dev/null | grep -q .; then
              ip route show 2>/dev/null | awk '{print $1}' | grep -E '^[0-9]+\.'
          else
              # macOS: parse inet routes from netstat
              netstat -rn -f inet 2>/dev/null | awk 'NR>4 && $1~/^[0-9]/{print $1}'
          fi; } | sort -u
    )
    for j in $(seq 200 254); do
        local CANDIDATE="192.168.${j}.0/24"
        if ! echo "$USED" | grep -qF "$CANDIDATE"; then
            echo "$CANDIDATE"
            return 0
        fi
    done
    return 1
}

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" >/dev/null 2>&1 && pwd)"
INSTALL_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")/.." >/dev/null 2>&1 && pwd)"

if [[ "$INSTALL_DIR" == *\ * ]]; then
    echo "Deepracer-for-Cloud cannot be installed in path with spaces. Exiting."
    exit 1
fi

OPT_ARCH="gpu"
OPT_CLOUD=""
OPT_STYLE="swarm"

while getopts ":m:c:a:s:" opt; do
    case $opt in
    a)
        OPT_ARCH="$OPTARG"
        ;;
    m)
        OPT_MOUNT="$OPTARG"
        ;;
    c)
        OPT_CLOUD="$OPTARG"
        ;;
    s)
        OPT_STYLE="$OPTARG"
        ;;
    \?)
        echo "Invalid option -$OPTARG" >&2
        exit 1
        ;;
    esac
done

# Check if cloud type is set, if not try to detect it. If detection fails, default to local.
if [[ -z "$OPT_CLOUD" ]]; then
    source $SCRIPT_DIR/detect.sh
    OPT_CLOUD=$CLOUD_NAME
    echo "Detected cloud type to be $CLOUD_NAME"
fi

# Check GPU
if [ "$OPT_ARCH" = "gpu" ]; then
    if GPUS="$(docker run --rm --gpus all --pull=missing \
        nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04 \
        bash -lc 'nvidia-smi -L | wc -l')" ; then

        if [ "${GPUS:-0}" -ge 1 ]; then
            echo "Detected ${GPUS} GPU(s) inside docker."
        else
            echo "No GPU detected in docker. Using CPU"
            OPT_ARCH="cpu"
        fi
    else
        echo "Failed to run GPU test container. Using CPU"
        OPT_ARCH="cpu"
    fi
fi

cd $INSTALL_DIR

# create directory structure for docker volumes
mkdir -p $INSTALL_DIR/data $INSTALL_DIR/data/minio $INSTALL_DIR/data/minio/bucket
mkdir -p $INSTALL_DIR/data/logs $INSTALL_DIR/data/analysis $INSTALL_DIR/data/scripts $INSTALL_DIR/tmp
sudo mkdir -p /tmp/sagemaker
sudo chmod -R g+w /tmp/sagemaker

# create symlink to current user's home .aws directory
# NOTE: AWS cli must be installed for this to work
# https://docs.aws.amazon.com/cli/latest/userguide/install-linux-al2017.html
mkdir -p $(eval echo "~${USER}")/.aws $INSTALL_DIR/docker/volumes/
ln -sf $(eval echo "~${USER}")/.aws $INSTALL_DIR/docker/volumes/

# copy rewardfunctions
mkdir -p $INSTALL_DIR/custom_files
cp $INSTALL_DIR/defaults/hyperparameters.json $INSTALL_DIR/custom_files/
cp $INSTALL_DIR/defaults/model_metadata.json $INSTALL_DIR/custom_files/
cp $INSTALL_DIR/defaults/reward_function.py $INSTALL_DIR/custom_files/

cp $INSTALL_DIR/defaults/template-system.env $INSTALL_DIR/system.env
cp $INSTALL_DIR/defaults/template-run.env $INSTALL_DIR/run.env
if [[ "${OPT_CLOUD}" == "aws" ]]; then
    IMDS_TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
    AWS_EC2_AVAIL_ZONE=$(curl -s -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" http://169.254.169.254/latest/meta-data/placement/availability-zone)
    AWS_REGION="$(echo $AWS_EC2_AVAIL_ZONE | sed 's/[a-z]$//')"
    sedi "s/<AWS_DR_BUCKET>/not-defined/g" $INSTALL_DIR/system.env
    sedi "s/<LOCAL_PROFILE>/default/g" $INSTALL_DIR/system.env
elif [[ "${OPT_CLOUD}" == "remote" ]]; then
    AWS_REGION="us-east-1"
    sedi "s/<LOCAL_PROFILE>/minio/g" $INSTALL_DIR/system.env
    sedi "s/<AWS_DR_BUCKET>/not-defined/g" $INSTALL_DIR/system.env
    echo "Please run 'aws configure --profile minio' to set the credentials"
    echo "Please define DR_REMOTE_MINIO_URL in system.env to point to remote minio instance."
else
    AWS_REGION="us-east-1"
    MINIO_PROFILE="minio"
    sedi "s/<LOCAL_PROFILE>/$MINIO_PROFILE/g" $INSTALL_DIR/system.env
    sedi "s/<AWS_DR_BUCKET>/not-defined/g" $INSTALL_DIR/system.env

    aws configure --profile $MINIO_PROFILE get aws_access_key_id >/dev/null 2>/dev/null

    if [[ "$?" -ne 0 ]]; then
        echo "Creating default minio credentials in AWS profile '$MINIO_PROFILE'"
        aws configure --profile $MINIO_PROFILE set aws_access_key_id $(openssl rand -base64 12)
        aws configure --profile $MINIO_PROFILE set aws_secret_access_key $(openssl rand -base64 12)
        aws configure --profile $MINIO_PROFILE set region us-east-1
    fi
fi
sedi "s/<AWS_DR_BUCKET_ROLE>/to-be-defined/g" $INSTALL_DIR/system.env
sedi "s/<CLOUD_REPLACE>/$OPT_CLOUD/g" $INSTALL_DIR/system.env
sedi "s/<REGION_REPLACE>/$AWS_REGION/g" $INSTALL_DIR/system.env

if [[ "${OPT_ARCH}" == "gpu" ]]; then
    SAGEMAKER_TAG="gpu"
else
    SAGEMAKER_TAG="cpu"
fi

#set proxys if required
for arg in "$@"; do
    IFS='=' read -ra part <<<"$arg"
    if [ "${part[0]}" == "--http_proxy" ] || [ "${part[0]}" == "--https_proxy" ] || [ "${part[0]}" == "--no_proxy" ]; then
        var=${part[0]:2}=${part[1]}
        args="${args} --build-arg ${var}"
    fi
done

# Download docker images. Change to build statements if locally built images are desired.
SIMAPP_VERSION=$(jq -r '.containers.simapp | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json)
sedi "s/<SIMAPP_VERSION_TAG>/$SIMAPP_VERSION-$SAGEMAKER_TAG/g" $INSTALL_DIR/system.env
docker pull awsdeepracercommunity/deepracer-simapp:$SIMAPP_VERSION-$SAGEMAKER_TAG

# create the network sagemaker-local if it doesn't exit
SAGEMAKER_NW='sagemaker-local'

if [[ "${OPT_STYLE}" == "swarm" ]]; then

    docker node ls >/dev/null 2>/dev/null
    if [ $? -eq 0 ]; then
        echo "Swarm exists. Exiting."
        exit 1
    fi

    docker swarm init
    if [ $? -ne 0 ]; then

        if ip route 2>/dev/null | grep -q default; then
            DEFAULT_IFACE=$(ip route | grep default | awk '{print $5}')
            DEFAULT_IP=$(ip addr show $DEFAULT_IFACE | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)
        else
            # macOS fallback
            DEFAULT_IFACE=$(route -n get default 2>/dev/null | awk '/interface:/{print $2}')
            DEFAULT_IP=$(ipconfig getifaddr "$DEFAULT_IFACE" 2>/dev/null)
        fi

        if [ -z "$DEFAULT_IP" ]; then
            echo "Could not determine default IP address. Exiting."
            exit 1
        fi

        echo "Error when creating swarm, trying again with advertise address $DEFAULT_IP."
        docker swarm init --advertise-addr $DEFAULT_IP
        if [ $? -ne 0 ]; then
            echo "Cound not create swarm. Exiting."
            exit 1
        fi
    fi

    SWARM_NODE=$(docker node inspect self | jq .[0].ID -r)
    docker node update --label-add Sagemaker=true $SWARM_NODE >/dev/null 2>/dev/null
    docker node update --label-add Robomaker=true $SWARM_NODE >/dev/null 2>/dev/null
    NW_SUBNET=$(find_free_subnet)
    docker network ls | grep -q $SAGEMAKER_NW && docker network rm $SAGEMAKER_NW >/dev/null 2>&1
    docker network create $SAGEMAKER_NW -d overlay --attachable --scope swarm \
        ${NW_SUBNET:+--subnet=$NW_SUBNET} \
        --label com.docker.compose.network=sagemaker-local \
        --label com.docker.compose.project=sagemaker-local

elif [[ "${OPT_STYLE}" == "compose" ]]; then

    NW_SUBNET=$(find_free_subnet)
    docker network ls | grep -q $SAGEMAKER_NW || \
        docker network create $SAGEMAKER_NW ${NW_SUBNET:+--subnet=$NW_SUBNET}

else
    echo "Unknown docker style ${OPT_STYLE}. Exiting."
    exit 1
fi
sedi "s/<DOCKER_STYLE>/${OPT_STYLE}/g" $INSTALL_DIR/system.env

# ensure our variables are set on startup - not for local setup.
if [[ "${OPT_CLOUD}" != "local" ]]; then
    touch "$HOME/.profile"
    NUM_IN_PROFILE=$(grep -c "$INSTALL_DIR/bin/activate.sh" "$HOME/.profile" || true)
    if [ "$NUM_IN_PROFILE" -eq 0 ]; then
        echo "source $INSTALL_DIR/bin/activate.sh" >>$HOME/.profile
    fi
fi

# mark as done
date | tee $INSTALL_DIR/DONE

## Optional auturun feature
# if using automation scripts to auto configure and run
# you must pass s3_training_location.txt to this instance in order for this to work
if [[ -f "$INSTALL_DIR/autorun.s3url" ]]; then
    ## read in first line.  first line always assumed to be training location regardless what else is in file
    TRAINING_LOC=$(awk 'NR==1 {print; exit}' $INSTALL_DIR/autorun.s3url)

    #get bucket name
    TRAINING_BUCKET=${TRAINING_LOC%%/*}
    #get prefix. minor exception handling in case there is no prefix and a root bucket is passed
    if [[ "$TRAINING_LOC" == *"/"* ]]; then
        TRAINING_PREFIX=${TRAINING_LOC#*/}
    else
        TRAINING_PREFIX=""
    fi

    ##check if custom autorun script exists in s3 training bucket.  If not, use default in this repo
    aws s3api head-object --bucket $TRAINING_BUCKET --key $TRAINING_PREFIX/autorun.sh || not_exist=true
    if [ $not_exist ]; then
        echo "custom file does not exist, using local copy"
    else
        echo "custom script does exist, use it"
        aws s3 cp s3://$TRAINING_LOC/autorun.sh $INSTALL_DIR/bin/autorun.sh
    fi
    chmod +x $INSTALL_DIR/bin/autorun.sh
    bash -c "source $INSTALL_DIR/bin/autorun.sh"
fi


================================================
FILE: bin/module/droa.sh
================================================
#!/usr/bin/env bash
# DRoA (DeepRacer on AWS) shell functions.
# Sourced by bin/activate.sh alongside scripts_wrapper.sh and summary.sh.

function droa-list-models {
  dr-update-env && python3 "${DR_DIR}/scripts/droa/list_models.py" "$@"
}

function droa-get-model {
  dr-update-env && python3 "${DR_DIR}/scripts/droa/get_model.py" "$@"
}

function droa-download-logs {
  dr-update-env && python3 "${DR_DIR}/scripts/droa/download_logs.py" "$@"
}

function droa-delete-model {
  dr-update-env && python3 "${DR_DIR}/scripts/droa/delete_model.py" "$@"
}

function droa-import-model {
  dr-update-env && python3 "${DR_DIR}/scripts/droa/import_model.py" "$@"
}


================================================
FILE: bin/module/summary.sh
================================================
function dr-summary {
  # ANSI colour palette
  local RST='\033[0m'
  local BOLD='\033[1m'
  local DIM='\033[2m'

  local C_BORDER='\033[38;5;33m'      # blue
  local C_HEADER='\033[38;5;39m'      # bright blue
  local C_KEY='\033[38;5;250m'        # light grey
  local C_VAL='\033[38;5;222m'        # amber
  local C_OK='\033[38;5;82m'          # green
  local C_WARN='\033[38;5;220m'       # yellow
  local C_ERR='\033[38;5;196m'        # red
  local C_SECTION='\033[38;5;75m'     # sky blue

  # ── dynamic width / height ──────────────────────────────────────────────
  local TERM_W WIDE=false W
  TERM_W=$(tput cols 2>/dev/null || echo 80)
  TERM_H=$(tput lines 2>/dev/null || echo 24)
  _dr_lines=0   # running line counter (non-local so helpers can increment)
  if [[ $TERM_W -ge 120 ]]; then
    W=118   # total box width = W+2 = 120
    WIDE=true
  else
    W=$(( TERM_W - 2 ))
    [[ $W -lt 78 ]] && W=78
  fi
  # Two-column content widths: │ space WL space │ space WR space │ = WL+WR+7 = W+2
  local WL=$(( (W - 5) / 2 ))
  local WR=$(( W - 5 - WL ))

  # ── helpers ───────────────────────────────────────────────────────────────
  _dr_hline() {
    local L="$1" M="$2" R="$3"
    printf "${C_BORDER}${L}"; printf "${M}%.0s" $(seq 1 $W); printf "${R}${RST}\n"
    (( ++_dr_lines ))
  }
  _dr_row() {
    local text="$1"
    local plain; plain=$(echo -e "$text" | sed 's/\x1b\[[0-9;]*m//g')
    local pad=$(( W - ${#plain} - 2 ))
    [[ $pad -lt 0 ]] && pad=0
    printf "${C_BORDER}│${RST} %b%-*s ${C_BORDER}│${RST}\n" "$text" "$pad" ""
    (( ++_dr_lines ))
  }
  _dr_blank() { _dr_row ""; }
  _dr_section() {
    _dr_hline "├" "─" "┤"
    local label=" ${BOLD}${C_SECTION}$1${RST}"
    [[ -n "${2:-}" ]] && label+="${DIM}  $2${RST}"
    _dr_row "$label"
    _dr_hline "├" "─" "┤"
  }
  _dr_kv() {
    local k="$1" v="$2" s="${3:-}"
    local vc="$C_VAL"
    [[ "$s" == "ok"   ]] && vc="$C_OK"
    [[ "$s" == "warn" ]] && vc="$C_WARN"
    [[ "$s" == "err"  ]] && vc="$C_ERR"
    _dr_row " ${C_KEY}$(printf '%-22s' "$k")${RST} ${vc}${v}${RST}"
  }
  _dr_hline_2col() {  # L M1 SEP M2 R
    local L="$1" M1="$2" SEP="$3" M2="$4" R="$5"
    local LD=$(( WL + 2 )) RD=$(( WR + 2 ))
    printf "${C_BORDER}${L}"
    printf "${M1}%.0s" $(seq 1 $LD)
    printf "${SEP}"
    printf "${M2}%.0s" $(seq 1 $RD)
    printf "${R}${RST}\n"
    (( ++_dr_lines ))
  }
  _dr_row_2col() {
    local lt="$1" rt="${2:-}"
    local lp; lp=$(echo -e "$lt" | sed 's/\x1b\[[0-9;]*m//g')
    local rp; rp=$(echo -e "$rt" | sed 's/\x1b\[[0-9;]*m//g')
    local lpad=$(( WL - ${#lp} )) rpad=$(( WR - ${#rp} ))
    [[ $lpad -lt 0 ]] && lpad=0
    [[ $rpad -lt 0 ]] && rpad=0
    printf "${C_BORDER}│${RST} %b%-*s ${C_BORDER}│${RST} %b%-*s ${C_BORDER}│${RST}\n" \
      "$lt" "$lpad" "" "$rt" "$rpad" ""
    (( ++_dr_lines ))
  }

  # ── spinner (shown while pre-compute phase runs) ─────────────────────────
  local _dr_spinner_pid=""
  if [[ -t 1 ]]; then
    { (
      local frames=('⠋' '⠙' '⠹' '⠸' '⠼' '⠴' '⠦' '⠧' '⠇' '⠏') i=0
      while true; do
        printf '\r  \033[38;5;33m%s\033[0m  \033[2mLoading DeepRacer-for-Cloud...\033[0m' \
          "${frames[i]}" >/dev/tty 2>/dev/null
        (( i = (i + 1) % 4 ))
        sleep 0.12
      done
    ) & } 2>/dev/null
    _dr_spinner_pid=$!
  fi

  # ── pre-compute git branch / update status ───────────────────────────────
  local _git_branch _git_update_available=false
  _git_branch=$(git -C "$DR_DIR" rev-parse --abbrev-ref HEAD 2>/dev/null || true)
  timeout 5 git -C "$DR_DIR" fetch --quiet origin 2>/dev/null || true
  local _local_hash _remote_hash
  _local_hash=$(git -C "$DR_DIR" rev-parse HEAD 2>/dev/null || true)
  _remote_hash=$(git -C "$DR_DIR" rev-parse '@{u}' 2>/dev/null || true)
  if [[ -n "$_local_hash" && -n "$_remote_hash" && "$_local_hash" != "$_remote_hash" ]]; then
    _git_update_available=true
  fi

  # ── pre-compute dynamic values ────────────────────────────────────────────
  local cloud_val="${DR_CLOUD:-n/a}"
  [[ "${DR_CLOUD,,}" == "aws" ]] && cloud_val="aws"
  [[ "${DR_CLOUD,,}" == "remote" ]] && cloud_val="remote"

  local s3_color
  if aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3api head-bucket \
      --bucket "${DR_LOCAL_S3_BUCKET}" >/dev/null 2>&1; then
    s3_color="${C_OK}"
  else
    s3_color="${C_ERR}"
  fi

  local nvidia_runtime
  if docker info --format '{{json .Runtimes}}' 2>/dev/null | grep -q '"nvidia"'; then
    nvidia_runtime="${C_OK}available${RST}"
  else
    nvidia_runtime="${C_WARN}not found${RST}"
  fi

  # ── stop spinner and clear its line before rendering ─────────────────────
  if [[ -n "${_dr_spinner_pid:-}" ]]; then
    kill "$_dr_spinner_pid" 2>/dev/null
    disown "$_dr_spinner_pid" 2>/dev/null
    wait "$_dr_spinner_pid" 2>/dev/null
    printf '\r\033[K' >/dev/tty 2>/dev/null || true
  fi

  # ── header ────────────────────────────────────────────────────────────────
  echo; (( ++_dr_lines ))
  _dr_hline "╭" "─" "╮"
  _dr_row " ${BOLD}${C_HEADER}DeepRacer for Cloud  —  Environment Summary${RST}"
  local _meta_row
  if [[ -n "${DR_EXPERIMENT_NAME:-}" ]]; then
    _meta_row=" ${DIM}Experiment: ${RST}${C_VAL}${DR_EXPERIMENT_NAME}${RST}"
  else
    local _rel_config
    _rel_config=$(realpath --relative-to="${PWD}" "${DR_CONFIG}" 2>/dev/null || basename "${DR_CONFIG}")
    _meta_row=" ${DIM}Config: ${RST}${C_VAL}${_rel_config}${RST}"
  fi
  local _branch_row="${DIM}  Branch: ${RST}${C_VAL}${_git_branch:-unknown}${RST}"
  if [[ "$_git_update_available" == true ]]; then
    _branch_row+="  ${C_WARN}⬆ update available — run 'git pull'${RST}"
  fi
  _dr_row "${_meta_row}${_branch_row}"

  # ── system config + run config ────────────────────────────────────────────
  if [[ "$WIDE" == true ]]; then
    local CKW=18  # key column width in 2-col mode
    _dr_hline_2col "├" "─" "┬" "─" "┤"
    _dr_row_2col \
      " ${BOLD}${C_SECTION}System Configuration${RST}" \
      " ${BOLD}${C_SECTION}Run Configuration${RST}${DIM}  ID: ${DR_RUN_ID:-0}${RST}"
    _dr_hline_2col "├" "─" "┼" "─" "┤"

    local lrows=() rrows=()
    lrows+=(" ${C_KEY}$(printf '%-*s' $CKW 'Docker style')${RST} ${C_VAL}${DR_DOCKER_STYLE:-swarm}${RST}")
    lrows+=(" ${C_KEY}$(printf '%-*s' $CKW 'Cloud / Bucket')${RST} ${DIM}${cloud_val}${RST}  ${s3_color}${DR_LOCAL_S3_BUCKET:-n/a}${RST}")
    lrows+=(" ${C_KEY}$(printf '%-*s' $CKW 'Workers')${RST} ${C_VAL}${DR_WORKERS:-1}${RST}")
    lrows+=(" ${C_KEY}$(printf '%-*s' $CKW 'NVIDIA runtime')${RST} ${nvidia_runtime}")

    rrows+=(" ${C_KEY}$(printf '%-*s' $CKW 'Model prefix')${RST} ${C_VAL}${DR_LOCAL_S3_MODEL_PREFIX:-n/a}${RST}")
    rrows+=(" ${C_KEY}$(printf '%-*s' $CKW 'Race type')${RST} ${C_VAL}${DR_RACE_TYPE:-n/a}${RST}")
    rrows+=(" ${C_KEY}$(printf '%-*s' $CKW 'World / track')${RST} ${C_VAL}${DR_WORLD_NAME:-n/a}${RST}")
    rrows+=(" ${C_KEY}$(printf '%-*s' $CKW 'Car name')${RST} ${C_VAL}${DR_CAR_NAME:-n/a}${RST}")

    local max_r=$(( ${#lrows[@]} > ${#rrows[@]} ? ${#lrows[@]} : ${#rrows[@]} ))
    for (( i=0; i<max_r; i++ )); do
      _dr_row_2col "${lrows[$i]:-}" "${rrows[$i]:-}"
    done
    _dr_hline_2col "├" "─" "┴" "─" "┤"
  else
    _dr_section "System Configuration"
    _dr_row " ${C_KEY}$(printf '%-22s' 'Docker style')${RST} ${C_VAL}${DR_DOCKER_STYLE:-swarm}${RST}"
    _dr_row " ${C_KEY}$(printf '%-22s' 'Cloud / Bucket')${RST} ${DIM}${cloud_val}${RST}  ${s3_color}${DR_LOCAL_S3_BUCKET:-n/a}${RST}"
    _dr_kv "Workers"        "${DR_WORKERS:-1}"
    _dr_row " ${C_KEY}$(printf '%-22s' 'NVIDIA runtime')${RST} ${nvidia_runtime}"
    _dr_section "Run Configuration" "ID: ${DR_RUN_ID:-0}"
    _dr_kv "Model prefix"   "${DR_LOCAL_S3_MODEL_PREFIX:-n/a}"
    _dr_kv "Race type"      "${DR_RACE_TYPE:-n/a}"
    _dr_kv "World / track"  "${DR_WORLD_NAME:-n/a}"
    _dr_kv "Car name"       "${DR_CAR_NAME:-n/a}"
  fi

  # ── simapp version check (used inline in docker images section) ───────────
  local simapp_update_available=false _required_simapp_ver=""
  _required_simapp_ver=$(jq -r '.containers.simapp | select (.!=null)' "$DR_DIR/defaults/dependencies.json" 2>/dev/null || true)
  if [[ -n "$_required_simapp_ver" && -n "${DR_SIMAPP_VERSION:-}" ]]; then
    local _configured_simapp_ver
    _configured_simapp_ver=$(echo "${DR_SIMAPP_VERSION}" | grep -oP '^\d+\.\d+(\.\d+)?')
    if [[ -n "$_configured_simapp_ver" ]] && ! verlte "$_required_simapp_ver" "$_configured_simapp_ver"; then
      simapp_update_available=true
    fi
  fi

  # ── docker images ─────────────────────────────────────────────────────────
  if [[ "$WIDE" == true ]]; then
    # 2-col closing line already drawn; just add section label row
    local label=" ${BOLD}${C_SECTION}Configured Docker Images${RST}"
    _dr_row "$label"
    _dr_hline "├" "─" "┤"
  else
    _dr_section "Configured Docker Images"
  fi

  local simapp_img="${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION}"
  local simapp_disp="${simapp_img/awsdeepracercommunity/[a-d-c]}"
  local simapp_id; simapp_id=$(docker image inspect "$simapp_img" --format '{{slice .Id 7 19}}' 2>/dev/null)

  local analysis_img="awsdeepracercommunity/deepracer-analysis:${DR_ANALYSIS_IMAGE:-cpu}"
  local analysis_disp="${analysis_img/awsdeepracercommunity/[a-d-c]}"
  local analysis_id; analysis_id=$(docker image inspect "$analysis_img" --format '{{slice .Id 7 19}}' 2>/dev/null)

  local minio_img="" minio_disp="" minio_id=""
  if [[ "${DR_CLOUD,,}" == "local" || "${DR_CLOUD,,}" == "azure" ]]; then
    minio_img="minio/minio:${DR_MINIO_IMAGE:-latest}"
    minio_disp="$minio_img"
    minio_id=$(docker image inspect "$minio_img" --format '{{slice .Id 7 19}}' 2>/dev/null)
    if [[ -z "$minio_id" ]]; then
      minio_id=$(docker images minio/minio --format '{{slice .ID 0 12}}' 2>/dev/null | head -1)
    fi
  fi

  local _simapp_upd_note=""
  [[ "$simapp_update_available" == true ]] && _simapp_upd_note="  ${C_WARN}⬆ update available (→ ${_required_simapp_ver})${RST}"

  if [[ "$WIDE" == true ]]; then
    local IKW=14
    if [[ -n "$simapp_id" ]]; then
      _dr_row " ${C_KEY}$(printf '%-*s' $IKW 'SimApp')${RST} ${C_OK}${simapp_disp}${RST}  ${DIM}ID: ${simapp_id}  ✓ local${RST}${_simapp_upd_note}"
    else
      _dr_row " ${C_KEY}$(printf '%-*s' $IKW 'SimApp')${RST} ${C_WARN}${simapp_disp}  (not pulled)${RST}${_simapp_upd_note}"
    fi
    if [[ -n "$analysis_id" ]]; then
      _dr_row " ${C_KEY}$(printf '%-*s' $IKW 'Analysis')${RST} ${C_OK}${analysis_disp}${RST}  ${DIM}ID: ${analysis_id}  ✓ local${RST}"
    else
      _dr_row " ${C_KEY}$(printf '%-*s' $IKW 'Analysis')${RST} ${C_WARN}${analysis_disp}  (not pulled)${RST}"
    fi
    if [[ -n "$minio_img" ]]; then
      if [[ -n "$minio_id" ]]; then
        _dr_row " ${C_KEY}$(printf '%-*s' $IKW 'MinIO')${RST} ${C_OK}${minio_disp}${RST}  ${DIM}ID: ${minio_id}  ✓ local${RST}"
      else
        _dr_row " ${C_KEY}$(printf '%-*s' $IKW 'MinIO')${RST} ${C_WARN}${minio_disp}  (not pulled)${RST}"
      fi
    fi
  else
    if [[ -n "$simapp_id" ]]; then
      _dr_kv "SimApp" "${simapp_disp}" "ok"
      _dr_row " ${DIM}$(printf '%22s' '') ID: ${simapp_id}  ✓ local${RST}${_simapp_upd_note}"
    else
      _dr_kv "SimApp" "${simapp_disp}  (not pulled)${_simapp_upd_note}" "warn"
    fi
    if [[ -n "$analysis_id" ]]; then
      _dr_kv "Analysis" "${analysis_disp}" "ok"
      _dr_row " ${DIM}$(printf '%22s' '') ID: ${analysis_id}  ✓ local${RST}"
    else
      _dr_kv "Analysis" "${analysis_disp}  (not pulled)" "warn"
    fi
    if [[ -n "$minio_img" ]]; then
      if [[ -n "$minio_id" ]]; then
        _dr_kv "MinIO" "${minio_disp}" "ok"
        _dr_row " ${DIM}$(printf '%22s' '') ID: ${minio_id}  ✓ local${RST}"
      else
        _dr_kv "MinIO" "${minio_disp}  (not pulled)" "warn"
      fi
    fi
  fi

  # ── services and containers ───────────────────────────────────────────────
  _dr_section "DeepRacer Services And Containers"
  local found_any=false

  if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
    local stack_lines
    stack_lines=$(docker stack ls --format '{{.Name}}\t{{.Services}}' 2>/dev/null || true)
    if [[ -n "$stack_lines" ]]; then
      found_any=true
      _dr_row " ${DIM}Swarm stacks:${RST}"
      while IFS=$'\t' read -r stname stsvcs; do
        _dr_row " ${C_KEY}$(printf '%-30s' "$stname")${RST} ${C_VAL}${stsvcs} service(s)${RST}"
      done <<< "$stack_lines"
    fi

    local svc_lines
    svc_lines=$(docker service ls --format '{{.Name}}\t{{.Replicas}}\t{{.Image}}' 2>/dev/null \
      | grep -i '^deepracer' || true)
    if [[ -n "$svc_lines" ]]; then
      found_any=true
      _dr_row " ${DIM}Swarm services:${RST}"
      while IFS=$'\t' read -r sname sreplicas simage; do
        local desired actual
        desired=$(echo "$sreplicas" | cut -d'/' -f2)
        actual=$(echo "$sreplicas" | cut -d'/' -f1)
        local rep_color="$C_OK"
        [[ "$actual" != "$desired" ]] && rep_color="$C_WARN"
        local simage_disp="${simage/awsdeepracercommunity/[a-d-c]}"
        _dr_row " ${C_KEY}$(printf '%-30s' "$sname")${RST} ${rep_color}$(printf '%-8s' "$sreplicas")${RST} ${DIM}${simage_disp}${RST}"
      done <<< "$svc_lines"
    fi

    local container_lines
    container_lines=$(docker ps --format '{{.Names}}\t{{.Status}}\t{{.Image}}' 2>/dev/null \
      | while IFS=$'\t' read -r cn cs ci; do
          if echo "$cn" | grep -qiE '^deepracer|robomaker|sagemaker|minio|rl_coach|analysis' \
             || [[ "$ci" == "$simapp_img"* ]]; then
            printf '%s\t%s\n' "$cn" "$cs"
          fi
        done)
    if [[ -n "$container_lines" ]]; then
      found_any=true
      local n_ctrs; n_ctrs=$(echo "$container_lines" | wc -l)
      _dr_row " ${DIM}Containers:${RST}"
      # 3 lines reserved for footer (blank row + closing hline + trailing newline)
      if (( _dr_lines + n_ctrs + 3 > TERM_H )); then
        _dr_row "   ${DIM}${n_ctrs} container(s) running  ${C_WARN}(terminal too short to list)${RST}"
      else
        while IFS=$'\t' read -r cname cstatus; do
          local status_color="$C_OK"
          [[ "$cstatus" != Up* ]] && status_color="$C_WARN"
          _dr_row " ${C_KEY}$(printf '%-30s' "$cname")${RST} ${status_color}${cstatus}${RST}"
        done <<< "$container_lines"
      fi
    fi
  else
    local proj_lines
    proj_lines=$(docker compose ls --format json 2>/dev/null \
      | jq -r '.[] | select(.Name | test("deepracer|s3"; "i")) | "\(.Name)\t\(.Status)"' 2>/dev/null || true)
    if [[ -n "$proj_lines" ]]; then
      found_any=true
      _dr_row " ${DIM}Compose projects:${RST}"
      while IFS=$'\t' read -r pname pstatus; do
        local pstatus_color="$C_OK"
        [[ "$pstatus" != *running* ]] && pstatus_color="$C_WARN"
        _dr_row " ${C_KEY}$(printf '%-30s' "$pname")${RST} ${pstatus_color}${pstatus}${RST}"
      done <<< "$proj_lines"
    fi

    local container_lines
    container_lines=$(docker ps --format '{{.Names}}\t{{.Status}}\t{{.Image}}' 2>/dev/null \
      | while IFS=$'\t' read -r cn cs ci; do
          if echo "$cn" | grep -qiE '^deepracer|robomaker|sagemaker|minio|rl_coach|analysis' \
             || [[ "$ci" == "$simapp_img"* ]]; then
            printf '%s\t%s\n' "$cn" "$cs"
          fi
        done)
    if [[ -n "$container_lines" ]]; then
      found_any=true
      local n_ctrs; n_ctrs=$(echo "$container_lines" | wc -l)
      _dr_row " ${DIM}Compose services:${RST}"
      if (( _dr_lines + n_ctrs + 3 > TERM_H )); then
        _dr_row "   ${DIM}${n_ctrs} container(s) running  ${C_WARN}(terminal too short to list)${RST}"
      else
        while IFS=$'\t' read -r cname cstatus; do
          local status_color="$C_OK"
          [[ "$cstatus" != Up* ]] && status_color="$C_WARN"
          _dr_row " ${C_KEY}$(printf '%-30s' "$cname")${RST} ${status_color}${cstatus}${RST}"
        done <<< "$container_lines"
      fi
    fi
  fi

  if [[ "$found_any" == false ]]; then
    _dr_row "  ${C_WARN}No DeepRacer-related services or containers running.${RST}"
  fi

  # ── footer ────────────────────────────────────────────────────────────────
  _dr_blank
  _dr_hline "╰" "─" "╯"
  echo
}


================================================
FILE: bin/prepare-mac.sh
================================================
#!/usr/bin/env bash

set -euo pipefail
trap ctrl_c INT

function ctrl_c() {
    echo "Requested to stop."
    exit 1
}

DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"

## Only allow macOS
if [[ "$(uname -s)" != "Darwin" ]]; then
    echo "ERROR: This script is for macOS only. Use prepare.sh for Linux."
    exit 1
fi

MACOS_VERSION=$(sw_vers -productVersion)
MACOS_MAJOR=$(echo "$MACOS_VERSION" | cut -d. -f1)

# Supported: Monterey (12), Ventura (13), Sonoma (14), Sequoia (15)
SUPPORTED_MACOS_MAJOR=(12 13 14 15)
VERSION_OK=false
for V in "${SUPPORTED_MACOS_MAJOR[@]}"; do
    if [[ "${MACOS_MAJOR}" -eq "$V" ]]; then
        VERSION_OK=true
        break
    fi
done

if [[ "$VERSION_OK" != true ]]; then
    echo "WARNING: macOS ${MACOS_VERSION} is not a tested version."
    echo "         Supported: Monterey (12), Ventura (13), Sonoma (14), Sequoia (15)"
fi

echo "Detected macOS ${MACOS_VERSION}"

## macOS does not support NVIDIA GPUs -- always CPU
ARCH="cpu"
echo "macOS does not support NVIDIA GPUs. Using CPU mode."

## Detect Apple Silicon vs Intel
CPU_ARCH=$(uname -m)
if [[ "${CPU_ARCH}" == "arm64" ]]; then
    echo "Apple Silicon (arm64) detected."
    BREW_PREFIX="/opt/homebrew"
else
    echo "Intel (x86_64) detected."
    BREW_PREFIX="/usr/local"
fi

## Install Homebrew if not present
if ! command -v brew >/dev/null 2>&1; then
    echo "Installing Homebrew..."
    /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
    eval "$("${BREW_PREFIX}/bin/brew" shellenv)"
fi

## Update Homebrew
brew update

## Install required packages (no awscli here -- installed separately below)
brew install jq python3 git screen bash

## Set Homebrew bash as the default shell if not already
BREW_BASH="${BREW_PREFIX}/bin/bash"
if ! grep -qF "${BREW_BASH}" /etc/shells; then
    echo "Adding ${BREW_BASH} to /etc/shells..."
    echo "${BREW_BASH}" | sudo tee -a /etc/shells
fi
if [[ "$(dscl . -read /Users/"$(id -un)" UserShell | awk '{print $2}')" != "${BREW_BASH}" ]]; then
    echo "Setting default shell to ${BREW_BASH}..."
    sudo chsh -s "${BREW_BASH}" "$(id -un)"
fi

## Ensure bash 5 + Homebrew PATH are set up for all SSH sessions.
## Sets PATH first so --login re-entry is safe (BASH_VERSINFO guard prevents looping).
BASH_PROFILE="${HOME}/.bash_profile"
BOOTSTRAP_MARKER="# drfc-bash5-bootstrap"
if ! grep -qF "${BOOTSTRAP_MARKER}" "${BASH_PROFILE}" 2>/dev/null; then
    cat >> "${BASH_PROFILE}" <<EOF

${BOOTSTRAP_MARKER}
eval "\$(${BREW_PREFIX}/bin/brew shellenv)"
export PATH="/usr/local/bin:\$PATH"  # AWS CLI v2
if [ -x "${BREW_BASH}" ] && [ "\${BASH_VERSINFO[0]:-0}" -lt 5 ]; then
    exec "${BREW_BASH}" --login
fi
EOF
    echo "Added bash 5 + PATH bootstrap to ${BASH_PROFILE}."
fi

## Install boto3 and pyyaml
if pip3 install boto3 pyyaml --break-system-packages 2>/dev/null; then
    echo "boto3 and pyyaml installed."
else
    pip3 install boto3 pyyaml
fi

## Install AWS CLI v2 via official pkg installer (avoids Homebrew Python conflicts)
if command -v aws >/dev/null 2>&1; then
    echo "AWS CLI already installed: $(aws --version 2>&1)"
else
    echo "Installing AWS CLI v2 via official installer..."
    TMP_PKG=$(mktemp /tmp/AWSCLIV2.XXXXXX.pkg)
    curl -fsSL "https://awscli.amazonaws.com/AWSCLIV2.pkg" -o "${TMP_PKG}"
    sudo installer -pkg "${TMP_PKG}" -target /
    rm -f "${TMP_PKG}"
    echo "AWS CLI installed: $(aws --version 2>&1)"
fi

## Detect cloud
# detect.sh relies on cloud-init which is typically absent on macOS.
# Fall back to probing the AWS Instance Metadata Service (IMDSv2).
CLOUD_NAME="local"
if [[ -f /var/run/cloud-init/instance-data.json ]]; then
    source "$DIR/detect.sh"
else
    if IMDS_TOKEN=$(curl -s --connect-timeout 2 \
            -X PUT "http://169.254.169.254/latest/api/token" \
            -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null) \
        && [[ -n "${IMDS_TOKEN}" ]]; then
        CLOUD_NAME="aws"
        CLOUD_INSTANCETYPE=$(curl -s --connect-timeout 2 \
            -H "X-aws-ec2-metadata-token: ${IMDS_TOKEN}" \
            "http://169.254.169.254/latest/meta-data/instance-type" 2>/dev/null || echo "unknown")
        export CLOUD_NAME
        export CLOUD_INSTANCETYPE
    else
        export CLOUD_NAME
    fi
fi
echo "Detected cloud type ${CLOUD_NAME}"

## Install Docker CLI and Colima (headless Docker runtime for macOS)
## Colima is preferred over Docker Desktop for headless/EC2 use.

if brew list --formula colima &>/dev/null; then
    echo "Colima already installed."
else
    brew install colima
fi

if command -v docker >/dev/null 2>&1; then
    echo "Docker CLI already installed."
else
    brew install docker
fi

## Install docker-compose v2 (as CLI plugin)
if brew list --formula docker-compose &>/dev/null; then
    echo "docker-compose already installed."
else
    brew install docker-compose
fi

## Register docker-compose as a Docker CLI plugin
mkdir -p "${HOME}/.docker/cli-plugins"
ln -sfn "$(brew --prefix)/opt/docker-compose/bin/docker-compose" \
    "${HOME}/.docker/cli-plugins/docker-compose"

## Start Colima if not already running
if colima status 2>/dev/null | grep -q "Running"; then
    echo "Colima is already running."
else
    echo "Starting Colima..."
    if [[ "${CPU_ARCH}" == "arm64" ]] && [[ "${MACOS_MAJOR}" -ge 13 ]]; then
        # Apple Silicon + macOS 13+: use Virtualization.framework (vz) for much
        # lower hypervisor overhead vs QEMU. virtiofs gives better I/O than sshfs.
        colima start --cpu 8 --memory 12 --disk 60 \
            --vm-type vz --mount-type virtiofs
    elif [[ "${CPU_ARCH}" == "arm64" ]]; then
        colima start --cpu 8 --memory 12 --disk 60 --mount-type virtiofs
    else
        # Intel Mac
        colima start --cpu 4 --memory 8 --disk 60 --mount-type virtiofs
    fi
fi

## Ensure docker socket is reachable
if ! docker info >/dev/null 2>&1; then
    echo "ERROR: Docker is not reachable. Check that Colima is running: colima status"
    exit 1
fi
echo "Docker is available via Colima."

## Create /tmp/sagemaker inside the Colima VM.
## On macOS, Docker runs inside Colima's Linux VM so bind-mounts must exist there,
## not on the macOS host. /tmp persists across colima stop/start but not colima delete.
colima ssh -- sudo mkdir -p /tmp/sagemaker
colima ssh -- sudo chmod -R ug+w /tmp/sagemaker
echo "/tmp/sagemaker created inside Colima VM."

## Ensure Colima auto-starts on login (launchd)
if ! launchctl list 2>/dev/null | grep -q "com.abiosoft.colima.default"; then
    brew services start colima || true
fi

## Completion message
echo ""
echo "First stage done. Log out and back in, then run init.sh -c ${CLOUD_NAME} -a ${ARCH}"
echo ""
echo "Notes:"
echo "  - Log out and back in for the new default shell (bash 5) to take effect."
echo "  - Colima must be running before using DeepRacer-for-Cloud."
echo "    Start it manually with: colima start"
echo "  - On Apple Silicon (arm64), amd64/x86_64 container images require"
echo "    Rosetta 2. Install it with: softwareupdate --install-rosetta"
echo "    Then restart Colima with: colima start --arch x86_64"
echo "  - No reboot is required."


================================================
FILE: bin/prepare.sh
================================================
#!/usr/bin/env bash

set -euo pipefail
trap ctrl_c INT

function ctrl_c() {
    echo "Requested to stop."
    exit 1
}

export DEBIAN_FRONTEND=noninteractive
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"

# Only allow supported Ubuntu versions
. /etc/os-release
SUPPORTED_VERSIONS=("22.04" "24.04" "24.10" "25.04" "25.10")
DISTRIBUTION=${ID}${VERSION_ID//./}
UBUNTU_MAJOR_VERSION=$(echo $VERSION_ID | cut -d. -f1)
UBUNTU_MINOR_VERSION=$(echo $VERSION_ID | cut -d. -f2)
if [[ "$ID" == "ubuntu" ]]; then
    VERSION_OK=false
    for V in "${SUPPORTED_VERSIONS[@]}"; do
        if [[ "$VERSION_ID" == "$V" ]]; then
            VERSION_OK=true
            break
        fi
    done
    if [[ "$VERSION_OK" != true ]]; then
        echo "ERROR: Ubuntu $VERSION_ID is not a supported version. Supported versions: ${SUPPORTED_VERSIONS[*]}"
        exit 1
    fi
fi

## Check if WSL2
IS_WSL2=""
if grep -qi Microsoft /proc/version && grep -q "WSL2" /proc/version; then
    IS_WSL2="yes"
fi

# Remove needrestart in all Ubuntu 2x.04/2x.10+ (future-proof)
if [[ "${ID}" == "ubuntu" && ${UBUNTU_MAJOR_VERSION} -ge 22 && -z "${IS_WSL2}" ]]; then
    sudo apt remove -y needrestart || true
fi

## Patch system
sudo apt update && sudo apt-mark hold grub-pc && sudo apt -y -o \
    DPkg::options::="--force-confdef" -o DPkg::options::="--force-confold" -qq upgrade

## Install required packages
sudo apt install --no-install-recommends -y jq python3-boto3 python3-venv screen git curl

## Install AWS CLI
if [[ "${ID}" == "ubuntu" && ( ${UBUNTU_MAJOR_VERSION} -eq 22 ) ]]; then
    sudo apt install -y awscli
else
    if command -v snap >/dev/null 2>&1; then
        sudo snap install aws-cli --classic
    else
        echo "WARNING: snap not available, AWS CLI not installed"
    fi
fi

## Create Python virtual environment
VENV_DIR="${DIR}/../.venv"
if [[ ! -d "${VENV_DIR}" ]]; then
    echo "Creating Python virtual environment at ${VENV_DIR}"
    python3 -m venv --prompt drfc "${VENV_DIR}"
fi
echo "Installing Python requirements into virtual environment"
"${VENV_DIR}/bin/pip" install --quiet -r "${DIR}/../requirements.txt"

## Detect cloud
source $DIR/detect.sh
echo "Detected cloud type ${CLOUD_NAME}"

## Do I have a GPU
GPUS=0
if [[ -z "${IS_WSL2}" ]]; then
    GPUS=$(lspci | awk '/NVIDIA/ && ( /VGA/ || /3D controller/ ) ' | wc -l)
else
    if [[ -f /usr/lib/wsl/lib/nvidia-smi ]]; then
        GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
    fi
fi
if [ $? -ne 0 ] || [ $GPUS -eq 0 ]; then
    ARCH="cpu"
    echo "No NVIDIA GPU detected. Will not install drivers."
else
    ARCH="gpu"
fi

## Adding Nvidia Drivers
if [[ "${ARCH}" == "gpu" && -z "${IS_WSL2}" ]]; then
    DRIVER_OK=false
    # Find all installed nvidia-driver-XXX packages (status 'ii'), extract version, and check if >= 525
    for PKG in $(dpkg -l | awk '$1 == "ii" && /nvidia-driver-[0-9]+/ {print $2}'); do
        DRIVER_VER=$(echo "${PKG}" | sed -E 's/nvidia-driver-([0-9]+).*/\1/')
        if [[ ${DRIVER_VER} -ge 560 ]]; then
            echo "NVIDIA driver ${DRIVER_VER} already installed."
            DRIVER_OK=true
            break
        fi
    done
    if [[ "${DRIVER_OK}" != true ]]; then
        # Try to install the highest available driver >= 560
        HIGHEST_DRIVER=$(apt-cache search --names-only '^nvidia-driver-[0-9]+$' | awk '{print $1}' | grep -oE '[0-9]+$' | awk '$1 >= 560' | sort -nr | head -n1)
        if [[ -n "${HIGHEST_DRIVER}" ]]; then
            sudo apt install -y "nvidia-driver-${HIGHEST_DRIVER}" --no-install-recommends -o Dpkg::Options::="--force-overwrite"
        elif apt-cache show nvidia-driver-560-server &>/dev/null; then
            sudo apt install -y nvidia-driver-560-server --no-install-recommends -o Dpkg::Options::="--force-overwrite"
        else
            echo "No supported NVIDIA driver >= 560 found for this Ubuntu version."
            exit 1
        fi
    fi
fi

## Installing Docker
sudo apt install -y --no-install-recommends docker.io docker-buildx docker-compose-v2

## Install Nvidia Docker Container
if [[ "${ARCH}" == "gpu" ]]; then
    curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg &&
        curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list |
        sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' |
            sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list

    sudo apt update && sudo apt install -y --no-install-recommends nvidia-docker2 nvidia-container-runtime
    if [ -f "/etc/docker/daemon.json" ]; then
        echo "Altering /etc/docker/daemon.json with default-runtime nvidia."
        cat /etc/docker/daemon.json | jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' | sudo tee /etc/docker/daemon.json
    else
        echo "Creating /etc/docker/daemon.json with default-runtime nvidia."
        sudo cp "${DIR}/../defaults/docker-daemon.json" /etc/docker/daemon.json
    fi
fi

## Enable and start docker
if [[ -n "${IS_WSL2}" ]]; then
    sudo service docker restart
else
    sudo systemctl enable docker
    sudo systemctl restart docker
fi

## Ensure user can run docker
sudo usermod -a -G docker "$(id -un)"

## Reboot to load driver -- continue install if in cloud-init
CLOUD_INIT=$(pstree -s $BASHPID | awk /cloud-init/ | wc -l)

if [[ "${CLOUD_INIT}" -ne 0 ]]; then
    echo "Rebooting in 5 seconds. Will continue with install."
    cd "${DIR}"
    ./runonce.sh "./init.sh -c ${CLOUD_NAME} -a ${ARCH}"
    sleep 5s
    sudo shutdown -r +1
elif [[ -n "${IS_WSL2}" || "${ARCH}" == "cpu" ]]; then
    echo "First stage done. Log out, then log back in and run init.sh -c ${CLOUD_NAME} -a ${ARCH}"
    echo "Note: You may need to log out and back in for docker group membership to take effect."
else
    echo "First stage done. Please reboot and run init.sh -c ${CLOUD_NAME} -a ${ARCH}"
    echo "Note: Reboot is required for NVIDIA drivers and docker group membership to take effect."
fi


================================================
FILE: bin/runonce.sh
================================================
#!/usr/bin/env bash

if [[ $# -eq 0 ]]; then
    echo "Schedules a command to be run after the next reboot."
    echo "Usage: $(basename $0) <command>"
    echo "       $(basename $0) -p <path> <command>"
    echo "       $(basename $0) -r <command>"
else
    REMOVE=0
    COMMAND=${!#}
    SCRIPTPATH=$PATH

    while getopts ":r:p:" optionName; do
        case "$optionName" in
        r)
            REMOVE=1
            COMMAND=$OPTARG
            ;;
        p) SCRIPTPATH=$OPTARG ;;
        esac
    done

    SCRIPT="${HOME}/.$(basename $0)_$(echo $COMMAND | sed 's/[^a-zA-Z0-9_]/_/g')"

    if [[ ! -f $SCRIPT ]]; then
        echo "PATH=$SCRIPTPATH" >>$SCRIPT
        echo "cd $(pwd)" >>$SCRIPT
        echo "logger -t $(basename $0) -p local3.info \"COMMAND=$COMMAND ; USER=\$(whoami) ($(logname)) ; PWD=$(pwd) ; PATH=\$PATH\"" >>$SCRIPT
        echo "$COMMAND | logger -t $(basename $0) -p local3.info" >>$SCRIPT
        echo "$0 -r \"$(echo $COMMAND | sed 's/\"/\\\"/g')\"" >>$SCRIPT
        chmod +x $SCRIPT
    fi

    CRONTAB="${HOME}/.$(basename $0)_temp_crontab_$RANDOM"
    ENTRY="@reboot $SCRIPT"

    echo "$(crontab -l 2>/dev/null)" | grep -v "$ENTRY" | grep -v "^# DO NOT EDIT THIS FILE - edit the master and reinstall.$" | grep -v "^# ([^ ]* installed on [^)]*)$" | grep -v "^# (Cron version [^$]*\$[^$]*\$)$" >$CRONTAB

    if [[ $REMOVE -eq 0 ]]; then
        echo "$ENTRY" >>$CRONTAB
    fi

    crontab $CRONTAB
    rm $CRONTAB

    if [[ $REMOVE -ne 0 ]]; then
        rm $SCRIPT
    fi
fi


================================================
FILE: bin/scripts_wrapper.sh
================================================
#!/usr/bin/env bash

function _dr_is_macos {
  [[ "$(uname -s)" == "Darwin" ]]
}

if ! declare -F _realpath >/dev/null 2>&1; then
  function _realpath {
    if command -v realpath >/dev/null 2>&1; then
      realpath "$1"
    elif command -v grealpath >/dev/null 2>&1; then
      grealpath "$1"
    elif ! _dr_is_macos && readlink -f / >/dev/null 2>&1; then
      readlink -f "$1"
    else
      python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1"
    fi
  }
fi
export -f _realpath

function _dr_require_colima {
  if ! _dr_is_macos; then
    return 0
  fi

  if ! command -v colima >/dev/null 2>&1; then
    echo "ERROR: Colima is required on macOS. Run bin/prepare-mac.sh or install and start Colima." >&2
    return 1
  fi

  if ! colima status 2>/dev/null | grep -qi "running"; then
    echo "ERROR: Colima is not running. Start it with: colima start" >&2
    return 1
  fi
}

function _dr_ensure_sagemaker_dir {
  if _dr_is_macos; then
    _dr_require_colima || return 1
    colima ssh -- sudo mkdir -p /tmp/sagemaker
    colima ssh -- sudo chmod -R ug+w /tmp/sagemaker
  elif [ ! -d /tmp/sagemaker ]; then
    sudo mkdir -p /tmp/sagemaker
    sudo chmod -R g+w /tmp/sagemaker
  fi
}

function _dr_runtime_cat {
  if _dr_is_macos; then
    _dr_require_colima || return 1
    colima ssh -- sudo cat "$1"
  else
    sudo cat "$1"
  fi
}

function _dr_find_sagemaker_compose_files {
  local compose_service_name="$1"

  if _dr_is_macos; then
    _dr_require_colima || return 1
    colima ssh -- sudo env COMPOSE_SERVICE_NAME="$compose_service_name" sh -lc 'find /tmp/sagemaker -name docker-compose.yaml -exec grep -l -- "$COMPOSE_SERVICE_NAME" {} +'
  else
    sudo find /tmp/sagemaker -name docker-compose.yaml -exec grep -l -- "$compose_service_name" {} +
  fi
}

function _dr_compose_file_matches_run {
  local compose_file="$1"
  local compose_content

  compose_content=$(_dr_runtime_cat "$compose_file" 2>/dev/null) || return 1
  grep -Fq "RUN_ID=${DR_RUN_ID}" <<<"$compose_content" && grep -Fq "${DR_LOCAL_S3_MODEL_PREFIX}" <<<"$compose_content"
}

function dr-upload-custom-files {
  eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/)
  echo "Uploading files to $CUSTOM_TARGET"
  if [[ -z $DR_EXPERIMENT_NAME ]]; then
    aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/custom_files/ $CUSTOM_TARGET
  else
    aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/experiments/$DR_EXPERIMENT_NAME/custom_files/ $CUSTOM_TARGET
  fi
}

function dr-upload-model {
  dr-update-env && ${DR_DIR}/scripts/upload/upload-model.sh "$@"
}

function dr-download-model {
  dr-update-env && ${DR_DIR}/scripts/upload/download-model.sh "$@"
}

function dr-upload-car-zip {
  dr-update-env && ${DR_DIR}/scripts/upload/upload-car.sh "$@"
}

function dr-list-aws-models {
  echo "Due to changes in AWS DeepRacer Console this command is no longer available."
}

function dr-set-upload-model {
  echo "Due to changes in AWS DeepRacer Console this command is no longer available."
}

function dr-increment-upload-model {
  dr-update-env && ${DR_DIR}/scripts/upload/increment.sh "$@" && dr-update-env
}

function dr-download-custom-files {
  eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/)
  echo "Downloading files from $CUSTOM_TARGET"
  if [[ -z $DR_EXPERIMENT_NAME ]]; then
    aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $CUSTOM_TARGET $DR_DIR/custom_files/
  else
    aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $CUSTOM_TARGET $DR_DIR/experiments/$DR_EXPERIMENT_NAME/custom_files/
  fi
}

function dr-start-training {
  dr-update-env
  $DR_DIR/scripts/training/start.sh "$@"
}

function dr-increment-training {
  dr-update-env && ${DR_DIR}/scripts/training/increment.sh "$@" && dr-update-env
}

function dr-stop-training {
  bash -c "cd $DR_DIR/scripts/training && ./stop.sh"
}

function dr-start-evaluation {
  dr-update-env
  $DR_DIR/scripts/evaluation/start.sh "$@"
}

function dr-stop-evaluation {
  bash -c "cd $DR_DIR/scripts/evaluation && ./stop.sh"
}

function dr-stop-all {
  # Step 1: Stop all stacks (swarm) or all compose projects (compose)
  if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
    docker stack ls --format '{{.Name}}' | while read -r STACK; do
      echo "Removing stack: $STACK"
      docker stack rm "$STACK"
    done
  else
    while IFS=$'\t' read -r NAME CONFIGS; do
      echo "Stopping compose project: $NAME"
      local CONFIG_FLAGS
      CONFIG_FLAGS=$(echo "$CONFIGS" | tr ',' '\n' | sed 's/^/-f /' | tr '\n' ' ')
      docker compose $CONFIG_FLAGS -p "$NAME" down
    done < <(docker compose ls --format json 2>/dev/null \
      | jq -r '.[] | [.Name, .ConfigFiles] | @tsv')
  fi

  # Step 2: Stop the s3/minio stack if still running
  if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
    if docker stack ls --format '{{.Name}}' | grep -qx 's3'; then
      echo "Removing stack: s3"
      docker stack rm s3
    fi
  else
    if docker compose ls --format json 2>/dev/null | jq -e '.[] | select(.Name == "s3")' >/dev/null 2>&1; then
      echo "Stopping compose project: s3"
      docker compose -p s3 down
    fi
  fi
  echo "Waiting 10 seconds for stacks and services to stop..."
  sleep 10
  # Step 3: Stop any remaining containers still attached to sagemaker-local
  local REMAINING
  REMAINING=$(docker network inspect sagemaker-local --format '{{json .Containers}}' 2>/dev/null \
    | jq -r 'keys[] | select(test("^[0-9a-f]{64}$"))' 2>/dev/null)
  if [[ -n "$REMAINING" ]]; then
    echo "Stopping remaining containers on sagemaker-local:"
    echo "$REMAINING" | while read -r CONTAINER_ID; do
      local CONTAINER_NAME
      CONTAINER_NAME=$(docker inspect --format '{{.Name}}' "$CONTAINER_ID" | sed 's|^/||')
      echo "  Stopping: $CONTAINER_NAME"
      docker stop "$CONTAINER_ID"
    done
  fi
}

function dr-start-tournament {
  echo "Tournaments are no longer supported. Use Head-to-Model evaluation instead."
}

function dr-start-loganalysis {
  bash -c "cd $DR_DIR/scripts/log-analysis && ./start.sh"
}

function dr-stop-loganalysis {
  eval LOG_ANALYSIS_ID=$(docker ps | awk ' /deepracer-analysis/ { print $1 }')
  if [ -n "$LOG_ANALYSIS_ID" ]; then
    bash -c "cd $DR_DIR/scripts/log-analysis && ./stop.sh"
  else
    echo "Log-analysis is not running."
  fi

}

function dr-logs-sagemaker {

  local OPTIND
  OPT_TIME="--since 5m"

  while getopts ":w:a" opt; do
    case $opt in
    w)
      OPT_WAIT=$OPTARG
      ;;
    a)
      OPT_TIME=""
      ;;
    \?)
      echo "Invalid option -$OPTARG" >&2
      ;;
    esac
  done

  SAGEMAKER_CONTAINER=$(dr-find-sagemaker)

  if [[ -z "$SAGEMAKER_CONTAINER" ]]; then
    if [[ -n "$OPT_WAIT" ]]; then
      WAIT_TIME=$OPT_WAIT
      echo "Waiting up to $WAIT_TIME seconds for Sagemaker to start up..."
      until [ -n "$SAGEMAKER_CONTAINER" ]; do
        sleep 1
        ((WAIT_TIME--))
        if [ "$WAIT_TIME" -lt 1 ]; then
          echo "Sagemaker is not running."
          return 1
        fi
        SAGEMAKER_CONTAINER=$(dr-find-sagemaker)
      done
    else
      echo "Sagemaker is not running."
      return 1
    fi
  fi

  if [[ "$TERM_PROGRAM" == "vscode" ]]; then
    echo "VS Code terminal detected. Displaying Sagemaker logs inline."
    docker logs $OPT_TIME -f $SAGEMAKER_CONTAINER
  elif [[ "${DR_HOST_X,,}" == "true" && -n "$DISPLAY" ]]; then
    if [ -x "$(command -v gnome-terminal)" ]; then
      gnome-terminal --tab --title "DR-${DR_RUN_ID}: Sagemaker - ${SAGEMAKER_CONTAINER}" -- /usr/usr/bin/env bash -c "docker logs $OPT_TIME -f ${SAGEMAKER_CONTAINER}" 2>/dev/null
      echo "Sagemaker container $SAGEMAKER_CONTAINER logs opened in separate gnome-terminal. "
    elif [ -x "$(command -v x-terminal-emulator)" ]; then
      x-terminal-emulator -e /bin/sh -c "docker logs $OPT_TIME -f ${SAGEMAKER_CONTAINER}" 2>/dev/null
      echo "Sagemaker container $SAGEMAKER_CONTAINER logs opened in separate terminal. "
    else
      echo 'Could not find a terminal emulator. Displaying inline.'
      docker logs $OPT_TIME -f $SAGEMAKER_CONTAINER
    fi
  else
    docker logs $OPT_TIME -f $SAGEMAKER_CONTAINER
  fi

}

function dr-find-sagemaker {

  STACK_NAME="deepracer-$DR_RUN_ID"
  SAGEMAKER_CONTAINERS=$(docker ps | awk ' /simapp/ { print $1 } ' | xargs)

  if [[ -n "$SAGEMAKER_CONTAINERS" ]]; then
      for CONTAINER in $SAGEMAKER_CONTAINERS; do
          CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER)
          CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)-(algo-(.)-(.*))/; print $1')
          COMPOSE_SERVICE_NAME=$(echo $CONTAINER_NAME | perl -n -e'/(.*)-(algo-(.)-(.*))/; print $2')

          if [[ -n "$COMPOSE_SERVICE_NAME" ]]; then
                COMPOSE_FILES=$(_dr_find_sagemaker_compose_files "$COMPOSE_SERVICE_NAME")
              for COMPOSE_FILE in $COMPOSE_FILES; do
                  if _dr_compose_file_matches_run "$COMPOSE_FILE"; then
                      echo $CONTAINER
                  fi
              done
          fi
      done
  fi

}

function dr-logs-robomaker {

  OPT_REPLICA=1
  OPT_EVAL=""
  local OPTIND
  OPT_TIME="--since 5m"

  while getopts ":w:n:ea" opt; do
    case $opt in
    w)
      OPT_WAIT=$OPTARG
      ;;
    n)
      OPT_REPLICA=$OPTARG
      ;;
    e)
      OPT_EVAL="-e"
      ;;
    a)
      OPT_TIME=""
      ;;
    \?)
      echo "Invalid option -$OPTARG" >&2
      ;;
    esac
  done

  ROBOMAKER_CONTAINER=$(dr-find-robomaker -n ${OPT_REPLICA} ${OPT_EVAL})

  if [[ -z "$ROBOMAKER_CONTAINER" ]]; then
    if [[ -n "$OPT_WAIT" ]]; then
      WAIT_TIME=$OPT_WAIT
      echo "Waiting up to $WAIT_TIME seconds for Robomaker #${OPT_REPLICA} to start up..."
      until [ -n "$ROBOMAKER_CONTAINER" ]; do
        sleep 1
        ((WAIT_TIME--))
        if [ "$WAIT_TIME" -lt 1 ]; then
          echo "Robomaker #${OPT_REPLICA} is not running."
          return 1
        fi
        ROBOMAKER_CONTAINER=$(dr-find-robomaker -n ${OPT_REPLICA} ${OPT_EVAL})
      done
    else
      echo "Robomaker #${OPT_REPLICA} is not running."
      return 1
    fi
  fi

  if [[ "$TERM_PROGRAM" == "vscode" ]]; then
    echo "VS Code terminal detected. Displaying Robomaker #${OPT_REPLICA} logs inline."
    docker logs $OPT_TIME -f $ROBOMAKER_CONTAINER
  elif [[ "${DR_HOST_X,,}" == "true" && -n "$DISPLAY" ]]; then
    if [ -x "$(command -v gnome-terminal)" ]; then
      gnome-terminal --tab --title "DR-${DR_RUN_ID}: Robomaker #${OPT_REPLICA} - ${ROBOMAKER_CONTAINER}" -- /usr/usr/bin/env bash -c "docker logs $OPT_TIME -f ${ROBOMAKER_CONTAINER}" 2>/dev/null
      echo "Robomaker #${OPT_REPLICA} ($ROBOMAKER_CONTAINER) logs opened in separate gnome-terminal. "
    elif [ -x "$(command -v x-terminal-emulator)" ]; then
      x-terminal-emulator -e /bin/sh -c "docker logs $OPT_TIME -f ${ROBOMAKER_CONTAINER}" 2>/dev/null
      echo "Robomaker #${OPT_REPLICA} ($ROBOMAKER_CONTAINER) logs opened in separate terminal. "
    else
      echo 'Could not find a terminal emulator. Displaying inline.'
      docker logs $OPT_TIME -f $ROBOMAKER_CONTAINER
    fi
  else
    docker logs $OPT_TIME -f $ROBOMAKER_CONTAINER
  fi

}

function dr-find-robomaker {

  local OPTIND

  OPT_PREFIX="deepracer"

  while getopts ":n:e" opt; do
    case $opt in
    n)
      OPT_REPLICA=$OPTARG
      ;;
    e)
      OPT_PREFIX="-eval"
      ;;
    \?)
      echo "Invalid option -$OPTARG" >&2
      ;;
    esac
  done

  if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
    eval ROBOMAKER_ID=$(docker ps | grep "${OPT_PREFIX}-${DR_RUN_ID}_robomaker.${OPT_REPLICA}" | cut -f1 -d\  | head -1)
  else
    eval ROBOMAKER_ID=$(docker ps | grep "${OPT_PREFIX}-${DR_RUN_ID}-robomaker-${OPT_REPLICA}" | cut -f1 -d\  | head -1)
  fi

  if [ -n "$ROBOMAKER_ID" ]; then
    echo $ROBOMAKER_ID
  fi
}

function dr-get-robomaker-stats {

  local OPTIND
  OPT_REPLICA=1

  while getopts ":n:" opt; do
    case $opt in
    n)
      OPT_REPLICA=$OPTARG
      ;;
    \?)
      echo "Invalid option -$OPTARG" >&2
      ;;
    esac
  done

  eval ROBOMAKER_ID=$(dr-find-robomaker -n $OPT_REPLICA)
  if [ -n "$ROBOMAKER_ID" ]; then
    echo "Showing statistics for Robomaker #$OPT_REPLICA - container $ROBOMAKER_ID"
    docker exec -ti $ROBOMAKER_ID bash -c "gz stats"
  else
    echo "Robomaker #$OPT_REPLICA is not running."
  fi
}

function dr-logs-loganalysis {
  eval LOG_ANALYSIS_ID=$(docker ps | awk ' /deepracer-analysis/ { print $1 }')
  if [ -n "$LOG_ANALYSIS_ID" ]; then
    docker logs -f $LOG_ANALYSIS_ID
  else
    echo "Log-analysis is not running."
  fi

}

function dr-url-loganalysis {
  LOG_ANALYSIS_ID=$(docker ps --filter "name=deepracer-analysis" --format "{{.ID}}" | head -1)
  if [ -n "$LOG_ANALYSIS_ID" ]; then
    URL=$(docker logs "$LOG_ANALYSIS_ID" 2>&1 | grep -oE 'http://127\.0\.0\.1:[0-9]+[^ ]*token=[a-f0-9]+' | tail -1)
    if [ -n "$URL" ]; then
      echo "${URL/127.0.0.1/localhost}"
    else
      echo "Jupyter URL not found yet. Try again in a moment."
    fi
  else
    echo "Log-analysis is not running."
  fi
}

function dr-view-stream {
  ${DR_DIR}/utils/start-local-browser.sh "$@"
}

function dr-start-viewer {
  $DR_DIR/scripts/viewer/start.sh "$@"
}

function dr-stop-viewer {
  $DR_DIR/scripts/viewer/stop.sh "$@"
}

function dr-update-viewer {
  $DR_DIR/scripts/viewer/stop.sh "$@"
  $DR_DIR/scripts/viewer/start.sh "$@"
}

function dr-start-metrics {
  $DR_DIR/scripts/metrics/start.sh "$@"
}

function dr-stop-metrics {
  $DR_DIR/scripts/metrics/stop.sh "$@"
}

================================================
FILE: defaults/debug-reward_function.py
================================================
import math
import numpy
import time

class Reward:

    '''
    Debugging reward function to be used to track performance of local training.
    Will print out the Real-Time-Factor (RTF), as well as how many 
    steps-per-second (sim-time) that the system is able to deliver.
    '''

    def __init__(self, verbose=False, track_time=False):
        self.verbose = verbose
        self.track_time = track_time

        if track_time:
            TIME_WINDOW=10
            self.time = numpy.zeros([TIME_WINDOW, 2])

        if verbose:
            print("Initializing Reward Class")

    def get_time(self):

        wall_time_incr = numpy.max(self.time[:,0]) - numpy.min(self.time[:,0])
        sim_time_incr = numpy.max(self.time[:,1]) - numpy.min(self.time[:,1])
        
        rtf = sim_time_incr / wall_time_incr
        fps = (self.time.shape[0] - 1) / sim_time_incr

        return rtf, fps
    
    def record_time(self, steps, sim_time=0.0):

        index = int(steps) % self.time.shape[0]
        self.time[index,0] = time.time()
        self.time[index,1] = sim_time

    def reward_function(self, params):

        # Read input parameters
        steps = params["steps"]

        if self.track_time:
            self.record_time(steps, sim_time=params.get("sim_time", 0.0))

        if self.track_time:
            if steps >= self.time.shape[0]:
                rtf, fps = self.get_time()
                print("TIME: s: {}, rtf: {}, fps:{}".format(int(steps), round(rtf, 2), round(fps, 2) ))

        return 1.0


reward_object = Reward(verbose=False, track_time=True)

def reward_function(params):
    return reward_object.reward_function(params)


================================================
FILE: defaults/dependencies.json
================================================
{
    "master_version": "6.0",
    "containers": {
        "simapp": "6.0.4"
    }
}


================================================
FILE: defaults/docker-daemon.json
================================================
{
    "runtimes": {
        "nvidia": {
            "path": "nvidia-container-runtime",
            "runtimeArgs": []
        }
    },
    "default-runtime": "nvidia"
}

================================================
FILE: defaults/hyperparameters.json
================================================
{
    "batch_size": 64,
    "beta_entropy": 0.01,
    "discount_factor": 0.99,
    "e_greedy_value": 0.05,
    "epsilon_steps": 10000,
    "exploration_type": "categorical",
    "loss_type": "huber",
    "lr": 0.0003,
    "num_episodes_between_training": 20,
    "num_epochs": 5,
    "stack_size": 1,
    "term_cond_avg_score": 350.0,
    "term_cond_max_episodes": 1000,
    "sac_alpha": 0.2
  }

================================================
FILE: defaults/model_metadata.json
================================================
{
    "action_space": [
        {
            "steering_angle": -30,
            "speed": 0.6
        },
        {
            "steering_angle": -15,
            "speed": 0.6
        },
        {
            "steering_angle": 0,
            "speed": 0.6
        },
        {
            "steering_angle": 15,
            "speed": 0.6
        },
        {
            "steering_angle": 30,
            "speed": 0.6
        }
    ],
    "sensor": ["FRONT_FACING_CAMERA"],
    "neural_network": "DEEP_CONVOLUTIONAL_NETWORK_SHALLOW",
    "training_algorithm": "clipped_ppo", 
    "action_space_type": "discrete",
    "version": "5"
}


================================================
FILE: defaults/model_metadata_cont.json
================================================
{
    "action_space": {
        "speed": {
            "high": 2,
            "low": 1
        },
        "steering_angle": {
            "high": 30,
            "low": -30
        }
    },
    "sensor": [
        "FRONT_FACING_CAMERA"
    ],
    "neural_network": "DEEP_CONVOLUTIONAL_NETWORK_SHALLOW",
    "training_algorithm": "clipped_ppo",
    "action_space_type": "continuous",
    "version": "5"
}

================================================
FILE: defaults/model_metadata_sac.json
================================================
{
    "action_space": {"speed": {"high": 2, "low": 1}, "steering_angle": {"high": 30, "low": -30}},
    "sensor": ["FRONT_FACING_CAMERA"],
    "neural_network": "DEEP_CONVOLUTIONAL_NETWORK_SHALLOW",
    "training_algorithm": "sac", 
    "action_space_type": "continuous",
    "version": "4"
}


================================================
FILE: defaults/reward_function.py
================================================
def reward_function(params):
    '''
    Example of penalize steering, which helps mitigate zig-zag behaviors
    '''
    
    # Read input parameters
    distance_from_center = params['distance_from_center']
    track_width = params['track_width']
    steering = abs(params['steering_angle']) # Only need the absolute steering angle

    # Calculate 3 marks that are farther and father away from the center line
    marker_1 = 0.1 * track_width
    marker_2 = 0.25 * track_width
    marker_3 = 0.5 * track_width

    # Give higher reward if the car is closer to center line and vice versa
    if distance_from_center <= marker_1:
        reward = 1
    elif distance_from_center <= marker_2:
        reward = 0.5
    elif distance_from_center <= marker_3:
        reward = 0.1
    else:
        reward = 1e-3  # likely crashed/ close to off track

    # Steering penality threshold, change the number based on your action space setting
    ABS_STEERING_THRESHOLD = 15

    # Penalize reward if the car is steering too much
    if steering > ABS_STEERING_THRESHOLD:
        reward *= 0.8

    return float(reward)


================================================
FILE: defaults/template-run.env
================================================
DR_RUN_ID=0
DR_WORLD_NAME=reinvent_base
DR_RACE_TYPE=TIME_TRIAL
DR_CAR_NAME=FastCar
DR_CAR_BODY_SHELL_TYPE=deepracer
DR_CAR_COLOR=Red
DR_DISPLAY_NAME=$DR_CAR_NAME
DR_RACER_NAME=$DR_CAR_NAME
DR_ENABLE_DOMAIN_RANDOMIZATION=False
DR_EVAL_NUMBER_OF_TRIALS=3
DR_EVAL_IS_CONTINUOUS=True
DR_EVAL_MAX_RESETS=100
DR_EVAL_OFF_TRACK_PENALTY=5.0
DR_EVAL_COLLISION_PENALTY=5.0
DR_EVAL_SAVE_MP4=False
DR_EVAL_CHECKPOINT=last
DR_EVAL_OPP_S3_MODEL_PREFIX=rl-deepracer-sagemaker
DR_EVAL_OPP_CAR_BODY_SHELL_TYPE=deepracer
DR_EVAL_OPP_CAR_NAME=FasterCar
DR_EVAL_OPP_DISPLAY_NAME=$DR_EVAL_OPP_CAR_NAME
DR_EVAL_OPP_RACER_NAME=$DR_EVAL_OPP_CAR_NAME
DR_EVAL_DEBUG_REWARD=False
DR_EVAL_RESET_BEHIND_DIST=1.0
DR_EVAL_REVERSE_DIRECTION=False
#DR_EVAL_RTF=1.0
DR_TRAIN_CHANGE_START_POSITION=True
DR_TRAIN_REVERSE_DIRECTION=False
DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False
DR_TRAIN_START_POSITION_OFFSET=0.0
DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.05
DR_TRAIN_MULTI_CONFIG=False
DR_TRAIN_MIN_EVAL_TRIALS=5
DR_TRAIN_BEST_MODEL_METRIC=progress
#DR_TRAIN_RTF=1.0
#DR_TRAIN_MAX_STEPS_PER_ITERATION=10000
DR_LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker
DR_LOCAL_S3_PRETRAINED=False
DR_LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained
DR_LOCAL_S3_PRETRAINED_CHECKPOINT=last
DR_LOCAL_S3_CUSTOM_FILES_PREFIX=custom_files
DR_LOCAL_S3_TRAINING_PARAMS_FILE=training_params.yaml
DR_LOCAL_S3_EVAL_PARAMS_FILE=evaluation_params.yaml
DR_LOCAL_S3_MODEL_METADATA_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/model_metadata.json
DR_LOCAL_S3_HYPERPARAMETERS_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/hyperparameters.json
DR_LOCAL_S3_REWARD_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/reward_function.py
DR_LOCAL_S3_METRICS_PREFIX=$DR_LOCAL_S3_MODEL_PREFIX/metrics
DR_UPLOAD_S3_PREFIX=$DR_LOCAL_S3_MODEL_PREFIX-1
DR_OA_NUMBER_OF_OBSTACLES=6
DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES=2.0
DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS=False
DR_OA_IS_OBSTACLE_BOT_CAR=False
DR_OA_OBSTACLE_TYPE=box_obstacle
DR_OA_OBJECT_POSITIONS=
DR_H2B_IS_LANE_CHANGE=False
DR_H2B_LOWER_LANE_CHANGE_TIME=3.0
DR_H2B_UPPER_LANE_CHANGE_TIME=5.0
DR_H2B_LANE_CHANGE_DISTANCE=1.0
DR_H2B_NUMBER_OF_BOT_CARS=3
DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS=2.0
DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS=False
DR_H2B_BOT_CAR_SPEED=0.2
DR_H2B_BOT_CAR_PENALTY=5.0

================================================
FILE: defaults/template-system.env
================================================
DR_CLOUD=<CLOUD_REPLACE>
DR_AWS_APP_REGION=<REGION_REPLACE>
DR_UPLOAD_S3_PROFILE=default
DR_UPLOAD_S3_BUCKET=<AWS_DR_BUCKET>
DR_LOCAL_S3_BUCKET=bucket
DR_LOCAL_S3_PROFILE=<LOCAL_PROFILE>
DR_GUI_ENABLE=False
DR_KINESIS_STREAM_NAME=
DR_CAMERA_MAIN_ENABLE=True
DR_CAMERA_SUB_ENABLE=False
DR_CAMERA_KVS_ENABLE=True
DR_ENABLE_EXTRA_KVS_OVERLAY=False
DR_SIMAPP_SOURCE=awsdeepracercommunity/deepracer-simapp
DR_SIMAPP_VERSION=<SIMAPP_VERSION_TAG>
DR_MINIO_IMAGE=latest
DR_ANALYSIS_IMAGE=cpu
DR_WORKERS=1
DR_ROBOMAKER_MOUNT_LOGS=False
# DR_ROBOMAKER_MOUNT_SIMAPP_DIR=
# DR_ROBOMAKER_MOUNT_SCRIPTS_DIR=${DR_DIR}/data/scripts
DR_CLOUD_WATCH_ENABLE=False
DR_CLOUD_WATCH_LOG_STREAM_PREFIX=
DR_DOCKER_STYLE=<DOCKER_STYLE>
DR_HOST_X=False
DR_WEBVIEWER_PORT=8100
DR_QUIET_ACTIVATE=False
# DR_DISPLAY=:99
# DR_REMOTE_MINIO_URL=http://mynas:9000
# DR_ROBOMAKER_CUDA_DEVICES=0
# DR_SAGEMAKER_CUDA_DEVICES=0
# DR_EXPERIMENT_NAME=
# DR_TELEGRAF_HOST=telegraf
# DR_TELEGRAF_PORT=8092
## DRoA Integration
# DR_DROA_URL=https://xxxx.cloudfront.net
# DR_DROA_USERNAME=user@example.com


================================================
FILE: defaults/template-worker.env
================================================
DR_WORLD_NAME=reInvent2019_track
DR_RACE_TYPE=TIME_TRIAL
DR_CAR_COLOR=Blue
DR_ENABLE_DOMAIN_RANDOMIZATION=False
DR_TRAIN_CHANGE_START_POSITION=True
DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False
DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.05
DR_TRAIN_START_POSITION_OFFSET=0.0
DR_OA_NUMBER_OF_OBSTACLES=6
DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES=2.0
DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS=False
DR_OA_IS_OBSTACLE_BOT_CAR=False
DR_OA_OBSTACLE_TYPE=box_obstacle
DR_OA_OBJECT_POSITIONS=
DR_H2B_IS_LANE_CHANGE=False
DR_H2B_LOWER_LANE_CHANGE_TIME=3.0
DR_H2B_UPPER_LANE_CHANGE_TIME=5.0
DR_H2B_LANE_CHANGE_DISTANCE=1.0
DR_H2B_NUMBER_OF_BOT_CARS=3
DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS=2.0
DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS=False
DR_H2B_BOT_CAR_SPEED=0.2


================================================
FILE: docker/docker-compose-aws.yml
================================================
version: '3.7'

services:
  rl_coach:
    environment:
      - AWS_METADATA_SERVICE_TIMEOUT=3
      - AWS_METADATA_SERVICE_NUM_ATTEMPTS=5
  robomaker:
    environment:
      - AWS_METADATA_SERVICE_TIMEOUT=3
      - AWS_METADATA_SERVICE_NUM_ATTEMPTS=5


================================================
FILE: docker/docker-compose-cwlog.yml
================================================
version: '3.7'

services:
  rl_coach:
    logging:
      driver: awslogs
      options:
        awslogs-group: '/deepracer-for-cloud'
        awslogs-create-group: 'true'
        awslogs-region: ${DR_AWS_APP_REGION}
        tag: "${DR_CLOUD_WATCH_LOG_STREAM_PREFIX}{{.Name}}"
  robomaker:
    logging:
      driver: awslogs
      options:
        awslogs-group: '/deepracer-for-cloud'
        awslogs-create-group: 'true' 
        awslogs-region: ${DR_AWS_APP_REGION}
        tag: "${DR_CLOUD_WATCH_LOG_STREAM_PREFIX}{{.Name}}"

================================================
FILE: docker/docker-compose-endpoint.yml
================================================
version: '3.7'

services:
  rl_coach:
    environment:
      - S3_ENDPOINT_URL=${DR_MINIO_URL}
  robomaker:
    environment:
      - S3_ENDPOINT_URL=${DR_MINIO_URL}


================================================
FILE: docker/docker-compose-eval-swarm.yml
================================================
version: '3.7'

services:
  rl_coach:
    deploy:
      restart_policy:
        condition: none
      placement:
        constraints: [node.labels.Sagemaker == true ]
  robomaker:
    deploy:
      restart_policy:
        condition: none
      replicas: 1
      placement:
        constraints: [node.labels.Robomaker == true ]
    environment:
        - DOCKER_REPLICA_SLOT={{.Task.Slot}}

================================================
FILE: docker/docker-compose-eval.yml
================================================
version: '3.7'

networks:
  default:
    external: true
    name: sagemaker-local

services:
  rl_coach:
    image: ${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION}
    command: ["/bin/bash", "-c", "echo No work for coach in Evaluation Mode"]
  robomaker:
    image: ${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION}
    command: ["${ROBOMAKER_COMMAND:-}"]
    ports:
      - "${DR_ROBOMAKER_EVAL_PORT}:8080"
    environment:
      - CUDA_VISIBLE_DEVICES=${DR_ROBOMAKER_CUDA_DEVICES:-}
      - DEBUG_REWARD=${DR_EVAL_DEBUG_REWARD}
      - WORLD_NAME=${DR_WORLD_NAME}
      - MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX}
      - MODEL_S3_BUCKET=${DR_LOCAL_S3_BUCKET}      
      - APP_REGION=${DR_AWS_APP_REGION}
      - S3_YAML_NAME=${DR_CURRENT_PARAMS_FILE}
      - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME}
      - ENABLE_KINESIS=${DR_CAMERA_KVS_ENABLE}
      - ENABLE_GUI=${DR_GUI_ENABLE}
      - ROLLOUT_IDX=0
      - RTF_OVERRIDE=${DR_EVAL_RTF:-}
      - ROS_MASTER_URI=http://localhost:11311/
      - ROS_IP=127.0.0.1
      - GAZEBO_ARGS=${DR_GAZEBO_ARGS:-}
      - GAZEBO_RENDER_ENGINE=${DR_GAZEBO_RENDER_ENGINE:-ogre2}
      - TELEGRAF_HOST=${DR_TELEGRAF_HOST:-}
      - TELEGRAF_PORT=${DR_TELEGRAF_PORT:-}
    init: true

================================================
FILE: docker/docker-compose-keys.yml
================================================
version: '3.7'

services:
  rl_coach:
    environment:
      - AWS_ACCESS_KEY_ID=${DR_LOCAL_ACCESS_KEY_ID}
      - AWS_SECRET_ACCESS_KEY=${DR_LOCAL_SECRET_ACCESS_KEY}
  robomaker:
    environment:
      - AWS_ACCESS_KEY_ID=${DR_LOCAL_ACCESS_KEY_ID}
      - AWS_SECRET_ACCESS_KEY=${DR_LOCAL_SECRET_ACCESS_KEY}


================================================
FILE: docker/docker-compose-local-xorg-wsl.yml
================================================
version: '3.7'

services:
  robomaker:
    environment:
      - DISPLAY
      - USE_EXTERNAL_X=${DR_HOST_X}
      - QT_X11_NO_MITSHM=1
      - LD_LIBRARY_PATH=/usr/lib/wsl/lib
    volumes:
      - '/tmp/.X11-unix/:/tmp/.X11-unix'
      - '/mnt/wslg:/mnt/wslg'
      - '/usr/lib/wsl:/usr/lib/wsl'
    devices:
      - /dev/dxg


================================================
FILE: docker/docker-compose-local-xorg.yml
================================================
version: '3.7'

services:
  robomaker:
    environment:
      - DISPLAY
      - USE_EXTERNAL_X=${DR_HOST_X}
      - XAUTHORITY=/root/.Xauthority
      - QT_X11_NO_MITSHM=1
      - NVIDIA_DRIVER_CAPABILITIES=all
    volumes:
      - '/tmp/.X11-unix/:/tmp/.X11-unix'
      - '${XAUTHORITY}:/root/.Xauthority'

================================================
FILE: docker/docker-compose-local.yml
================================================

version: '3.7'

networks:
  default:
    external: true
    name: sagemaker-local

services:
  minio:
    image: minio/minio:${DR_MINIO_IMAGE}
    ports:
      - "9000:9000"
      - "9001:9001"
    command: server /data --console-address ":9001"
    environment:
      - MINIO_ROOT_USER=${DR_LOCAL_ACCESS_KEY_ID}
      - MINIO_ROOT_PASSWORD=${DR_LOCAL_SECRET_ACCESS_KEY}
      - MINIO_UID
      - MINIO_GID
      - MINIO_USERNAME
      - MINIO_GROUPNAME
    volumes:
      - ${DR_DIR}/data/minio:/data


================================================
FILE: docker/docker-compose-metrics.yml
================================================

version: '3.7'

networks:
  default:
    external: true
    name: sagemaker-local

services:
  telegraf:
    image: telegraf:1.18-alpine
    volumes:
    - ./metrics/telegraf/etc/telegraf.conf:/etc/telegraf/telegraf.conf:ro
    depends_on:
      - influxdb
    links:
      - influxdb
    ports:
    - '127.0.0.1:8125:8125/udp'
    - '127.0.0.1:8092:8092/udp'

  influxdb:
    image: influxdb:1.8-alpine
    env_file: ./metrics/configuration.env
    ports:
      - '127.0.0.1:8886:8086'
    volumes:
      - influxdb_data:/var/lib/influxdb

  grafana:
    image: grafana/grafana:10.4.2
    depends_on:
      - influxdb
    env_file: ./metrics/configuration.env
    links:
      - influxdb
    ports:
      - '3000:3000'
    volumes:
      - grafana_data:/var/lib/grafana
      - ./metrics/grafana/provisioning/:/etc/grafana/provisioning/

volumes:
  grafana_data: {}
  influxdb_data: {}


================================================
FILE: docker/docker-compose-mount.yml
================================================
version: '3.7'

services:
  robomaker:
    volumes:
      - "${DR_MOUNT_DIR}:/root/.ros/log"


================================================
FILE: docker/docker-compose-robomaker-multi.yml
================================================
version: '3.7'

services:
  robomaker:
    volumes:
      - "${DR_DIR}/tmp/comms.${DR_RUN_ID}:/mnt/comms"


================================================
FILE: docker/docker-compose-robomaker-scripts.yml
================================================
version: '3.7'

services:
  robomaker:
    volumes:
      - '${DR_ROBOMAKER_MOUNT_SCRIPTS_DIR}:/scripts'

================================================
FILE: docker/docker-compose-simapp.yml
================================================
version: '3.7'

services:
  robomaker:
    volumes:
      - '${DR_ROBOMAKER_MOUNT_SIMAPP_DIR}:/opt/simapp'

================================================
FILE: docker/docker-compose-training-swarm.yml
================================================
version: '3.7'

services:
  rl_coach:
    deploy:
      restart_policy:
        condition: none
      placement:
        constraints: [node.labels.Sagemaker == true ]
  robomaker:
    deploy:
      restart_policy:
        condition: none
      replicas: ${DR_WORKERS}
      placement:
        constraints: [node.labels.Robomaker == true ]
    environment:
        - DOCKER_REPLICA_SLOT={{.Task.Slot}}

================================================
FILE: docker/docker-compose-training.yml
================================================
version: "3.7"

networks:
  default:
    external: true
    name: sagemaker-local

services:
  rl_coach:
    image: ${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION}
    command: ["source /root/sagemaker-venv/bin/activate && python3 /opt/ml/code/rl_coach/start.py"]
    working_dir: "/opt/ml/code/"
    environment:
      - RUN_ID=${DR_RUN_ID}
      - AWS_REGION=${DR_AWS_APP_REGION}
      - SAGEMAKER_IMAGE=${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION}
      - PRETRAINED=${DR_LOCAL_S3_PRETRAINED}
      - PRETRAINED_S3_PREFIX=${DR_LOCAL_S3_PRETRAINED_PREFIX}
      - PRETRAINED_S3_BUCKET=${DR_LOCAL_S3_BUCKET}
      - PRETRAINED_CHECKPOINT=${DR_LOCAL_S3_PRETRAINED_CHECKPOINT}
      - MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX}
      - MODEL_S3_BUCKET=${DR_LOCAL_S3_BUCKET}
      - HYPERPARAMETER_FILE_S3_KEY=${DR_LOCAL_S3_HYPERPARAMETERS_KEY}
      - MODELMETADATA_FILE_S3_KEY=${DR_LOCAL_S3_MODEL_METADATA_KEY}
      - CUDA_VISIBLE_DEVICES=${DR_SAGEMAKER_CUDA_DEVICES:-}
      - MAX_MEMORY_STEPS=${DR_TRAIN_MAX_STEPS_PER_ITERATION:-}
      - TELEGRAF_HOST=${DR_TELEGRAF_HOST:-}
      - TELEGRAF_PORT=${DR_TELEGRAF_PORT:-}

    volumes:
      - "/var/run/docker.sock:/var/run/docker.sock"
      - "/tmp/sagemaker:/tmp/sagemaker"
  robomaker:
    image: ${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION}
    command: ["${ROBOMAKER_COMMAND:-}"]
    ports:
      - "${DR_ROBOMAKER_TRAIN_PORT}:8080"
      - "${DR_ROBOMAKER_GUI_PORT}:5900"
    environment:
      - WORLD_NAME=${DR_WORLD_NAME}
      - SAGEMAKER_SHARED_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX}
      - SAGEMAKER_SHARED_S3_BUCKET=${DR_LOCAL_S3_BUCKET}
      - APP_REGION=${DR_AWS_APP_REGION}
      - S3_YAML_NAME=${DR_CURRENT_PARAMS_FILE}
      - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME}
      - ENABLE_KINESIS=${DR_CAMERA_KVS_ENABLE}
      - ENABLE_GUI=${DR_GUI_ENABLE}
      - CUDA_VISIBLE_DEVICES=${DR_ROBOMAKER_CUDA_DEVICES:-}
      - MULTI_CONFIG
      - RTF_OVERRIDE=${DR_TRAIN_RTF:-}      
      - ROS_MASTER_URI=http://localhost:11311/
      - ROS_IP=127.0.0.1
      - GAZEBO_ARGS=${DR_GAZEBO_ARGS:-}
      - GAZEBO_RENDER_ENGINE=${DR_GAZEBO_RENDER_ENGINE:-ogre2}
      - TELEGRAF_HOST=${DR_TELEGRAF_HOST:-}
      - TELEGRAF_PORT=${DR_TELEGRAF_PORT:-}
    init: true


================================================
FILE: docker/docker-compose-webviewer-swarm.yml
================================================
version: '3.7'

networks:
  default:
    external: true
    name: sagemaker-local

services:
  proxy:
    deploy:
      restart_policy:
        condition: none
      replicas: 1
      placement:
        constraints: [node.labels.Sagemaker == true ]


================================================
FILE: docker/docker-compose-webviewer.yml
================================================
version: '3.7'

networks:
  default:
    external: true
    name: sagemaker-local

services:
  proxy:
    image: nginx
    ports:
      - "${DR_WEBVIEWER_PORT}:80"
    volumes:
      - ${DR_VIEWER_HTML}:/usr/share/nginx/html/index.html
      - ${DR_NGINX_CONF}:/etc/nginx/conf.d/default.conf


================================================
FILE: docker/metrics/configuration.env
================================================
# Grafana options
GF_SECURITY_ADMIN_USER=admin
GF_SECURITY_ADMIN_PASSWORD=admin
GF_INSTALL_PLUGINS=

# InfluxDB options
INFLUXDB_DB=influx
INFLUXDB_ADMIN_USER=admin
INFLUXDB_ADMIN_PASSWORD=admin


================================================
FILE: docker/metrics/grafana/provisioning/dashboards/dashboard.yml
================================================
apiVersion: 1

providers:
- name: 'Default'
  folder: ''
  options:
    path: /etc/grafana/provisioning/dashboards


================================================
FILE: docker/metrics/grafana/provisioning/dashboards/deepracer-training-template.json
================================================
{
  "annotations": {
    "list": [
      {
        "builtIn": 1,
        "datasource": {
          "type": "datasource",
          "uid": "grafana"
        },
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "limit": 100,
        "name": "Annotations & Alerts",
        "type": "dashboard"
      }
    ]
  },
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 2,
  "id": 1,
  "links": [],
  "panels": [
    {
      "datasource": {},
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 9,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "reward"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "red",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 11,
        "w": 24,
        "x": 0,
        "y": 0
      },
      "id": 6,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "mean",
            "max",
            "lastNotNull"
          ],
          "displayMode": "table",
          "placement": "bottom",
          "showLegend": true,
          "sortBy": "Max",
          "sortDesc": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "alias": "$tag_model training reward",
          "datasource": {
            "type": "influxdb",
            "uid": "${DS_INFLUXDB}"
          },
          "groupBy": [
            {
              "params": [
                "$__interval"
              ],
              "type": "time"
            },
            {
              "params": [
                "model"
              ],
              "type": "tag"
            },
            {
              "params": [
                "none"
              ],
              "type": "fill"
            }
          ],
          "hide": false,
          "measurement": "dr_training_episodes",
          "orderByTime": "ASC",
          "policy": "default",
          "refId": "A",
          "resultFormat": "time_series",
          "select": [
            [
              {
                "params": [
                  "reward"
                ],
                "type": "field"
              },
              {
                "params": [],
                "type": "mean"
              }
            ]
          ],
          "tags": [
            {
              "key": "phase",
              "operator": "=",
              "value": "training"
            }
          ]
        },
        {
          "alias": "$tag_model complete lap reward",
          "datasource": {
            "type": "influxdb",
            "uid": "${DS_INFLUXDB}"
          },
          "groupBy": [
            {
              "params": [
                "$__interval"
              ],
              "type": "time"
            },
            {
              "params": [
                "model"
              ],
              "type": "tag"
            },
            {
              "params": [
                "none"
              ],
              "type": "fill"
            }
          ],
          "hide": false,
          "measurement": "dr_training_episodes",
          "orderByTime": "ASC",
          "policy": "default",
          "refId": "B",
          "resultFormat": "time_series",
          "select": [
            [
              {
                "params": [
                  "reward"
                ],
                "type": "field"
              },
              {
                "params": [],
                "type": "mean"
              }
            ]
          ],
          "tags": [
            {
              "key": "status",
              "operator": "=",
              "value": "Lap complete"
            }
          ]
        },
        {
          "alias": "$tag_model eval reward",
          "datasource": {
            "type": "influxdb",
            "uid": "${DS_INFLUXDB}"
          },
          "groupBy": [
            {
              "params": [
                "$__interval"
              ],
              "type": "time"
            },
            {
              "params": [
                "model"
              ],
              "type": "tag"
            },
            {
              "params": [
                "none"
              ],
              "type": "fill"
            }
          ],
          "hide": false,
          "measurement": "dr_training_episodes",
          "orderByTime": "ASC",
          "policy": "default",
          "refId": "C",
          "resultFormat": "time_series",
          "select": [
            [
              {
                "params": [
                  "reward"
                ],
                "type": "field"
              },
              {
                "params": [],
                "type": "mean"
              }
            ]
          ],
          "tags": [
            {
              "key": "phase",
              "operator": "=",
              "value": "evaluation"
            }
          ]
        }
      ],
      "title": "Reward",
      "type": "timeseries"
    },
    {
      "datasource": {},
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "points",
            "fillOpacity": 3,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 4,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": [
          {
            "matcher": {
              "id": "byRegexp",
              "options": ".*eval progress moving average"
            },
            "properties": [
              {
                "id": "custom.drawStyle",
                "value": "line"
              },
              {
                "id": "custom.showPoints",
                "value": "never"
              },
              {
                "id": "color",
                "value": {
                  "fixedColor": "orange",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byRegexp",
              "options": ".*training progress moving average"
            },
            "properties": [
              {
                "id": "custom.drawStyle",
                "value": "line"
              },
              {
                "id": "custom.showPoints",
                "value": "never"
              },
              {
                "id": "color",
                "value": {
                  "fixedColor": "blue",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 10,
        "w": 24,
        "x": 0,
        "y": 11
      },
      "id": 4,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "mean",
            "max"
          ],
          "displayMode": "table",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "alias": "$tag_model training progress",
          "datasource": {
            "type": "influxdb",
            "uid": "${DS_INFLUXDB}"
          },
          "groupBy": [
            {
              "params": [
                "$__interval"
              ],
              "type": "time"
            },
            {
              "params": [
                "model"
              ],
              "type": "tag"
            },
            {
              "params": [
                "none"
              ],
              "type": "fill"
            }
          ],
          "hide": false,
          "measurement": "dr_training_episodes",
          "orderByTime": "ASC",
          "policy": "default",
          "refId": "A",
          "resultFormat": "time_series",
          "select": [
            [
              {
                "params": [
                  "progress"
                ],
                "type": "field"
              },
              {
                "params": [],
                "type": "mean"
              }
            ]
          ],
          "tags": [
            {
              "key": "phase",
              "operator": "=",
              "value": "training"
            }
          ]
        },
        {
          "alias": "$tag_model eval progress",
          "datasource": {
            "type": "influxdb",
            "uid": "${DS_INFLUXDB}"
          },
          "groupBy": [
            {
              "params": [
                "$__interval"
              ],
              "type": "time"
            },
            {
              "params": [
                "model"
              ],
              "type": "tag"
            },
            {
              "params": [
                "none"
              ],
              "type": "fill"
            }
          ],
          "hide": false,
          "measurement": "dr_training_episodes",
          "orderByTime": "ASC",
          "policy": "default",
          "refId": "B",
          "resultFormat": "time_series",
          "select": [
            [
              {
                "params": [
                  "progress"
                ],
                "type": "field"
              },
              {
                "params": [],
                "type": "mean"
              }
            ]
          ],
          "tags": [
            {
              "key": "phase",
              "operator": "=",
              "value": "evaluation"
            }
          ]
        },
        {
          "alias": "$tag_model eval progress moving average",
          "datasource": {
            "type": "influxdb",
            "uid": "${DS_INFLUXDB}"
          },
          "groupBy": [
            {
              "params": [
                "$__interval"
              ],
              "type": "time"
            },
            {
              "params": [
                "model"
              ],
              "type": "tag"
            },
            {
              "params": [
                "none"
              ],
              "type": "fill"
            }
          ],
          "hide": false,
          "measurement": "dr_training_episodes",
          "orderByTime": "ASC",
          "policy": "default",
          "refId": "C",
          "resultFormat": "time_series",
          "select": [
            [
              {
                "params": [
                  "progress"
                ],
                "type": "field"
              },
              {
                "params": [],
                "type": "mean"
              },
              {
                "params": [
                  30
                ],
                "type": "moving_average"
              }
            ]
          ],
          "tags": [
            {
              "key": "phase",
              "operator": "=",
              "value": "evaluation"
            }
          ]
        },
        {
          "alias": "$tag_model training progress moving average",
          "datasource": {
            "type": "influxdb",
            "uid": "${DS_INFLUXDB}"
          },
          "groupBy": [
            {
              "params": [
                "$__interval"
              ],
              "type": "time"
            },
            {
              "params": [
                "model"
              ],
              "type": "tag"
            },
            {
              "params": [
                "none"
              ],
              "type": "fill"
            }
          ],
          "hide": false,
          "measurement": "dr_training_episodes",
          "orderByTime": "ASC",
          "policy": "default",
          "refId": "D",
          "resultFormat": "time_series",
          "select": [
            [
              {
                "params": [
                  "progress"
                ],
                "type": "field"
              },
              {
                "params": [],
                "type": "mean"
              },
              {
                "params": [
                  30
                ],
                "type": "moving_average"
              }
            ]
          ],
          "tags": [
            {
              "key": "phase",
              "operator": "=",
              "value": "training"
            }
          ]
        }
      ],
      "title": "Progress",
      "type": "timeseries"
    },
    {
      "datasource": {},
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "points",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 4,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "decimals": 3,
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "ms"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byRegexp",
              "options": ".*eval lap moving average"
            },
            "properties": [
              {
                "id": "custom.drawStyle",
                "value": "line"
              },
              {
                "id": "custom.showPoints",
                "value": "never"
              },
              {
                "id": "color",
                "value": {
                  "fixedColor": "orange",
                  "mode": "fixed"
                }
              },
              {
                "id": "custom.lineWidth",
                "value": 2
              }
            ]
          },
          {
            "matcher": {
              "id": "byRegexp",
              "options": ".*training lap moving average"
            },
            "properties": [
              {
                "id": "custom.drawStyle",
                "value": "line"
              },
              {
                "id": "custom.showPoints",
                "value": "never"
              },
              {
                "id": "color",
                "value": {
                  "fixedColor": "blue",
                  "mode": "fixed"
                }
              },
              {
                "id": "custom.lineWidth",
                "value": 2
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 10,
        "w": 24,
        "x": 0,
        "y": 21
      },
      "id": 2,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "mean",
            "max",
            "lastNotNull"
          ],
          "displayMode": "table",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "alias": "$tag_model training lap ",
          "datasource": {
            "type": "influxdb",
            "uid": "${DS_INFLUXDB}"
          },
          "groupBy": [
            {
              "params": [
                "$__interval"
              ],
              "type": "time"
            },
            {
              "params": [
                "model"
              ],
              "type": "tag"
            },
            {
              "params": [
                "none"
              ],
              "type": "fill"
            }
          ],
          "hide": false,
          "measurement": "dr_training_episodes",
          "orderByTime": "ASC",
          "policy": "default",
          "refId": "A",
          "resultFormat": "time_series",
          "select": [
            [
              {
                "params": [
                  "elapsed_time"
                ],
                "type": "field"
              },
              {
                "params": [],
                "type": "min"
              }
            ]
          ],
          "tags": [
            {
              "key": "status",
              "operator": "=",
              "value": "Lap complete"
            },
            {
              "condition": "AND",
              "key": "phase",
              "operator": "=",
              "value": "training"
            }
          ]
        },
        {
          "alias": "$tag_model eval lap ",
          "datasource": {
            "type": "influxdb",
            "uid": "${DS_INFLUXDB}"
          },
          "groupBy": [
            {
              "params": [
                "$__interval"
              ],
              "type": "time"
            },
            {
              "params": [
                "model"
              ],
              "type": "tag"
            },
            {
              "params": [
                "none"
              ],
              "type": "fill"
            }
          ],
          "hide": false,
          "measurement": "dr_training_episodes",
          "orderByTime": "ASC",
          "policy": "default",
          "refId": "B",
          "resultFormat": "time_series",
          "select": [
            [
              {
                "params": [
                  "elapsed_time"
                ],
                "type": "field"
              },
              {
                "params": [],
                "type": "min"
              }
            ]
          ],
          "tags": [
            {
              "key": "status",
              "operator": "=",
              "value": "Lap complete"
            },
            {
              "condition": "AND",
              "key": "phase",
              "operator": "=",
              "value": "evaluation"
            }
          ]
        },
        {
          "alias": "$tag_model eval lap moving average",
          "datasource": {
            "type": "influxdb",
            "uid": "${DS_INFLUXDB}"
          },
          "groupBy": [
            {
              "params": [
                "$__interval"
              ],
              "type": "time"
            },
            {
              "params": [
                "model"
              ],
              "type": "tag"
            },
            {
              "params": [
                "none"
              ],
              "type": "fill"
            }
          ],
          "hide": false,
          "measurement": "dr_training_episodes",
          "orderByTime": "ASC",
          "policy": "default",
          "refId": "C",
          "resultFormat": "time_series",
          "select": [
            [
              {
                "params": [
                  "elapsed_time"
                ],
                "type": "field"
              },
              {
                "params": [],
                "type": "min"
              },
              {
                "params": [
                  30
                ],
                "type": "moving_average"
              }
            ]
          ],
          "tags": [
            {
              "key": "status",
              "operator": "=",
              "value": "Lap complete"
            },
            {
              "condition": "AND",
              "key": "phase",
              "operator": "=",
              "value": "evaluation"
            }
          ]
        },
        {
          "alias": "$tag_model training lap moving average",
          "datasource": {
            "type": "influxdb",
            "uid": "${DS_INFLUXDB}"
          },
          "groupBy": [
            {
              "params": [
                "$__interval"
              ],
              "type": "time"
            },
            {
              "params": [
                "model"
              ],
              "type": "tag"
            },
            {
              "params": [
                "none"
              ],
              "type": "fill"
            }
          ],
          "hide": false,
          "measurement": "dr_training_episodes",
          "orderByTime": "ASC",
          "policy": "default",
          "refId": "D",
          "resultFormat": "time_series",
          "select": [
            [
              {
                "params": [
                  "elapsed_time"
                ],
                "type": "field"
              },
              {
                "params": [],
                "type": "min"
              },
              {
                "params": [
                  30
                ],
                "type": "moving_average"
              }
            ]
          ],
          "tags": [
            {
              "key": "status",
              "operator": "=",
              "value": "Lap complete"
            },
            {
              "condition": "AND",
              "key": "phase",
              "operator": "=",
              "value": "training"
            }
          ]
        }
      ],
      "title": "Training Complete Lap times",
      "type": "timeseries"
    },
    {
      "datasource": {},
      "description": "",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "points",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": [
          {
            "matcher": {
              "id": "byRegexp",
              "options": ".*entropy moving average"
            },
            "properties": [
              {
                "id": "custom.drawStyle",
                "value": "line"
              },
              {
                "id": "custom.showPoints",
                "value": "never"
              },
              {
                "id": "color",
                "value": {
                  "fixedColor": "blue",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 11,
        "w": 24,
        "x": 0,
        "y": 31
      },
      "id": 7,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "mean",
            "max",
            "lastNotNull"
          ],
          "displayMode": "table",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "alias": "$tag_model entropy",
          "datasource": {},
          "groupBy": [
            {
              "params": [
                "$__interval"
              ],
              "type": "time"
            },
            {
              "params": [
                "model"
              ],
              "type": "tag"
            },
            {
              "params": [
                "none"
              ],
              "type": "fill"
            }
          ],
          "measurement": "dr_sagemaker_epochs",
          "orderByTime": "ASC",
          "policy": "default",
          "refId": "A",
          "resultFormat": "time_series",
          "select": [
            [
              {
                "params": [
                  "entropy"
                ],
                "type": "field"
              },
              {
                "params": [],
                "type": "mean"
              }
            ]
          ],
          "tags": []
        },
        {
          "alias": "$tag_model entropy moving average",
          "datasource": {},
          "groupBy": [
            {
              "params": [
                "$__interval"
              ],
              "type": "time"
            },
            {
              "params": [
                "model"
              ],
              "type": "tag"
            }
          ],
          "hide": false,
          "measurement": "dr_sagemaker_epochs",
          "orderByTime": "ASC",
          "policy": "default",
          "refId": "B",
          "resultFormat": "time_series",
          "select": [
            [
              {
                "params": [
                  "entropy"
                ],
                "type": "field"
              },
              {
                "params": [],
                "type": "mean"
              },
              {
                "params": [
                  10
                ],
                "type": "moving_average"
              }
            ]
          ],
          "tags": []
        }
      ],
      "title": "Epoch",
      "type": "timeseries"
    }
  ],
  "refresh": "10s",
  "schemaVersion": 39,
  "tags": [],
  "templating": {
    "list": []
  },
  "time": {
    "from": "now-1h",
    "to": "now"
  },
  "timepicker": {},
  "timezone": "",
  "title": "DeepRacer Training template",
  "uid": "adke0lwv5zwg0e",
  "version": 1,
  "weekStart": ""
}

================================================
FILE: docker/metrics/grafana/provisioning/datasources/influxdb.yml
================================================
# config file version
apiVersion: 1

# list of datasources that should be deleted from the database
deleteDatasources:
  - name: Influxdb
    orgId: 1

# list of datasources to insert/update depending
# whats available in the database
datasources:
  # <string, required> name of the datasource. Required
- name: InfluxDB
  # <string, required> datasource type. Required
  type: influxdb
  # <string, required> access mode. direct or proxy. Required
  access: proxy
  # <int> org id. will default to orgId 1 if not specified
  orgId: 1
  # <string> url
  url: http://influxdb:8086
  # <string> database password, if used
  password: "admin"
  # <string> database user, if used
  user: "admin"
  # <string> database name, if used
  database: "influx"
  # <bool> enable/disable basic auth
  basicAuth: false
#  withCredentials:
  # <bool> mark as default datasource. Max one per org
  isDefault: true
  # <map> fields that will be converted to json and stored in json_data
  jsonData:
    timeInterval: "5s"
#     graphiteVersion: "1.1"
#     tlsAuth: false
#     tlsAuthWithCACert: false
#  # <string> json object of data that will be encrypted.
#  secureJsonData:
#    tlsCACert: "..."
#    tlsClientCert: "..."
#    tlsClientKey: "..."
  version: 1
  # <bool> allow users to edit datasources from the UI.
  editable: false


================================================
FILE: docker/metrics/telegraf/etc/telegraf.conf
================================================
# Telegraf configuration

# Telegraf is entirely plugin driven. All metrics are gathered from the
# declared inputs, and sent to the declared outputs.

# Plugins must be declared in here to be active.
# To deactivate a plugin, comment out the name and any variables.

# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
# file would generate.

# Global tags can be specified here in key="value" format.
[global_tags]
  # dc = "us-east-1" # will tag all metrics with dc=us-east-1
  # rack = "1a"

# Configuration for telegraf agent
[agent]
  ## Default data collection interval for all inputs
  interval = "5s"
  ## Rounds collection interval to 'interval'
  ## ie, if interval="10s" then always collect on :00, :10, :20, etc.
  round_interval = true

  ## Telegraf will cache metric_buffer_limit metrics for each output, and will
  ## flush this buffer on a successful write.
  metric_buffer_limit = 10000
  ## Flush the buffer whenever full, regardless of flush_interval.
  flush_buffer_when_full = true

  ## Collection jitter is used to jitter the collection by a random amount.
  ## Each plugin will sleep for a random time within jitter before collecting.
  ## This can be used to avoid many plugins querying things like sysfs at the
  ## same time, which can have a measurable effect on the system.
  collection_jitter = "0s"

  ## Default flushing interval for all outputs. You shouldn't set this below
  ## interval. Maximum flush_interval will be flush_interval + flush_jitter
  flush_interval = "1s"
  ## Jitter the flush interval by a random amount. This is primarily to avoid
  ## large write spikes for users running a large number of telegraf instances.
  ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
  flush_jitter = "0s"

  ## Run telegraf in debug mode
  debug = false
  ## Run telegraf in quiet mode
  quiet = false
  ## Override default hostname, if empty use os.Hostname()
  hostname = ""


###############################################################################
#                                  OUTPUTS                                    #
###############################################################################

# Configuration for influxdb server to send metrics to
[[outputs.influxdb]]
  # The full HTTP or UDP endpoint URL for your InfluxDB instance.
  # Multiple urls can be specified but it is assumed that they are part of the same
  # cluster, this means that only ONE of the urls will be written to each interval.
  # urls = ["udp://localhost:8089"] # UDP endpoint example
  urls = ["http://influxdb:8086"] # required
  # The target database for metrics (telegraf will create it if not exists)
  database = "influx" # required
  # Precision of writes, valid values are "ns", "us" (or "µs"), "ms", "s", "m", "h".
  # note: using second precision greatly helps InfluxDB compression
  precision = "s"

  ## Write timeout (for the InfluxDB client), formatted as a string.
  ## If not provided, will default to 5s. 0s means no timeout (not recommended).
  timeout = "5s"
  # username = "telegraf"
  # password = "metricsmetricsmetricsmetrics"
  # Set the user agent for HTTP POSTs (can be useful for log differentiation)
  # user_agent = "telegraf"
  # Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
  # udp_payload = 512


###############################################################################
#                                  INPUTS                                     #
###############################################################################
# Statsd Server
[[inputs.statsd]]
  ## Protocol, must be "tcp", "udp4", "udp6" or "udp" (default=udp)
  protocol = "udp"

  ## MaxTCPConnection - applicable when protocol is set to tcp (default=250)
  max_tcp_connections = 250

  ## Enable TCP keep alive probes (default=false)
  tcp_keep_alive = false

  ## Specifies the keep-alive period for an active network connection.
  ## Only applies to TCP sockets and will be ignored if tcp_keep_alive is false.
  ## Defaults to the OS configuration.
  # tcp_keep_alive_period = "2h"

  ## Address and port to host UDP listener on
  service_address = ":8125"

  ## The following configuration options control when telegraf clears it's cache
  ## of previous values. If set to false, then telegraf will only clear it's
  ## cache when the daemon is restarted.
  ## Reset gauges every interval (default=true)
  delete_gauges = true
  ## Reset counters every interval (default=true)
  delete_counters = true
  ## Reset sets every interval (default=true)
  delete_sets = true
  ## Reset timings & histograms every interval (default=true)
  delete_timings = true

  ## Percentiles to calculate for timing & histogram stats
  percentiles = [90]

  ## separator to use between elements of a statsd metric
  metric_separator = "_"

  ## Parses tags in the datadog statsd format
  ## http://docs.datadoghq.com/guides/dogstatsd/
  parse_data_dog_tags = false

  ## Statsd data translation templates, more info can be read here:
  ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md#graphite
  # templates = [
  #     "cpu.* measurement*"
  # ]

  ## Number of UDP messages allowed to queue up, once filled,
  ## the statsd server will start dropping packets
  allowed_pending_messages = 10000

  ## Number of timing/histogram values to track per-measurement in the
  ## calculation of percentiles. Raising this limit increases the accuracy
  ## of percentiles but also increases the memory usage and cpu time.
  percentile_limit = 1000

  ## Maximum socket buffer size in bytes, once the buffer fills up, metrics
  ## will start dropping.  Defaults to the OS default.
  # read_buffer_size = 65535

# Read metrics about cpu usage
[[inputs.cpu]]
  ## Whether to report per-cpu stats or not
  percpu = true
  ## Whether to report total system cpu stats or not
  totalcpu = true
  ## Comment this line if you want the raw CPU time metrics
  fielddrop = ["time_*"]


# Read metrics about disk usage by mount point
[[inputs.disk]]
  ## By default, telegraf gather stats for all mountpoints.
  ## Setting mountpoints will restrict the stats to the specified mountpoints.
  # mount_points = ["/"]

  ## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually
  ## present on /run, /var/run, /dev/shm or /dev).
  ignore_fs = ["tmpfs", "devtmpfs"]


# Read metrics about disk IO by device
[[inputs.diskio]]
  ## By default, telegraf will gather stats for all devices including
  ## disk partitions.
  ## Setting devices will restrict the stats to the specified devices.
  # devices = ["sda", "sdb"]
  ## Uncomment the following line if you need disk serial numbers.
  # skip_serial_number = false


# Get kernel statistics from /proc/stat
[[inputs.kernel]]
  # no configuration


# Read metrics about memory usage
[[inputs.mem]]
  # no configuration


# Get the number of processes and group them by status
[[inputs.processes]]
  # no configuration


# Read metrics about swap memory usage
[[inputs.swap]]
  # no configuration


# Read metrics about system load & uptime
[[inputs.system]]
  # no configuration

# Read metrics about network interface usage
[[inputs.net]]
  # collect data only about specific interfaces
  # interfaces = ["eth0"]


[[inputs.netstat]]
  # no configuration

[[inputs.interrupts]]
  # no configuration

[[inputs.linux_sysctl_fs]]
  # no configuration

[[inputs.socket_listener]]
  service_address = "udp://:8092"

================================================
FILE: docs/_config.yml
================================================
---
theme: jekyll-theme-slate
markdown: GFM
name: Deepracer-for-Cloud
plugins:
  - jekyll-relative-links
relative_links:
  enabled:     true
  collections: false

================================================
FILE: docs/docker.md
================================================
# About the Docker setup

DRfC supports running Docker in to modes `swarm` and `compose` - this behaviour is configured in `system.env` through `DR_DOCKER_STYLE`.

## Swarm Mode

Docker Swarm mode is the default. Docker Swarm makes it possible to connect multiple hosts together to spread the load -- esp. useful if one wants to run multiple Robomaker workers, but can also be useful locally if one has two computers that each are not powerful enough to run DeepRacer.

In Swarm mode DRfC creates Stacks, using `docker stack`. During operations one can check running stacks through `docker stack ls`, and running services through `docker stack <id> ls`.

DRfC is installed only on the manager. (The first installed host.) Swarm workers are 'dumb' and do not need to have DRfC installed.

### Key features

* Allows user to connect multiple computers on the same network. (In AWS the instances must be connected on same VPC, and instances must be allowed to communicate.)
* Supports [multiple Robomaker workers](multi_worker.md)
* Supports [running multiple parallel experiments](multi_run.md)

### Limitations

* The Sagemaker container can only be run on the manager.
* Docker images are downloaded from Docker Hub. Locally built images are allowed only if they have a unique tag, not in Docker Hub. If you have multiple Docker nodes ensure that they all have the image available.

### Connecting Workers

* On the manager run `docker swarm join-token manager`.
* On the worker run the command that was displayed on the manager `docker swarm join --token <token> <ip>:<port>`.

### Ports

Docker Swarm will automatically put a load-balancer in front of all replicas in a service. This means that the ROS Web View, which provides a video stream of the DeepRacer during training, will be load balanced - sharing one port (`8080`). If you have multiple workers (even across multiple hosts) then press F5 to cycle through them. 

## Compose Mode

In Compose mode DRfC creates Services, using `docker compose`. During operations one can check running stacks through `docker service ls`, and running services through `docker service ps`.

### Key features

* Supports [multiple Robomaker workers](multi_worker.md)
* Supports [running multiple parallel experiments](multi_run.md)
* Supports [GPU Accelerated OpenGL for Robomaker](opengl.md)

### Limitations

* Workload cannot be spread across multiple hosts.

### Ports

In the case of using Docker Compose the different Robomaker worker will require unique ports for ROS Web Vew and VNC. Docker will assign these dynamically. Use `docker ps` to see which container has been assigned which ports.


================================================
FILE: docs/droa.md
================================================
# DeepRacer on AWS (DRoA) Integration

[DeepRacer on AWS](https://aws.amazon.com/solutions/implementations/deepracer-on-aws/) is the community-hosted replacement for the original AWS DeepRacer console. DRfC includes a set of `droa-*` commands that let you manage models in your DRoA installation directly from the command line.

## Prerequisites

### Install DRoA

Follow the [DeepRacer on AWS installation guide](https://github.com/aws-deepracer-community/deepracer-on-aws) to deploy DRoA into your own AWS account.

### Configure DRfC

In `system.env` set:

```bash
DR_DROA_URL=https://<your-droa-domain>   # e.g. https://deepracer.aws.example.com
DR_DROA_USERNAME=<your-droa-email>
```

`DR_DROA_URL` is the base URL of your DRoA deployment. At runtime, DRfC fetches `<DR_DROA_URL>/env.js` to discover the region, Cognito pools, API endpoint, and upload bucket automatically — no additional AWS config required.

> **Security**: never store your DRoA password in `system.env`. All commands prompt for it interactively (or accept `--password` on the CLI). Credentials are cached in `~/.droa-cache/` for the duration of the session token.

### Python environment

Run `bin/prepare.sh` to create the `.venv` virtual environment and install the required Python packages (`boto3`, `pyyaml`, `requests`, `deepracer-utils`). After `source bin/activate.sh` the venv is active and all `droa-*` commands are available.

---

## Commands

### `droa-list-models`

List all models in your DRoA installation, sorted newest-first.

```
droa-list-models [--json]
```

Output columns: `modelId`, `name`, `status`, `trainingStatus`, `createdAt`.

| Status | Meaning |
|--------|---------|
| `IMPORTING` | Import in progress |
| `READY` | Available for evaluation |
| `TRAINING` | Training job running |
| `ERROR` | Import or training failed |
| `DELETING` | Deletion in progress |

---

### `droa-get-model`

Show details of a single model.

```
droa-get-model <modelId> [--verbose] [--summary] [--json]
```

| Flag | Description |
|------|-------------|
| *(none)* | Identity, car config, training config, metadata |
| `--verbose` | Adds action space and reward function source |
| `--summary` | Adds mean training metrics (reward, progress) via DeepRacer Utils |
| `--json` | Raw JSON output |

---

### `droa-download-logs`

Download training or evaluation logs for a model.

```
droa-download-logs <modelId> [--asset-type TRAINING_LOGS|EVALUATION_LOGS|PHYSICAL_CAR_MODEL|VIRTUAL_MODEL|VIDEOS]
                              [--evaluation-id <id>]
                              [--output <file>]
                              [--summary]
```

| Flag | Description |
|------|-------------|
| `--asset-type` | Asset type (default: `TRAINING_LOGS`) |
| `--evaluation-id` | Required when `--asset-type EVALUATION_LOGS` |
| `--output` / `-o` | Output file path (default: derived from the presigned URL filename) |
| `--summary` | Print DeepRacer Utils stability summary after download (TRAINING_LOGS only) |

The command polls until the asset is ready (up to 5 minutes for `VIRTUAL_MODEL`).

---

### `droa-delete-model`

Delete a model. Only models with status `READY` or `ERROR` can be deleted.

```
droa-delete-model <modelId> [-y/--yes]
```

Without `--yes`, you are shown the model name and status and must type the model name to confirm. Deletion is asynchronous — the model transitions to `DELETING` status.

---

### `droa-import-model`

Import a locally trained DRFC model into DRoA.

```
droa-import-model (--model-prefix <prefix> | --model-dir <dir>)
                  [--model-name <name>]
                  [--model-description <text>]
                  [--best | --checkpoint <step>]
```

#### Source options

| Option | Description |
|--------|-------------|
| `--model-prefix` | Pull directly from local MinIO S3 (`DR_LOCAL_S3_BUCKET`). Defaults `--model-name` to the prefix. |
| `--model-dir` | Use a pre-assembled local directory containing all required model files. |

#### Checkpoint selection (`--model-prefix` only)

| Flag | Behaviour |
|------|-----------|
| *(none)* | Use the last checkpoint |
| `--best` | Use the best checkpoint |
| `--checkpoint STEP` | Use the checkpoint at the given training step |

#### What happens

1. Model files are pulled from MinIO (path-style S3, using `DR_MINIO_URL` and `DR_LOCAL_S3_PROFILE`).
2. `training_params.yaml` is copied from the bucket (`training_params_1.yaml` preferred for multi-worker runs). If missing, it is generated from `DR_*` environment variables.
3. `WORLD_NAME` direction suffixes (`_cw`, `_ccw`) are stripped and `TRACK_DIRECTION_CLOCKWISE` is added — required by DRoA's track validation.
4. Files are uploaded to the DRoA S3 transit bucket and the import API is called.

#### Required files (when using `--model-dir`)

- `model_metadata.json`
- `reward_function.py`
- `training_params.yaml`
- `hyperparameters.json`

---

## Environment variables reference

| Variable | Location | Description |
|----------|----------|-------------|
| `DR_DROA_URL` | `system.env` | Base URL of your DRoA deployment |
| `DR_DROA_USERNAME` | `system.env` | DRoA login email |
| `DR_MINIO_URL` | `system.env` | MinIO endpoint URL (e.g. `http://minio:9000`) |
| `DR_LOCAL_S3_PROFILE` | `system.env` | boto3 AWS profile name for MinIO access |
| `DR_LOCAL_S3_BUCKET` | `run.env` | Local S3 bucket name |
| `DR_LOCAL_S3_MODEL_PREFIX` | `run.env` | Default model prefix for `--model-prefix` |

All `droa-*` commands also accept `--url`, `--username`, and `--password` flags to override the environment variables.


================================================
FILE: docs/head-to-head.md
================================================
# Head-to-Head Race (Beta)

It is possible to run a head-to-head race, similar to the races in the brackets 
run by AWS in the Virtual Circuits to  determine the winner of the head-to-bot races.

This replaces the "Tournament Mode".

## Introduction

The concept is that you have two models racing each other, one Purple and one Orange Car. One car
is powered by our primary configured model, and the second car is powered by the model in `DR_EVAL_OPP_S3_MODEL_PREFIX`

## Configuration

### run.env

Configure `run.env` with the following parameters:
* `DR_RACE_TYPE` should be `HEAD_TO_MODEL`.
* `DR_EVAL_OPP_S3_MODEL_PREFIX` will be the S3 prefix for the secondary model.
* `DR_EVAL_OPP_CAR_NAME` is the display name of this model.

Metrics, Traces and Videos will be stored in each models' prefix.

## Run

Run the race with `dr-start-evaluation`; one race will be run. 

================================================
FILE: docs/index.md
================================================
# Introduction

Provides a quick and easy way to get up and running with a DeepRacer training environment in AWS or Azure, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing), or locally on your own desktop or server.

DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://github.com/aws-deepracer-community/deepracer) repository.

# Main Features

DRfC supports a wide set of features to ensure that you can focus on creating the best model:
* User-friendly
	* Based on the continously updated community [Robomaker](https://github.com/aws-deepracer-community/deepracer-simapp) and [Sagemaker](https://github.com/aws-deepracer-community/deepracer-sagemaker-container) containers, supporting a wide range of CPU and GPU setups.
	* Wide set of scripts (`dr-*`) enables effortless training.
	* Detection of your AWS DeepRacer Console models; allows upload of a locally trained model to any of them.
* Modes
	* Time Trial
	* Object Avoidance
	* Head-to-Bot
* Training
	* Multiple Robomaker instances per Sagemaker (N:1) to improve training progress.
	* Multiple training sessions in parallel - each being (N:1) if hardware supports it - to test out things in parallel.
	* Connect multiple nodes together (Swarm-mode only) to combine the powers of multiple computers/instances.
* Evaluation
	* Evaluate independently from training.
	* Save evaluation run to MP4 file in S3.
* Logging
	* Training metrics and trace files are stored to S3.
	* Optional integration with AWS CloudWatch.
	* Optional exposure of Robomaker internal log-files.
* Technology
	* Supports both Docker Swarm (used for connecting multiple nodes together) and Docker Compose (used to support OpenGL)

# Documentation

* [Initial Installation](installation.md)
* [DeepRacer on AWS (DRoA) Integration](droa.md)
* [Reference](reference.md)
* [Using multiple Robomaker workers](multi_worker.md)
* [Managing experiments and running multiple parallel experiments](multi_run.md)
* [GPU Accelerated OpenGL for Robomaker](opengl.md)
* [Having multiple GPUs in one Computer](multi_gpu.md)
* [Installing on Windows](windows.md)
* [Run a Head-to-Head Race](head-to-head.md)
* [Watching the car](video.md)

# Support

* For general support it is suggested to join the [AWS DeepRacing Community](https://deepracing.io/). The Community Slack has a channel #dr-training-local where the community provides active support.
* Create a GitHub issue if you find an actual code issue, or where updates to documentation would be required.


================================================
FILE: docs/installation.md
================================================
# Installing Deepracer-for-Cloud

## Requirements

Depending on your needs as well as specific needs of the cloud platform you can configure your VM to your liking. Both CPU-only as well as GPU systems are supported.

**AWS**:

* EC2 instance of type G3, G4, P2 or P3 - recommendation is g4dn.2xlarge - for GPU enabled training. C5 or M6 types - recommendation is c5.2xlarge - for CPU training.
  * Ubuntu 20.04
  * Minimum 30 GB, preferred 40 GB of OS disk.
  * Ephemeral Drive connected
  * Minimum of 8 GB GPU-RAM if running with GPU.
  * Recommended at least 6 VCPUs
* S3 bucket. Preferrably in same region as EC2 instance.
* The internal `sagemaker-local` docker network runs by default on `192.168.2.0/24`. Ensure that your AWS IPC does not overlap with this subnet.

**Azure**:

* N-Series VM that comes with NVIDIA Graphics Adapter - recommendation is NC6_Standard
  * Ubuntu 20.04
  * Standard 30 GB OS drive is sufficient to get started.
  * Recommended to add an additional 32 GB data disk if you want to use the Log Analysis container.
  * Minimum 8 GB GPU-RAM
  * Recommended at least 6 VCPUs
* Storage Account with one Blob container configured for Access Key authentication.

**Local**:

* A modern, comparatively powerful, Intel based system.
  * Ubuntu 20.04, other Linux-dristros likely to work.
  * 4 core-CPU, equivalent to 8 vCPUs; the more the better.
  * NVIDIA Graphics adapter with minimum 8 GB RAM for Sagemaker to run GPU. Robomaker enabled GPU instances need ~1 GB each.
  * System RAM + GPU RAM should be at least 32 GB.
* Running DRfC Ubuntu 20.04 on Windows using Windows Subsystem for Linux 2 is possible. See [Installing on Windows](windows.md)

## Installation

The package comes with preparation and setup scripts that would allow a turn-key setup for a fresh virtual machine.

```shell
git clone https://github.com/aws-deepracer-community/deepracer-for-cloud.git
```

**For cloud setup** execute:

```shell
cd deepracer-for-cloud && ./bin/prepare.sh
```

This will prepare the VM by partitioning additional drives as well as installing all prerequisites. After a reboot it will continuee to run `./bin/init.sh` setting up the full repository and downloading the core Docker images. Depending on your environment this may take up to 30 minutes. The scripts will create a file `DONE` once completed.

The installation script will adapt `.profile` to ensure that all settings are applied on login. Otherwise run the activation with `source bin/activate.sh`.

**For local install** it is recommended *not* to run the `bin/prepare.sh` script; it might do more changes than what you want. Rather ensure that all prerequisites are set up and run `bin/init.sh` directly.

See also the [following article](https://awstip.com/deepracer-for-cloud-drfc-local-setup-3c6418b2c75a) for guidance.

The Init Script takes a few parameters:

| Variable | Description |
|----------|-------------|
| `-c <cloud>` | Sets the cloud version to be configured, automatically updates the `DR_CLOUD` parameter in `system.env`. Options are `azure`, `aws` or `local`. Default is `local` |
| `-a <arch>` | Sets the architecture to be configured. Either `cpu` or `gpu`. Default is `gpu`. |

## Environment Setup

The initialization script will attempt to auto-detect your environment (`Azure`, `AWS` or `Local`), and store the outcome in the `DR_CLOUD` parameter in `system.env`. You can also pass in a `-c <cloud>` parameter to override it, e.g. if you want to run the minio-based `local` mode in the cloud.

The main difference between the mode is based on authentication mechanisms and type of storage being configured. The next chapters will review each type of environment on its own.

### AWS

In AWS it is possible to set up authentication to S3 in two ways: Integrated sign-on using [IAM Roles](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html) or using access keys.

#### IAM Role

To use IAM Roles:

* An empty S3 bucket in the same region as the EC2 instance.
* An IAM Role that has permissions to:
  * Access both the *new* S3 bucket as well as the DeepRacer bucket.
  * AmazonVPCReadOnlyAccess
  * AmazonKinesisVideoStreamsFullAccess if you want to stream to Kinesis
  * CloudWatch
* An EC2 instance with the defined IAM Role assigned.
* Configure `system.env` as follows:
  * `DR_LOCAL_S3_PROFILE=default`
  * `DR_LOCAL_S3_BUCKET=<bucketname>`
  * `DR_UPLOAD_S3_PROFILE=default`
  * `DR_UPLOAD_S3_BUCKET=<your-aws-deepracer-bucket>`
* Run `dr-update` for configuration to take effect.

#### Manual setup

For access with IAM user:

* An empty S3 bucket in the same region as the EC2 instance.
* A real AWS IAM user set up with access keys:
  * User should have permissions to access the *new* bucket as well as the dedicated DeepRacer S3 bucket.
  * Use `aws configure` to configure this into the default profile.
* Configure `system.env` as follows:
  * `DR_LOCAL_S3_PROFILE=default`
  * `DR_LOCAL_S3_BUCKET=<bucketname>`
  * `DR_UPLOAD_S3_PROFILE=default`
  * `DR_UPLOAD_S3_BUCKET=<your-aws-deepracer-bucket>`
* Run `dr-update` for configuration to take effect.

### Azure

Minio has deprecated the gateway feature that exposed an Azure Blob Storage as an S3 bucket. Azure mode now sets up minio in the same way as in local mode.

If you want to use awscli (`aws`) to manually move files then use `aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 ...`, as this will set both `--profile` and `--endpoint-url` parameters to match your configuration.

### Local

Local mode runs a minio server that hosts the data in the `docker/volumes` directory. It is otherwise command-compatible with the Azure setup; as the data is accessible via Minio and not via native S3.

In Local mode the script-set requires the following:

* Configure the Minio credentials with `aws configure --profile minio`. The default configuration will use the `minio` profile to configure MINIO. You can choose any username or password, but username needs to be at least length 3, and password at least length 8.
* A real AWS IAM user configured with `aws configure` to enable upload of models into AWS DeepRacer.
* Configure `system.env` as follows:
  * `DR_LOCAL_S3_PROFILE=default`
  * `DR_LOCAL_S3_BUCKET=<bucketname>`
  * `DR_UPLOAD_S3_PROFILE=default`
  * `DR_UPLOAD_S3_BUCKET=<your-aws-deepracer-bucket>`
* Run `dr-update` for configuration to take effect.

## First Run

For the first run the following final steps are needed. This creates a training run with all default values in

* Define your custom files in `custom_files/` - samples can be found in `defaults` which you must copy over:
  * `hyperparameters.json` - definining the training hyperparameters
  * `model_metadata.json` - defining the action space and sensors
  * `reward_function.py` - defining the reward function
* Upload the files into the bucket with `dr-upload-custom-files`. This will also start minio if required.
* Start training with `dr-start-training`

After a while you will see the sagemaker logs on the screen.

## Troubleshooting

Here are some hints for troubleshooting specific issues you may encounter

### Local training troubleshooting

| Issue        | Troubleshooting hint |
|------------- | ---------------------|
Get messages like "Sagemaker is not running" | Run `docker -ps a` to see if the containers are running or if they stopped due to some errors. If running after a fresh install, try restarting the system.
Check docker errors for specific container | Run `docker logs -f <containerid>`
Get message "Error response from daemon: could not choose an IP address to advertise since this system has multiple addresses on interface <your_interface> ..." when running `./bin/init.sh -c local -a cpu` | It means you have multiple IP addresses and you need to specify one within `./bin/init.sh`.<br> If you don't care which one to use, you can get the first one by running ```ifconfig \| grep $(route \| awk '/^default/ {print $8}') -a1 \| grep -o -P '(?<=inet ).*(?= netmask)```.<br> Edit   `./bin/init.sh` and locate line `docker swarm init` and change it to `docker swarm init --advertise-addr <your_IP>`.<br> Rerun  `./bin/init.sh -c local -a cpu`
I don't have any of the `dr-*` commands | Run `source bin/activate.sh`.


================================================
FILE: docs/mac.md
================================================
# Running DeepRacer-for-Cloud on macOS

DRfC can be run on macOS, both on AWS Mac EC2 instances (mac1/mac2 family) and on local Mac hardware (Intel or Apple Silicon). Because macOS does not support NVIDIA GPUs, training always runs in **CPU mode**.

---

## Architecture overview

On macOS, Docker containers run inside a lightweight Linux VM managed by [Colima](https://github.com/abiosoft/colima) rather than directly on the host. This has a few implications you should be aware of:

| Concern | Impact |
|---|---|
| **No NVIDIA GPU** | Always `cpu` architecture; training is slower than a GPU instance |
| **Colima VM filesystem** | Bind-mount paths (e.g. `/tmp/sagemaker`) must exist inside the VM, not on the macOS host |
| **IMDS not reachable from VM** | IAM role credentials are not automatically available inside containers; explicit AWS keys must be configured |
| **BSD userland** | `sed`, `grep`, `sort`, `readlink` differ from GNU; key shell entrypoints use portable path handling, but custom scripts should still avoid assuming GNU-only flags |
| **bash 3.2 ships with macOS** | A modern bash 5 must be installed via Homebrew and set as the login shell |

---

## Option 1: AWS Mac EC2 instance

AWS offers bare-metal Mac instances (`mac1.metal` for Intel, `mac2.metal` / `mac2-m2.metal` for Apple Silicon). These run macOS natively and support EC2 features like IAM roles, S3, and instance metadata — with the IMDS caveat noted above.

### Prerequisites

* A Mac EC2 instance running macOS Monterey (12) or later
* An IAM role or IAM user with permissions for S3 (and optionally STS, CloudWatch)
* An S3 bucket in the same region as the instance

### Step 1 — Clone the repository

```bash
git clone https://github.com/aws-deepracer-community/deepracer-for-cloud.git
cd deepracer-for-cloud
```

### Step 2 — Run prepare-mac.sh

```bash
bash bin/prepare-mac.sh
```

This script will:

1. Verify macOS version compatibility
2. Install [Homebrew](https://brew.sh) if not present
3. Install required packages: `jq`, `python3`, `git`, `screen`, `bash`
4. Install bash 5 and set it as the default login shell
5. Add a `~/.bash_profile` bootstrap so bash 5 is used even when SSH starts `/bin/bash` (3.2)
6. Install the AWS CLI v2 via the official `.pkg` installer (avoids Homebrew Python conflicts)
7. Install [Colima](https://github.com/abiosoft/colima) and the Docker CLI
8. Start Colima (4 vCPUs, 8 GB RAM, 60 GB disk — adjust as needed)
9. Create `/tmp/sagemaker` inside the Colima VM
10. Install a launchd agent so Colima auto-starts on login

After the script completes, **log out and back in** so the new default shell takes effect.

### Step 3 — Configure AWS credentials

Because containers run inside Colima's Linux VM, they cannot reach the EC2 Instance Metadata Service at `169.254.169.254`. You must provide explicit AWS credentials:

```bash
aws configure --profile default
```

Enter an Access Key ID and Secret Access Key for an IAM user (or long-term credentials). The profile name must match `DR_LOCAL_S3_PROFILE` in `system.env` (default: `default` for AWS cloud setups).

> **Tip:** Create a dedicated IAM user with a policy scoped to your S3 bucket rather than using root or overly broad credentials.

### Step 4 — Run init.sh

```bash
bin/init.sh -c aws -a cpu
```

This sets up the directory structure, configures `system.env` and `run.env`, and pulls the Docker images. Image pulls may take a while depending on bandwidth.

### Step 5 — Activate and train

```bash
source bin/activate.sh
dr-upload-custom-files
dr-start-training -q
```

---

## Option 2: Local Mac (desktop/laptop)

Running DRfC locally on a Mac works for development and small-scale training. Performance is limited by CPU speed and memory.

### Differences from EC2

* No IAM role — configure an IAM user with `aws configure`
* `DR_CLOUD` should be set to `local` in `system.env`, which uses a local MinIO container as the S3 backend
* Colima memory and CPU limits should be tuned to your machine (leave headroom for the macOS host)

### Recommended Colima sizing

| Mac | Recommended Colima config |
|---|---|
| M1/M2/M3 with 16 GB RAM | `--cpu 6 --memory 10 --disk 60` |
| M1/M2/M3 with 32 GB RAM | `--cpu 10 --memory 20 --disk 60` |
| Intel with 16 GB RAM | `--cpu 4 --memory 8 --disk 60` |

To change the sizing after initial setup:

```bash
colima stop
colima start --cpu 6 --memory 10 --disk 60
```

### Apple Silicon (arm64) and container image architecture

The DRfC SimApp images are built for `amd64` (x86_64). On Apple Silicon, Colima runs them via emulation. This works but is slower. To enable it:

```bash
# Install Rosetta 2 if not already present
softwareupdate --install-rosetta

# Start Colima with x86_64 architecture
colima stop
colima start --arch x86_64 --cpu 4 --memory 8 --disk 60
```

> Note: Once Colima is started with `--arch x86_64`, it stays in that mode until deleted. You cannot mix architectures in the same Colima instance.

### Installation steps

```bash
git clone https://github.com/aws-deepracer-community/deepracer-for-cloud.git
cd deepracer-for-cloud
bash bin/prepare-mac.sh
# Log out and back in
bin/init.sh -c local -a cpu
source bin/activate.sh
dr-upload-custom-files
dr-start-training -q
```

---

## Known limitations

| Limitation | Notes |
|---|---|
| CPU-only training | No NVIDIA GPU support on macOS |
| IMDS not reachable from containers | Must use explicit AWS keys; IAM role auto-rotation does not work inside containers |
| `/tmp/sagemaker` must exist in Colima VM | Created automatically by `prepare-mac.sh` and `dr-start-training`; recreate manually after `colima delete` with `colima ssh -- sudo mkdir -p /tmp/sagemaker && colima ssh -- sudo chmod -R a+w /tmp/sagemaker` |
| Colima iptables rules reset on restart | Not relevant with the explicit-keys approach |
| `brew services` fails headlessly | Colima is started via a launchd plist instead |

---

## Troubleshooting

**`bash: ${VAR,,}: bad substitution`**  
You are running bash 3.2 (macOS built-in). Run `prepare-mac.sh` to install bash 5, then log out and back in.

**`No configuration file.`** when sourcing `activate.sh`  
`init.sh` has not been run yet, or `run.env` does not exist. Run `bin/init.sh` first.

**`docker: command not found`**  
Homebrew PATH is not set. Ensure `~/.bash_profile` contains `eval "$(brew shellenv)"` and re-source it.

**`NoCredentialsError: Unable to locate credentials`**  
Containers cannot reach IMDS. Run `aws configure --profile default` (or the profile matching `DR_LOCAL_S3_PROFILE`) on the host.

**Colima fails to start**  
Check `colima status` and `colima start` output. On freshly allocated Mac EC2 instances the full macOS desktop session may still be initialising — wait a minute and retry.


================================================
FILE: docs/metrics.md
================================================
# Realtime Metrics

It is possible to collect and visualise real-time metrics using the optional telegraf/influxdb/grafana stack.

```mermaid
flowchart TD
    A(Robomaker) --> B(Telegraf)
    B --> C(InfluxDB)
    C --> D(Grafana)
```

When enabled the Robomaker containers will send UDP metrics to Telegraf, which enriches and stores the metrics in the InfluxDB timeseries database container.

Grafana provides a presentation layer for interactive dashboards.

## Initial config and start-up

To enable the feature simply uncomment the lines in system.env for `DR_TELEGRAF_HOST` and `DR_TELEGRAF_PORT`. In most cases the default values should work without modification.

Start the metrics docker stack using `dr-start-metrics`. 

Once running Grafana should be accessible via a web browser on port 3000, e.g http://localhost:3000
The default username is `admin`, password `admin`. You will be prompted to set your own password on first login.

*Note: Grafana can take 60-90 seconds to perform initial internal setup the first time it is started. The web UI will not be available until this is complete. You can check the status by viewing the grafana container logs if necessary.*

The metrics stack will remain running until stopped (`dr-stop-metrics`) or the machine is rebooted. It does not need to be restarted in between training runs and should automatically pick up metrics from new models. 

## Using the dashboards

A template dashboard is provided to show how to access basic deepracer metrics. You can use this dashboard as a base to build your own more customised dashboards.

After connecting to the Grafana Web UI with a browser use the menu to browse to the Dashboards section. 

The template dashboard called `DeepRacer Training template` should be visible, showing graphs of reward, progress, and completed lap times. 

As this is an automatically provisioned dashboard you are not able to save changes to it, however you can copy it by clicking on the small cog icon to enter the dashboard settings page, and then clicking `Save as` to make an editable copy. 

A full user guide on how to work the dashboards is available on the [Grafana website](https://grafana.com/docs/grafana/latest/dashboards/use-dashboards/).


================================================
FILE: docs/multi_gpu.md
================================================
# Training on a Computer with more than one GPU

In some cases you might end up with having a computer with more than one GPU. This may be common on a workstation
which may have one GPU for general graphics (e.g. GTX 10-series, RTX 20-series), as well as a data center GPU 
like a Tesla K40, K80 or M40.

In this setting it can get a bit chaotic as DeepRacer will 'greedily' put any workload on any GPU - which will 
lead to Out-of-Memory somewhere down the road.

## Checking available GPUs

You can use Tensorflow to give you an overview of available devices running `utils/cuda-check.sh`.

It will say something like:
```
2020-07-04 12:25:55.179580: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2020-07-04 12:25:55.547206: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1411] Found device 0 with properties: 
name: GeForce GTX 1650 major: 7 minor: 5 memoryClockRate(GHz): 1.68
pciBusID: 0000:04:00.0
totalMemory: 3.82GiB freeMemory: 3.30GiB
2020-07-04 12:25:55.732066: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1411] Found device 1 with properties: 
name: Tesla M40 24GB major: 5 minor: 2 memoryClockRate(GHz): 1.112
pciBusID: 0000:81:00.0
totalMemory: 22.41GiB freeMemory: 22.30GiB
2020-07-04 12:25:55.732141: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1490] Adding visible gpu devices: 0, 1
2020-07-04 12:25:56.745647: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-07-04 12:25:56.745719: I tensorflow/core/common_runtime/gpu/gpu_device.cc:977]      0 1 
2020-07-04 12:25:56.745732: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 0:   N N 
2020-07-04 12:25:56.745743: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 1:   N N 
2020-07-04 12:25:56.745973: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 195 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650, pci bus id: 0000:04:00.0, compute capability: 7.5)
2020-07-04 12:25:56.750352: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 1147 MB memory) -> physical GPU (device: 1, name: Tesla M40 24GB, pci bus id: 0000:81:00.0, compute capability: 5.2)
2020-07-04 12:25:56.774305: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1490] Adding visible gpu devices: 0, 1
2020-07-04 12:25:56.774408: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-07-04 12:25:56.774425: I tensorflow/core/common_runtime/gpu/gpu_device.cc:977]      0 1 
2020-07-04 12:25:56.774436: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 0:   N N 
2020-07-04 12:25:56.774446: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 1:   N N 
2020-07-04 12:25:56.774551: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/device:GPU:0 with 195 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650, pci bus id: 0000:04:00.0, compute capability: 7.5)
2020-07-04 12:25:56.774829: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/device:GPU:1 with 1147 MB memory) -> physical GPU (device: 1, name: Tesla M40 24GB, pci bus id: 0000:81:00.0, compute capability: 5.2)
['/device:GPU:0', '/device:GPU:1']
```
In this case the CUDA device #0 is the GTX 1650 and the CUDA device #1 is the Tesla M40.

### Selecting Device

To control the CUDA assignment for Sagemaker abd Robomaker then the following to variables in `system.env`:

```
DR_ROBOMAKER_CUDA_DEVICES=0
DR_SAGEMAKER_CUDA_DEVICES=1
``` 

The number is the CUDA number of the GPU you want the containers to use.


================================================
FILE: docs/multi_run.md
================================================
# Managing Experiments

## Experiment sub-directories

When iterating on a model you typically need different reward functions, action spaces, hyperparameters, and track settings across runs. By default DRfC stores all of this in `run.env` and `custom_files/` at the root of the installation, which can become difficult to manage over time.

The **experiment sub-directory** feature lets you keep every config and custom file for a training run in its own folder under `experiments/`. DRfC then picks up those files automatically when you activate with the experiment name.

### Directory structure

```
deepracer-for-cloud/
├── experiments/
│   ├── sprint-v1/
│   │   ├── run.env
│   │   ├── worker-2.env          # optional – multi-worker only
│   │   └── custom_files/
│   │       ├── reward_function.py
│   │       ├── model_metadata.json
│   │       └── hyperparameters.json
│   └── sprint-v2/
│       ├── run.env
│       └── custom_files/
│           └── ...
├── system.env
└── ...
```

The `experiments/` directory is excluded from git (via `.gitignore`) to avoid committing sensitive configuration and credentials.

### Setting up your first experiment

1. Create the directory structure (run from the DRfC root):

    ```bash
    mkdir -p experiments/sprint-v1/custom_files
    ```

2. Copy your current run configuration into the experiment:

    ```bash
    cp run.env experiments/sprint-v1/
    cp custom_files/* experiments/sprint-v1/custom_files/
    ```

    If you are using multiple workers, copy the worker env files too:

    ```bash
    cp worker-*.env experiments/sprint-v1/
    ```

3. Activate with the experiment name using the `-e` flag:

    ```bash
    source bin/activate.sh -e sprint-v1
    ```

### Activating an experiment

There are two ways to select an experiment:

**Option A — `-e` flag (recommended)**

Pass the experiment name when sourcing the activation script. This takes precedence over anything in `system.env`:

```bash
source bin/activate.sh -e sprint-v1
```

**Option B — `DR_EXPERIMENT_NAME` in `system.env`**

Uncomment and set the variable in `system.env`:

```
DR_EXPERIMENT_NAME=sprint-v1
```

Then run `dr-update` or re-source `bin/activate.sh`. Use this option if you want the experiment to persist across shell sessions automatically.

When `DR_EXPERIMENT_NAME` is set (by either method), DRfC will:
- Load `run.env` from `experiments/<name>/run.env`
- Load `worker-N.env` from `experiments/<name>/worker-N.env` (multi-worker)
- Sync `custom_files` to/from `experiments/<name>/custom_files/`
- Show `Experiment: <name>` in `dr-summary`

If the experiment directory does not exist, activation will abort with an error.

### Iterating to a new experiment

Copy the entire experiment folder to a new name and update the model prefix in `run.env`:

```bash
cp -av experiments/sprint-v1 experiments/sprint-v2
```

Edit `experiments/sprint-v2/run.env` to update `DR_LOCAL_S3_MODEL_PREFIX` (and `DR_LOCAL_S3_PRETRAINED_PREFIX` if you want to continue training from the previous experiment's model), then activate the new experiment:

```bash
source bin/activate.sh -e sprint-v2
```

### Custom files upload and download

`dr-upload-custom-files` and `dr-download-custom-files` are experiment-aware. When an experiment is active they sync against `experiments/<name>/custom_files/` instead of the root `custom_files/` directory.

---

# Running Multiple Parallel Experiments

It is possible to run multiple experiments on one computer in parallel. This is possible both in `swarm` and `compose` mode, and is controlled by `DR_RUN_ID` in `run.env`.

The feature works by creating unique prefixes to the container names:
* In Swarm mode this is done through defining a stack name (default: deepracer-0)
* In Compose mode this is done through adding a project name.

## Suggested way to use the feature

By default `run.env` is loaded when DRfC is activated - but it is possible to load a separate configuration through `source bin/activate.sh <filename>`, or through `source bin/activate.sh -e <experiment-name>` when using experiment sub-directories.

The best way to use this feature is to have a bash-shell per experiment, and to load a separate configuration per shell.

After activating one can control each experiment independently through using the `dr-*` commands.

If using local or Azure the S3 / Minio instance will be shared, and is running only once.


================================================
FILE: docs/multi_worker.md
================================================
# Using multiple Robomaker workers

One way to accelerate training is to launch multiple Robomaker workers that feed into one Sagemaker instance.

The number of workers is configured through setting `system.env` `DR_WORKERS` to the desired number of workers. The result is that the number of episodes (hyperparameter `num_episodes_between_training`) will be divivided over the number of workers. The theoretical maximum number of workers equals `num_episodes_between_training`.

The training can be started as normal.

## How many workers do I need?

One Robomaker worker requires 2-4 vCPUs. Tests show that a `c5.4xlarge` instance can run 3 workers and the Sagemaker without a drop in performance. Using OpenGL images reduces the number of vCPUs required per worker.

To avoid issues with the position from which evaluations are run ensure that `( num_episodes_between_training / DR_WORKERS) * DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST = 1.0`. 

Example: With 3 workers set `num_episodes_between_training: 30` and `DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.1`.

Note; Sagemaker will stop collecting experiences once you have reached 10.000 steps (3-layer CNN) in an iteration. For longer tracks with 600-1000 steps per completed episodes this will define the upper bound for the number of workers and episodes per iteration.

## Training with different parameters for each worker

It is also possible to use different configurations between workers, such as different tracks (WORLD_NAME).  To enable, set DR_TRAIN_MULTI_CONFIG=True inside run.env, then make copies of defaults/template-worker.env in the main deepracer-for-cloud directory with format worker-2.env, worker-3.env, etc.  (So alongside run.env, you should have woker-2.env, worker-3.env, etc.  run.env is still used for worker 1)  Modify the worker env files with your desired changes, which can be more than just the world_name.  These additional worker env files are only used if you are training with multiple workers.

## Watching the streams

If you want to watch the streams -- and are in `compose` mode you can use the script `utils/start-local-browser.sh` to dynamically create a HTML that streams the KVS stream from ALL workers at a time.


================================================
FILE: docs/opengl.md
================================================
# GPU Accelerated OpenGL for Robomaker

One way to improve performance, especially of Robomaker, is to enable GPU-accelerated OpenGL. OpenGL can significantly improve Gazebo performance, even where the GPU does not have enough GPU RAM, or is too old, to support Tensorflow.

## Desktop 

On a Ubuntu desktop running Unity there are hardly any additional steps required.

* Ensure that a recent Nvidia driver is installed and is running.
* Ensure that nvidia-docker is installed; review `bin/prepare.sh` for steps if you do not want to directly run the script.
* Configure DRfC using the following settings in `system.env`:
    * `DR_HOST_X=True`; uses the local X server rather than starting one within the docker container.
    * `DR_DISPLAY`; set to the value of your running X server, if not set then `DISPLAY` will be used.

Before running `dr-start-training`/`dr-start-evaluation` ensure that `DR_DISPLAY`/`DISPLAY` and `XAUTHORITY` are defined.

Check that OpenGL is working by looking for `gzserver` in `nvidia-smi`.

If `DR_GUI_ENABLE=True` then the Gazebo UI, rviz and rqt will open up in separate windows. (With multiple workers it can get crowded...)

### Remote connection to Desktop 

If you want to start training or evaluation via SSH (e.g. to increment the training whilst you are on the go) there are a few steps to do:
* Ensure that you are actually logged in to the local machine (desktop session is running).
* In the SSH terminal:
    * Ensure `DR_DISPLAY` is configured in `system.env`. Otherwise run `export DISPLAY=:1`. [*]
    * Run `export XAUTHORITY=/run/user/$(id -u)/gdm/Xauthority` to let X know where the X magic cookie is.
    * Run `source bin/activate.sh` as normal.
    * Run your `dr-start-training` or `dr-start-evaluation` command. 

*Remark*: Setting `DISPLAY` will lead to certain commands (e.g. `dr-logs-sagemaker`) starting in a terminal window on the desktop, rather than the output being showhn in the SSH terminal.
Use of `DR_DISPLAY` is recommended to avoid this.

## Headless Server

Also a headless server with a GPU, e.g. an EC2 instance, or a local computer with a displayless GPU (e.g. Tesla K40, K80, M40).

This also applies for a desktop computer where you are not logged in. In this case also disconnect any monitor cables to avoid conflict.

* Ensure that a Nvidia driver and nvidia-docker is installed; review `bin/prepare.sh` for steps if you do not want to directly run the script.
* Setup an X-server on the host. `utils/setup-xorg.sh` is a basic installation script.
* Configure DRfC using the following settings in `system.env`:
    * `DR_HOST_X=True`; uses the local X server rather than starting one within the docker container.
    * `DR_DISPLAY`; the X display that the headless X server will start on. (Default is `:99`, avoid using `:0` or `:1` as it may conflict with other X servers.)

Start up the X server with `utils/start-xorg.sh`. 

If `DR_GUI_ENABLE=True` then a VNC server will be started on port 5900 so that you can connect and interact with the Gazebo UI.

Check that OpenGL is working by looking for `gzserver` in `nvidia-smi`.

## WSL2 on Windows 11

OpenGL is also supported in WSL2 on Windows 11. By default an Xwayland server is started in Ubuntu 22.04.

To enable OpenGL acceleration perform the following steps:
* Install x11-server-utils with `sudo apt install x11-xserver-utils`.
* Configure DRfC using the following settings in `system.env`:
    * `DR_HOST_X=True`; uses the local X server rather than starting one within the docker container.
    * `DR_DISPLAY=:0`; the Xwayland starts on :0 by default.

If you want to interact with the Gazebo UI, set `DR_DOCKER_STYLE=compose` and `DR_GUI_ENABLE=True` in `system.env`.


================================================
FILE: docs/reference.md
================================================
# Deepracer-for-Cloud Reference

## Environment Variables

The scripts assume that two files `system.env` containing constant configuration values and  `run.env` with run specific values is populated with the required values. Which values go into which file is not really important.

| Variable | Description |
|----------|-------------|
| `DR_RUN_ID` | Used if you have multiple independent training jobs only a single DRfC instance. This is an advanced configuration and generally you should just leave this as the default `0`.|
| `DR_WORLD_NAME` | Defines the track to be used.|
| `DR_RACE_TYPE` | Valid options are `TIME_TRIAL`, `OBJECT_AVOIDANCE`, and `HEAD_TO_BOT`.|
| `DR_CAR_COLOR` | Valid options are `Black`, `Grey`, `Blue`, `Red`, `Orange`, `White`, and `Purple`.|
| `DR_CAR_NAME` | Display name of car; shows in Deepracer Console when uploading.|
| `DR_ENABLE_DOMAIN_RANDOMIZATION` | If `True`, this cycles through different environment colors and lighting each episode.  This is typically used to make your model more robust and generalized instead of tightly aligned with the simulator|
| `DR_UPLOAD_S3_PREFIX` | Prefix of the target location. (Typically starts with `DeepRacer-SageMaker-RoboMaker-comm-`|
| `DR_EVAL_NUMBER_OF_TRIALS` | How many laps to complete for evaluation simulations.|
| `DR_EVAL_IS_CONTINUOUS` | If False, your evaluation trial will end if you car goes off track or is in a collision. If True, your car will take the penalty times as configured in those parameters, but continue evaluating the trial.|
| `DR_EVAL_OFF_TRACK_PENALTY` | Number of seconds penalty time added for an off track during evaluation.  Only takes effect if `DR_EVAL_IS_CONTINUOUS` is set to True.|
| `DR_EVAL_COLLISION_PENALTY` | Number of seconds penalty time added for a collision during evaluation.  Only takes effect if `DR_EVAL_IS_CONTINUOUS` is set to True.|
| `DR_EVAL_SAVE_MP4` | Set to `True` to save MP4 of an evaluation run. |
| `DR_EVAL_REVERSE_DIRECTION` | Set to `True` to reverse the direction in which the car traverses the track.|
| `DR_TRAIN_CHANGE_START_POSITION` | Determines if the racer shall round-robin the starting position during training sessions. (Recommended to be `True` for initial training.)|
| `DR_TRAIN_ALTERNATE_DRIVING_DIRECTION` | `True` or `False`.  If `True`, the car will alternate driving between clockwise and counter-clockwise each episode.|
| `DR_TRAIN_START_POSITION_OFFSET` | Used to control where to start the training from on first episode.|
| `DR_TRAIN_ROUND_ROBIN_ADVANCE_DISTANCE` | How far to progress each episode in round robin.  0.05 is 5% of the track.  Generally best to try and keep this to even numbers that match with your total number of episodes to allow for even distribution around the track.  For example, if 20 episodes per iternation, .05 or .10 or .20  would be good.|
| `DR_TRAIN_MULTI_CONFIG` | `True` or `False`.  This is used if you want to use different run.env configurations for each worker in a multi worker training run.  See multi config documentation for more details on how to set this up.|
| `DR_TRAIN_MIN_EVAL_TRIALS` | The minimum number of evaluation trials run between each training iteration.  Evaluations will continue as long as policy training is occuring and may be more than this number.  This establishes the minimum, and is generally useful if you want to speed up training especially when using gpu sagemaker containers.|
| `DR_TRAIN_REVERSE_DIRECTION` | Set to `True` to reverse the direction in which the car traverses the track. |
| `DR_TRAIN_BEST_MODEL_METRIC` | Can be used to control which model is kept as the "best" model. Set to `progress` to select the model with the highest evaluation completion percentage, set to `reward` to select the model with the highest evaluation reward.|
| `DR_TRAIN_MAX_STEPS_PER_ITERATION` | Can be used to control the max number of steps per iteration to use for learning, the excess steps will be discarded to avoid out-of-memory situations, default is 10000. |
| `DR_LOCAL_S3_PRETRAINED` | Determines if training or evaluation shall be based on the model created in a previous session, held in `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`, accessible by credentials held in profile `{DR_LOCAL_S3_PROFILE}`.|
| `DR_LOCAL_S3_PRETRAINED_PREFIX` | Prefix of pretrained model within S3 bucket.|
| `DR_LOCAL_S3_MODEL_PREFIX` | Prefix of model within S3 bucket.|
| `DR_LOCAL_S3_BUCKET` | Name of S3 bucket which will be used during the session.|
| `DR_LOCAL_S3_CUSTOM_FILES_PREFIX` | Prefix of configuration files within S3 bucket.|
| `DR_LOCAL_S3_TRAINING_PARAMS_FILE` | Name of YAML file that holds parameters sent to robomaker container for configuration during training. Filename is relative to `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`.|
| `DR_LOCAL_S3_EVAL_PARAMS_FILE` | Name of YAML file that holds parameters sent to robomaker container for configuration during evaluations.  Filename is relative to `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`.|
| `DR_LOCAL_S3_MODEL_METADATA_KEY` | Location where the `model_metadata.json` file is stored.|
| `DR_LOCAL_S3_HYPERPARAMETERS_KEY` | Location where the `hyperparameters.json` file is stored.|
| `DR_LOCAL_S3_REWARD_KEY` | Location where the `reward_function.py` file is stored.|
| `DR_LOCAL_S3_METRICS_PREFIX` | Location where the metrics will be stored.|
| `DR_OA_NUMBER_OF_OBSTACLES` | For Object Avoidance, the number of obstacles on the track.|
| `DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES` | Minimum distance in meters between obstacles.|
| `DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS` | If True, obstacle locations will randomly change after each episode.|
| `DR_OA_IS_OBSTACLE_BOT_CAR` | If True, obstacles will appear as a stationary car instead of a box.|
| `DR_OA_OBJECT_POSITIONS` | Positions of boxes on the track. Tuples consisting of progress (fraction [0..1]) and inside or outside lane (-1 or 1). Example: `"0.23,-1;0.46,1"`|
| `DR_H2B_IS_LANE_CHANGE` | If True, bot cars will change lanes based on configuration.|
| `DR_H2B_LOWER_LANE_CHANGE_TIME` | Minimum time in seconds before car will change lanes.|
| `DR_H2B_UPPER_LANE_CHANGE_TIME` | Maximum time in seconds before car will change langes.|
| `DR_H2B_LANE_CHANGE_DISTANCE` | Distance in meters how long it will take the car to change lanes.|
| `DR_H2B_NUMBER_OF_BOT_CARS` | Number of bot cars on the track.|
| `DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS` | Minimum distance between bot cars.|
| `DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS` | If True, bot car locations will randomly change after each episode.|
| `DR_H2B_BOT_CAR_SPEED` | How fast the bot cars go in meters per second.|
| `DR_CLOUD` | Can be `azure`, `aws`, `local` or `remote`; determines how the storage will be configured.|
| `DR_AWS_APP_REGION` | (AWS only) Region for other AWS resources (e.g. Kinesis) |
| `DR_UPLOAD_S3_PROFILE` | AWS Cli profile to be used that holds the 'real' S3 credentials needed to upload a model into AWS DeepRacer.|
| `DR_UPLOAD_S3_BUCKET` | Name of the AWS DeepRacer bucket where models will be uploaded. (Typically starts with `aws-deepracer-`.)|
| `DR_LOCAL_S3_PROFILE` | Name of AWS profile with credentials to be used. Stored in `~/.aws/credentials` unless AWS IAM Roles are used.|
| `DR_GUI_ENABLE` | Enable or disable the Gazebo GUI in Robomaker |
| `DR_KINESIS_STREAM_NAME` | Kinesis stream name. Used if you actually publish to the AWS KVS service. Leave blank if you do not want this. |
| `DR_KINESIS_STREAM_ENABLE` | Enable or disable 'Kinesis Stream', True both publishes to a AWS KVS stream (if name not None), and to the topic `/racecar/deepracer/kvs_stream`. Leave True if you want to watch the car racing. |
| `DR_SAGEMAKER_IMAGE` | Determines which sagemaker image will be used for training.|
| `DR_ROBOMAKER_IMAGE` | Determines which robomaker image will be used for training or evaluation.|
| `DR_MINIO_IMAGE` | Determines which Minio image will be used. |
| `DR_COACH_IMAGE` | Determines which coach image will be used for training.|
| `DR_WORKERS` | Number of Robomaker workers to be used for training.  See additional documentation for more information about this feature.|
| `DR_ROBOMAKER_MOUNT_LOGS` | True to get logs mounted to `$DR_DIR/data/logs/robomaker/$DR_LOCAL_S3_MODEL_PREFIX`|
| `DR_ROBOMAKER_MOUNT_SIMAPP_DIR` | Path to the altered Robomaker bundle, e.g. `/home/ubuntu/deepracer-simapp/bundle`.|
| `DR_CLOUD_WATCH_ENABLE` | Send log files to AWS CloudWatch.|
| `DR_CLOUD_WATCH_LOG_STREAM_PREFIX` | Add a prefix to the CloudWatch log stream name.|
| `DR_DOCKER_STYLE` | Valid Options are `Swarm` and `Compose`.  Use Compose for openGL optimized containers.|
| `DR_HOST_X` | Uses the host X-windows server, rather than starting one inside of Robomaker. Required for OpenGL images.|
| `DR_WEBVIEWER_PORT` | Port for the web-viewer proxy which enables the streaming of all robomaker workers at once.|
| `CUDA_VISIBLE_DEVICES` | Used in multi-GPU configurations. See additional documentation for more information about this feature.|
| `DR_TELEGRAF_HOST` | The hostname to send real-time metrics to. Uncommenting this will enable real-time metrics collection using Telegraf. The telegraf/influxdb/grafana compose stack must already be running (use `dr-start-metrics`) for this to work, and it should usually be set to `telegraf` to send metrics to the telegraf container.
| `DR_TELEGRAF_PORT` | Defines the UDP port to send real-time metrics to. Should usually remain set as 8092.  
| `DR_QUIET_ACTIVATE` | Set to `True` to suppress the environment summary dashboard that is displayed when sourcing `bin/activate.sh` in an interactive shell. Defaults to `False`.|
| `DR_EXPERIMENT_NAME` | Optional. When set, DRfC loads `run.env`, `worker-N.env`, and `custom_files/` from `experiments/<name>/` instead of the repository root. Can be set here or passed via `source bin/activate.sh -e <name>`. See [Managing Experiments](multi_run.md).|

## Commands

| Command | Description |
|---------|-------------|
| `dr-update` | Loads in all scripts and environment variables again.|
| `dr-reload` | Re-sources `bin/activate.sh` with the current configuration file.|
| `dr-summary` | Displays the environment summary dashboard (cloud config, Docker images, running services and containers). Runs automatically on interactive shell activation unless `DR_QUIET_ACTIVATE=True`.|
| `dr-update-env` | Loads in all environment variables from `system.env` and `run.env`.|
| `dr-upload-custom-files` | Uploads changed configuration files from `custom_files/` into `s3://{DR_LOCAL_S3_BUCKET}/custom_files`.|
| `dr-download-custom-files` | Downloads changed configuration files from `s3://{DR_LOCAL_S3_BUCKET}/custom_files` into `custom_files/`.|
| `dr-start-training` | Starts a training session in the local VM based on current configuration.|
| `dr-increment-training` | Updates configuration, setting the current model prefix to pretrained, and incrementing a serial.|
| `dr-stop-training` | Stops the current local training session. Uploads log files.|
| `dr-start-evaluation` | Starts a evaluation session in the local VM based on current configuration.|
| `dr-stop-evaluation` | Stops the current local evaluation session. Uploads log files.|
| `dr-start-loganalysis` | Starts a Jupyter log-analysis container, available on port 8888.|
| `dr-stop-loganalysis` | Stops the Jupyter log-analysis container.|
| `dr-start-viewer` | Starts an NGINX proxy to stream all the robomaker streams; accessible remotly.|
| `dr-stop-viewer` | Stops the NGINX proxy.|
| `dr-logs-sagemaker` | Displays the logs from the running Sagemaker container.|
| `dr-logs-robomaker` | Displays the logs from the running Robomaker container.|
| `dr-list-aws-models` | Lists the models that are currently stored in your AWS DeepRacer S3 bucket. |
| `dr-set-upload-model` | Updates the `run.env` with the prefix and name of your selected model. |
| `dr-upload-model` | Uploads the model defined in `DR_LOCAL_S3_MODEL_PREFIX` to the AWS DeepRacer S3 prefix defined in `DR_UPLOAD_S3_PREFIX` |
| `dr-download-model` | Downloads a file from a 'real' S3 location into a local prefix of choice. |


================================================
FILE: docs/video.md
================================================
# Watching the car

There are multiple ways to watch the car during training and evaluation. The ports and 'features' depend on the docker mode (swarm vs. compose) as well as between training and evaluation.

## Training using Viewer

DRfC has a built in viewer that supports showing the video stream from up to 6 workers on one webpage.

The view can be started with `dr-start-viewer` and is available on `http://localhost:8100` or `http://127.0.0.1:8100`. The viewer must be updated if training is restarted using `dr-update-viewer`, as it needs to connect to the new containers.

It is also possible to automatically start/update the viewer using the `-v` flag to `dr-start-training`.

## ROS Stream Viewer

The ROS Stream Viewer is a built in ROS feature that will stream any topic in ROS that publishing ROSImg messages. The viewer starts automatically.

### Ports

| Docker Mode  | Training         | Evaluation      | Comment
| -------- | -------- | -------- | -------- | 
| swarm      | 8080 + `DR_RUN_ID` |  8180 + `DR_RUN_ID` | Default 8080/8180. Multiple workers share one port, press F5 to cycle between them.
| compose | 8080-8089 | 8080-8089 | Each worker gets a unique port.

### Topics

| Topic  | Description         | 
| -------- | -------- | 
| `/racecar/camera/zed/rgb/image_rect_color`      | In-car video stream. This is used for inference. | 
| `/racecar/main_camera/zed/rgb/image_rect_color`      | Camera following the car. Stream without overlay | 
| `/sub_camera/zed/rgb/image_rect_color`      | Top-view of the track | 
| `/racecar/deepracer/kvs_stream`      | Camera following the car. Stream with overlay. Different overlay in Training and Evaluation | 
| `/racecar/deepracer/main_camera_stream`      | Same as `kvs_stream`, topic used for MP4 production. Only active in Evaluation if `DR_EVAL_SAVE_MP4=True` | 

## Saving Evaluation to File

During evaluation (`dr-start-evaluation`), if `DR_EVAL_SAVE_MP4=True` then three MP4 files are created in the S3 bucket's MP4 folder. They contain the in-car camera, top-camera and the camera following the car.

================================================
FILE: docs/windows.md
================================================
# Installing on Windows

## Prerequisites

The basic installation steps to get a NVIDIA GPU / CUDA enabled Ubuntu subsystem on Windows can be found in the [Cuda on WSL User Guide](https://docs.nvidia.com/cuda/wsl-user-guide/index.html).  Ensure your windows has an updated [nvidia cuda enabled driver](https://developer.nvidia.com/cuda/wsl/download) that will work with WSL.

The further instructions assume that you have a basic working WSL using the default Ubuntu distribution.


## Additional steps

The typical `bin/prepare.sh` script will not work for a Ubuntu WSL installation, hence alternate steps will be required.

### Adding required packages

Install additional packages with the following command:

```
sudo apt-get install jq awscli python3-boto3 docker-compose
```

### Install and configure docker and nvidia-docker
```
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
sudo add-apt-repository    "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
sudo apt-get update && sudo apt-get install -y --no-install-recommends docker-ce docker-ce-cli containerd.io

distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list

cat /etc/docker/daemon.json | jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' | sudo tee /etc/docker/daemon.json
sudo usermod -a -G docker $(id -un)
```


### Install DRfC

You can now run `bin/init.sh -a gpu -c local` to setup DRfC, and follow the typical DRfC startup instructions

## Known Issues

* `init.sh` is not able to detect the GPU given differences in the Nvidia drivers, and the WSL2 Linux Kernel. You need to manually set the GPU image in `system.env`.
* Docker does not start automatically when you launch Ubuntu. Start it manually with `sudo service docker start` 

     You can also configure the service to start automatically using the Windows Task Scheduler
     
     *1)* Create a new file at /etc/init-wsl  (sudo vi /etc/init-wsl) with the following contents.
     
     ```
     #!/bin/sh
     service start docker
     ```
 
     *2)* Make the script executable `sudo chmod +x /etc/init-wsl`
       
     *3)* Open Task Scheduler in Windows 10
       
     - On the left, click **Task Scheduler Library** option, and then on the right, click **Create Task**
          
     - In **General** Tab, Enter Name **WSL Startup**, and select **Run whether user is logged on or not** and **Run with highest privileges** options.
         
     - In **Trigger** tab, click New ... > Begin the task: **At startup** > OK
        
     - In **Actions** tab, click New ... > Action: **Start a program**
                            
       program/script:  **wsl**
                   
       add arguments:  **-u root /etc/init-wsl**
                   
     - Click OK to exit
          
     *4)* You can run the task manually to confirm, or after Windows reboot docker should now automatically start.

* Video streams may not load using the localhost address.  To access the html video streams from your windows browser, you may need to use the IP address of the WSL VM.  From a WSL terminal, determine your IP address by the command 'ip addr' and look for **eth0** then **inet** (e.g. ip = 172.29.38.21).  Then from your windows browser (edge, chrome, etc) navigate to **ip:8080** (e.g. 172.29.38.21:8080)
     

================================================
FILE: requirements.txt
================================================
boto3
pyyaml
requests
deepracer-utils

================================================
FILE: scripts/droa/__init__.py
================================================


================================================
FILE: scripts/droa/auth.py
================================================
#!/usr/bin/env python3
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Shared authentication and configuration utilities for DRoA (DeepRacer on AWS) scripts.

Provides:
  fetch_env_config(site_url)           — fetch and parse <site_url>/env.js
  authenticate(...)                    — Cognito User Pool sign-in → ID token
  get_aws_credentials(...)             — Identity Pool → temporary AWS credentials
  load_droa_config(args)               — resolve config from env vars + CLI args
  build_auth(url, credentials, region) — prepare botocore SigV4 request signing using AWSRequest/SigV4Auth
  add_common_args(parser)              — add shared CLI flags to an argparse parser
"""

import datetime
import json
import hashlib
import os
import re
import sys
import uuid

import boto3
import requests
import requests.auth
from botocore.auth import SigV4Auth
from botocore.awsrequest import AWSRequest


# ---------------------------------------------------------------------------
# Config discovery
# ---------------------------------------------------------------------------

def fetch_env_config(site_url: str) -> dict:
    """Fetch <site_url>/env.js and parse the window.EnvironmentConfig object."""
    env_js_url = site_url.rstrip("/") + "/env.js"
    response = requests.get(env_js_url, timeout=10)
    if not response.ok:
        raise RuntimeError(
            f"Could not fetch env.js from {env_js_url}: "
            f"{response.status_code} {response.reason}"
        )
    match = re.search(
        r"window\.EnvironmentConfig\s*=\s*(\{.+\})\s*;", response.text, re.DOTALL)
    if not match:
        raise RuntimeError(f"Could not find EnvironmentConfig in {env_js_url}")
    raw = match.group(1)
    try:
        config = json.loads(raw)
    except json.JSONDecodeError:
        # Convert JS object literal to strict JSON
        js = raw
        js = re.sub(r'([{,]\s*)([A-Za-z_]\w*)\s*:', r'\1"\2":', js)
        js = re.sub(r"'([^']*)'", r'"\1"', js)
        js = re.sub(r',(\s*})', r'\1', js)
        try:
            config = json.loads(js)
        except json.JSONDecodeError as exc:
            raise RuntimeError(
                f"Could not parse EnvironmentConfig from {env_js_url}.\n"
                f"Parse error: {exc}\nRaw content:\n{raw}"
            ) from exc
    return config


# ---------------------------------------------------------------------------
# Cognito authentication
# ---------------------------------------------------------------------------

def authenticate(region: str, client_id: str, username: str, password: str) -> str:
    """Sign in to Cognito User Pool and return an ID token."""
    client = boto3.client("cognito-idp", region_name=region)
    response = client.initiate_auth(
        AuthFlow="USER_PASSWORD_AUTH",
        AuthParameters={"USERNAME": username, "PASSWORD": password},
        ClientId=client_id,
    )
    result = response.get("AuthenticationResult") or {}
    id_token = result.get("IdToken")
    if not id_token:
        raise RuntimeError("Authentication failed – no ID token in response.")
    return id_token


def get_aws_credentials(
    region: str,
    user_pool_id: str,
    identity_pool_id: str,
    id_token: str,
) -> dict:
    """Exchange a Cognito ID token for temporary STS credentials via Identity Pool."""
    cognito_identity = boto3.client("cognito-identity", region_name=region)
    login_key = f"cognito-idp.{region}.amazonaws.com/{user_pool_id}"

    identity_response = cognito_identity.get_id(
        IdentityPoolId=identity_pool_id,
        Logins={login_key: id_token},
    )
    identity_id = identity_response["IdentityId"]

    creds_response = cognito_identity.get_credentials_for_identity(
        IdentityId=identity_id,
        Logins={login_key: id_token},
    )
    creds = creds_response["Credentials"]
    if os.environ.get("DR_DROA_DEBUG"):
        sts = boto3.client(
            "sts",
            region_name=region,
            aws_access_key_id=creds_response["Credentials"]["AccessKeyId"],
            aws_secret_access_key=creds_response["Credentials"]["SecretKey"],
            aws_session_token=creds_response["Credentials"]["SessionToken"],
        )
        identity = sts.get_caller_identity()
        print(
            f"  STS identity: Account={identity['Account']} Arn={identity['Arn']}", file=sys.stderr)

    return {
        "access_key": creds["AccessKeyId"],
        "secret_key": creds["SecretKey"],
        "session_token": creds["SessionToken"],
        "expiry": creds["Expiration"],
    }


# ---------------------------------------------------------------------------
# Credential cache
# ---------------------------------------------------------------------------

def _credential_cache_path(identity_pool_id: str, username: str) -> str:
    key = hashlib.sha256(
        f"{identity_pool_id}:{username}".encode()).hexdigest()[:16]
    cache_dir = os.path.expanduser("~/.droa-cache")
    os.makedirs(cache_dir, mode=0o700, exist_ok=True)
    return os.path.join(cache_dir, f"{key}.json")


def load_cached_credentials(identity_pool_id: str, username: str) -> dict | None:
    """Return cached AWS credentials if they have more than 60 seconds of validity left."""
    path = _credential_cache_path(identity_pool_id, username)
    if not os.path.exists(path):
        return None
    try:
        with open(path) as f:
            data = json.load(f)
        expiry = datetime.datetime.fromisoformat(data["expiry"])
        if expiry.tzinfo is None:
            expiry = expiry.replace(tzinfo=datetime.timezone.utc)
        if expiry <= datetime.datetime.now(tz=datetime.timezone.utc) + datetime.timedelta(seconds=60):
            return None
        return {k: v for k, v in data.items() if k != "expiry"}
    except (KeyError, ValueError, json.JSONDecodeError, OSError):
        return None


def save_credentials_to_cache(identity_pool_id: str, username: str, credentials: dict) -> None:
    """Save AWS credentials (including 'expiry') to a 0600 cache file."""
    expiry = credentials.get("expiry")
    if expiry is None:
        return
    path = _credential_cache_path(identity_pool_id, username)
    try:
        data = {
            k: v for k, v in credentials.items() if k != "expiry"
        }
        data["expiry"] = expiry.isoformat() if hasattr(
            expiry, "isoformat") else str(expiry)
        with open(path, "w") as f:
            json.dump(data, f)
        os.chmod(path, 0o600)
    except OSError:
        pass  # Non-fatal


# ---------------------------------------------------------------------------
# Config resolution
# ---------------------------------------------------------------------------

class DRoAConfig:
    """Resolved DRoA endpoint configuration."""

    def __init__(
        self,
        region: str,
        user_pool_id: str,
        client_id: str,
        identity_pool_id: str,
        api_endpoint: str,
        upload_bucket: str,
        site_url: str | None = None,
    ) -> None:
        self.region = region
        self.user_pool_id = user_pool_id
        self.client_id = client_id
        self.identity_pool_id = identity_pool_id
        self.api_endpoint = api_endpoint.rstrip("/")
        self.upload_bucket = upload_bucket
        self.site_url = site_url


def load_droa_config(args) -> "DRoAConfig":
    """
    Resolve DRoA configuration in priority order:
      1. Explicit CLI override flags on ``args``
      2. env.js fetched from --url / DR_DROA_URL environment variable

    Exits with a descriptive error if any required value is missing.
    """
    site_url = getattr(args, "url", None) or os.environ.get("DR_DROA_URL")
    env: dict = {}
    if site_url:
        env = fetch_env_config(site_url)
        print(f"Loaded configuration from {site_url}/env.js", file=sys.stderr)

    region = getattr(args, "region", None) or env.get("region")
    user_pool_id = getattr(args, "user_pool_id", None) or env.get("userPoolId")
    client_id = getattr(args, "user_pool_client_id",
                        None) or env.get("userPoolClientId")
    identity_pool_id = getattr(
        args, "identity_pool_id", None) or env.get("identityPoolId")
    api_endpoint = getattr(args, "api_endpoint",
                           None) or env.get("apiEndpointUrl")
    upload_bucket = getattr(args, "upload_bucket",
                            None) or env.get("uploadBucketName")

    missing = [
        name for name, val in [
            ("region", region),
            ("user-pool-id", user_pool_id),
            ("user-pool-client-id", client_id),
            ("identity-pool-id", identity_pool_id),
            ("api-endpoint", api_endpoint),
            ("upload-bucket", upload_bucket),
        ]
        if not val
    ]
    if missing:
        print(
            f"Error: could not resolve: {', '.join(missing)}.\n"
            "Set DR_DROA_URL in system.env or pass --url.",
            file=sys.stderr,
        )
        sys.exit(1)

    return DRoAConfig(
        region=region,
        user_pool_id=user_pool_id,
        client_id=client_id,
        identity_pool_id=identity_pool_id,
        api_endpoint=api_endpoint,
        upload_bucket=upload_bucket,
        site_url=site_url,
    )


# ---------------------------------------------------------------------------
# SigV4 auth helper
# ---------------------------------------------------------------------------

def build_auth(url: str, credentials: dict, region: str, site_url: str | None = None) -> requests.auth.AuthBase:
    """Create a requests AuthBase that SigV4-signs each request via botocore."""
    origin = site_url.rstrip("/") if site_url else None
    session = boto3.Session(
        aws_access_key_id=credentials["access_key"],
        aws_secret_access_key=credentials["secret_key"],
        aws_session_token=credentials["session_token"],
        region_name=region,
    )
    frozen_creds = session.get_credentials().get_frozen_credentials()

    class _Auth(requests.auth.AuthBase):
        def __call__(self, r: requests.PreparedRequest) -> requests.PreparedRequest:
            body = r.body or b""
            if isinstance(body, str):
                body = body.encode("utf-8")

            sign_headers = {
                "accept": "*/*",
                "accept-encoding": "gzip, deflate",
                "accept-language": "en-US,en;q=0.9,de-DE;q=0.8,de;q=0.7",
                "amz-sdk-invocation-id": str(uuid.uuid4()),
                "amz-sdk-request": "attempt=1; max=3",
                "cache-control": "no-cache",
                "pragma": "no-cache",
                "sec-ch-ua": '"Microsoft Edge";v="147", "Not.A/Brand";v="8", "Chromium";v="147"',
                "sec-ch-ua-mobile": "?0",
                "sec-ch-ua-platform": '"Windows"',
                "sec-fetch-dest": "empty",
                "sec-fetch-mode": "cors",
                "sec-fetch-site": "cross-site",
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36 Edg/147.0.0.0",
                "x-amz-content-sha256": hashlib.sha256(body).hexdigest(),
                "x-amz-user-agent": "aws-sdk-js/1.0.0 ua/2.0 os/Windows#NT-10.0 lang/js md/browser#Microsoft-Edge_147.0.0.0",
            }
            if origin:
                sign_headers["origin"] = origin
                sign_headers["referer"] = origin + "/"
            aws_request = AWSRequest(
                method=r.method, url=r.url, data=body, headers=sign_headers)
            SigV4Auth(frozen_creds, "execute-api",
                      region).add_auth(aws_request)
            r.headers.update(dict(aws_request.headers))

            if os.environ.get("DR_DROA_DEBUG"):
                print("\n--- DEBUG: signed request ---", file=sys.stderr)
                print(f"  {r.method} {r.url}", file=sys.stderr)
                print(
                    f"  (access key: ***{credentials['access_key'][-4:]})", file=sys.stderr)
                for k, v in sorted(r.headers.items()):
                    display = v[:40] + \
                        "..." if k.lower() == "x-amz-security-token" and len(v) > 40 else v
                    print(f"  {k}: {display}", file=sys.stderr)
                print("-----------------------------\n", file=sys.stderr)
            return r

    return _Auth()


# ---------------------------------------------------------------------------
# Shared argparse helpers
# ---------------------------------------------------------------------------

def add_common_args(parser) -> None:
    """Add shared DRoA connection/auth arguments to an argparse parser."""
    parser.add_argument(
        "--url",
        help="DeepRacer on AWS site URL (defaults to DR_DROA_URL env var). "
             "All AWS config is read automatically from <url>/env.js.",
    )
    parser.add_argument("--region", help="Override: AWS region")
    parser.add_argument(
        "--user-pool-id", help="Override: Cognito User Pool ID")
    parser.add_argument("--user-pool-client-id",
                        help="Override: Cognito App Client ID")
    parser.add_argument("--identity-pool-id",
                        help="Override: Cognito Identity Pool ID")
    parser.add_argument(
        "--api-endpoint", help="Override: API Gateway base URL")
    parser.add_argument("--upload-bucket",
                        help="Override: S3 upload bucket name")
    parser.add_argument(
        "--username",
        help="Cognito username / email (defaults to DR_DROA_USERNAME env var)",
    )
    parser.add_argument(
        "--password", help="Cognito password (prompted if omitted)")


================================================
FILE: scripts/droa/delete_model.py
================================================
#!/usr/bin/env python3
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Delete a model from DeepRacer on AWS (DRoA).

Sends DELETE /models/{modelId}.  Before issuing the request the script fetches
the model record and verifies its status — the API only permits deletion when
the model is in READY or ERROR state.  Any other status results in a clear
error message without making the DELETE call.

Deletion is asynchronous on the server: the model status transitions to
DELETING immediately, then S3 artifacts, training records and evaluation
records are removed in the background.  If any step fails the server reverts
the status to ERROR for manual cleanup.

Usage examples
--------------
  # Interactive confirmation (shows model name and current status):
  python delete_model.py 2w7R6h2PNexQ9kC

  # Skip confirmation prompt (use in scripts):
  python delete_model.py 2w7R6h2PNexQ9kC --yes

  # Override site URL and username on the command line:
  python delete_model.py 2w7R6h2PNexQ9kC --url https://my.droa.example.com --username alice

Authentication
--------------
Credentials are obtained via the Cognito Identity Pool embedded in the DRoA
site's /env.js.  A password prompt is shown on the first call; subsequent
calls within the credential lifetime (~1 h) reuse a cache stored in
~/.droa-cache/.

The site URL is read from DR_DROA_URL and the username from DR_DROA_USERNAME
(both set in system.env), or supplied via --url / --username.

Deletable status values
-----------------------
  READY   Model trained successfully and ready for use
  ERROR   Model encountered an error during a previous operation

All other statuses (TRAINING, EVALUATING, IMPORTING, QUEUED, STOPPING,
SUBMITTING, DELETING) are rejected by the API with HTTP 400.
"""

import argparse
import getpass
import os
import sys

import requests

from auth import (
    add_common_args, authenticate, build_auth, get_aws_credentials, load_droa_config,
    load_cached_credentials, save_credentials_to_cache,
)

_DELETABLE_STATUSES = {"READY", "ERROR"}


def fetch_model(cfg, credentials: dict, model_id: str) -> dict:
    url = f"{cfg.api_endpoint}/models/{model_id}"
    response = requests.get(
        url, auth=build_auth(url, credentials, cfg.region, cfg.site_url), timeout=30
    )
    if not response.ok:
        raise RuntimeError(
            f"API error fetching model: {response.status_code} {response.reason}\n{response.text}"
        )
    data = response.json()
    return data.get("model", data)


def delete_model(cfg, credentials: dict, model_id: str) -> None:
    url = f"{cfg.api_endpoint}/models/{model_id}"
    response = requests.delete(
        url, auth=build_auth(url, credentials, cfg.region, cfg.site_url), timeout=30
    )
    if not response.ok:
        raise RuntimeError(
            f"API error: {response.status_code} {response.reason}\n{response.text}"
        )


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Delete a model from DeepRacer on AWS.",
        epilog=(
            "examples:\n"
            "  %(prog)s 2w7R6h2PNexQ9kC\n"
            "  %(prog)s 2w7R6h2PNexQ9kC --yes"
        ),
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    add_common_args(parser)
    parser.add_argument("model_id", help="Model ID to delete")
    parser.add_argument(
        "-y", "--yes", action="store_true", help="Skip confirmation prompt"
    )
    return parser.parse_args()


def main() -> None:
    args = parse_args()

    username = args.username or os.environ.get("DR_DROA_USERNAME")
    if not username:
        print("Error: --username or DR_DROA_USERNAME required.", file=sys.stderr)
        sys.exit(1)

    cfg = load_droa_config(args)

    credentials = load_cached_credentials(cfg.identity_pool_id, username)
    if credentials:
        print("Using cached credentials.", file=sys.stderr)
    else:
        password = args.password or getpass.getpass(
            f"Password for {username}: ")
        id_token = authenticate(cfg.region, cfg.client_id, username, password)
        credentials = get_aws_credentials(
            cfg.region, cfg.user_pool_id, cfg.identity_pool_id, id_token)
        save_credentials_to_cache(cfg.identity_pool_id, username, credentials)

    model = fetch_model(cfg, credentials, args.model_id)
    name = model.get("name", args.model_id)
    status = model.get("status", "UNKNOWN")

    if status not in _DELETABLE_STATUSES:
        print(
            f"Error: model '{name}' has status {status} and cannot be deleted.\n"
            f"Only models with status READY or ERROR may be deleted.",
            file=sys.stderr,
        )
        sys.exit(1)

    if not args.yes:
        print(f"Model name : {name}")
        print(f"Model ID   : {args.model_id}")
        print(f"Status     : {status}")
        print()
        confirm = input(
            f"Type the model name to confirm deletion: "
        ).strip()
        if confirm != name:
            print("Aborted.")
            sys.exit(0)

    print(f"Deleting model '{name}' ({args.model_id})...")
    delete_model(cfg, credentials, args.model_id)
    print("Delete request accepted. The model will be removed shortly (status → DELETING).")


if __name__ == "__main__":
    main()


================================================
FILE: scripts/droa/download_logs.py
================================================
#!/usr/bin/env python3
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Download logs or assets for a model from DeepRacer on AWS (DRoA).

Calls GET /models/{modelId}/getasset to obtain a presigned S3 URL, then
downloads the file.  For VIRTUAL_MODEL the server packages the artifact
asynchronously — the script polls until the URL is ready (up to
POLL_TIMEOUT seconds).  All other asset types return a URL immediately
or a 400 error if the underlying job has not yet completed.

Usage examples
--------------
  # Download training logs (default):
  python download_logs.py 2w7R6h2PNexQ9kC

  # Download and immediately print a training stability summary:
  python download_logs.py 2w7R6h2PNexQ9kC --summary

  # Download training logs to a specific file:
  python download_logs.py 2w7R6h2PNexQ9kC -o training.tar.gz

  # Download evaluation logs (evaluation ID required):
  python download_logs.py 2w7R6h2PNexQ9kC --asset-type EVALUATION_LOGS --evaluation-id <evalId>

  # Download virtual model artifact (polls until packaging completes):
  python download_logs.py 2w7R6h2PNexQ9kC --asset-type VIRTUAL_MODEL

  # Override site URL and username on the command line:
  python download_logs.py 2w7R6h2PNexQ9kC --url https://my.droa.example.com --username alice

Asset types
-----------
  TRAINING_LOGS       Logs from the training job (job must be COMPLETED or FAILED)
  EVALUATION_LOGS     Logs from an evaluation run (requires --evaluation-id;
                      job must be COMPLETED or FAILED)
  PHYSICAL_CAR_MODEL  Physical car model artifact
  VIRTUAL_MODEL       Virtual model package (packaged asynchronously; script polls)
  VIDEOS              Evaluation video recordings

Authentication
--------------
Credentials are obtained via the Cognito Identity Pool embedded in the DRoA
site's /env.js.  A password prompt is shown on the first call; subsequent
calls within the credential lifetime (~1 h) reuse a cache stored in
~/.droa-cache/.

The site URL is read from DR_DROA_URL and the username from DR_DROA_USERNAME
(both set in system.env), or supplied via --url / --username.
"""

import argparse
import getpass
import os
import sys
import time
from urllib.parse import urlparse

import requests

from auth import (
    add_common_args, authenticate, build_auth, get_aws_credentials, load_droa_config,
    load_cached_credentials, save_credentials_to_cache,
)

ASSET_TYPES = ["TRAINING_LOGS", "EVALUATION_LOGS",
               "PHYSICAL_CAR_MODEL", "VIRTUAL_MODEL", "VIDEOS"]
POLL_INTERVAL = 5    # seconds between status checks
POLL_TIMEOUT = 300  # seconds before giving up (packaging can take a while)


def get_asset_url(cfg, credentials, model_id, asset_type, evaluation_id=None):
    """Call GET /models/{modelId}/getasset, polling while status is QUEUED."""
    url = f"{cfg.api_endpoint}/models/{model_id}/getasset"
    params = {"assetType": asset_type}
    if evaluation_id:
        params["evaluationId"] = evaluation_id

    deadline = time.monotonic() + POLL_TIMEOUT
    while True:
        response = requests.get(
            url, params=params,
            auth=build_auth(url, credentials, cfg.region, cfg.site_url),
            timeout=30,
        )
        if not response.ok:
            raise RuntimeError(
                f"API error: {response.status_code} {response.reason}\n{response.text}"
            )
        data = response.json()
        if data.get("url"):
            return data["url"]
        # Only VIRTUAL_MODEL returns status:QUEUED while packaging
        status = data.get("status", "UNKNOWN")
        if status != "QUEUED":
            raise RuntimeError(
                f"No URL returned and unexpected status '{status}'. "
                f"The asset may not be available yet."
            )
        if time.monotonic() > deadline:
            raise RuntimeError(
                f"Timed out waiting for asset after {POLL_TIMEOUT}s.")
        print(
            f"  Packaging in progress (status: {status}) — retrying in {POLL_INTERVAL}s...", file=sys.stderr)
        time.sleep(POLL_INTERVAL)


def parse_args():
    parser = argparse.ArgumentParser(
        description="Download logs/assets from a DeepRacer on AWS model.",
        epilog=(
            "examples:\n"
            "  %(prog)s 2w7R6h2PNexQ9kC\n"
            "  %(prog)s 2w7R6h2PNexQ9kC --asset-type EVALUATION_LOGS --evaluation-id <evalId>\n"
            "  %(prog)s 2w7R6h2PNexQ9kC --asset-type VIRTUAL_MODEL -o model.tar.gz"
        ),
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    add_common_args(parser)
    parser.add_argument("model_id", help="Model ID")
    parser.add_argument(
        "--asset-type", default="TRAINING_LOGS", choices=ASSET_TYPES,
        help="Asset type to download (default: TRAINING_LOGS)",
    )
    parser.add_argument(
        "--evaluation-id", default=None,
        help="Evaluation ID — required when --asset-type is EVALUATION_LOGS",
    )
    parser.add_argument(
        "--output", "-o", default=None,
        help="Output file path (default: derived from the presigned URL filename)",
    )
    parser.add_argument(
        "--summary", action="store_true",
        help="After downloading, load the archive with DeepRacer Utils and print a training stability summary (TRAINING_LOGS only)",
    )
    return parser.parse_args()


def main():
    args = parse_args()

    username = args.username or os.environ.get("DR_DROA_USERNAME")
    if not username:
        print("Error: --username or DR_DROA_USERNAME required.", file=sys.stderr)
        sys.exit(1)

    if args.asset_type == "EVALUATION_LOGS" and not args.evaluation_id:
        print("Error: --evaluation-id is required for EVALUATION_LOGS.",
              file=sys.stderr)
        sys.exit(1)

    cfg = load_droa_config(args)

    credentials = load_cached_credentials(cfg.identity_pool_id, username)
    if credentials:
        print("Using cached credentials.", file=sys.stderr)
    else:
        password = args.password or getpass.getpass(
            f"Password for {username}: ")
        id_token = authenticate(cfg.region, cfg.client_id, username, password)
        credentials = get_aws_credentials(
            cfg.region, cfg.user_pool_id, cfg.identity_pool_id, id_token)
        save_credentials_to_cache(cfg.identity_pool_id, username, credentials)

    print(
        f"Requesting {args.asset_type} for model {args.model_id}...", file=sys.stderr)
    presigned_url = get_asset_url(
        cfg, credentials, args.model_id, args.asset_type, args.evaluation_id
    )

    dl_response = requests.get(presigned_url, timeout=120, stream=True)
    if not dl_response.ok:
        raise RuntimeError(
            f"Download failed: {dl_response.status_code} {dl_response.reason}")

    out_path = args.output
    if not out_path:
        url_filename = os.path.basename(urlparse(presigned_url).path)
        out_path = url_filename or f"{args.model_id}_{args.asset_type.lower()}.bin"

    with open(out_path, "wb") as f:
        for chunk in dl_response.iter_content(chunk_size=65536):
            f.write(chunk)

    print(f"Downloaded to: {out_path}", file=sys.stderr)

    if args.summary:
        if args.asset_type != "TRAINING_LOGS":
            print(
                f"Warning: --summary is only supported for TRAINING_LOGS, skipping.",
                file=sys.stderr,
            )
        else:
            try:
                from deepracer.logs import DeepRacerLog, TarFileHandler
            except ImportError:
                print(
                    "Error: deepracer-utils is not installed. "
                    "Run: pip install deepracer-utils",
                    file=sys.stderr,
                )
                sys.exit(1)
            print(file=sys.stderr)
            fh = TarFileHandler(archive_path=out_path)
            log = DeepRacerLog(filehandler=fh, verbose=True)
            log.load_training_trace()
            log.stability.print_summary()


if __name__ == "__main__":
    main()


================================================
FILE: scripts/droa/get_model.py
================================================
#!/usr/bin/env python3
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Get details of a specific model from DeepRacer on AWS (DRoA).

Retrieves the full model record from GET /models/{modelId} and prints it in
a human-readable key-value format.  Use --json for machine-readable output.

Usage examples
--------------
  # Basic summary (status, training config, sensors, hyperparameters):
  python get_model.py 2w7R6h2PNexQ9kC

  # Include reward function, action space and Metrics URL:
  python get_model.py 2w7R6h2PNexQ9kC --verbose

  # Print DeepRacer Utils training metrics summary:
  python get_model.py 2w7R6h2PNexQ9kC --summary

  # Raw JSON (suitable for piping to jq):
  python get_model.py 2w7R6h2PNexQ9kC --json | jq .status

  # Override site URL and username on the command line:
  python get_model.py 2w7R6h2PNexQ9kC --url https://my.droa.example.com --username alice

Authentication
--------------
Credentials are obtained via the Cognito Identity Pool embedded in the DRoA
site's /env.js.  A password prompt is shown on the first call; subsequent
calls within the credential lifetime (~1 h) reuse a cache stored in
~/.droa-cache/.

The site URL is read from DR_DROA_URL and the username from DR_DROA_USERNAME
(both set in system.env), or supplied via --url / --username.

Model status values
-------------------
  DELETING  ERROR  EVALUATING  IMPORTING  QUEUED  READY
  STOPPING  SUBMITTING  TRAINING

Training status values
----------------------
  CANCELED  COMPLETED  FAILED  IN_PROGRESS  INITIALIZING  QUEUED  STOPPING
"""

import argparse
import getpass
import json
import os
import sys

import requests

from auth import (
    add_common_args, authenticate, build_auth, get_aws_credentials, load_droa_config,
    load_cached_credentials, save_credentials_to_cache,
)


def get_model(cfg, credentials: dict, model_id: str) -> dict:
    url = f"{cfg.api_endpoint}/models/{model_id}"
    response = requests.get(
        url, auth=build_auth(url, credentials, cfg.region, cfg.site_url), timeout=30
    )
    if not response.ok:
        raise RuntimeError(
            f"API error: {response.status_code} {response.reason}\n{response.text}"
        )
    data = response.json()
    return data.get("model", data)


def _fmt_bytes(n) -> str:
    if n is None:
        return ""
    for unit, threshold in (("GB", 1024**3), ("MB", 1024**2), ("KB", 1024)):
        if n >= threshold:
            return f"{n / threshold:.1f} {unit}"
    return f"{n} B"


def _kv(key: str, value, indent: int = 0) -> None:
    if value is None or value == "":
        return
    pad = "  " * indent
    print(f"{pad}{key:<22}: {value}")


def print_model(model: dict, verbose: bool = False) -> None:
    _kv("Model ID", model.get("modelId"))
    _kv("Name", model.get("name"))
    _kv("Description", model.get("description"))
    _kv("Status", model.get("status"))
    _kv("Training Status", model.get("trainingStatus"))
    created = (model.get("createdAt") or "")[:19].replace("T", " ")
    _kv("Created At", created)
    _kv("File Size", _fmt_bytes(model.get("fileSizeInBytes")))
    _kv("Packaging Status", model.get("packagingStatus"))
    if model.get("importErrorMessage"):
        _kv("Import Error", model["importErrorMessage"])

    car = model.get("carCustomization") or {}
    if car:
        print()
        print("Car Customization")
        _kv("Color", car.get("carColor"), indent=1)
        _kv("Shell", car.get("carShell"), indent=1)

    tc = model.get("trainingConfig") or {}
    if tc:
        print()
        print("Training Config")
        track = tc.get("trackConfig") or {}
        _kv("Track", track.get("trackId"), indent=1)
        _kv("Direction", track.get("trackDirection"), indent=1)
        _kv("Race Type", tc.get("raceType"), indent=1)
        _kv("Max Time (min)", tc.get("maxTimeInMinutes"), indent=1)

    meta = model.get("metadata") or {}
    if meta:
        print()
        print("Metadata")
        _kv("Algorithm", meta.get("agentAlgorithm"), indent=1)
        sensors = meta.get("sensors") or {}
        _kv("Camera", sensors.get("camera"), indent=1)
        _kv("Lidar", sensors.get("lidar"), indent=1)
        hp = meta.get("hyperparameters") or {}
        if hp:
            print("  Hyperparameters")
            for k, v in hp.items():
                _kv(k, v, indent=2)
        if verbose:
            action_space = meta.get("actionSpace") or {}
            if action_space:
                print()
                print("Action Space")
                cont = action_space.get("continous") or {}
                disc = action_space.get("discrete") or []
                if cont:
                    _kv("Type", "continuous", indent=1)
                    _kv("Speed range",
                        f"{cont.get('lowSpeed')} – {cont.get('highSpeed')} m/s", indent=1)
                    _kv("Steering range",
                        f"{cont.get('lowSteeringAngle')}° – {cont.get('highSteeringAngle')}°", indent=1)
                elif disc:
                    _kv("Type", f"discrete ({len(disc)} actions)", indent=1)
                    for i, a in enumerate(disc):
                        _kv(f"Action {i}", f"speed={a.get('speed')} m/s, steering={a.get('steeringAngle')}°", indent=2)
            rf = meta.get("rewardFunction")
            if rf:
                print()
                print("Reward Function")
                print(rf)

    if verbose:
        metrics_url = model.get("trainingMetricsUrl")
        if metrics_url:
            print()
            _kv("Metrics URL", metrics_url)
    video_url = model.get("trainingVideoStreamUrl")
    if video_url:
        print()
        _kv("Video Stream URL", video_url)


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Get details of a model in DeepRacer on AWS.",
        epilog=(
            "examples:\n"
            "  %(prog)s 2w7R6h2PNexQ9kC\n"
            "  %(prog)s 2w7R6h2PNexQ9kC --verbose\n"
            "  %(prog)s 2w7R6h2PNexQ9kC --summary\n"
            "  %(prog)s 2w7R6h2PNexQ9kC --json | jq .status"
        ),
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    add_common_args(parser)
    parser.add_argument("model_id", help="Model ID to retrieve")
    parser.add_argument(
        "--json", dest="output_json", action="store_true",
        help="Output raw JSON instead of formatted view",
    )
    parser.add_argument(
        "-v", "--verbose", action="store_true",
        help="Also print reward function, action space, and Metrics URL",
    )
    parser.add_argument(
        "--summary", action="store_true",
        help="Load training metrics via DeepRacer Utils and print a mean summary",
    )
    return parser.parse_args()


def main() -> None:
    args = parse_args()

    username = args.username or os.environ.get("DR_DROA_USERNAME")
    if not username:
        print("Error: --username or DR_DROA_USERNAME required.", file=sys.stderr)
        sys.exit(1)

    cfg = load_droa_config(args)

    credentials = load_cached_credentials(cfg.identity_pool_id, username)
    if credentials:
        print("Using cached credentials.", file=sys.stderr)
    else:
        password = args.password or getpass.getpass(
            f"Password for {username}: ")
        id_token = authenticate(cfg.region, cfg.client_id, username, password)
        credentials = get_aws_credentials(
            cfg.region, cfg.user_pool_id, cfg.identity_pool_id, id_token)
        save_credentials_to_cache(cfg.identity_pool_id, username, credentials)
    model = get_model(cfg, credentials, args.model_id)
    if args.output_json:
        print(json.dumps(model, indent=2, default=str))
    else:
        print_model(model, verbose=args.verbose)

    if args.summary:
        metrics_url = model.get("trainingMetricsUrl")
        if not metrics_url:
            print("Error: no trainingMetricsUrl available for this model.",
                  file=sys.stderr)
            sys.exit(1)
        try:
            from deepracer.logs import TrainingMetrics
        except ImportError:
            print(
                "Error: deepracer-utils is not installed. "
                "Run: pip install deepracer-utils",
                file=sys.stderr,
            )
            sys.exit(1)
        print()
        tm = TrainingMetrics(None, url=metrics_url)
        print(tm.getSummary(method="mean",
              summary_index=["r-i", "master_iteration"]))


if __name__ == "__main__":
    main()


================================================
FILE: scripts/droa/import_model.py
================================================
#!/usr/bin/env python3
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Import a locally trained DRFC model into DeepRacer on AWS (DRoA).

Two source modes
----------------
--model-dir DIR
    Upload from a pre-assembled local directory.  The directory must contain
    at minimum: model_metadata.json, reward_function.py, training_params.yaml,
    hyperparameters.json.  All files are uploaded as-is preserving relative
    paths.

--model-prefix PREFIX
    Pull the model from the DRFC local S3 bucket (MinIO), assemble the correct
    upload structure, and generate training_params.yaml from DR_* environment
    variables — replicating what scripts/upload/upload-model.sh and
    scripts/upload/prepare-config.py do.  If omitted, DR_LOCAL_S3_MODEL_PREFIX
    is used as the prefix.

Checkpoint selection (--model-prefix mode only)
-----------------------------------------------
  Default          last tested checkpoint  (last_checkpoint in deepracer_checkpoints.json)
  --best           best checkpoint         (best_checkpoint)
  --checkpoint N   specific checkpoint step number

Flow
----
  1. Authenticate with Cognito User Pool  →  ID token
  2. Exchange ID token via Identity Pool  →  temporary AWS credentials
  3. (--model-prefix) Download from local S3 into a temp dir;
     generate training_params.yaml from DR_* env vars
  4. Upload assembled directory to the DRoA upload S3 bucket
  5. POST /importmodel  →  modelId

Usage examples
--------------
  # Upload from a pre-assembled local directory:
  python import_model.py --model-dir /tmp/my-model --model-name my-model

  # Pull current model from local MinIO (uses DR_LOCAL_S3_MODEL_PREFIX):
  python import_model.py --model-prefix rl-deepracer-sagemaker --model-name my-model

  # Pull with best checkpoint:
  python import_model.py --model-prefix rl-deepracer-sagemaker --model-name my-model --best

Authentication
--------------
  DR_DROA_URL and DR_DROA_USERNAME (system.env) or --url / --username.
  Credential cache: ~/.droa-cache/

Environment variables (--model-prefix mode)
-------------------------------------------
  DR_LOCAL_S3_BUCKET       Local S3 bucket name
  DR_LOCAL_S3_MODEL_PREFIX Default model prefix (overridden by --model-prefix)
  DR_MINIO_URL             MinIO endpoint URL (e.g. http://minio:9000)
  DR_LOCAL_S3_PROFILE      AWS profile name for local S3 access (default: "default")
  DR_*                     Training config variables used to build training_params.yaml
"""

import argparse
import getpass
import json
import os
import re
import sys
import tempfile
import uuid
from pathlib import Path

import boto3
import requests
import yaml

from auth import (
    add_common_args, authenticate, build_auth, get_aws_credentials, load_droa_config,
    load_cached_credentials, save_credentials_to_cache,
)


EXCLUDED_FILES = {".DS_Store", "Thumbs.db", "desktop.ini", "._.DS_Store"}

# Required files when validating a user-supplied --model-dir
REQUIRED_FILES_DIR = {
    "model_metadata.json",
    "reward_function.py",
    "training_params.yaml",
    "hyperparameters.json",
}


# ---------------------------------------------------------------------------
# Content-type helper
# ---------------------------------------------------------------------------

def _content_type(file_path):
    name = file_path.name
    ext = file_path.suffix.lower()
    if name == "done":
        return "text/plain"
    mapping = {
        ".meta": "application/octet-stream",
        ".ckpt": "application/octet-stream",
        ".pb": "application/octet-stream",
        ".ready": "text/plain",
        ".json": "application/json",
        ".yaml": "application/x-yaml",
        ".yml": "application/x-yaml",
        ".py": "text/x-python",
        ".data": "application/octet-stream",
        ".index": "application/octet-stream",
    }
    return mapping.get(ext, "application/octet-stream")


# ---------------------------------------------------------------------------
# Local S3 client (MinIO via DR_MINIO_URL + DR_LOCAL_S3_PROFILE)
# ---------------------------------------------------------------------------

def _local_s3_client():
    """Return a boto3 S3 client pointed at the local MinIO instance."""
    profile = os.environ.get("DR_LOCAL_S3_PROFILE", "default")
    endpoint = os.environ.get("DR_MINIO_URL")  # e.g. http://minio:9000
    session = boto3.Session(profile_name=profile)
    kwargs = {"endpoint_url": endpoint} if endpoint else {}
    return session.client("s3", **kwargs)


def _s3_cp_down(s3, bucket, key, local_path):
    """Download a single S3 object to local_path."""
    print(f"    s3 cp  s3://{bucket}/{key}  →  {local_path}")
    Path(local_path).parent.mkdir(parents=True, exist_ok=True)
    s3.download_file(bucket, key, str(local_path))


def _s3_sync_down(s3, bucket, prefix, local_dir, include_pattern=None):
    """Download all objects under prefix into local_dir, optionally filtered."""
    print(f"    s3 sync  s3://{bucket}/{prefix}  →  {local_dir}")
    paginator = s3.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            key = obj["Key"]
            name = key[len(prefix):].lstrip("/")
            if not name:
                continue
            if include_pattern and not any(name.startswith(p) for p in include_pattern):
                continue
            dest = Path(local_dir) / name
            dest.parent.mkdir(parents=True, exist_ok=True)
            s3.download_file(bucket, key, str(dest))


# ---------------------------------------------------------------------------
# training_params.yaml generation  (replicates prepare-config.py)
# ---------------------------------------------------------------------------

def _build_training_params(work_dir, target_bucket, target_prefix):
    """Generate training_params.yaml from DR_* env vars into work_dir."""
    e = os.environ.get
    cfg = {
        "AWS_REGION":                  e("DR_AWS_APP_REGION", "us-east-1"),
        "JOB_TYPE":                    "TRAINING",
        "METRICS_S3_BUCKET":           target_bucket,
        "METRICS_S3_OBJECT_KEY":       f"{target_prefix}/TrainingMetrics.json",
        "MODEL_METADATA_FILE_S3_KEY":  f"{target_prefix}/model/model_metadata.json",
        "REWARD_FILE_S3_KEY":          f"{target_prefix}/reward_function.py",
        "SAGEMAKER_SHARED_S3_BUCKET":  target_bucket,
        "SAGEMAKER_SHARED_S3_PREFIX":  target_prefix,
        "BODY_SHELL_TYPE":             e("DR_CAR_BODY_SHELL_TYPE", "deepracer"),
        "CAR_NAME":                    e("DR_CAR_NAME", "MyCar"),
        "RACE_TYPE":                   e("DR_RACE_TYPE", "TIME_TRIAL"),
        # DRoA TrackId has no direction suffix; strip _cw/_ccw that DRFC appends
        "WORLD_NAME":                  re.sub(r'_(cw|ccw)$', '', e("DR_WORLD_NAME", "LGSWide")),
        "DISPLAY_NAME":                e("DR_DISPLAY_NAME", "racer1"),
        "RACER_NAME":                  e("DR_RACER_NAME", "racer1"),
        "ALTERNATE_DRIVING_DIRECTION": e("DR_TRAIN_ALTERNATE_DRIVING_DIRECTION",
                                         e("DR_ALTERNATE_DRIVING_DIRECTION", "false")),
        "CHANGE_START_POSITION":       e("DR_TRAIN_CHANGE_START_POSITION",
                                         e("DR_CHANGE_START_POSITION", "true")),
        "ROUND_ROBIN_ADVANCE_DIST":    e("DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST", "0.05"),
        "START_POSITION_OFFSET":       e("DR_TRAIN_START_POSITION_OFFSET", "0.00"),
        "ENABLE_DOMAIN_RANDOMIZATION": e("DR_ENABLE_DOMAIN_RANDOMIZATION", "false"),
        "MIN_EVAL_TRIALS":             e("DR_TRAIN_MIN_EVAL_TRIALS", "5"),
    }

    if cfg["BODY_SHELL_TYPE"] == "deepracer":
        cfg["CAR_COLOR"] = e("DR_CAR_COLOR", "Red")

    race_type = cfg["RACE_TYPE"]
    if race_type == "OBJECT_AVOIDANCE":
        cfg["NUMBER_OF_OBSTACLES"] = e("DR_OA_NUMBER_OF_OBSTACLES", "6")
        cfg["MIN_DISTANCE_BETWEEN_OBSTACLES"] = e(
            "DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES", "2.0")
        cfg["RANDOMIZE_OBSTACLE_LOCATIONS"] = e(
            "DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS", "True")
        cfg["IS_OBSTACLE_BOT_CAR"] = e("DR_OA_IS_OBSTACLE_BOT_CAR", "false")
        positions_str = e("DR_OA_OBJECT_POSITIONS", "")
        if positions_str:
            positions = positions_str.split(";")
            cfg["OBJECT_POSITIONS"] = positions
            cfg["NUMBER_OF_OBSTACLES"] = str(len(positions))

    if race_type == "HEAD_TO_BOT":
        cfg["IS_LANE_CHANGE"] = e("DR_H2B_IS_LANE_CHANGE", "False")
        cfg["LOWER_LANE_CHANGE_TIME"] = e(
            "DR_H2B_LOWER_LANE_CHANGE_TIME", "3.0")
        cfg["UPPER_LANE_CHANGE_TIME"] = e(
            "DR_H2B_UPPER_LANE_CHANGE_TIME", "5.0")
        cfg["LANE_CHANGE_DISTANCE"] = e("DR_H2B_LANE_CHANGE_DISTANCE", "1.0")
        cfg["NUMBER_OF_BOT_CARS"] = e("DR_H2B_NUMBER_OF_BOT_CARS", "0")
        cfg["MIN_DISTANCE_BETWEEN_BOT_CARS"] = e(
            "DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS", "2.0")
        cfg["RANDOMIZE_BOT_CAR_LOCATIONS"] = e(
            "DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS", "False")
        cfg["BOT_CAR_SPEED"] = e("DR_H2B_BOT_CAR_SPEED", "0.2")

    # TRACK_DIRECTION_CLOCKWISE: infer from the raw DR_WORLD_NAME (which still carries
    # the _cw/_ccw suffix) before that suffix is stripped for WORLD_NAME above.
    raw_world = e("DR_WORLD_NAME", "LGSWide")
    if raw_world.endswith("_cw"):
        cfg["TRACK_DIRECTION_CLOCKWISE"] = True
    elif raw_world.endswith("_ccw"):
        cfg["TRACK_DIRECTION_CLOCKWISE"] = False
    else:
        reverse = e("DR_TRAIN_REVERSE_DIRECTION",
                    "False").lower() in ("true", "1", "yes")
        cfg["TRACK_DIRECTION_CLOCKWISE"] = not reverse

    out = Path(work_dir) / "training_params.yaml"
    with open(out, "w") as fh:
        yaml.dump(cfg, fh, default_flow_style=False,
                  default_style="'", explicit_start=True)
    return out


# ---------------------------------------------------------------------------
# Pull from local S3 and assemble upload structure
# ---------------------------------------------------------------------------

def _build_from_s3_prefix(model_prefix, checkpoint_mode, checkpoint_num,
                          target_bucket, target_prefix):
    """
    Download model files from local DRFC S3 into a temp directory and return
    its path.  The caller is responsible for cleanup.

    checkpoint_mode: 'last' | 'best' | 'number'
    checkpoint_num:  step number (int) when mode == 'number'
    target_bucket / target_prefix: DRoA upload destination — needed to bake
    correct paths into training_params.yaml.
    """
    local_bucket = os.environ.get("DR_LOCAL_S3_BUCKET", "bucket")

    work = Path(tempfile.mkdtemp(prefix="droa-import-"))
    model_dir = work / "model"
    model_dir.mkdir()
    ip_dir = work / "ip"
    ip_dir.mkdir()
    metrics_dir = work / "metrics"
    metrics_dir.mkdir()

    print(f"Pulling model from s3://{local_bucket}/{model_prefix}")
    s3 = _local_s3_client()

    # --- metadata files ---
    # model_metadata.json must be at the root of the upload prefix (API reads it there)
    # also keep a copy inside model/ so sagemaker-artifacts structure is preserved
    _s3_cp_down(s3, local_bucket, f"{model_prefix}/model/model_metadata.json",
                work / "model_metadata.json")
    (model_dir / "model_metadata.json").write_bytes((work /
                                                     "model_metadata.json").read_bytes())
    _s3_cp_down(s3, local_bucket, f"{model_prefix}/ip/hyperparameters.json",
                ip_dir / "hyperparameters.json")

    # reward_function.py: try model root first, then DR_LOCAL_S3_REWARD_KEY
    local_reward_key = os.environ.get("DR_LOCAL_S3_REWARD_KEY",
                                      f"{model_prefix}/reward_function.py")
    try:
        _s3_cp_down(s3, local_bucket, f"{model_prefix}/reward_function.py",
                    work / "reward_function.py")
    except Exception:
        _s3_cp_down(s3, local_bucket, local_reward_key,
                    work / "reward_function.py")

    # metrics
    metrics_prefix = os.environ.get("DR_LOCAL_S3_METRICS_PREFIX",
                                    f"{model_prefix}/metrics")
    _s3_sync_down(s3, local_bucket, metrics_prefix, metrics_dir)

    # --- checkpoint index ---
    _s3_cp_down(s3, local_bucket, f"{model_prefix}/model/deepracer_checkpoints.json",
                model_dir / "deepracer_checkpoints.json")
    with open(model_dir / "deepracer_checkpoints.json") as fh:
        ckpt_index = json.load(fh)

    if checkpoint_mode == "best":
        ckpt_entry = ckpt_index.get(
            "best_checkpoint", ckpt_index.get("last_checkpoint"))
        print("Using best checkpoint.")
    elif checkpoint_mode == "number":
        # List model/ prefix and find the matching .ckpt.index key
        paginator = s3.get_paginator("list_objects_v2")
        match = None
        for page in paginator.paginate(Bucket=local_bucket,
                                       Prefix=f"{model_prefix}/model/"):
            for obj in page.get("Contents", []):
                fname = obj["Key"].split("/")[-1]
                if fname.startswith(f"{checkpoint_num}_Step-") and fname.endswith(".ckpt.index"):
                    match = fname[:-len(".index")]  # strip .index → .ckpt
                    break
            if match:
                break
        if not match:
            raise RuntimeError(
                f"No checkpoint found for step {checkpoint_num} "
                f"in s3://{local_bucket}/{model_prefix}/model/"
            )
        ckpt_entry = {"name": match}
        print(f"Using checkpoint {match}.")
    else:
        ckpt_entry = ckpt_index.get("last_checkpoint")
        print("Using last checkpoint.")

    if not ckpt_entry:
        raise RuntimeError(
            "Could not determine checkpoint from deepracer_checkpoints.json")

    ckpt_file = ckpt_entry["name"]       # e.g. "500_Step-500.ckpt"
    ckpt_step = ckpt_file.split("_")[0]  # e.g. "500"
    print(f"Checkpoint: {ckpt_file}")

    # Download checkpoint model files (prefix-filtered sync)
    _s3_sync_down(
        s3, local_bucket, f"{model_prefix}/model/", model_dir,
        include_pattern=[f"{ckpt_step}_Step-", f"model_{ckpt_step}.pb"],
    )

    # Write .coach_checkpoint
    (model_dir / ".coach_checkpoint").write_text(ckpt_file)

    # Rewrite deepracer_checkpoints.json to reference only chosen checkpoint
    new_ckpt_json = {"last_checkpoint": ckpt_entry,
                     "best_checkpoint": ckpt_entry}
    with open(model_dir / "deepracer_checkpoints.json", "w") as fh:
        json.dump(new_ckpt_json, fh)

    # --- training_params.yaml: copy from bucket, generate only if missing ---
    # Multi-worker training produces training_params_1.yaml, training_params_2.yaml, …
    # We prefer _1 (worker 1 is canonical), then the plain name, then generate.
    tp_dst = work / "training_params.yaml"
    tp_candidates = [
        f"{model_prefix}/training_params_1.yaml",
        f"{model_prefix}/training_params.yaml",
    ]
    tp_found = False
    for tp_key in tp_candidates:
        try:
            _s3_cp_down(s3, local_bucket, tp_key, tp_dst)
            print(f"Using {tp_key.split('/')[-1]} from bucket.")
            tp_found = True
            break
        except Exception:
            pass
    if not tp_found:
        print("training_params.yaml not found in bucket — generating from DR_* env vars.")
        _build_training_params(work, target_bucket, target_prefix)

    # Normalise training_params.yaml for DRoA:
    # 1. Patch all S3 bucket/prefix fields to the DRoA upload destination
    #    (the file from the bucket still references the original DRFC paths).
    # 2. WORLD_NAME must not have a _cw/_ccw suffix (DRoA TrackId has none)
    # 3. TRACK_DIRECTION_CLOCKWISE must be present (DRFC never wrote it)
    with open(tp_dst) as fh:
        tp_data = yaml.safe_load(fh) or {}
    changed = False
    # Always overwrite the S3 destination fields regardless of where the file came from
    tp_data["METRICS_S3_BUCKET"] = target_bucket
    tp_data["METRICS_S3_OBJECT_KEY"] = f"{target_prefix}/TrainingMetrics.json"
    tp_data["MODEL_METADATA_FILE_S3_KEY"] = f"{target_prefix}/model/model_metadata.json"
    tp_data["REWARD_FILE_S3_KEY"] = f"{target_prefix}/reward_function.py"
    tp_data["SAGEMAKER_SHARED_S3_BUCKET"] = target_bucket
    tp_data["SAGEMAKER_SHARED_S3_PREFIX"] = target_prefix
    changed = True
    # Strip direction suffix from WORLD_NAME if present
    world_raw = tp_data.get("WORLD_NAME", "")
    world_clean = re.sub(r'_(cw|ccw)$', '', world_raw)
    if world_clean != world_raw:
        tp_data["WORLD_NAME"] = world_clean
        changed = True
        print(
            f"    Stripped direction suffix from WORLD_NAME: {world_raw} → {world_clean}")
    # Infer TRACK_DIRECTION_CLOCKWISE if missing
    if "TRACK_DIRECTION_CLOCKWISE" not in tp_data:
        # Prefer DR_WORLD_NAME env var which still carries the suffix
        dr_world = os.environ.get("DR_WORLD_NAME", world_raw)
        if dr_world.endswith("_cw") or world_raw.endswith("_cw"):
            tp_data["TRACK_DIRECTION_CLOCKWISE"] = True
        elif dr_world.endswith("_ccw") or world_raw.endswith("_ccw"):
            tp_data["TRACK_DIRECTION_CLOCKWISE"] = False
        else:
            reverse = os.environ.get(
                "DR_TRAIN_REVERSE_DIRECTION", "False").lower() in ("true", "1", "yes")
            tp_data["TRACK_DIRECTION_CLOCKWISE"] = not reverse
        changed = True
        print(
            f"    Set TRACK_DIRECTION_CLOCKWISE={tp_data['TRACK_DIRECTION_CLOCKWISE']}")
    if changed:
        with open(tp_dst, "w") as fh:
            yaml.dump(tp_data, fh, default_flow_style=False)

    return work


# ---------------------------------------------------------------------------
# Upload to DRoA S3
# ---------------------------------------------------------------------------

def upload_model_folder(cfg, model_dir, credentials, validate_required=True, s3_prefix=None):
    """Upload all eligible files from model_dir to the DRoA S3 bucket.

    If ``s3_prefix`` is provided the files are uploaded under that exact prefix
    (important when training_params.yaml already references that prefix).
    Otherwise a new UUID-based prefix is generated.
    """
    if validate_required:
        root = Path(model_dir)
        missing = {f for f in REQUIRED_FILES_DIR if not (root / f).is_file()}
        if missing:
            raise ValueError(
                f"Missing required model files at root of model dir: {', '.join(sorted(missing))}")

    if s3_prefix is None:
        s3_prefix = f"uploads/models/{uuid.uuid4()}"
    s3 = boto3.client(
        "s3",
        region_name=cfg.region,
        aws_access_key_id=credentials["access_key"],
        aws_secret_access_key=credentials["secret_key"],
        aws_session_token=credentials["session_token"],
    )

    for file_path in Path(model_dir).rglob("*"):
        if not file_path.is_file():
            continue
        if file_path.name in EXCLUDED_FILES:
            continue
        if file_path.suffix.lower() in {".gz", ".zip"}:
            continue
        relative = file_path.relative_to(model_dir)
        s3_key = f"{s3_prefix}/{relative}"
        print(f"    Uploading: {relative}")
        s3.upload_file(
            Filename=str(file_path),
            Bucket=cfg.upload_bucket,
            Key=s3_key,
            ExtraArgs={"ContentType": _content_type(file_path)},
        )

    print(
        f"[3/4] Uploaded model files to s3://{cfg.upload_bucket}/{s3_prefix}")
    return s3_prefix


# ---------------------------------------------------------------------------
# DRoA API call
# ---------------------------------------------------------------------------

def call_import_model_api(cfg, s3_path, model_name, model_description, credentials):
    """POST /importmodel and return the created modelId."""
    url = f"{cfg.api_endpoint}/importmodel"
    payload = {
        "s3Bucket": cfg.upload_bucket,
        "s3Path": s3_path,
        "modelName": model_name,
    }
    if model_description:
        payload["modelDescription"] = model_description

    response = requests.post(
        url,
        json=payload,
        auth=build_auth(url, credentials, cfg.region, cfg.site_url),
        headers={"Content-Type": "application/json"},
        timeout=30,
    )
    if not response.ok:
        raise RuntimeError(
            f"API call failed: {response.status_code} {response.reason}\n{response.text}"
        )
    model_id = response.json().get("modelId")
    if not model_id:
        raise RuntimeError(f"Unexpected API response: {response.text}")
    print(f"[4/4] Import job created. modelId: {model_id}")
    return model_id


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def parse_args():
    parser = argparse.ArgumentParser(
        description="Import a locally trained DRFC model into DeepRacer on AWS.",
        epilog=(
            "examples:\n"
            "  %(prog)s --model-dir /tmp/my-model --model-name my-model\n"
            "  %(prog)s --model-prefix rl-deepracer-sagemaker\n"
            "  %(prog)s --model-prefix rl-deepracer-sagemaker --model-name my-model --best"
        ),
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    add_common_args(parser)

    src = parser.add_mutually_exclusive_group(required=True)
    src.add_argument(
        "--model-dir", type=Path,
        help="Pre-assembled local directory containing model files",
    )
    src.add_argument(
        "--model-prefix",
        help="DRFC local S3 model prefix to pull from (default: DR_LOCAL_S3_MODEL_PREFIX)",
    )

    parser.add_argument(
        "--model-name", default=None,
        help="Name for the imported model (default: --model-prefix or directory name)",
    )
    parser.add_argument("--model-description", default=None,
                        help="Optional model description")

    ckpt = parser.add_mutually_exclusive_group()
    ckpt.add_argument(
        "--best", action="store_true",
        help="(--model-prefix) Use best checkpoint instead of last",
    )
    ckpt.add_argument(
        "--checkpoint", type=int, metavar="STEP",
        help="(--model-prefix) Use specific checkpoint step number",
    )

    return parser.parse_args()


def main():
    args = parse_args()

    username = args.username or os.environ.get("DR_DROA_USERNAME")
    if not username:
        print("Error: --username or DR_DROA_USERNAME required.", file=sys.stderr)
        sys.exit(1)

    if args.model_dir and not args.model_dir.is_dir():
        print(
            f"Error: --model-dir '{args.model_dir}' is not a directory.", file=sys.stderr)
        sys.exit(1)

    # Derive model name from source if not given explicitly
    if not args.model_name:
        if args.model_prefix:
            args.model_name = args.model_prefix
        elif args.model_dir:
            args.model_name = args.model_dir.name
        else:
            print(
                "Error: --model-name is required when source cannot be inferred.", file=sys.stderr)
            sys.exit(1)

    cfg = load_droa_config(args)

    credentials = load_cached_credentials(cfg.identity_pool_id, username)
    if credentials:
        print("[1/4] Using cached credentials.")
    else:
        password = args.password or getpass.getpass(
            f"Password for {username}: ")
        print("[1/4] Authenticating with Cognito User Pool...")
        id_token = authenticate(cfg.region, cfg.client_id, username, password)
        print("[2/4] Obtaining temporary AWS credentials...")
        credentials = get_aws_credentials(
            cfg.region, cfg.user_pool_id, cfg.identity_pool_id, id_token)
        save_credentials_to_cache(cfg.identity_pool_id, username, credentials)
        print("[2/4] Credentials obtained.")

    temp_dir = None
    upload_prefix = None
    try:
        if args.model_dir:
            source_dir = args.model_dir
            validate = True
        else:
            model_prefix = args.model_prefix or os.environ.get(
                "DR_LOCAL_S3_MODEL_PREFIX")
            if not model_prefix:
                print(
                    "Error: --model-prefix or DR_LOCAL_S3_MODEL_PREFIX required.", file=sys.stderr)
                sys.exit(1)
            # Generate the upload prefix now so training_params.yaml can reference it
            upload_prefix = f"uploads/models/{uuid.uuid4()}"
            checkpoint_mode = "best" if args.best else (
                "number" if args.checkpoint else "last")
            print("[3/4] Pulling model from local S3...")
            temp_dir = _build_from_s3_prefix(
                model_prefix, checkpoint_mode, args.checkpoint,
                cfg.upload_bucket, upload_prefix,
            )
            source_dir = temp_dir
            validate = False

        print("[3/4] Uploading to DRoA S3...")
        s3_path = upload_model_folder(
            cfg, source_dir, credentials, validate_required=validate,
            s3_prefix=upload_prefix if not args.model_dir else None)
        model_id = call_import_model_api(
            cfg, s3_path, args.model_name, args.model_description, credentials
        )
    finally:
        if temp_dir:
            import shutil
            shutil.rmtree(temp_dir, ignore_errors=True)

    print(
        f"\nDone. Model '{args.model_name}' is being imported (id: {model_id})")
    print("Check the DeepRacer on AWS console or use: droa-get-model " + model_id)


if __name__ == "__main__":
    main()


================================================
FILE: scripts/droa/list_models.py
================================================
#!/usr/bin/env python3
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
"""
List all models in DeepRacer on AWS (DRoA).

Fetches the full paginated model list from GET /models and prints it as a
table.  Use --json for machine-readable output.

Usage examples
--------------
  # Print table of all models:
  python list_models.py

  # Raw JSON (suitable for piping to jq):
  python list_models.py --json | jq '[.[] | {id: .modelId, name: .name}]'

  # Override site URL and username on the command line:
  python list_models.py --url https://my.droa.example.com --username alice

Table columns
-------------
  Model ID        15-char alphanumeric model identifier
  Name            Model name (up to 64 chars)
  Status          Current model lifecycle status
  Training        Training job status
  Created At      Creation timestamp (UTC, second precision)

Authentication
--------------
Credentials are obtained via the Cognito Identity Pool embedded in the DRoA
site's /env.js.  A password prompt is shown on the first call; subsequent
calls within the credential lifetime (~1 h) reuse a cache stored in
~/.droa-cache/.

The site URL is read from DR_DROA_URL and the username from DR_DROA_USERNAME
(both set in system.env), or supplied via --url / --username.

Model status values
-------------------
  DELETING  ERROR  EVALUATING  IMPORTING  QUEUED  READY
  STOPPING  SUBMITTING  TRAINING

Training status values
----------------------
  CANCELED  COMPLETED  FAILED  IN_PROGRESS  INITIALIZING  QUEUED  STOPPING
"""

import argparse
import getpass
import json
import os
import sys

import requests

from auth import (
    add_common_args, authenticate, build_auth, get_aws_credentials, load_droa_config,
    load_cached_credentials, save_credentials_to_cache,
)


def list_models(cfg, credentials: dict) -> list:
    """Fetch all models, auto-paginating via token."""
    url = f"{cfg.api_endpoint}/models"
    models = []
    token = None
    while True:
        params = {"token": token} if token else {}
        response = requests.get(
            url, params=params, auth=build_auth(url, credentials, cfg.region, cfg.site_url), timeout=30
        )
        if not response.ok:
            raise RuntimeError(
                f"API error: {response.status_code} {response.reason}\n"
                f"Headers: {dict(response.headers)}\n"
                f"Body: {response.text}"
            )
        data = response.json()
        models.extend(data.get("models", []))
        token = data.get("token")
        if not token:
            break
    return models


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="List models in DeepRacer on AWS.",
        epilog=(
            "examples:\n"
            "  %(prog)s\n"
            "  %(prog)s --json | jq '[.[] | {id: .modelId, name: .name}]'"
        ),
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    add_common_args(parser)
    parser.add_argument(
        "--json", dest="output_json", action="store_true",
        help="Output raw JSON instead of a table",
    )
    return parser.parse_args()


def main() -> None:
    args = parse_args()

    username = args.username or os.environ.get("DR_DROA_USERNAME")
    if not username:
        print("Error: --username or DR_DROA_USERNAME required.", file=sys.stderr)
        sys.exit(1)

    cfg = load_droa_config(args)

    credentials = load_cached_credentials(cfg.identity_pool_id, username)
    if credentials:
        print("Using cached credentials.", file=sys.stderr)
    else:
        password = args.password or getpass.getpass(
            f"Password for {username}: ")
        id_token = authenticate(cfg.region, cfg.client_id, username, password)
        credentials = get_aws_credentials(
            cfg.region, cfg.user_pool_id, cfg.identity_pool_id, id_token)
        save_credentials_to_cache(cfg.identity_pool_id, username, credentials)
    models = sorted(
        list_models(cfg, credentials),
        key=lambda m: m.get("createdAt") or "",
        reverse=True,
    )

    if args.output_json:
        print(json.dumps(models, indent=2, default=str))
        return

    if not models:
        print("No models found.")
        return

    id_w, name_w, status_w, tstatus_w = 15, 40, 16, 16
    header = (
        f"{'Model ID':<{id_w}}  {'Name':<{name_w}}  "
        f"{'Status':<{status_w}}  {'Training':<{tstatus_w}}  Created At"
    )
    print(header)
    print("-" * (id_w + name_w + status_w + tstatus_w + 30))
    for m in models:
        created = (m.get("createdAt") or "")[:19].replace("T", " ")
        print(
            f"{m.get('modelId', ''):<{id_w}}  "
            f"{m.get('name', ''):<{name_w}}  "
            f"{m.get('status', ''):<{status_w}}  "
            f"{m.get('trainingStatus', ''):<{tstatus_w}}  "
            f"{created}"
        )


if __name__ == "__main__":
    main()


================================================
FILE: scripts/evaluation/prepare-config.py
================================================
#!/usr/bin/python3

import boto3
from datetime import datetime
import sys
import os 
import time
import json
import io
import yaml

def str2bool(v):
  return v.lower() in ("yes", "true", "t", "1")

eval_time = datetime.now().strftime('%Y%m%d%H%M%S')

config = {}
config['CAR_COLOR'] = []
config['BODY_SHELL_TYPE'] = []
config['RACER_NAME'] = []
config['DISPLAY_NAME'] = []
config['MODEL_S3_PREFIX'] = []
config['MODEL_S3_BUCKET'] = []
config['SIMTRACE_S3_PREFIX'] = []
config['SIMTRACE_S3_BUCKET'] = []
config['KINESIS_VIDEO_STREAM_NAME'] = []
config['METRICS_S3_BUCKET'] = []
config['METRICS_S3_OBJECT_KEY'] = []
config['MP4_S3_BUCKET'] = []
config['MP4_S3_OBJECT_PREFIX'] = []

# Basic configuration; including all buckets etc.
config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1')
config['JOB_TYPE'] = 'EVALUATION'
config['KINESIS_VIDEO_STREAM_NAME'] = os.environ.get('DR_KINESIS_STREAM_NAME', '')
config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy')

s3_container_endpoint_url = os.environ.get('DR_MINIO_URL', None)
if s3_container_endpoint_url is not None:
    config['S3_ENDPOINT_URL'] = s3_container_endpoint_url

config['MODEL_S3_PREFIX'].append(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker'))
config['MODEL_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
config['SIMTRACE_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
config['SIMTRACE_S3_PREFIX'].append(
    '{}/evaluation-{}'.format(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker'), eval_time)
)

# Metrics
config['METRICS_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
metrics_prefix = os.environ.get('DR_LOCAL_S3_METRICS_PREFIX', None)
if metrics_prefix is not None:
    config['METRICS_S3_OBJECT_KEY'].append('{}/evaluation/evaluation-{}.json'.format(metrics_prefix, eval_time))
else:
    config['METRICS_S3_OBJECT_KEY'].append('DeepRacer-Metrics/EvaluationMetrics-{}.json'.format(eval_time))
    
# MP4 configuration / sav
save_mp4 = str2bool(os.environ.get("DR_EVAL_SAVE_MP4", "False"))
if save_mp4:
    config['MP4_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
    config['MP4_S3_OBJECT_PREFIX'].append('{}/{}'.format(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'bucket'),'mp4'))

# Checkpoint
config['EVAL_CHECKPOINT'] = os.environ.get('DR_EVAL_CHECKPOINT', 'last')

# Car and training 
body_shell_type = os.environ.get('DR_CAR_BODY_SHELL_TYPE', 'deepracer')
config['BODY_SHELL_TYPE'].append(body_shell_type)
config['CAR_COLOR'].append(os.environ.get('DR_CAR_COLOR', 'Red'))
config['DISPLAY_NAME'].append(os.environ.get('DR_DISPLAY_NAME', 'racer1'))
config['RACER_NAME'].append(os.environ.get('DR_RACER_NAME', 'racer1'))

config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL')
config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide')
config['NUMBER_OF_TRIALS'] = os.environ.get('DR_EVAL_NUMBER_OF_TRIALS', '5')
config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false')
config['RESET_BEHIND_DIST'] = os.environ.get('DR_EVAL_RESET_BEHIND_DIST', '1.0')

config['IS_CONTINUOUS'] = os.environ.get('DR_EVAL_IS_CONTINUOUS', 'True')
config['NUMBER_OF_RESETS'] = os.environ.get('DR_EVAL_MAX_RESETS', '0')

config['OFF_TRACK_PENALTY'] = os.environ.get('DR_EVAL_OFF_TRACK_PENALTY', '5.0')
config['COLLISION_PENALTY'] = os.environ.get('DR_COLLISION_PENALTY', '5.0')

config['CAMERA_MAIN_ENABLE'] = os.environ.get('DR_CAMERA_MAIN_ENABLE', 'True')
config['CAMERA_SUB_ENABLE'] = os.environ.get('DR_CAMERA_SUB_ENABLE', 'True')
config['REVERSE_DIR'] = os.environ.get('DR_EVAL_REVERSE_DIRECTION', False)
config['ENABLE_EXTRA_KVS_OVERLAY'] = os.environ.get('DR_ENABLE_EXTRA_KVS_OVERLAY', 'False')

# Object Avoidance
if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE':
    config['NUMBER_OF_OBSTACLES'] = os.environ.get('DR_OA_NUMBER_OF_OBSTACLES', '6')
    config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0')
    config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True')
    config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false')
    config['OBSTACLE_TYPE'] = os.environ.get('DR_OA_OBSTACLE_TYPE', 'box_obstacle')

    object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "")
    if object_position_str != "":
        object_positions = []
        for o in object_position_str.split(";"):
            object_positions.append(o)
        config['OBJECT_POSITIONS'] = object_positions
        config['NUMBER_OF_OBSTACLES'] = str(len(object_positions))

# Head to Bot
if config['RACE_TYPE'] == 'HEAD_TO_BOT':
    config['IS_LANE_CHANGE'] = os.environ.get('DR_H2B_IS_LANE_CHANGE', 'False')
    config['LOWER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME', '3.0')
    config['UPPER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME', '5.0')
    config['LANE_CHANGE_DISTANCE'] = os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE', '1.0')
    config['NUMBER_OF_BOT_CARS'] = os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS', '0')
    config['MIN_DISTANCE_BETWEEN_BOT_CARS'] = os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS', '2.0')
    config['RANDOMIZE_BOT_CAR_LOCATIONS'] = os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS', 'False')
    config['BOT_CAR_SPEED'] = os.environ.get('DR_H2B_BOT_CAR_SPEED', '0.2')
    config['PENALTY_SECONDS'] = os.environ.get('DR_H2B_BOT_CAR_PENALTY', '2.0')

# Head to Model
if config['RACE_TYPE'] == 'HEAD_TO_MODEL':
    config['MODEL_S3_PREFIX'].append(os.environ.get('DR_EVAL_OPP_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker'))
    config['MODEL_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
    config['SIMTRACE_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
    config['SIMTRACE_S3_PREFIX'].append(os.environ.get('DR_EVAL_OPP_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker'))

    # Metrics
    config['METRICS_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
    metrics_prefix = os.environ.get('DR_EVAL_OPP_S3_METRICS_PREFIX', '{}/{}'.format(os.environ.get('DR_EVAL_OPP_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker'),'metrics'))
    if metrics_prefix is not None:
        config['METRICS_S3_OBJECT_KEY'].append('{}/EvaluationMetrics-{}.json'.format(metrics_prefix, str(round(time.time()))))
    else:
        config['METRICS_S3_OBJECT_KEY'].append('DeepRacer-Metrics/EvaluationMetrics-{}.json'.format(str(round(time.time()))))

    # MP4 configuration / sav
    save_mp4 = str2bool(os.environ.get("DR_EVAL_SAVE_MP4", "False"))
    if save_mp4:
        config['MP4_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
        config['MP4_S3_OBJECT_PREFIX'].append('{}/{}'.format(os.environ.get('DR_EVAL_OPP_MODEL_PREFIX', 'bucket'),'mp4'))

    # Car and training 
    config['DISPLAY_NAME'].append(os.environ.get('DR_EVAL_OPP_DISPLAY_NAME', 'racer1'))
    config['RACER_NAME'].append(os.environ.get('DR_EVAL_OPP_RACER_NAME', 'racer1'))

    body_shell_type = os.environ.get('DR_EVAL_OPP_CAR_BODY_SHELL_TYPE', 'deepracer')
    config['BODY_SHELL_TYPE'].append(body_shell_type)
    config['VIDEO_JOB_TYPE'] = 'EVALUATION'
    config['CAR_COLOR'] = ['Purple', 'Orange']    
    config['MODEL_NAME'] = config['DISPLAY_NAME']

# S3 Setup / write and upload file
s3_local_endpoint_url = os.environ.get('DR_LOCAL_S3_ENDPOINT_URL', None)
s3_region = config['AWS_REGION']
s3_bucket = config['MODEL_S3_BUCKET'][0]
s3_prefix = config['MODEL_S3_PREFIX'][0]
s3_mode = os.environ.get('DR_LOCAL_S3_AUTH_MODE','profile')
if s3_mode == 'profile':
    s3_profile = os.environ.get('DR_LOCAL_S3_PROFILE', 'default')
else: # mode is 'role'
    s3_profile = None
s3_yaml_name = os.environ.get('DR_LOCAL_S3_EVAL_PARAMS_FILE', 'eval_params.yaml')
yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name))

session = boto3.session.Session(profile_name=s3_profile)
s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_local_endpoint_url)

yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name))
local_yaml_path = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'tmp', 'eval-params-' + str(round(time.time())) + '.yaml'))

with open(local_yaml_path, 'w') as yaml_file:
    yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True)

s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path)


================================================
FILE: scripts/evaluation/start.sh
================================================
#!/usr/bin/env bash

source $DR_DIR/bin/scripts_wrapper.sh

usage() {
  echo "Usage: $0 [-q] [-c]"
  echo "       -q        Quiet - does not start log tracing."
  echo "       -c        Clone - copies model into new prefix before evaluating."
  exit 1
}

trap ctrl_c INT

function ctrl_c() {
  echo "Requested to stop."
  exit 1
}

while getopts ":qc" opt; do
  case $opt in
  q)
    OPT_QUIET="QUIET"
    ;;
  c)
    OPT_CLONE="CLONE"
    ;;
  h)
    usage
    ;;
  \?)
    echo "Invalid option -$OPTARG" >&2
    usage
    ;;
  esac
done

## Check if WSL2
if [[ -f /proc/version ]] && grep -qi Microsoft /proc/version && grep -q "WSL2" /proc/version; then
    IS_WSL2="yes"
fi

# set evaluation specific environment variables
STACK_NAME="deepracer-eval-$DR_RUN_ID"
STACK_CONTAINERS=$(docker stack ps $STACK_NAME 2>/dev/null | wc -l)
if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
  if [[ "$STACK_CONTAINERS" -gt 1 ]]; then
    echo "ERROR: Processes running in stack $STACK_NAME. Stop evaluation with dr-stop-evaluation."
    exit 1
  fi
fi

# Ensure Sagemaker's folder is there
_dr_ensure_sagemaker_dir

echo "Evaluation of model s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX starting."
echo "Using image ${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION}"
echo ""

# clone if required
if [ -n "$OPT_CLONE" ]; then
  echo "Cloning model into s3://$DR_LOCAL_S3_BUCKET/${DR_LOCAL_S3_MODEL_PREFIX}-E"
  aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX/model s3://$DR_LOCAL_S3_BUCKET/${DR_LOCAL_S3_MODEL_PREFIX}-E/model
  aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX/ip s3://$DR_LOCAL_S3_BUCKET/${DR_LOCAL_S3_MODEL_PREFIX}-E/ip
  export DR_LOCAL_S3_MODEL_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX}-E
fi

# set evaluation specific environment variables
S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX"

export ROBOMAKER_COMMAND="/opt/ml/code/run.sh run evaluation.launch.py"
export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_EVAL_PARAMS_FILE}

if [ ${DR_ROBOMAKER_MOUNT_LOGS,,} = "true" ]; then
  COMPOSE_FILES="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-mount.yml"
  export DR_MOUNT_DIR="$DR_DIR/data/logs/robomaker/$DR_LOCAL_S3_MODEL_PREFIX"
  mkdir -p $DR_MOUNT_DIR
else
  COMPOSE_FILES="$DR_EVAL_COMPOSE_FILE"
fi

echo "Creating Robomaker configuration in $S3_PATH/$DR_CURRENT_PARAMS_FILE"
python3 $DR_DIR/scripts/evaluation/prepare-config.py

# Check if we are using Host X -- ensure variables are populated
if [[ "${DR_HOST_X,,}" == "true" ]]; then
  if [[ -n "$DR_DISPLAY" ]]; then
    ROBO_DISPLAY=$DR_DISPLAY
  else
    ROBO_DISPLAY=$DISPLAY
  fi

  if ! DISPLAY=$ROBO_DISPLAY timeout 1s xset q &>/dev/null; then
    echo "No X Server running on display $ROBO_DISPLAY. Exiting"
    exit 1
  fi

  if [[ -z "$XAUTHORITY" && "$IS_WSL2" != "yes" ]]; then
    export XAUTHORITY=~/.Xauthority
    if [[ ! -f "$XAUTHORITY" ]]; then
      echo "No XAUTHORITY defined. .Xauthority does not exist. Stopping."
      exit 1
    fi
  fi
fi

# Check if we will use Docker Swarm or Docker Compose
if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then

  if [ "$DR_DOCKER_MAJOR_VERSION" -gt 24 ]; then
    DETACH_FLAG="--detach=true"
  fi

  DISPLAY=$ROBO_DISPLAY docker stack deploy $COMPOSE_FILES $DETACH_FLAG $STACK_NAME
else
  DISPLAY=$ROBO_DISPLAY docker compose $COMPOSE_FILES -p $STACK_NAME up -d
fi

# Request to be quiet. Quitting here.
if [ -n "$OPT_QUIET" ]; then
  exit 0
fi

# Trigger requested log-file
dr-logs-robomaker -w 15 -e


================================================
FILE: scripts/evaluation/stop.sh
================================================
#!/usr/bin/env bash

STACK_NAME="deepracer-eval-$DR_RUN_ID"

# Check if we will use Docker Swarm or Docker Compose
if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
    docker stack rm $STACK_NAME
else
    COMPOSE_FILES=$(echo ${DR_EVAL_COMPOSE_FILE} | cut -f1-2 -d\ )
    export DR_CURRENT_PARAMS_FILE=""
    docker compose $COMPOSE_FILES -p $STACK_NAME down
fi


================================================
FILE: scripts/log-analysis/start.sh
================================================
#!/usr/bin/env bash

if docker ps --filter "name=deepracer-analysis" --format "{{.Names}}" | grep -q "^deepracer-analysis$"; then
  echo "Log-analysis is already running. Use dr-url-loganalysis to get the URL."
  exit 0
fi

echo "Starting log-analysis container (image: awsdeepracercommunity/deepracer-analysis:${DR_ANALYSIS_IMAGE})..."
docker run --rm -d -p "8888:8888" \
-v $DR_DIR/data/logs:/workspace/logs \
-v $DR_DIR/docker/volumes/.aws:/home/ubuntu/.aws \
-v $DR_DIR/data/analysis:/workspace/analysis \
-v $DR_DIR/data/minio:/workspace/minio \
--name deepracer-analysis \
--network sagemaker-local \
 awsdeepracercommunity/deepracer-analysis:$DR_ANALYSIS_IMAGE > /dev/null

echo "Waiting for Jupyter to start..."
for i in $(seq 1 30); do
  URL=$(docker logs deepracer-analysis 2>&1 | grep -oE 'http://127\.0\.0\.1:[0-9]+[^ ]*token=[a-f0-9]+' | tail -1)
  if [ -n "$URL" ]; then
    echo "Log-analysis is running. Open in browser:"
    echo "  ${URL/127.0.0.1/localhost}"
    exit 0
  fi
  sleep 1
done
echo "Log-analysis started. Use dr-url-loganalysis to get the URL once ready."

================================================
FILE: scripts/log-analysis/stop.sh
================================================
#!/usr/bin/env bash

echo "Stopping log-analysis container..."
if docker stop deepracer-analysis > /dev/null 2>&1; then
  echo "Log-analysis stopped."
else
  echo "Log-analysis is not running."
fi


================================================
FILE: scripts/metrics/start.sh
================================================
#!/usr/bin/env bash

COMPOSE_FILES=./docker/docker-compose-metrics.yml

docker compose -f $COMPOSE_FILES -p deepracer-metrics up -d

================================================
FILE: scripts/metrics/stop.sh
================================================
#!/usr/bin/env bash

COMPOSE_FILES=./docker/docker-compose-metrics.yml

docker compose -f $COMPOSE_FILES -p deepracer-metrics down

================================================
FILE: scripts/training/increment.sh
================================================
#!/usr/bin/env bash

usage() {
    echo "Usage: $0 [-f] [-w] [-p <model-prefix>] [-d <delimiter>]"
    echo ""
    echo "Command will set the current model to be the pre-trained model and increment a numerical suffix."
    echo "-p model  Sets the to-be name to be <model-prefix> rather than auto-incremeneting the previous model."
    echo "-d delim  Delimiter in model-name (e.g. '-' in 'test-model-1')"
    echo "-f        Force. Ask for no confirmations."
    echo "-w        Wipe the S3 prefix to ensure that two models are not mixed."
    exit 1
}

trap ctrl_c INT

function ctrl_c() {
    echo "Requested to stop."
    exit 1
}

OPT_DELIM='-'

while getopts ":fwp:d:" opt; do
    case $opt in

    f)
        OPT_FORCE="True"
        ;;
    p)
        OPT_PREFIX="$OPTARG"
        ;;
    w)
        OPT_WIPE="--delete"
        ;;
    d)
        OPT_DELIM="$OPTARG"
        ;;
    h)
        usage
        ;;
    \?)
        echo "Invalid option -$OPTARG" >&2
        usage
        ;;
    esac
done

CONFIG_FILE=$DR_CONFIG
echo "Configuration file $CONFIG_FILE will be updated."

## Read in data
CURRENT_RUN_MODEL=$(grep -e "^DR_LOCAL_S3_MODEL_PREFIX" ${CONFIG_FILE} | awk '{split($0,a,"="); print a[2] }')
CURRENT_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL}" |
    awk -v DELIM="${OPT_DELIM}" '{ n=split($0,a,DELIM); if (a[n] ~ /[0-9]*/) print a[n]; else print ""; }')
if [[ -n ${OPT_PREFIX} ]]; then
    NEW_RUN_MODEL="${OPT_PREFIX}"
else
    if [[ -z ${CURRENT_RUN_MODEL_NUM} ]]; then
        NEW_RUN_MODEL="${CURRENT_RUN_MODEL}${OPT_DELIM}1"
    else
        NEW_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL_NUM} + 1" | bc)
        NEW_RUN_MODEL=$(echo $CURRENT_RUN_MODEL | sed "s/${CURRENT_RUN_MODEL_NUM}\$/${NEW_RUN_MODEL_NUM}/")
    fi
fi

if [[ -n "${NEW_RUN_MODEL}" ]]; then
    echo "Incrementing model from ${CURRENT_RUN_MODEL} to ${NEW_RUN_MODEL}"
    if [[ -z "${OPT_FORCE}" ]]; then
        read -r -p "Are you sure? [y/N] " response
        if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]]; then
            echo "Aborting."
            exit 1
        fi
    fi
    sed -i.bak -re "s/(DR_LOCAL_S3_PRETRAINED_PREFIX=).*$/\1$CURRENT_RUN_MODEL/g; s/(DR_LOCAL_S3_PRETRAINED=).*$/\1True/g; ; s/(DR_LOCAL_S3_MODEL_PREFIX=).*$/\1$NEW_RUN_MODEL/g" "$CONFIG_FILE" && echo "Done."
else
    echo "Error in determining new model. Aborting."
    exit 1
fi

if [[ -n "${OPT_WIPE}" ]]; then
    MODEL_DIR_S3=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls s3://${DR_LOCAL_S3_BUCKET}/${NEW_RUN_MODEL})
    if [[ -n "${MODEL_DIR_S3}" ]]; then
        echo "The new model's S3 prefix s3://${DR_LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} exists. Will wipe."
    fi
    if [[ -z "${OPT_FORCE}" ]]; then
        read -r -p "Are you sure? [y/N] " response
        if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]]; then
            echo "Aborting."
            exit 1
        fi
    fi
    aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 rm s3://${DR_LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} --recursive
fi


================================================
FILE: scripts/training/prepare-config.py
================================================
#!/usr/bin/python3

from datetime import datetime
import boto3
import sys
import os 
import time
import json
import io
import yaml

train_time = datetime.now().strftime('%Y%m%d%H%M%S')

config = {}
config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1')
config['JOB_TYPE'] = 'TRAINING'
config['KINESIS_VIDEO_STREAM_NAME'] = os.environ.get('DR_KINESIS_STREAM_NAME', '')
config['METRICS_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')

s3_container_endpoint_url = os.environ.get('DR_MINIO_URL', None)
if s3_container_endpoint_url is not None:
    config['S3_ENDPOINT_URL'] = s3_container_endpoint_url

metrics_prefix = os.environ.get('DR_LOCAL_S3_METRICS_PREFIX', None)
if metrics_prefix is not None:
    config['METRICS_S3_OBJECT_KEY'] = '{}/TrainingMetrics.json'.format(metrics_prefix)
else:
    config['METRICS_S3_OBJECT_KEY'] = 'DeepRacer-Metrics/TrainingMetrics-{}.json'.format(train_time)

config['MODEL_METADATA_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_MODEL_METADATA_KEY', 'custom_files/model_metadata.json') 
config['REWARD_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_REWARD_KEY', 'custom_files/reward_function.py')
config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy')
config['NUM_WORKERS'] = os.environ.get('DR_WORKERS', 1)
config['SAGEMAKER_SHARED_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')
config['SAGEMAKER_SHARED_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker')
config['SIMTRACE_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')
config['SIMTRACE_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker')
config['TRAINING_JOB_ARN'] = 'arn:Dummy'

# Car and training 
config['BODY_SHELL_TYPE'] = os.environ.get('DR_CAR_BODY_SHELL_TYPE', 'deepracer')
config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red')
config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar')
config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL')
config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide')
config['DISPLAY_NAME'] = os.environ.get('DR_DISPLAY_NAME', 'racer1')
config['RACER_NAME'] = os.environ.get('DR_RACER_NAME', 'racer1')

config['REVERSE_DIR'] = os.environ.get('DR_TRAIN_REVERSE_DIRECTION', False)
config['ALTERNATE_DRIVING_DIRECTION'] = os.environ.get('DR_TRAIN_ALTERNATE_DRIVING_DIRECTION', os.environ.get('DR_ALTERNATE_DRIVING_DIRECTION', 'false'))
config['CHANGE_START_POSITION'] = os.environ.get('DR_TRAIN_CHANGE_START_POSITION', os.environ.get('DR_CHANGE_START_POSITION', 'true'))
config['ROUND_ROBIN_ADVANCE_DIST'] = os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST', '0.05')
config['START_POSITION_OFFSET'] = os.environ.get('DR_TRAIN_START_POSITION_OFFSET', '0.00')
config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false')
config['MIN_EVAL_TRIALS'] = os.environ.get('DR_TRAIN_MIN_EVAL_TRIALS', '5')
config['CAMERA_MAIN_ENABLE'] = os.environ.get('DR_CAMERA_MAIN_ENABLE', 'True')
config['CAMERA_SUB_ENABLE'] = os.environ.get('DR_CAMERA_SUB_ENABLE', 'True')
config['BEST_MODEL_METRIC'] = os.environ.get('DR_TRAIN_BEST_MODEL_METRIC', 'progress')
config['ENABLE_EXTRA_KVS_OVERLAY'] = os.environ.get('DR_ENABLE_EXTRA_KVS_OVERLAY', 'False')

# Object Avoidance
if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE':
    config['NUMBER_OF_OBSTACLES'] = os.environ.get('DR_OA_NUMBER_OF_OBSTACLES', '6')
    config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0')
    config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True')
    config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false')
    config['OBSTACLE_TYPE'] = os.environ.get('DR_OA_OBSTACLE_TYPE', 'box_obstacle')

    object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "")
    if object_position_str != "":
        object_positions = []
        for o in object_position_str.split(";"):
            object_positions.append(o)
        config['OBJECT_POSITIONS'] = object_positions
        config['NUMBER_OF_OBSTACLES'] = str(len(object_positions))

# Head to Bot
if config['RACE_TYPE'] == 'HEAD_TO_BOT':
    config['IS_LANE_CHANGE'] = os.environ.get('DR_H2B_IS_LANE_CHANGE', 'False')
    config['LOWER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME', '3.0')
    config['UPPER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME', '5.0')
    config['LANE_CHANGE_DISTANCE'] = os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE', '1.0')
    config['NUMBER_OF_BOT_CARS'] = os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS', '0')
    config['MIN_DISTANCE_BETWEEN_BOT_CARS'] = os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS', '2.0')
    config['RANDOMIZE_BOT_CAR_LOCATIONS'] = os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS', 'False')
    config['BOT_CAR_SPEED'] = os.environ.get('DR_H2B_BOT_CAR_SPEED', '0.2')
    config['PENALTY_SECONDS'] = os.environ.get('DR_H2B_BOT_CAR_PENALTY', '2.0')

s3_local_endpoint_url = os.environ.get('DR_LOCAL_S3_ENDPOINT_URL', None)
s3_region = config['AWS_REGION']
s3_bucket = config['SAGEMAKER_SHARED_S3_BUCKET']
s3_prefix = config['SAGEMAKER_SHARED_S3_PREFIX']
s3_mode = os.environ.get('DR_LOCAL_S3_AUTH_MODE','profile')
if s3_mode == 'profile':
    s3_profile = os.environ.get('DR_LOCAL_S3_PROFILE', 'default')
else: # mode is 'role'
    s3_profile = None
s3_yaml_name = os.environ.get('DR_LOCAL_S3_TRAINING_PARAMS_FILE', 'training_params.yaml')
yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name))

session = boto3.session.Session(profile_name=s3_profile)
s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_local_endpoint_url)

yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name))
local_yaml_path = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'tmp', 'training-params-' + train_time + '.yaml'))

with open(local_yaml_path, 'w') as yaml_file:
    yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True)

# Copy the reward function to the s3 prefix bucket for compatability with DeepRacer console.
reward_function_key = os.path.normpath(os.path.join(s3_prefix, "reward_function.py"))
copy_source = {
    'Bucket': s3_bucket,
    'Key': config['REWARD_FILE_S3_KEY']
}
s3_client.copy(copy_source, Bucket=s3_bucket, Key=reward_function_key)

# Training with different configurations on each worker (aka Multi Config training)
config['MULTI_CONFIG'] = os.environ.get('DR_TRAIN_MULTI_CONFIG', 'False')
num_workers = int(config['NUM_WORKERS'])

if config['MULTI_CONFIG'] == "True" and num_workers > 1:
    
    multi_config = {}
    multi_config['multi_config'] = [None] * num_workers

    for i in range(1,num_workers+1,1):
        if i == 1:
            # copy training_params to training_params_1
            s3_yaml_name_list = s3_yaml_name.split('.')
            s3_yaml_name_temp = s3_yaml_name_list[0] + "_%d.yaml" % i

            #upload additional training params files
            yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name_temp))
            s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path)            

            # Store in multi_config array
            multi_config['multi_config'][i - 1] = {'config_file': s3_yaml_name_temp,
                                                             'world_name': config['WORLD_NAME']}

        else:  # i >= 2 
            #read in additional configuration file.  format of file must be worker#-run.env
            if os.environ.get('DR_EXPERIMENT_NAME'):
                location = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'experiments', os.environ.get('DR_EXPERIMENT_NAME'),'worker-{}.env'.format(i)))
            else:
                location = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'worker-{}.env'.format(i)))
            with open(location, 'r') as fh:
                vars_dict = dict(
                    tuple(line.split('='))
                    for line in fh.read().splitlines() if not line.startswith('#')
                    )

            # Reset parameters for the configuration of this worker number
            os.environ.update(vars_dict)

            # Update car and training parameters
            config.update({'WORLD_NAME': os.environ.get('DR_WORLD_NAME')})
            config.update({'RACE_TYPE': os.environ.get('DR_RACE_TYPE')})
            config.update({'CAR_COLOR': os.environ.get('DR_CAR_COLOR')})
            config.update({'BODY_SHELL_TYPE': os.environ.get('DR_CAR_BODY_SHELL_TYPE')})
            config.update({'ALTERNATE_DRIVING_DIRECTION': os.environ.get('DR_TRAIN_ALTERNATE_DRIVING_DIRECTION')})
            config.update({'CHANGE_START_POSITION': os.environ.get('DR_TRAIN_CHANGE_START_POSITION')})
            config.update({'ROUND_ROBIN_ADVANCE_DIST': os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST')})
            config.update({'ENABLE_DOMAIN_RANDOMIZATION': os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION')})
            config.update({'START_POSITION_OFFSET': os.environ.get('DR_TRAIN_START_POSITION_OFFSET', '0.00')})
            config.update({'REVERSE_DIR': os.environ.get('DR_TRAIN_REVERSE_DIRECTION', False)})
            config.update({'CAMERA_MAIN_ENABLE': os.environ.get('DR_CAMERA_MAIN_ENABLE', 'True')})
            config.update({'CAMERA_SUB_ENABLE': os.environ.get('DR_CAMERA_SUB_ENABLE', 'True')})  
            config.update({'ENABLE_EXTRA_KVS_OVERLAY': os.environ.get('DR_ENABLE_EXTRA_KVS_OVERLAY', 'False')})

            
            # Update Object Avoidance parameters
            if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE':
                config.update({'NUMBER_OF_OBSTACLES': os.environ.get('DR_OA_NUMBER_OF_OBSTACLES')})
                config.update({'MIN_DISTANCE_BETWEEN_OBSTACLES': os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES')})
                config.update({'RANDOMIZE_OBSTACLE_LOCATIONS': os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS')})
                config.update({'IS_OBSTACLE_BOT_CAR': os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR')})
                config.update({'OBSTACLE_TYPE': os.environ.get('DR_OA_OBSTACLE_TYPE', 'box_obstacle')})

                object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "")
                if object_position_str != "":
                    object_positions = []
                    for o in object_position_str.replace('"','').split(";"):
                        object_positions.append(o)
                    config.update({'OBJECT_POSITIONS': object_positions})
                    config.update({'NUMBER_OF_OBSTACLES': str(len(object_positions))})
                else:
                    config.pop('OBJECT_POSITIONS',[])
            else:
                config.pop('NUMBER_OF_OBSTACLES', None)
                config.pop('MIN_DISTANCE_BETWEEN_OBSTACLES', None)
                config.pop('RANDOMIZE_OBSTACLE_LOCATIONS', None)
                config.pop('IS_OBSTACLE_BOT_CAR', None)
                config.pop('OBJECT_POSITIONS',[])

            # Update Head to Bot parameters
            if config['RACE_TYPE'] == 'HEAD_TO_BOT':
                config.update({'IS_LANE_CHANGE': os.environ.get('DR_H2B_IS_LANE_CHANGE')})
                config.update({'LOWER_LANE_CHANGE_TIME': os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME')})
                config.update({'UPPER_LANE_CHANGE_TIME': os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME')})
                config.update({'LANE_CHANGE_DISTANCE': os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE')})
                config.update({'NUMBER_OF_BOT_CARS': os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS')})
                config.update({'MIN_DISTANCE_BETWEEN_BOT_CARS': os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS')})
                config.update({'RANDOMIZE_BOT_CAR_LOCATIONS': os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS')})
                config.update({'BOT_CAR_SPEED': os.environ.get('DR_H2B_BOT_CAR_SPEED')})
                config.update({'PENALTY_SECONDS': os.environ.get('DR_H2B_BOT_CAR_PENALTY')})
            else:
                config.pop('IS_LANE_CHANGE', None)
                config.pop('LOWER_LANE_CHANGE_TIME', None)
                config.pop('UPPER_LANE_CHANGE_TIME', None)
                config.pop('LANE_CHANGE_DISTANCE', None)
                config.pop('NUMBER_OF_BOT_CARS', None)
                config.pop('MIN_DISTANCE_BETWEEN_BOT_CARS', None)
                config.pop('RANDOMIZE_BOT_CAR_LOCATIONS', None)
                config.pop('BOT_CAR_SPEED', None)

            #split string s3_yaml_name, insert the worker number, and add back on the .yaml extension
            s3_yaml_name_list = s3_yaml_name.split('.')
            s3_yaml_name_temp = s3_yaml_name_list[0] + "_%d.yaml" % i

            #upload additional training params files
            yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name_temp))
            local_yaml_path = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'tmp', 'training-params-' + train_time + '-' + str(i) + '.yaml'))
            with open(local_yaml_path, 'w') as yaml_file:
                yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True)
            s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path)

            # Store in multi_config array
            multi_config['multi_config'][i - 1] = {'config_file': s3_yaml_name_temp,
                                                             'world_name': config['WORLD_NAME']}

    print(json.dumps(multi_config))

else:
    s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path)


================================================
FILE: scripts/training/start.sh
================================================
#!/usr/bin/env bash

source $DR_DIR/bin/scripts_wrapper.sh

usage() {
  echo "Usage: $0 [-w] [-q | -s | -r [n] | -a ] [-v]"
  echo "       -w        Wipes the target AWS DeepRacer model structure before upload."
  echo "       -q        Do not output / follow a log when starting."
  echo "       -a        Follow all Sagemaker and Robomaker logs."
  echo "       -s        Follow Sagemaker logs (default)."
  echo "       -v        Updates the viewer webpage."
  echo "       -r [n]    Follow Robomaker logs for worker n (default worker 0 / replica 1)."
  exit 1
}

trap ctrl_c INT

function ctrl_c() {
  echo "Requested to stop."
  exit 1
}

OPT_DISPLAY="SAGEMAKER"

while getopts ":whqsavr:" opt; do
  case $opt in
  w)
    OPT_WIPE="WIPE"
    ;;
  q)
    OPT_QUIET="QUIET"
    ;;
  s)
    OPT_DISPLAY="SAGEMAKER"
    ;;
  a)
    OPT_DISPLAY="ALL"
    ;;
  r) # Check if value is in numeric format.
    OPT_DISPLAY="ROBOMAKER"
    if [[ $OPTARG =~ ^[0-9]+$ ]]; then
      OPT_ROBOMAKER=$OPTARG
    else
      OPT_ROBOMAKER=0
      ((OPTIND--))
    fi
    ;;
  v)
    OPT_VIEWER="VIEWER"
    ;;
  h)
    usage
    ;;
  \?)
    echo "Invalid option -$OPTARG" >&2
    usage
    ;;
  esac
done

## Check if WSL2
if [[ -f /proc/version ]] && grep -qi Microsoft /proc/version && grep -q "WSL2" /proc/version; then
    IS_WSL2="yes"
fi

# Ensure Sagemaker's folder is there
_dr_ensure_sagemaker_dir

# set evaluation specific environment variables
STACK_NAME="deepracer-$DR_RUN_ID"
STACK_CONTAINERS=$(docker stack ps $STACK_NAME 2>/dev/null | wc -l)
if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
  if [[ "$STACK_CONTAINERS" -gt 1 ]]; then
    echo "ERROR: Processes running in stack $STACK_NAME. Stop training with dr-stop-training."
    exit 1
  fi
fi

# Check if metadata-files are available
WORK_DIR=${DR_DIR}/tmp/start/
mkdir -p ${WORK_DIR}
rm -f ${WORK_DIR}/*

REWARD_FILE=""
METADATA_FILE=""
HYPERPARAM_FILE=""

aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${DR_LOCAL_S3_BUCKET}/${DR_LOCAL_S3_REWARD_KEY} ${WORK_DIR} --no-progress >/dev/null 2>&1
aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${DR_LOCAL_S3_BUCKET}/${DR_LOCAL_S3_MODEL_METADATA_KEY} ${WORK_DIR} --no-progress >/dev/null 2>&1
aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${DR_LOCAL_S3_BUCKET}/${DR_LOCAL_S3_HYPERPARAMETERS_KEY} ${WORK_DIR} --no-progress >/dev/null 2>&1

if [ -f "${WORK_DIR}/$(basename "$DR_LOCAL_S3_REWARD_KEY")" ]; then
  REWARD_FILE=$(_realpath "${WORK_DIR}/$(basename "$DR_LOCAL_S3_REWARD_KEY")")
fi

if [ -f "${WORK_DIR}/$(basename "$DR_LOCAL_S3_MODEL_METADATA_KEY")" ]; then
  METADATA_FILE=$(_realpath "${WORK_DIR}/$(basename "$DR_LOCAL_S3_MODEL_METADATA_KEY")")
fi

if [ -f "${WORK_DIR}/$(basename "$DR_LOCAL_S3_HYPERPARAMETERS_KEY")" ]; then
  HYPERPARAM_FILE=$(_realpath "${WORK_DIR}/$(basename "$DR_LOCAL_S3_HYPERPARAMETERS_KEY")")
fi

if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$HYPERPARAM_FILE" ]; then
  echo "Training of model s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX starting."
  echo "Using configuration files:"
  echo "   s3://${DR_LOCAL_S3_BUCKET}/${DR_LOCAL_S3_REWARD_KEY}"
  echo "   s3://${DR_LOCAL_S3_BUCKET}/${DR_LOCAL_S3_MODEL_METADATA_KEY}"
  echo "   s3://${DR_LOCAL_S3_BUCKET}/${DR_LOCAL_S3_HYPERPARAMETERS_KEY}"
  echo "Using image ${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION}"
  echo ""
else
  echo "Training aborted. Configuration files were not found."
  echo "Manually check that the following files exist:"
  echo "   s3://${DR_LOCAL_S3_BUCKET}/${DR_LOCAL_S3_REWARD_KEY}"
  echo "   s3://${DR_LOCAL_S3_BUCKET}/${DR_LOCAL_S3_MODEL_METADATA_KEY}"
  echo "   s3://${DR_LOCAL_S3_BUCKET}/${DR_LOCAL_S3_HYPERPARAMETERS_KEY}"
  echo "You might have to run dr-upload-custom files."
  exit 1
fi

# Check if model path exists.
S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX"

S3_FILES=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls ${S3_PATH} | wc -l)
if [[ "$S3_FILES" -gt 0 ]]; then
  if [[ -z $OPT_WIPE ]]; then
    echo "Selected path $S3_PATH exists. Delete it, or use -w option. Exiting."
    exit 1
  else
    echo "Wiping path $S3_PATH."
    aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 rm --recursive ${S3_PATH}
  fi
fi

# Base compose file
if [ ${DR_ROBOMAKER_MOUNT_LOGS,,} = "true" ]; then
  COMPOSE_FILES="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-mount.yml"
  export DR_MOUNT_DIR="$DR_DIR/data/logs/robomaker/$DR_LOCAL_S3_MODEL_PREFIX"
  mkdir -p $DR_MOUNT_DIR
else
  COMPOSE_FILES="$DR_TRAIN_COMPOSE_FILE"
fi

export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_TRAINING_PARAMS_FILE}

WORKER_CONFIG=$(python3 $DR_DIR/scripts/training/prepare-config.py)

if [ "$DR_WORKERS" -gt 1 ]; then
  echo "Starting $DR_WORKERS workers"

  if [[ "${DR_DOCKER_STYLE,,}" != "swarm" ]]; then
    mkdir -p $DR_DIR/tmp/comms.$DR_RUN_ID
    rm -rf $DR_DIR/tmp/comms.$DR_RUN_ID/*
    COMPOSE_FILES="$COMPOSE_FILES $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-robomaker-multi.yml"
  fi

  if [ "$DR_TRAIN_MULTI_CONFIG" == "True" ]; then
    export MULTI_CONFIG=$WORKER_CONFIG
    echo "Multi-config training, creating multiple Robomaker configurations in $S3_PATH"
  else
    echo "Creating Robomaker configuration in $S3_PATH/$DR_LOCAL_S3_TRAINING_PARAMS_FILE"
  fi
  export ROBOMAKER_COMMAND="/opt/ml/code/run.sh multi distributed_training.launch.py"

else
  export ROBOMAKER_COMMAND="/opt/ml/code/run.sh run distributed_training.launch.py"
  echo "Creating Robomaker configuration in $S3_PATH/$DR_LOCAL_S3_TRAINING_PARAMS_FILE"
fi

# Check if we are using Host X -- ensure variables are populated
if [[ "${DR_HOST_X,,}" == "true" ]]; then
  if [[ -n "$DR_DISPLAY" ]]; then
    ROBO_DISPLAY=$DR_DISPLAY
  else
    ROBO_DISPLAY=$DISPLAY
  fi

  if ! DISPLAY=$ROBO_DISPLAY timeout 1s xset q &>/dev/null; then
    echo "No X Server running on display $ROBO_DISPLAY. Exiting"
    exit 1
  fi

  if [[ -z "$XAUTHORITY" && "$IS_WSL2" != "yes" ]]; then
    export XAUTHORITY=~/.Xauthority
    if [[ ! -f "$XAUTHORITY" ]]; then
      echo "No XAUTHORITY defined. .Xauthority does not exist. Stopping."
      exit 1
    fi
  fi

fi

# Check if we will use Docker Swarm or Docker Compose
if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
  ROBOMAKER_NODES=$(docker node ls --format '{{.ID}}' | xargs docker inspect | jq '.[] | select (.Spec.Labels.Robomaker == "true") | .ID' | wc -l)
  if [[ "$ROBOMAKER_NODES" -eq 0 ]]; then
    echo "ERROR: No Swarm Nodes labelled for placement of Robomaker. Please add Robomaker node."
    echo "       Example: docker node update --label-add Robomaker=true $(docker node inspect self | jq .[0].ID -r)"
    exit 1
  fi

  SAGEMAKER_NODES=$(docker node ls --format '{{.ID}}' | xargs docker inspect | jq '.[] | select (.Spec.Labels.Sagemaker == "true") | .ID' | wc -l)
  if [[ "$SAGEMAKER_NODES" -eq 0 ]]; then
    echo "ERROR: No Swarm Nodes labelled for placement of Sagemaker. Please add Sagemaker node."
    echo "       Example: docker node update --label-add Sagemaker=true $(docker node inspect self | jq .[0].ID -r)"
    exit 1
  fi

  if [ "$DR_DOCKER_MAJOR_VERSION" -gt 24 ]; then
    DETACH_FLAG="--detach=true"
  fi

  DISPLAY=$ROBO_DISPLAY docker stack deploy $COMPOSE_FILES $DETACH_FLAG $STACK_NAME

else
  DISPLAY=$ROBO_DISPLAY docker compose $COMPOSE_FILES -p $STACK_NAME up -d --scale robomaker=$DR_WORKERS
fi

# Viewer
if [ -n "$OPT_VIEWER" ]; then
  (
    sleep 5
    dr-update-viewer
  )
fi

# Request to be quiet. Quitting here.
if [ -n "$OPT_QUIET" ]; then
  exit 0
fi

# Trigger requested log-file
if [[ "${OPT_DISPLAY,,}" == "all" && -n "${DISPLAY}" && "${DR_HOST_X,,}" == "true" ]]; then
  dr-logs-sagemaker -w 15
  if [ "${DR_WORKERS}" -gt 1 ]; then
    for i in $(seq 1 ${DR_WORKERS}); do
      dr-logs-robomaker -w 15 -n $i
    done
  else
    dr-logs-robomaker -w 15
  fi
elif [[ "${OPT_DISPLAY,,}" == "robomaker" ]]; then
  dr-logs-robomaker -w 15 -n $OPT_ROBOMAKER
elif [[ "${OPT_DISPLAY,,}" == "sagemaker" ]]; then
  dr-logs-sagemaker -w 15
fi


================================================
FILE: scripts/training/stop.sh
================================================
#!/usr/bin/env bash
source $DR_DIR/bin/scripts_wrapper.sh

STACK_NAME="deepracer-$DR_RUN_ID"

SAGEMAKER_CONTAINERS=$(dr-find-sagemaker)

if [[ -n "$SAGEMAKER_CONTAINERS" ]]; then
    for CONTAINER in $SAGEMAKER_CONTAINERS; do
        CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER)
        if [[ -n "$CONTAINER_NAME" ]]; then
            echo Found Sagemaker as $CONTAINER_NAME
            if _dr_is_macos; then
                echo "Stopping container $CONTAINER_NAME"
                docker stop $CONTAINER || true
                docker container rm $CONTAINER -v >/dev/null 2>&1 || true
            else
                COMPOSE_SERVICE_NAME=$(echo $CONTAINER_NAME | perl -n -e'/(.*)-(algo-(.)-(.*))/; print $2')
                if [[ -n "$COMPOSE_SERVICE_NAME" ]]; then
                    COMPOSE_FILES=$(_dr_find_sagemaker_compose_files "$COMPOSE_SERVICE_NAME")
                    for COMPOSE_FILE in $COMPOSE_FILES; do
                        if _dr_compose_file_matches_run "$COMPOSE_FILE"; then
                            if [ "$DR_DOCKER_MAJOR_VERSION" -gt 24 ]; then
                                sudo sed -i '/^version:/d' $COMPOSE_FILE
                            fi

                            echo "Stopping service $COMPOSE_SERVICE_NAME"
                            sudo docker compose -f $COMPOSE_FILE stop $COMPOSE_SERVICE_NAME
                            docker container rm $CONTAINER -v >/dev/null 2>&1 || true
                        fi
                    done
                fi
            fi
        fi
    done
fi

# Check if we will use Docker Swarm or Docker Compose
if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
    docker stack rm $STACK_NAME
else
    COMPOSE_FILES=$(echo ${DR_TRAIN_COMPOSE_FILE} | cut -f1-2 -d\ )
    export DR_CURRENT_PARAMS_FILE=""
    docker compose $COMPOSE_FILES -p $STACK_NAME down
fi


================================================
FILE: scripts/upload/download-model.sh
================================================
#!/usr/bin/env bash

usage() {
  echo "Usage: $0 [-f] [-w] [-d] -s <source-prefix> -t <target-prefix>"
  echo "       -f                Force download. No confirmation question."
  echo "       -w                Wipes the target AWS DeepRacer model structure before upload."
  echo "       -d                Dry-Run mode. Does not perform any write or delete operatios on target."
  echo "       -c                Copy config files into custom_files."
  echo "       -s source-url     Downloads model from specified S3 URL (s3://bucket/prefix)."
  echo "       -t target-prefix  Downloads model into specified prefix in local storage."
  exit 1
}

trap ctrl_c INT

function ctrl_c() {
  echo "Requested to stop."
  exit 1
}

while getopts "s:t:fwcdh" opt; do
  case $opt in
  f)
    OPT_FORCE="True"
    ;;
  c)
    OPT_CONFIG="Config"
    ;;
  d)
    OPT_DRYRUN="--dryrun"
    ;;
  w)
    OPT_WIPE="--delete"
    ;;
  t)
    OPT_TARGET="$OPTARG"
    ;;
  s)
    OPT_SOURCE="$OPTARG"
    ;;
  h)
    usage
    ;;
  \?)
    echo "Invalid option -$OPTARG" >&2
    usage
    ;;
  esac
done

if [[ -n "${OPT_DRYRUN}" ]]; then
  echo "*** DRYRUN MODE ***"
fi

SOURCE_S3_URL="${OPT_SOURCE}"

if [[ -z "${SOURCE_S3_URL}" ]]; then
  echo "No source URL to download model from."
  exit 1
fi

TARGET_S3_BUCKET=${DR_LOCAL_S3_BUCKET}
TARGET_S3_PREFIX=${OPT_TARGET}
if [[ -z "${TARGET_S3_PREFIX}" ]]; then
  echo "No target prefix defined. Exiting."
  exit 1
fi

SOURCE_REWARD_FILE_S3_KEY="${SOURCE_S3_URL}/reward_function.py"
SOURCE_HYPERPARAM_FILE_S3_KEY="${SOURCE_S3_URL}/ip/hyperparameters.json"
SOURCE_METADATA_S3_KEY="${SOURCE_S3_URL}/model/model_metadata.json"

WORK_DIR=${DR_DIR}/tmp/download
mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}/config ${WORK_DIR}/full

# Check if metadata-files are available
REWARD_FILE=""
METADATA_FILE=""
HYPERPARAM_FILE=""

aws ${DR_UPLOAD_PROFILE} s3 cp "${SOURCE_REWARD_FILE_S3_KEY}" ${WORK_DIR}/config/ --no-progress >/dev/null
aws ${DR_UPLOAD_PROFILE} s3 cp "${SOURCE_METADATA_S3_KEY}" ${WORK_DIR}/config/ --no-progress >/dev/null
aws ${DR_UPLOAD_PROFILE} s3 cp "${SOURCE_HYPERPARAM_FILE_S3_KEY}" ${WORK_DIR}/config/ --no-progress >/dev/null

if [ -f "${WORK_DIR}/config/$(basename "$SOURCE_REWARD_FILE_S3_KEY")" ]; then
  REWARD_FILE=$(_realpath "${WORK_DIR}/config/$(basename "$SOURCE_REWARD_FILE_S3_KEY")")
fi

if [ -f "${WORK_DIR}/config/$(basename "$SOURCE_METADATA_S3_KEY")" ]; then
  METADATA_FILE=$(_realpath "${WORK_DIR}/config/$(basename "$SOURCE_METADATA_S3_KEY")")
fi

if [ -f "${WORK_DIR}/config/$(basename "$SOURCE_HYPERPARAM_FILE_S3_KEY")" ]; then
  HYPERPARAM_FILE=$(_realpath "${WORK_DIR}/config/$(basename "$SOURCE_HYPERPARAM_FILE_S3_KEY")")
fi

if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$HYPERPARAM_FILE" ]; then
  echo "All meta-data files found. Source model ${SOURCE_S3_URL} valid."
else
  echo "Meta-data files are not found. Source model ${SOURCE_S3_URL} not valid. Exiting."
  exit 1
fi

# Upload files
if [[ -z "${OPT_FORCE}" ]]; then
  echo "Ready to download model ${SOURCE_S3_URL} to local ${TARGET_S3_PREFIX}"
  read -r -p "Are you sure? [y/N] " response
  if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]]; then
    echo "Aborting."
    exit 1
  fi
fi

cd ${WORK_DIR}
aws ${DR_UPLOAD_PROFILE} s3 sync "${SOURCE_S3_URL}" ${WORK_DIR}/full/ ${OPT_DRYRUN}
aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 sync ${WORK_DIR}/full/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ ${OPT_DRYRUN} ${OPT_WIPE}

if [[ -n "${OPT_CONFIG}" ]]; then
  echo "Copy configuration to custom_files"
  cp ${WORK_DIR}/config/* ${DR_DIR}/custom_files/
fi

echo "Done."


================================================
FILE: scripts/upload/increment.sh
================================================
#!/usr/bin/env bash

usage() {
    echo "Usage: $0 [-f] [-w] [-p <model-prefix>] [-d <delimiter>]"
    echo ""
    echo "Command will increment a numerical suffix on the current upload model."
    echo "-p model  Sets the to-be name to be <model-prefix> rather than auto-incremeneting the previous model."
    echo "-d delim  Delimiter in model-name (e.g. '-' in 'test-model-1')"
    echo "-f        Force. Ask for no confirmations."
    echo "-w        Wipe the S3 prefix to ensure that two models are not mixed."
    exit 1
}

trap ctrl_c INT

function ctrl_c() {
    echo "Requested to stop."
    exit 1
}

OPT_DELIM='-'

while getopts ":fwp:d:" opt; do
    case $opt in

    f)
        OPT_FORCE="True"
        ;;
    p)
        OPT_PREFIX="$OPTARG"
        ;;
    w)
        OPT_WIPE="--delete"
        ;;
    d)
        OPT_DELIM="$OPTARG"
        ;;
    h)
        usage
        ;;
    \?)
        echo "Invalid option -$OPTARG" >&2
        usage
        ;;
    esac
done

CONFIG_FILE=$DR_CONFIG
echo "Configuration file $CONFIG_FILE will be updated."

## Read in data
CURRENT_UPLOAD_MODEL=$(grep -e "^DR_UPLOAD_S3_PREFIX" ${CONFIG_FILE} | awk '{split($0,a,"="); print a[2] }')
CURRENT_UPLOAD_MODEL_NUM=$(echo "${CURRENT_UPLOAD_MODEL}" |
    awk -v DELIM="${OPT_DELIM}" '{ n=split($0,a,DELIM); if (a[n] ~ /[0-9]*/) print a[n]; else print ""; }')
if [[ -z ${CURRENT_UPLOAD_MODEL_NUM} ]]; then
    NEW_UPLOAD_MODEL="${CURRENT_UPLOAD_MODEL}${OPT_DELIM}1"
else
    NEW_UPLOAD_MODEL_NUM=$(echo "${CURRENT_UPLOAD_MODEL_NUM} + 1" | bc)
    NEW_UPLOAD_MODEL=$(echo $CURRENT_UPLOAD_MODEL | sed "s/${CURRENT_UPLOAD_MODEL_NUM}\$/${NEW_UPLOAD_MODEL_NUM}/")
fi

if [[ -n "${NEW_UPLOAD_MODEL}" ]]; then
    echo "Incrementing model from ${CURRENT_UPLOAD_MODEL} to ${NEW_UPLOAD_MODEL}"
    if [[ -z "${OPT_FORCE}" ]]; then
        read -r -p "Are you sure? [y/N] " response
        if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]]; then
            echo "Aborting."
            exit 1
        fi
    fi
    sed -i.bak -re "s/(DR_UPLOAD_S3_PREFIX=).*$/\1$NEW_UPLOAD_MODEL/g" "$CONFIG_FILE" && echo "Done."
else
    echo "Error in determining new model. Aborting."
    exit 1
fi

export DR_UPLOAD_S3_PREFIX=$(eval echo "${NEW_UPLOAD_MODEL}")

if [[ -n "${OPT_WIPE}" ]]; then
    MODEL_DIR_S3=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls s3://${DR_LOCAL_S3_BUCKET}/${NEW_UPLOAD_MODEL})
    if [[ -n "${MODEL_DIR_S3}" ]]; then
        echo "The new model's S3 prefix s3://${DR_LOCAL_S3_BUCKET}/${NEW_UPLOAD_MODEL} exists. Will wipe."
    fi
    if [[ -z "${OPT_FORCE}" ]]; then
        read -r -p "Are you sure? [y/N] " response
        if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]]; then
            echo "Aborting."
            exit 1
        fi
    fi
    aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 rm s3://${DR_LOCAL_S3_BUCKET}/${NEW_UPLOAD_MODEL} --recursive
fi


================================================
FILE: scripts/upload/prepare-config.py
================================================
#!/usr/bin/python3

import boto3
import sys
import os 
import time
import json
import io
import yaml

config = {}
config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1')
config['JOB_TYPE'] = 'TRAINING'
config['METRICS_S3_BUCKET'] = os.environ.get('TARGET_S3_BUCKET', 'bucket')
config['METRICS_S3_OBJECT_KEY'] = "{}/TrainingMetrics.json".format(os.environ.get('TARGET_S3_PREFIX', 'bucket'))
config['MODEL_METADATA_FILE_S3_KEY'] = "{}/model/model_metadata.json".format(os.environ.get('TARGET_S3_PREFIX', 'bucket'))
config['REWARD_FILE_S3_KEY'] = "{}/reward_function.py".format(os.environ.get('TARGET_S3_PREFIX', 'bucket'))
config['SAGEMAKER_SHARED_S3_BUCKET'] = os.environ.get('TARGET_S3_BUCKET', 'bucket')
config['SAGEMAKER_SHARED_S3_PREFIX'] = os.environ.get('TARGET_S3_PREFIX', 'rl-deepracer-sagemaker')

# Car and training 
config['BODY_SHELL_TYPE'] = os.environ.get('DR_CAR_BODY_SHELL_TYPE', 'deepracer')
if config['BODY_SHELL_TYPE'] == 'deepracer':
    config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red')
config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar')
config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL')
config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide')
config['DISPLAY_NAME'] = os.environ.get('DR_DISPLAY_NAME', 'racer1')
config['RACER_NAME'] = os.environ.get('DR_RACER_NAME', 'racer1')

config['ALTERNATE_DRIVING_DIRECTION'] = os.environ.get('DR_TRAIN_ALTERNATE_DRIVING_DIRECTION', os.environ.get('DR_ALTERNATE_DRIVING_DIRECTION', 'false'))
config['CHANGE_START_POSITION'] = os.environ.get('DR_TRAIN_CHANGE_START_POSITION', os.environ.get('DR_CHANGE_START_POSITION', 'true'))
config['ROUND_ROBIN_ADVANCE_DIST'] = os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST', '0.05')
config['START_POSITION_OFFSET'] = os.environ.get('DR_TRAIN_START_POSITION_OFFSET', '0.00')
config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false')
config['MIN_EVAL_TRIALS'] = os.environ.get('DR_TRAIN_MIN_EVAL_TRIALS', '5')

# Object Avoidance
if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE':
    config['NUMBER_OF_OBSTACLES'] = os.environ.get('DR_OA_NUMBER_OF_OBSTACLES', '6')
    config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0')
    config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True')
    config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false')

    object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "")
    if object_position_str != "":
        object_positions = []
        for o in object_position_str.split(";"):
            object_positions.append(o)
        config['OBJECT_POSITIONS'] = object_positions
        config['NUMBER_OF_OBSTACLES'] = str(len(object_positions))

# Head to Bot
if config['RACE_TYPE'] == 'HEAD_TO_BOT':
    config['IS_LANE_CHANGE'] = os.environ.get('DR_H2B_IS_LANE_CHANGE', 'False')
    config['LOWER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME', '3.0')
    config['UPPER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME', '5.0')
    config['LANE_CHANGE_DISTANCE'] = os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE', '1.0')
    config['NUMBER_OF_BOT_CARS'] = os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS', '0')
    config['MIN_DISTANCE_BETWEEN_BOT_CARS'] = os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS', '2.0')
    config['RANDOMIZE_BOT_CAR_LOCATIONS'] = os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS', 'False')
    config['BOT_CAR_SPEED'] = os.environ.get('DR_H2B_BOT_CAR_SPEED', '0.2')

local_yaml_path = os.path.abspath(os.path.join(os.environ.get('WORK_DIR'),'training_params.yaml'))
print(local_yaml_path)
with open(local_yaml_path, 'w') as yaml_file:
    yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True)

================================================
FILE: scripts/upload/upload-car.sh
================================================
#!/usr/bin/env bash

usage() {
    echo "Usage: $0 [-L] [-f]"
    echo "       -f        Force. Do not ask for confirmation."
    echo "       -L        Upload model to the local S3 bucket."
    exit 1
}

trap ctrl_c INT

function ctrl_c() {
    echo "Requested to stop."
    exit 1
}

while getopts ":Lf" opt; do
    case $opt in
    L)
        OPT_LOCAL="Local"
        ;;
    f)
        OPT_FORCE="force"
        ;;
    h)
        usage
        ;;
    \?)
        echo "Invalid option -$OPTARG" >&2
        usage
        ;;
    esac
done

# This script creates the tar.gz file necessary to operate inside a deepracer physical car
# The file is created directly from within the sagemaker container, using the most recent checkpoint

# Find name of sagemaker container
SAGEMAKER_CONTAINERS=$(docker ps | awk ' /algo/ { print $1 } ' | xargs)
if [[ -n $SAGEMAKER_CONTAINERS ]]; then
    for CONTAINER in $SAGEMAKER_CONTAINERS; do
        CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER)
        CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $1')
        echo "Found Sagemaker container: $CONTAINER_NAME"
    done
fi

#create tmp directory if it doesnt already exit
mkdir -p $DR_DIR/tmp/car_upload
cd $DR_DIR/tmp/car_upload
#ensure directory is empty
rm -r $DR_DIR/tmp/car_upload/*
#The files we want are located inside the sagemaker container at /opt/ml/model.  Copy them to the tmp directory
docker cp $CONTAINER_NAME:/opt/ml/model $DR_DIR/tmp/car_upload
cd $DR_DIR/tmp/car_upload/model
#create a tar.gz file containing all of these files
tar -czvf carfile.tar.gz *

# Upload files
if [[ -z "${OPT_FORCE}" ]]; then
    if [[ -n "${OPT_LOCAL}" ]]; then
        echo "Ready to upload car model to local s3://${DR_LOCAL_S3_BUCKET}/${DR_UPLOAD_S3_PREFIX}."
    else
        echo "Ready to upload car model to remote s3://${DR_UPLOAD_S3_BUCKET}/${DR_UPLOAD_S3_PREFIX}."
    fi
    read -r -p "Are you sure? [y/N] " response
    if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]]; then
        echo "Aborting."
        exit 1
    fi
fi

#upload to s3
if [[ -n "${OPT_LOCAL}" ]]; then
    aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 cp carfile.tar.gz s3://${DR_LOCAL_S3_BUCKET}/${DR_UPLOAD_S3_PREFIX}/carfile.tar.gz
else
    aws ${DR_UPLOAD_PROFILE} s3 cp carfile.tar.gz s3://${DR_UPLOAD_S3_BUCKET}/${DR_UPLOAD_S3_PREFIX}/carfile.tar.gz
fi


================================================
FILE: scripts/upload/upload-model.sh
================================================
#!/usr/bin/env bash

usage() {
  echo "Usage: $0 [-f] [-w] [-d] [-b] [-1] [-i] [-I] [-L] [-c <checkpoint>] [-p <model-prefix>]"
  echo "       -f        Force upload. No confirmation question."
  echo "       -w        Wipes the target AWS DeepRacer model structure before upload."
  echo "       -d        Dry-Run mode. Does not perform any write or delete operatios on target."
  echo "       -b        Uploads best checkpoint. Default is last checkpoint."
  echo "       -p model  Uploads model from specified S3 prefix."
  echo "       -1        Increment upload name with 1 (dr-increment-upload-model)"
  echo "       -L        Upload model to the local S3 bucket"
  exit 1
}

trap ctrl_c INT

function ctrl_c() {
  echo "Requested to stop."
  exit 1
}

while getopts ":fwdhbp:c:1L" opt; do
  case $opt in
  b)
    OPT_CHECKPOINT="Best"
    ;;
  c)
    OPT_CHECKPOINT_NUM="$OPTARG"
    ;;
  f)
    OPT_FORCE="-f"
    ;;
  d)
    OPT_DRYRUN="--dryrun"
    ;;
  p)
    OPT_PREFIX="$OPTARG"
    ;;
  w)
    OPT_WIPE="--delete"
    ;;
  L)
    OPT_LOCAL="Local"
    ;;
  1)
    OPT_INCREMENT="Yes"
    ;;
  h)
    usage
    ;;
  \?)
    echo "Invalid option -$OPTARG" >&2
    usage
    ;;
  esac
done

if [[ -n "${OPT_DRYRUN}" ]]; then
  echo "*** DRYRUN MODE ***"
fi

if [[ -n "${OPT_INCREMENT}" ]]; then
  source $DR_DIR/scripts/upload/increment.sh ${OPT_FORCE}
fi

SOURCE_S3_BUCKET=${DR_LOCAL_S3_BUCKET}
if [[ -n "${OPT_PREFIX}" ]]; then
  SOURCE_S3_MODEL_PREFIX=${OPT_PREFIX}
  SOURCE_S3_REWARD=${OPT_PREFIX}/reward_function.py
  SOURCE_S3_METRICS=${OPT_PREFIX}/metrics
  TARGET_S3_PREFIX=${OPT_PREFIX}
else
  SOURCE_S3_MODEL_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX}
  SOURCE_S3_REWARD=${DR_LOCAL_S3_REWARD_KEY}
  SOURCE_S3_METRICS=${DR_LOCAL_S3_METRICS_PREFIX}
  TARGET_S3_PREFIX=${DR_UPLOAD_S3_PREFIX}
fi

if [[ -z "${OPT_LOCAL}" ]]; then
  TARGET_S3_BUCKET=${DR_UPLOAD_S3_BUCKET}
  UPLOAD_PROFILE=${DR_UPLOAD_PROFILE}
else
  if [[ "${TARGET_S3_PREFIX}" = "${SOURCE_S3_MODEL_PREFIX}" ]]; then
    echo "Target equals source. Exiting."
    exit 1
  fi

  TARGET_S3_BUCKET=${DR_LOCAL_S3_BUCKET}
  UPLOAD_PROFILE=${DR_LOCAL_PROFILE_ENDPOINT_URL}
fi

if [[ -z "${TARGET_S3_BUCKET}" ]]; then
  echo "No upload bucket defined. Exiting."
  exit 1
fi

if [[ -z "${TARGET_S3_PREFIX}" ]]; then
  echo "No upload prefix defined. Exiting."
  exit 1
fi

export WORK_DIR=${DR_DIR}/tmp/upload/
rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}model ${WORK_DIR}ip

# Upload information on model.
TARGET_PARAMS_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/training_params.yaml"
TARGET_REWARD_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/reward_function.py"
TARGET_HYPERPARAM_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ip/hyperparameters.json"
TARGET_METRICS_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/metrics/"

# Check if metadata-files are available
REWARD_IN_ROOT=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 ls s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/reward_function.py 2>/dev/null | wc -l)
if [ "$REWARD_IN_ROOT" -ne 0 ]; then
  SOURCE_REWARD_BASENAME="reward_function.py"
  aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/reward_function.py ${WORK_DIR} --no-progress >/dev/null
else
  echo "Looking for Reward Function in s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_REWARD}"
  SOURCE_REWARD_BASENAME=$(basename "$SOURCE_S3_REWARD")
  aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_REWARD} ${WORK_DIR} --no-progress >/dev/null
fi

aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/model_metadata.json ${WORK_DIR} --no-progress >/dev/null
aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/ip/hyperparameters.json ${WORK_DIR} --no-progress >/dev/null
aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_METRICS} ${WORK_DIR}/metrics --no-progress >/dev/null

REWARD_FILE=""
METADATA_FILE=""
HYPERPARAM_FILE=""
METRICS_FILE=""

if [ -f "${WORK_DIR}${SOURCE_REWARD_BASENAME}" ]; then
  REWARD_FILE=$(_realpath "${WORK_DIR}${SOURCE_REWARD_BASENAME}")
fi

if [ -f "${WORK_DIR}model_metadata.json" ]; then
  METADATA_FILE=$(_realpath "${WORK_DIR}model_metadata.json")
fi

if [ -f "${WORK_DIR}hyperparameters.json" ]; then
  HYPERPARAM_FILE=$(_realpath "${WORK_DIR}hyperparameters.json")
fi

if find "${WORK_DIR}/metrics" -type f | grep -q .; then
  METRICS_FILE=$(_realpath "${WORK_DIR}/metrics")
fi

if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$HYPERPARAM_FILE" ] && [ -n "$METRICS_FILE" ]; then
  echo "All meta-data files found. Looking for checkpoint."
else
  echo "Meta-data files are not found. Exiting."
  exit 1
fi

# Download checkpoint file
echo "Looking for model to upload from s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/"
CHECKPOINT_INDEX=""
aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/deepracer_checkpoints.json ${WORK_DIR}model/ --no-progress >/dev/null

if [ -f "${WORK_DIR}model/deepracer_checkpoints.json" ]; then
  CHECKPOINT_INDEX=$(_realpath "${WORK_DIR}model/deepracer_checkpoints.json")
fi

if [ -z "$CHECKPOINT_INDEX" ]; then
  echo "No checkpoint file available at s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model. Exiting."
  exit 1
fi

if [ -n "$OPT_CHECKPOINT_NUM" ]; then
  echo "Checking for checkpoint $OPT_CHECKPOINT_NUM"
  export OPT_CHECKPOINT_NUM
  CHECKPOINT_FILE=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ | perl -ne'print "$1\n" if /.*\s($ENV{OPT_CHECKPOINT_NUM}_Step-[0-9]{1,7}\.ckpt)\.index/')
  CHECKPOINT=$(echo $CHECKPOINT_FILE | cut -f1 -d_)
  TIMESTAMP=$(date +%s)
  CHECKPOINT_JSON_PART=$(jq -n '{ checkpoint: { name: $name, time_stamp: $timestamp | tonumber, avg_comp_pct: 50.0 } }' --arg name $CHECKPOINT_FILE --arg timestamp $TIMESTAMP)
  CHECKPOINT_JSON=$(echo $CHECKPOINT_JSON_PART | jq '. | {last_checkpoint: .checkpoint, best_checkpoint: .checkpoint}')
elif [ -z "$OPT_CHECKPOINT" ]; then
  echo "Checking for latest tested checkpoint"
  CHECKPOINT_FILE=$(jq -r .last_checkpoint.name <$CHECKPOINT_INDEX)
  CHECKPOINT=$(echo $CHECKPOINT_FILE | cut -f1 -d_)
  CHECKPOINT_JSON=$(jq '. | {last_checkpoint: .last_checkpoint, best_checkpoint: .last_checkpoint}' <$CHECKPOINT_INDEX)
  echo "Latest checkpoint = $CHECKPOINT"
else
  echo "Checking for best checkpoint"
  CHECKPOINT_FILE=$(jq -r .best_checkpoint.name <$CHECKPOINT_INDEX)
  CHECKPOINT=$(echo $CHECKPOINT_FILE | cut -f1 -d_)
  CHECKPOINT_JSON=$(jq '. | {last_checkpoint: .best_checkpoint, best_checkpoint: .best_checkpoint}' <$CHECKPOINT_INDEX)
  echo "Best checkpoint: $CHECKPOINT"
fi

# Find checkpoint & model files - download
if [ -n "$CHECKPOINT" ]; then
  aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 sync s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ ${WORK_DIR}model/ --exclude "*" --include "${CHECKPOINT}*" --include "model_${CHECKPOINT}.pb" --include "deepracer_checkpoints.json" --no-progress >/dev/null
  CHECKPOINT_MODEL_FILE_COUNT=$(find "${WORK_DIR}model" -maxdepth 1 -type f \( -name "${CHECKPOINT}*" -o -name "model_${CHECKPOINT}.pb" -o -name "deepracer_checkpoints.json" \) | wc -l)
  if [ "$CHECKPOINT_MODEL_FILE_COUNT" -eq 0 ]; then
    echo "No model files found. Files possibly deleted. Try again."
    exit 1
  fi
  cp ${METADATA_FILE} ${WORK_DIR}model/
  #    echo "model_checkpoint_path: \"${CHECKPOINT_FILE}\"" | tee ${WORK_DIR}model/checkpoint
  echo ${CHECKPOINT_FILE} | tee ${WORK_DIR}model/.coach_checkpoint >/dev/null
else
  echo "Checkpoint not found. Exiting."
  exit 1
fi

# Create Training Params Yaml.
PARAMS_FILE=$(python3 $DR_DIR/scripts/upload/prepare-config.py)

# Upload files
if [[ -z "${OPT_FORCE}" ]]; then
  echo "Ready to upload model ${SOURCE_S3_MODEL_PREFIX} to s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/"
  read -r -p "Are you sure? [y/N] " response
  if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]]; then
    echo "Aborting."
    exit 1
  fi
fi

# echo "" > ${WORK_DIR}model/.ready
cd ${WORK_DIR}
echo ${CHECKPOINT_JSON} >${WORK_DIR}model/deepracer_checkpoints.json
aws ${UPLOAD_PROFILE} s3 sync ${WORK_DIR}model/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/ ${OPT_DRYRUN} ${OPT_WIPE}
aws ${UPLOAD_PROFILE} s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} ${OPT_DRYRUN}
aws ${UPLOAD_PROFILE} s3 sync ${WORK_DIR}/metrics/ ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN}
aws ${UPLOAD_PROFILE} s3 cp ${PARAMS_FILE} ${TARGET_PARAMS_FILE_S3_KEY} ${OPT_DRYRUN}
aws ${UPLOAD_PROFILE} s3 cp ${HYPERPARAM_FILE} ${TARGET_HYPERPARAM_FILE_S3_KEY} ${OPT_DRYRUN}
aws ${UPLOAD_PROFILE} s3 cp ${METADATA_FILE} s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ ${OPT_DRYRUN}


================================================
FILE: scripts/viewer/index.template.html
================================================
<!DOCTYPE html>
<html lang="en">

<head>
    <title>DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX</title>
    <style>
        :root {
            --primary-color: #500280;
        }


        body {
            display: block;
            margin: 0;
            background: #161e2d;
            color: #ffffff;
            font-family: "Amazon Ember", "Helvetica Neue", Roboto, Arial, sans-serif;
            font-size: 16px;
            font-weight: 400;
        }

        input {
            width: 100px;
        }

        .container {
            display: flex;
            flex-direction: column;
            position: absolute;
            top: 42px;
            bottom: 0;
            left: 0;
            right: 0;
        }

        .navbar {
            position: fixed;
            display: flex;
            justify-content: space-between;
            top: 0;
            left: 0;
            right: 0;
            z-index: 2;
            background: var(--primary-color);
            box-shadow: rgba(0, 0, 0, 0.2) 0px 3px 5px -1px, rgba(0, 0, 0, 0.14) 0px 6px 10px 0px, rgba(0, 0, 0, 0.12) 0px 1px 18px 0px;
        }

        h1.navbar-header {
            font-weight: 750;
            font-size: 1.125rem;
            display: flex;
            flex-wrap: wrap;
            align-items: center;
            padding: 12px 16px;
            margin: 0;
        }

        #main-container {
            justify-content: center;
            align-items: center;
            display: flex;
            flex-direction: row;
            flex-wrap: wrap;
            padding: 16px;
        }

        .card {
            margin: 8px;
            box-shadow: rgba(0, 0, 0, 0.2) 0px 2px 1px -1px, rgba(0, 0, 0, 0.14) 0px 1px 1px 0px, rgba(0, 0, 0, 0.12) 0px 1px 3px 0px;
            transition: box-shadow 280ms cubic-bezier(0.4, 0, 0.2, 1);
            border-radius: 4px;
            display: block;
            position: relative;
        }

        .card-img {
            border-radius: 4px;
        }

        .select {
            display: flex;
            align-items: center;
            margin-right: 1rem;
        }

        .select-options {
            display: flex;
            align-items: center;
        }

        label {
            margin-right: 0.5rem;
        }

        h2 {
            margin-left: 0.3rem;
            margin-top: 0.3rem;
            margin-bottom: 0.1rem;
        }

        h3 {
            margin-left: 0;
            margin-top: 0.3rem;
        }

        .hide {
            display: none;
        }

        .robo-camera-group {
            display: flex;
            flex-wrap: wrap;
        }

        .robo-maker {
            padding: 0.5rem;
            margin: 0.5rem;
            border: medium solid var(--primary-color);
            border-radius: 10px;
        }

        .dismiss-button {
            padding: 0.5rem 1rem;
            margin: 1rem;
        }
    </style>
</head>

<body>
    <div class="container">
        <div class="navbar">
            <h1 class="navbar-header">Run ID:$DR_RUN_ID - Model: $DR_LOCAL_S3_MODEL_PREFIX </h1>
            <div class="select-options">
                <div class="select">
                    <label for="robo-select">Worker:</label>
                    <select name="robo-select" id="robo-select">
                    </select>
                </div>

                <div class="select">
                    <label for="camera-select">Cameras:</label>
                    <select name="camera-select" id="camera-select" value="kvs_stream">
                    </select>
                </div>
                <div class="select">
                    <label for="camera-quality">Quality:</label>
                    <input name="camera-quality" id="camera-quality" type="number" min="1" max="100" value="$QUALITY" />
                </div>
                <div class="select">
                    <label for="width-size">Width:</label>
                    <input name="width-size" id="width-size" type="number" min="160" max="1920" />
                </div>
            </div>
        </div>
        <div id="main-container">
        </div>

    </div>

    <script>

        var robomakerContainers = [
            $ROBOMAKER_CONTAINERS_HTML
        ];


        const maximumCameraAmount = 6

        var cameras = [
            {
                id: "kvs_stream",
                topic: "/racecar/deepracer/kvs_stream",
            },
            {
                id: "camera",
                topic: "/racecar/camera/zed/rgb/image_rect_color",
            },
            {
                id: "main_camera",
                topic: "/racecar/main_camera/zed/rgb/image_rect_color",
            },
            {
                id: "sub_camera",
                topic: "/sub_camera/zed/rgb/image_rect_color",
            },
        ]

        
        let { robo, camera, quality, width } = extractPropertiesFromUrl(location.href)
        // Set defaults: ALL workers, kvs_stream camera, width 480, quality 70
        let needsUrlUpdate = false;
        if (!robo) { robo = 'all'; needsUrlUpdate = true; }
        if (!camera) { camera = 'kvs_stream'; needsUrlUpdate = true; }
        if (!width) { width = 480; needsUrlUpdate = true; }
        if (!quality) { quality = 70; needsUrlUpdate = true; }

        // If any defaults were used, update the URL to reflect them (without reloading)
        if (needsUrlUpdate) {
            const url = createUrl(robo, camera, quality, width);
            window.history.replaceState({}, '', url.href);
        }

        const widthSize = document.getElementById('width-size')

        widthSize.addEventListener('change', () => updatePage());

        const camQuality = document.getElementById('camera-quality')

        camQuality.addEventListener('change', () => updatePage());

        // Add Robomaker select options
        const roboSelect = document.getElementById('robo-select')

        addAllOption(roboSelect)

        robomakerContainers.forEach(robomaker => {
            var roboOption = document.createElement('option')
            roboOption.value = robomaker
            roboOption.innerHTML = robomaker
            roboSelect.appendChild(roboOption)
        })

        roboSelect.addEventListener('change', () => updatePage());

        // Add Camera select options
        const cameraSelect = document.getElementById('camera-select')

        addAllOption(cameraSelect)

        cameras.forEach(camera => {
            var cameraOption = document.createElement('option')
            cameraOption.value = camera.id
            cameraOption.innerHTML = camera.id
            cameraSelect.appendChild(cameraOption)
        })

        cameraSelect.addEventListener('change', () => updatePage());

        setupForm(robo, camera, quality, width)

        buildElements()

        function buildElements() {

            const urlSearchParams = new URLSearchParams(window.location.search)

            const mainContainer = document.getElementById('main-container')
            let roboSelectionValue = document.getElementById('robo-select').value
            const urlRoboSelection = urlSearchParams.get('robo')
            roboSelectionValue = urlRoboSelection || roboSelectionValue

            const cameraSelectionEl = document.getElementById('camera-select')
            let cameraSelectionValue = cameraSelectionEl.value
            cameraSelectionValue = urlSearchParams.get('camera') || cameraSelectionValue
            const cameraAllSelectEl = cameraSelectionEl.querySelector('.all-select')
            const cameraSelectTopEl = cameraSelectionEl.querySelectorAll('option')[1]
            const qualityVal = document.getElementById('camera-quality').value

            const urlWidthSelection = urlSearchParams.get('width')
            let widthSelectionValue = document.getElementById('width-size').value
            widthSelectionValue = urlWidthSelection || widthSelectionValue

            // Cleanup: close all streams by clearing src of all images before removing them
            mainContainer.querySelectorAll('img.card-img').forEach(img => { img.src = ''; });
            mainContainer.innerHTML = ''

            // Prevent All cameras with All cars
            if (roboSelectionValue === 'all' && cameraSelectionValue === 'all') {
                cameraSelectionValue = 'kvs_stream';
                cameraSelectionEl.value = 'kvs_stream';
            }

            if (roboSelectionValue === 'all') {
                cameraAllSelectEl.classList.add('hide')
            } else {
                cameraAllSelectEl.classList.remove('hide')
            }

            let cumulativeCameraAmount = 0

            robomakerContainers
                .filter((robo) => roboSelectionValue === 'all' || roboSelectionValue === robo)
                .forEach((robo) => {

                    if (cumulativeCameraAmount < maximumCameraAmount) {
                        const roboMaker = document.createElement('div')
                        roboMaker.classList.add('robo-maker')
                        roboMaker.dataset.robo = robo
                        const roboMakerTitle = document.createElement('h2')
                        roboMakerTitle.innerHTML = 'Worker: ' + robo
                        roboMaker.appendChild(roboMakerTitle)
                        const roboCameras = document.createElement('div')
                        roboCameras.dataset.robo = robo
                        roboCameras.classList.add('robo-camera-group')

                        const camerasToShow = cameras
                            .filter((cam) => cameraSelectionValue === 'all' || cameraSelectionValue === cam.id)

                        camerasToShow.forEach((camera) => {

                            if (cumulativeCameraAmount < maximumCameraAmount) {
                                // Create div
                                const div = document.createElement('div')
                                div.dataset.camera = camera.id
                                div.dataset.robo = robo
                                div.classList.add('card')
                                const cameraTitle = document.createElement('h3')
                                cameraTitle.innerHTML = camera.id

                                // Create image
                                const image = document.createElement('img')
                                image.dataset.camera = camera.id

                                const numericWidth = Number(widthSelectionValue)
                                const validatedWidth = Math.min(1920, Math.max(160, isNaN(numericWidth) ? 160 : numericWidth))

                                const url = createStreamUrl(robo, camera.topic, qualityVal, validatedWidth)
                                image.classList.add('card-img')
                                image.setAttribute('src', url)
                                image.style.width = validatedWidth + 'px'
                                image.setAttribute('alt', robo + '-' + camera.id)
                                image.onerror = function() {
                                    this.style.border = '2px solid red';
                                    this.alt = 'Failed to load: ' + robo + '-' + camera.id;
                                }

                                div.appendChild(cameraTitle)
                                div.appendChild(image)
                                roboCameras.append(div)
                                cumulativeCameraAmount += 1
                            }
                        })
                        roboMaker.appendChild(roboCameras)
                        mainContainer.appendChild(roboMaker)

                    } else if (cumulativeCameraAmount === maximumCameraAmount) {
                        const div = document.createElement('div')
                        div.innerText = "Maximum amount of " + maximumCameraAmount + " cameras reached"
                        div.classList.add('max-cameras-reached-alert')
                        const dismissButton = document.createElement('button')
                        dismissButton.classList.add('dismiss-button')
                        dismissButton.innerText = "Dismiss"
                        dismissButton.addEventListener('click', () => document.querySelector('.max-cameras-reached-alert').remove())
                        div.appendChild(dismissButton)
                        mainContainer.append(div)
                        cumulativeCameraAmount += 1
                    }
                })
        }

        // Adds an 'all' option to the select options element argument
        function addAllOption(el) {
            const option = document.createElement('option')
            option.value = 'all'
            option.innerHTML = 'All'
            option.classList.add('all-select')
            el.appendChild(option)
        }

        function createStreamUrl(robo, topic, quality, width) {
            // Ensure width is a finite number within an acceptable range
            let widthNum = Number(width)
            if (!Number.isFinite(widthNum)) {
                widthNum = 480
            } else {
                widthNum = Math.max(160, Math.min(1920, widthNum))
            }

            // Ensure quality is a finite number within an acceptable range
            let qualityNum = Number(quality)
            if (!Number.isFinite(qualityNum)) {
                qualityNum = 75
            } else {
                qualityNum = Math.max(1, Math.min(100, qualityNum))
            }

            // Calculate height maintaining 4:3 aspect ratio
            const height = Math.round(widthNum * 3 / 4)
            return "/" + robo + "/stream?topic=" + topic + "&quality=" + qualityNum + "&width=" + widthNum + "&height=" + height
        }

        function createUrl(robo, camera, quality, width) {
            const url = new URL(origin);
            const search_params = url.searchParams;
            search_params.set('robo', robo);
            search_params.set('camera', camera);
            search_params.set('quality', quality);
            search_params.set('width', width);

            // change the search property of the main url
            url.search = search_params.toString();

            return url
        }

        function extractPropertiesFromUrl(url) {
            const urlObj = new URL(url);
            const search_params = urlObj.searchParams;
            return {
                robo: search_params.get('robo'),
                camera: search_params.get('camera'),
                quality: search_params.get('quality'),
                width: search_params.get('width'),
            }
        }

        function setupForm(robo, camera, quality, width) {
            // Always set default to ALL workers, kvs_stream camera, width 480, quality 70 if not present
            document.getElementById('robo-select').value = robo || 'all';
            document.getElementById('camera-select').value = camera || 'kvs_stream';
            document.getElementById('camera-quality').value = quality || 70;
            document.getElementById('width-size').value = width || 480;
        }

        function getFormValue() {
            return {
                robo: document.getElementById('robo-select').value,
                camera: document.getElementById('camera-select').value,
                quality: document.getElementById('camera-quality').value,
                width: document.getElementById('width-size').value
            }
        }

        function updatePage() {
            setTimeout(() => {
                const { robo, camera, quality, width } = getFormValue()

                // Validate and clamp quality
                const qualityNum = Number(quality)
                const clampedQuality = (!Number.isFinite(qualityNum) || qualityNum < 1 || qualityNum > 100)
                    ? (Number.isFinite(qualityNum) ? Math.max(1, Math.min(100, qualityNum)) : 75)
                    : qualityNum
                
                if (clampedQuality !== qualityNum) {
                    document.getElementById('camera-quality').value = clampedQuality
                }

                // Validate and clamp width
                const widthNum = Number(width)
                const clampedWidth = (!Number.isFinite(widthNum) || widthNum < 160 || widthNum > 1920)
                    ? (Number.isFinite(widthNum) ? Math.max(160, Math.min(1920, widthNum)) : 480)
                    : widthNum
                
                if (clampedWidth !== widthNum) {
                    document.getElementById('width-size').value = clampedWidth
                }

                const url = createUrl(robo, camera, clampedQuality, clampedWidth)
                location.href = url.href
            }, 300)
        }

    </script>
</body>

</html>

================================================
FILE: scripts/viewer/start.sh
================================================
#!/usr/bin/env bash

usage() {
  echo "Usage: $0 [-t topic] [-w width] [-h height] [-q quality] -b [browser-command] -p [port]"
  echo "       -w        Width of individual stream."
  echo "       -h        Heigth of individual stream."
  echo "       -q        Quality of the stream image."
  echo "       -t        Topic to follow - default /racecar/deepracer/kvs_stream"
  echo "       -b        Browser command (default: firefox --new-tab)"
  echo "       -p        The port to use "
  exit 1
}

trap ctrl_c INT

function ctrl_c() {
  echo "Requested to stop."
  exit 1
}

# Stream definition
TOPIC="/racecar/deepracer/kvs_stream"
WIDTH=480
HEIGHT=360
QUALITY=75
BROWSER=${BROWSER:-"firefox --new-tab"}
PORT=$DR_WEBVIEWER_PORT

while getopts ":w:h:q:t:b:p:" opt; do
  case $opt in
  w)
    WIDTH="$OPTARG"
    ;;
  h)
    HEIGHT="$OPTARG"
    ;;
  q)
    QUALITY="$OPTARG"
    ;;
  t)
    TOPIC="$OPTARG"
    ;;
  b)
    BROWSER="$OPTARG"
    ;;
  p)
    PORT="$OPTARG"
    ;;
  \?)
    echo "Invalid option -$OPTARG" >&2
    usage
    ;;
  esac
done

DR_WEBVIEWER_PORT=$PORT

export DR_VIEWER_HTML=$DR_DIR/tmp/streams-$DR_RUN_ID.html
export DR_NGINX_CONF=$DR_DIR/tmp/streams-$DR_RUN_ID.conf

cat <<EOF >$DR_NGINX_CONF
server {
  listen 80;
  location / {
    root   /usr/share/nginx/html;
    index  index.html index.htm;
  }
EOF

if [[ "${DR_DOCKER_STYLE,,}" != "swarm" ]]; then
  ROBOMAKER_CONTAINERS=$(docker ps --format "{{.ID}} {{.Names}}" --filter name="deepracer-${DR_RUN_ID}" | grep robomaker | cut -f1 -d\ )
else
  ROBOMAKER_SERVICE_REPLICAS=$(docker service ps deepracer-${DR_RUN_ID}_robomaker | awk '/robomaker/ { print $1 }')
  for c in $ROBOMAKER_SERVICE_REPLICAS; do
    ROBOMAKER_CONTAINER_IP=$(docker inspect $c | jq -r '.[].NetworksAttachments[] | select (.Network.Spec.Name == "sagemaker-local") | .Addresses[0] ' | cut -f1 -d/)
    ROBOMAKER_CONTAINERS="${ROBOMAKER_CONTAINERS} ${ROBOMAKER_CONTAINER_IP}"
  done
fi

if [ -z "$ROBOMAKER_CONTAINERS" ]; then
  echo "No running robomakers. Exiting."
  exit
fi

# Expose the diamensions to the HTML template
export QUALITY
export WIDTH
export HEIGHT
# Create .js array of robomakers to pass to the HTML template
export ROBOMAKER_CONTAINERS_HTML=""
for c in $ROBOMAKER_CONTAINERS; do
  ROBOMAKER_CONTAINERS_HTML+="'$c',"
done
SCRIPT_PATH="${BASH_SOURCE:-$0}"
ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")"
ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")"
INDEX_HTML_TEMPLATE="${ABS_DIRECTORY}/index.template.html"
# Replace all variables in HTML template and create the viewer html file
envsubst <"${INDEX_HTML_TEMPLATE}" >$DR_VIEWER_HTML

# Add proxy paths in the NGINX file
for c in $ROBOMAKER_CONTAINERS; do
  echo "  location /$c { proxy_pass http://$c:8080; rewrite /$c/(.*) /\$1 break; }" >>$DR_NGINX_CONF
done
echo "}" >>$DR_NGINX_CONF

# Check if we will use Docker Swarm or Docker Compose
STACK_NAME="deepracer-$DR_RUN_ID-viewer"
COMPOSE_FILES=$DR_DIR/docker/docker-compose-webviewer.yml

if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
  if [ "$DR_DOCKER_MAJOR_VERSION" -gt 24 ]; then
    DETACH_FLAG="--detach=true"
  fi

  COMPOSE_FILES="$COMPOSE_FILES -c $DR_DIR/docker/docker-compose-webviewer-swarm.yml"
  docker stack deploy -c $COMPOSE_FILES $DETACH_FLAG $STACK_NAME
else
  docker compose -f $COMPOSE_FILES -p $STACK_NAME up -d
fi

# Starting browser if using local X and having display defined.
if [[ -n "${DISPLAY}" && "${DR_HOST_X,,}" == "true" ]]; then
  echo "Starting browser '$BROWSER'."
  if [ "${DR_DOCKER_STYLE,,}" == "swarm" ]; then
    sleep 5
  fi
  echo Launching $BROWSER "http://127.0.0.1:${DR_WEBVIEWER_PORT}"
  $BROWSER "http://127.0.0.1:${DR_WEBVIEWER_PORT}" &
fi

CURRENT_CONTAINER_HASH=$(docker ps | grep dr_viewer | head -c 12)

IP_ADDRESSES="$(hostname -I)"
echo "The viewer will avaliable on the following hosts after initialization:"
for ip in $IP_ADDRESSES; do
  echo "http://${ip}:${PORT}"
done


================================================
FILE: scripts/viewer/stop.sh
================================================
#!/usr/bin/env bash

STACK_NAME="deepracer-$DR_RUN_ID-viewer"
COMPOSE_FILES=$DR_DIR/docker/docker-compose-webviewer.yml

# Check if we will use Docker Swarm or Docker Compose
if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
    docker stack rm $STACK_NAME
else
    export DR_VIEWER_HTML=$DR_DIR/tmp/streams-$DR_RUN_ID.html
    export DR_NGINX_CONF=$DR_DIR/tmp/streams-$DR_RUN_ID.conf

    docker compose -f $COMPOSE_FILES -p $STACK_NAME down
fi


================================================
FILE: utils/Dockerfile.gpu-detect
================================================
FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04
RUN apt-get update && apt-get install -y --no-install-recommends wget python3
RUN wget https://gist.githubusercontent.com/f0k/63a664160d016a491b2cbea15913d549/raw/f25b6b38932cfa489150966ee899e5cc899bf4a6/cuda_check.py
CMD ["python3","cuda_check.py"]

================================================
FILE: utils/cuda-check-tf.py
================================================
from tensorflow.python.client import device_lib
import tensorflow as tf

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.05)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
print(get_available_gpus())


================================================
FILE: utils/cuda-check.sh
================================================
#!/usr/bin/env bash

CONTAINER_ID=$(docker create --rm -ti -e CUDA_VISIBLE_DEVICES --name cuda-check awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE "python3 cuda-check-tf.py")
docker cp $DR_DIR/utils/cuda-check-tf.py $CONTAINER_ID:/opt/install/
docker start -a $CONTAINER_ID


================================================
FILE: utils/download-car-model.py
================================================
#!/usr/bin/env python3
"""
This script checks for model files in an S3 bucket, downloads, and renames them based on a specified pattern.

Environment Variables:
- DR_LOCAL_S3_BUCKET: Name of the S3 bucket.
- DR_LOCAL_S3_PROFILE: AWS profile name for boto3 session.
- DR_REMOTE_MINIO_URL: (Optional) MinIO server URL.

Usage:
    python download-car-model.py --pattern <prefix_pattern>
"""

import boto3
import os
import fnmatch
import argparse

# Load environment variables
bucket_name = os.getenv('DR_LOCAL_S3_BUCKET')
profile_name = os.getenv('DR_LOCAL_S3_PROFILE')
minio_url = os.getenv('DR_REMOTE_MINIO_URL')

# Set up boto3 session with the specified profile
session = boto3.Session(profile_name=profile_name)
endpoint_url = minio_url if minio_url else None
s3 = session.client('s3', endpoint_url=endpoint_url)

def check_model_file(prefix):
    """
    Check if a model.tar.gz file exists in the specified prefix.

    Args:
        prefix (str): The prefix to check within the S3 bucket.

    Returns:
        bool: True if the model file is found, False otherwise.
    """
    try:
        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=f"{prefix}output/")
        for obj in response.get('Contents', []):
            if obj['Key'].endswith('model.tar.gz'):
                print(f"Found model.tar.gz in {prefix}output/")
                return f"{obj['Key']}"

        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
        for obj in response.get('Contents', []):
            if obj['Key'].endswith('carfile.tar.gz'):
                print(f"Found carfile.tar.gz in {prefix}")
                return f"{obj['Key']}"

        print(f"No model.tar.gz found in {prefix}output/ and no carfile.tar.gz found in {prefix}")
        return None
    except Exception as e:
        print(f"Error checking {prefix}: {e}")
        return None

def list_matching_prefixes(bucket_name, prefix_pattern):
    """
    List all prefixes in the specified S3 bucket that match the given pattern.

    Args:
        bucket_name (str): The name of the S3 bucket.
        prefix_pattern (str): The pattern to match prefixes against.

    Returns:
        list: A list of matching prefixes.
    """
    try:
        response = s3.list_objects_v2(Bucket=bucket_name, Delimiter='/')
        prefixes = [prefix['Prefix'] for prefix in response.get('CommonPrefixes', [])]
        matching_prefixes = fnmatch.filter(prefixes, prefix_pattern)
        return matching_prefixes
    except Exception as e:
        print(f"Error listing prefixes: {e}")
        return []

def download_and_rename_model_file(prefix, file_key, output_folder="."):
    """
    Download and rename the model.tar.gz file from the specified file key.

    Args:
        prefix (str): The prefix of the model file.
        file_key (str): The S3 key of the model file to download.
        output_folder (str): The folder where the downloaded file should be placed. Defaults to the current directory.

    Returns:
        bool: True if the model file is downloaded and renamed, False otherwise.
    """
    try:
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        local_filename = os.path.join(output_folder, f"{prefix.rstrip('/')}.tar.gz")
        s3.download_file(bucket_name, file_key, local_filename)
        print(f"Downloaded and renamed {file_key} to {local_filename}")
        return True
    except Exception as e:
        print(f"Error downloading {file_key}: {e}")
        return False

def validate_s3_connection():
    """
    Validate the S3 connection using the provided bucket name and profile name.

    Raises:
        ValueError: If bucket name or profile name is not defined.
        ConnectionError: If unable to connect to the S3 bucket.
    """
    if not bucket_name or not profile_name:
        raise ValueError("Bucket name and profile name must be defined in environment variables.")
    
    try:
        s3.head_bucket(Bucket=bucket_name)
        print(f"Successfully connected to bucket: {bucket_name}")
    except Exception as e:
        raise ConnectionError(f"Unable to connect to the bucket: {e}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Check and download model files from S3.')
    parser.add_argument('--pattern', type=str, required=True, help='Pattern for prefixes to check')
    parser.add_argument('--output_folder', type=str, default='.', help='Folder to store downloaded files')
    args = parser.parse_args()

    validate_s3_connection()

    matching_prefixes = list_matching_prefixes(bucket_name, args.pattern)
    for prefix in matching_prefixes:
        model_file_path = check_model_file(prefix)
        if model_file_path:
            download_and_rename_model_file(prefix, model_file_path, args.output_folder)

================================================
FILE: utils/evaluate.sh
================================================
#!/usr/bin/env bash

# This script evaluates DeepRacer models by managing the evaluation process.
# It requires one argument: the path to the environment configuration file.
# The script sources environment variables from the specified file, then:
# 1. Validates the existence of the environment file.
# 2. Sources the activate.sh script to set up necessary environment variables.
# 3. Prints the evaluation configuration, including Run ID, Model Name, and Track.
# 4. Executes the evaluation process by stopping any ongoing evaluation, and starts a new evaluation.

# To run this script every 3 minutes using crontab, follow these steps:
# 1. Open the crontab editor by executing `crontab -e` in your terminal.
# 2. Add the following line to schedule the script:
#    `*/3 * * * * <DRFC_PATH>/utils/evaluate.sh run.env >> <LOG_PATH>/evaluate.log 2>&1`
# 3. Save and close the editor. The script is now scheduled to run every 3 minutes.

if [ "$#" -ne 1 ]; then
  echo "Usage: $0 <environment file>"
  exit 1
fi

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
DR_DIR="$(dirname $SCRIPT_DIR)"
ENV_FILE="$1"

if [[ -f "$DR_DIR/$ENV_FILE" ]]; then
  source $DR_DIR/bin/activate.sh $DR_DIR/$ENV_FILE
else
  echo "File $ENV_FILE does not exist."
  exit 1
fi

printf "\n##################################################\n"
printf "### %-15s %-15s\n" "Configuration:" "$ENV_FILE"
printf "### %-15s %-15s\n" "Run ID:" "$DR_RUN_ID"
printf "### %-15s %-15s\n" "Model Name:" "$DR_LOCAL_S3_MODEL_PREFIX"
printf "### %-15s %-15s\n" "Track:" "$DR_WORLD_NAME"
printf "### %-15s %-15s\n" "Start:" "$(date)"
printf "##################################################\n\n"

dr-stop-evaluation

# Check if Docker style is set to swarm and wait for all containers to stop
if [ "$DR_DOCKER_STYLE" == "swarm" ]; then
	STACK_NAME="deepracer-eval-$DR_RUN_ID"
	STACK_CONTAINERS=$(docker stack ps $STACK_NAME 2>/dev/null | wc -l)
	while [[ "$STACK_CONTAINERS" -gt 1 ]]; do
		echo "Waiting for all containers in the stack to stop..."
		sleep 5
		STACK_CONTAINERS=$(docker stack ps $STACK_NAME 2>/dev/null | wc -l)
	done
fi

dr-start-evaluation -q


================================================
FILE: utils/sample-createspot.sh
================================================
#!/usr/bin/env bash

##  This is sample code that will generally show you how to launch a spot instance on aws and leverage the 
##  automation built into deepracer-for-cloud to automatically start training
##  Changes required to work:
##     Input location where your training will take place -- S3_LOCATION
##     Input security group, iam role, and key-name

## First you need to tell the script where in s3 your training will take place
## can be either a bucket at the root level, or a bucket/prefix.  don't include the s3://

S3_LOCATION=<#########>

## extract bucket location
BUCKET=${S3_LOCATION%%/*}

## extract prefix location
if [[ "$S3_LOCATION" == *"/"* ]]
then
  PREFIX=${S3_LOCATION#*/}
else
  PREFIX=""
fi

## Fill these out with your custom information if you want to upload and submit to leaderboard.  not required to run
DR_UPLOAD_S3_PREFIX=########

## set the instance type you want to launch
INSTANCE_TYPE=c5.2xlarge

## if you want to modify additional variables from the default, add them here, then add them to section further below called replace static paramamters.  I've only done World name for now
WORLD_NAME=FS_June2020

## modify this if you want additional robomaker workers
DR_WORKERS=1

## select which images you want to use.  these will be used later for a docker pull
DR_SAGEMAKER_IMAGE=cpu-avx-mkl
DR_ROBOMAKER_IMAGE=cpu-avx2

## check the s3 location for existing training folders
## automatically determine the latest training run (highest number), and set model parameters accordingly
## this script assumes the format rl-deepracer-1, rl-deepracer-2, etc.  you will need to modify if your schema differs

LAST_TRAINING=$(aws s3 ls $S3_LOCATION/rl-deepracer | sort -t - -k 3 -g | tail -n 1 | awk '{print $2}')
## drop trailing slash
LAST_TRAINING=$(echo $LAST_TRAINING | sed 's:/*$::')

CONFIG_FILE="./run.env"
OLD_SYSTEMENV="./system.env"

## incorporate logic from increment.sh, slightly modified to use last training 
OPT_DELIM='-'
## Read in data
CURRENT_RUN_MODEL=$(aws s3 ls $S3_LOCATION/rl-deepracer | sort -t - -k 3 -g | tail -n 1 | awk '{print $2}')
## drop trailing slash
CURRENT_RUN_MODEL=$(echo $LAST_TRAINING | sed 's:/*$::')
## get number at the end
CURRENT_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL}" | \
                    awk -v DELIM="${OPT_DELIM}" '{ n=split($0,a,DELIM); if (a[n] ~ /[0-9]*/) print a[n]; else print ""; }')

if [ -z $LAST_TRAINING ]
then
    echo No prior training found
    if [[ $PREFIX == "" ]]
    then
      NEW_RUN_MODEL=rl-deepracer-1
    else
      NEW_RUN_MODEL="$PREFIX/rl-deepracer-1"
    fi
    PRETRAINED=False
    CURRENT_RUN_MODEL=$NEW_RUN_MODEL
else

    NEW_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL_NUM} + 1" | bc )
    PRETRAINED=True

    if [[ $PREFIX == "" ]]
    then
      NEW_RUN_MODEL=$(echo $CURRENT_RUN_MODEL | sed "s/${CURRENT_RUN_MODEL_NUM}\$/${NEW_RUN_MODEL_NUM}/")
    else
      NEW_RUN_MODEL=$(echo $CURRENT_RUN_MODEL | sed "s/${CURRENT_RUN_MODEL_NUM}\$/${NEW_RUN_MODEL_NUM}/")     
      NEW_RUN_MODEL="$PREFIX/$NEW_RUN_MODEL"
      CURRENT_RUN_MODEL="$PREFIX/$CURRENT_RUN_MODEL"
    fi
    echo Last training was $CURRENT_RUN_MODEL so next training is $NEW_RUN_MODEL
fi

if [[ $PREFIX == "" ]]
then
    CUSTOM_FILES_PREFIX="custom_files"
else
    CUSTOM_FILES_PREFIX="$PREFIX/custom_files"
fi

## Replace dynamic parameters in run.env (still local to your directory)
sed -i.bak -re "s:(DR_LOCAL_S3_PRETRAINED_PREFIX=).*$:\1$CURRENT_RUN_MODEL:g; s:(DR_LOCAL_S3_PRETRAINED=).*$:\1$PRETRAINED:g; s:(DR_LOCAL_S3_MODEL_PREFIX=).*$:\1$NEW_RUN_MODEL:g; s:(DR_LOCAL_S3_CUSTOM_FILES_PREFIX=).*$:\1$CUSTOM_FILES_PREFIX:g" "$CONFIG_FILE"
sed -i.bak -re "s/(DR_LOCAL_S3_BUCKET=).*$/\1$BUCKET/g" "$CONFIG_FILE"

## Replace static parameters in run.env (still local to your directory)
sed -i.bak -re "s/(DR_UPLOAD_S3_PREFIX=).*$/\1$DR_UPLOAD_S3_PREFIX/g" "$CONFIG_FILE"
sed -i.bak -re "s/(DR_WORLD_NAME=).*$/\1$WORLD_NAME/g" "$CONFIG_FILE"

## Replace static paramaters in system.env file, including sagemaker and robomaker images (still local to your directory) and the number of DR_workers
sed -i.bak -re "s/(DR_UPLOAD_S3_BUCKET=).*$/\1$DR_UPLOAD_S3_BUCKET/g; s/(DR_SAGEMAKER_IMAGE=).*$/\1$DR_SAGEMAKER_IMAGE/g; s/(DR_ROBOMAKER_IMAGE=).*$/\1$DR_ROBOMAKER_IMAGE/g; s/(DR_WORKERS=).*$/\1$DR_WORKERS/g" "$OLD_SYSTEMENV"

## upload the new run.env and system.env files into your S3 bucket (same s3 location identified earlier)
## files are loaded into the node-config folder/prefix.  You can also upload other files to node config, and they
## will sync to the EC2 instance as part of the autorun script later.  If you add other files, make sure they are 
## in node-config in the same directory structure as DRfc;   example:   s3location/node-config/scripts/training/.start.sh
RUNENV_LOCATION=$S3_LOCATION/node-config/run.env
SYSENV_LOCATION=$S3_LOCATION/node-config/system.env

aws s3 cp ./run.env s3://$RUNENV_LOCATION
aws s3 cp ./system.env s3://$SYSENV_LOCATION

## upload a custom autorun script to S3.  there is a default autorun script in the repo that will be used unless a custom one is specified here instead
#aws s3 cp ./autorun.sh s3://$S3_LOCATION/autorun.sh

## upload custom files -- if you dont want this, comment these lines out
aws s3 cp ./model_metadata.json s3://$S3_LOCATION/custom_files/model_metadata.json
aws s3 cp ./reward_function.py s3://$S3_LOCATION/custom_files/reward_function.py
aws s3 cp ./hyperparameters.json s3://$S3_LOCATION/custom_files/hyperparameters.json

## launch an ec2
## update with your own settings, including key-name, security-group, and iam-instance-profile at a minimum
## user data includes a command to create a .txt file which simply contains the name of the s3 location
## this filename will be used as fundamental input to autorun.sh script run later on that instance
## you need to ensure you have proper IAM permissions to launch this instance

aws ec2 run-instances \
    --image-id ami-085925f297f89fce1 \
    --count 1 \
    --instance-type $INSTANCE_TYPE \
    --key-name <####keyname####> \
    --security-group-ids sg-<####sgid####> \
    --block-device-mappings 'DeviceName=/dev/sda1,Ebs={DeleteOnTermination=true,VolumeSize=40}' \
    --iam-instance-profile Arn=arn:aws:iam::<####acct_num####>:instance-profile/<####role_name####> \
    --instance-market-options MarketType=spot \
    --user-data "#!/bin/bash
    su -c 'git clone https://github.com/aws-deepracer-community/deepracer-for-cloud.git && echo "$S3_LOCATION/node-config" > /home/ubuntu/deepracer-for-cloud/autorun.s3url && /home/ubuntu/deepracer-for-cloud/bin/prepare.sh' - ubuntu"


================================================
FILE: utils/setup-xorg.sh
================================================
#!/usr/bin/env bash

set -e

# Script to install basic X-Windows on a headless instance (e.g. in EC2)

# Script shall run as user, not root. Sudo will be used when needed.
if [[ $EUID == 0 ]]; then
    echo "ERROR: Do not run as root / via sudo."
    exit 1
fi

# Deepracer environment variables must be set.
if [ -z "$DR_DIR" ]; then
    echo "ERROR: DR_DIR not set. Run 'source bin/activate.sh' before setup-xorg.sh."
    exit 1
fi

# Install additional packages
sudo apt-get install xinit xserver-xorg-legacy x11-xserver-utils x11-utils \
        menu mesa-utils xterm mwm x11vnc pkg-config screen -y --no-install-recommends

# Configure
sudo sed -i -e "s/console/anybody/" /etc/X11/Xwrapper.config
BUS_ID=$(nvidia-xconfig --query-gpu-info | grep "PCI BusID" | cut -f2- -d: | sed -e 's/^[[:space:]]*//' | head -1)
sudo nvidia-xconfig --busid=$BUS_ID -o $DR_DIR/tmp/xorg.conf

touch ~/.Xauthority

sudo tee -a $DR_DIR/tmp/xorg.conf <<EOF

Section "DRI"
        Mode 0666
EndSection
EOF


================================================
FILE: utils/start-local-browser.sh
================================================
#!/usr/bin/env bash

source $DR_DIR/bin/scripts_wrapper.sh

usage() {
  echo "Usage: $0 [-t topic] [-w width] [-h height] [-q quality] -b [browser-command]"
  echo "       -w        Width of individual stream."
  echo "       -h        Heigth of individual stream."
  echo "       -q        Quality of the stream image."
  echo "       -t        Topic to follow - default /racecar/deepracer/kvs_stream"
  echo "       -b        Browser command (default: firefox --new-tab)"
  exit 1
}

trap ctrl_c INT

function ctrl_c() {
  echo "Requested to stop."
  exit 1
}

# Stream definition
TOPIC="/racecar/deepracer/kvs_stream"
WIDTH=480
HEIGHT=360
QUALITY=75
BROWSER="firefox --new-tab"

while getopts ":w:h:q:t:b:" opt; do
  case $opt in
  w)
    WIDTH="$OPTARG"
    ;;
  h)
    HEIGHT="$OPTARG"
    ;;
  q)
    QUALITY="$OPTARG"
    ;;
  t)
    TOPIC="$OPTARG"
    ;;
  b)
    BROWSER="$OPTARG"
    ;;
  \?)
    echo "Invalid option -$OPTARG" >&2
    usage
    ;;
  esac
done

FILE=$DR_DIR/tmp/streams-$DR_RUN_ID.html

# Check if we will use Docker Swarm or Docker Compose
if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then
  echo "This script does not support swarm mode. Use $(dr-start-viewer)."
  exit
fi

echo "<html><head><title>DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC</title></head><body><h1>DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC</h1>" >$FILE

ROBOMAKER_CONTAINERS=$(docker ps --format "{{.ID}}" --filter name=deepracer-$DR_RUN_ID --filter "ancestor=${DR_SIMAPP_SOURCE}:${DR_SIMAPP_VERSION}")
if [ -z "$ROBOMAKER_CONTAINERS" ]; then
  echo "No running robomakers. Exiting."
  exit
fi

for c in $ROBOMAKER_CONTAINERS; do
  C_PORT=$(docker inspect $c | jq -r '.[0].NetworkSettings.Ports["8080/tcp"][0].HostPort')
  C_URL="http://localhost:${C_PORT}/stream?topic=${TOPIC}&quality=${QUALITY}&width=${WIDTH}&height=${HEIGHT}"
  C_IMG="<img src=\"${C_URL}\"></img>"
  echo $C_IMG >>$FILE
done

echo "</body></html>" >>$FILE
echo "Starting browser '$BROWSER'."
$BROWSER $(_realpath "$FILE") &


================================================
FILE: utils/start-xorg.sh
================================================
#!/usr/bin/env bash

set -e

# Script shall run as user, not root. Sudo will be used when needed.
if [[ $EUID == 0 ]]; then
    echo "ERROR: Do not run as root / via sudo."
    exit 1
fi

# X must not be running when we try to start it.
if timeout 1s xset -display $DR_DISPLAY q &>/dev/null; then
    echo "ERROR: X Server already running on display $DR_DISPLAY."
    exit 1
fi

# Deepracer environment variables must be set.
if [ -z "$DR_DIR" ]; then
    echo "ERROR: DR_DIR not set. Run 'source bin/activate.sh' before start-xorg.sh."
    exit 1
fi

if [ -z "$DR_DISPLAY" ]; then
    echo "ERROR: DR_DISPLAY not set. Ensure the variable is configured in system.env."
    exit 1
fi

# Start inside a sudo-screen to prevent it from stopping when disconnecting terminal.
sudo screen -d -S DeepracerXorg -m bash -c "xinit /usr/bin/mwm -display $DR_DISPLAY -- /usr/lib/xorg/Xorg $DR_DISPLAY -config $DR_DIR/tmp/xorg.conf > $DR_DIR/tmp/xorg.log 2>&1"

# Screen detaches; let it have some time to start X.
sleep 1

if [[ "${DR_GUI_ENABLE,,}" == "true" ]]; then
    x11vnc -bg -forever -no6 -nopw -rfbport 5901 -rfbportv6 -1 -loop -display WAIT$DR_DISPLAY &
    sleep 1
fi

# Create xauth mit-magic-cookie.
xauth generate $DR_DISPLAY

# Check if X started successfully. If not, print error message and exit.
if timeout 1s xset -display $DR_DISPLAY q &>/dev/null; then
    echo "X Server started on display $DR_DISPLAY"
else
    echo "Server failed to start on display $DR_DISPLAY"
fi


================================================
FILE: utils/timed-stop.sh
================================================
#!/usr/bin/env bash

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
DR_DIR="$(dirname $SCRIPT_DIR)"
ENV_FILE="$1"

source $DR_DIR/bin/activate.sh $DR_DIR/$1
dr-stop-training

================================================
FILE: utils/upload-rotate.sh
================================================
#!/usr/bin/env bash
# This script uploads the latest DeepRacer model and activates the necessary environment.
# It processes command line options to customize the environment file path, enable local upload, and specify an evaluation environment file.
# After processing the options, it activates the environment, uploads the model, and updates the evaluation environment file with the new model prefix if specified.
#
# Usage:
# ./upload-rotate.sh [-e <environment file>] [-L] [-E <evaluation environment file>] [-c <counter file>] [-v]
#
# Options:
# -c <counter file>               Specify the path to the counter file. This is optional.
# -e <environment file>           Specify the path to the environment configuration file. Defaults to 'run.env' in the script's directory.
# -L                              Enable local upload. This option does not require a value.
# -v                              Add more verbose logging, capturing iteration and entropy numbers.
# -E <evaluation environment file> Specify the path to the evaluation environment configuration file. This is optional.
# -C                              Upload the car file. This option does not require a value.
#
# Example:
# ./upload-rotate.sh -e custom.env -L -E eval.env
#
# To run this script manually, navigate to its directory and execute it with desired options.
# Ensure you have the necessary permissions to execute the script.

# Navigate to the script directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DR_DIR="$(dirname "$SCRIPT_DIR")"

# Default environment file path
ENV_FILE="$DR_DIR/run.env"
LOCAL_UPLOAD=""
EVAL_ENV_FILE=""

# Process command line options
while getopts "e:LE:vc:C" opt; do
  case $opt in
    c) COUNTER_FILE="$OPTARG" ;;
    e) ENV_FILE="$OPTARG" ;;
    L) LOCAL_UPLOAD="-L" ;;
    E) EVAL_ENV_FILE="$OPTARG" ;;
    v) VERBOSE_LOGGING="true" ;;
    C) CAR_FILE="-C" ;;
    *) echo "Invalid option: -$OPTARG" >&2; exit 1 ;;
  esac
done

# If a counter file is specified, increment the counter
if [ -n "$COUNTER_FILE" ]; then
  if [ -f "$COUNTER_FILE" ]; then
    COUNTER=$(cat "$COUNTER_FILE")
    COUNTER=$((COUNTER + 1))
    echo "$COUNTER" > "$COUNTER_FILE"
    export UPLOAD_COUNTER=$COUNTER
  else
    echo "Error: Counter file '$COUNTER_FILE' not found." >&2
    exit 1
  fi
fi

# Activate the environment
if [ -f "$ENV_FILE" ]; then
  source "$DR_DIR/bin/activate.sh" "$ENV_FILE"
else
  if [ -f "$DR_DIR/$ENV_FILE" ]; then
    source "$DR_DIR/bin/activate.sh" "$DR_DIR/$ENV_FILE"
  else
    echo "Error: Environment file '$ENV_FILE' not found." >&2
    exit 1
  fi
fi

# Execute the upload command
if [ -n "$COUNTER_FILE" ]; then
  dr-upload-model $LOCAL_UPLOAD -f
else
  dr-upload-model $LOCAL_UPLOAD -1 -f
fi
dr-update

# If the car file option is specified, upload the car file
if [ -n "$CAR_FILE" ]; then
  dr-upload-car-zip $LOCAL_UPLOAD -f
fi

# If an evaluation environment file is specified then alter the model prefix to enable evaluation
if [ -n "$EVAL_ENV_FILE" ]; then
  if [ ! -f "$EVAL_ENV_FILE" ]; then
    if [ -f "$DR_DIR/$EVAL_ENV_FILE" ]; then
      EVAL_ENV_FILE="$DR_DIR/$EVAL_ENV_FILE"
    else
      echo "Error: Evaluation environment file '$EVAL_ENV_FILE' not found." >&2
      exit 1
    fi
  fi
  MODEL_PREFIX=$(echo $DR_UPLOAD_S3_PREFIX)
  echo "Updating evaluation environment file $EVAL_ENV_FILE to use $MODEL_PREFIX"
  sed -i "s/DR_LOCAL_S3_MODEL_PREFIX=.*/DR_LOCAL_S3_MODEL_PREFIX=$MODEL_PREFIX/" $EVAL_ENV_FILE
fi

printf "\n############################################################\n"
printf "### %-15s %-15s\n" "Configuration:" "$ENV_FILE"
printf "### %-15s %-15s\n" "Model Name:" "$DR_LOCAL_S3_MODEL_PREFIX"
printf "### %-15s %-15s\n" "Uploaded Model:" "$DR_UPLOAD_S3_PREFIX"

# If verbose logging is enabled, retrieve the entropy and iteration numbers.
if [ -n "$VERBOSE_LOGGING" ]; then
  CONTAINER_ID=$(docker ps -f "name=deepracer-${DR_RUN_ID}_rl_coach" --format "{{.ID}}")
  if [ -n "$CONTAINER_ID" ]; then
    LAST_ITERATION=$(docker logs --since 20m "$CONTAINER_ID" 2>/dev/null | awk '{if (match($0, /Best checkpoint number: ([0-9]+), Last checkpoint number: ([0-9]+)/, arr)) {print arr[2]}}' | tail -n 1)
    printf "### %-15s %-15s\n" "Last iteration:" "$LAST_ITERATION"

    ENTROPY=$(docker logs --since 20m "$CONTAINER_ID" 2>/dev/null | awk '{if (match($0, /Entropy=([0-9.]+)/, arr)) {print arr[1]}}' | tail -n 1)
    printf "### %-15s %-15s\n" "Entropy:" "$ENTROPY"
  fi
fi

printf "### %-15s %-15s\n" "Completed at:" "$(date)"
printf "############################################################\n\n"