Repository: kahkhang/kube-linode Branch: master Commit: 812ac407badd Files: 51 Total size: 340.7 KB Directory structure: gitextract_psepesfk/ ├── .gitignore ├── LICENSE ├── README.md ├── display.sh ├── install-coreos.sh ├── kube-linode.sh ├── linode-utilities.sh └── manifests/ ├── alertmanager/ │ ├── alertmanager-config.yaml │ ├── alertmanager-service.yaml │ └── alertmanager.yaml ├── container-linux/ │ ├── master-config.yaml │ └── worker-config.yaml ├── grafana/ │ ├── grafana-dashboards.yaml │ ├── grafana-deployment.yaml │ └── grafana-service.yaml ├── heapster.yaml ├── kube-dashboard.yaml ├── kube-state-metrics/ │ ├── kube-state-metrics-cluster-role-binding.yaml │ ├── kube-state-metrics-cluster-role.yaml │ ├── kube-state-metrics-deployment.yaml │ ├── kube-state-metrics-service-account.yaml │ └── kube-state-metrics-service.yaml ├── node-exporter/ │ ├── node-exporter-daemonset.yaml │ └── node-exporter-service.yaml ├── prometheus/ │ ├── prometheus-k8s-ingress.yaml │ ├── prometheus-k8s-role-bindings.yaml │ ├── prometheus-k8s-roles.yaml │ ├── prometheus-k8s-rules.yaml │ ├── prometheus-k8s-service-account.yaml │ ├── prometheus-k8s-service-monitor-alertmanager.yaml │ ├── prometheus-k8s-service-monitor-apiserver.yaml │ ├── prometheus-k8s-service-monitor-kube-controller-manager.yaml │ ├── prometheus-k8s-service-monitor-kube-scheduler.yaml │ ├── prometheus-k8s-service-monitor-kube-state-metrics.yaml │ ├── prometheus-k8s-service-monitor-kubelet.yaml │ ├── prometheus-k8s-service-monitor-node-exporter.yaml │ ├── prometheus-k8s-service-monitor-prometheus-operator.yaml │ ├── prometheus-k8s-service-monitor-prometheus.yaml │ ├── prometheus-k8s-service-monitor-rook.yaml │ ├── prometheus-k8s-service-monitor-traefik.yaml │ ├── prometheus-k8s-service.yaml │ └── prometheus-k8s.yaml ├── prometheus-operator/ │ ├── prometheus-operator-cluster-role-binding.yaml │ ├── prometheus-operator-cluster-role.yaml │ ├── prometheus-operator-service-account.yaml │ ├── prometheus-operator-service.yaml │ └── prometheus-operator.yaml ├── rook/ │ ├── rook-cluster.yaml │ ├── rook-operator.yaml │ └── rook-storageclass.yaml └── traefik.yaml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ certs/ cluster/ acme.json auth settings.env testing.sh package.sh *.zip demo.mov resolv.conf bootkube manifests/grafana/grafana-credentials.yaml install.exp manifests/container-linux/master-config.yaml.bak ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2017 Andrew Low Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ ## :whale: Provision a Kubernetes / CoreOS Cluster on Linode [![Bash](https://img.shields.io/badge/language-Bash-green.svg)](https://github.com/kahkhang/kube-linode) [![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://raw.githubusercontent.com/kahkhang/kube-linode/master/LICENSE) [![Gitter](https://img.shields.io/gitter/room/kube-linode/support.svg)](https://gitter.im/kube-linode/support) Automatically provision a scalable CoreOS/Kubernetes cluster on Linode with zero configuration. ![Demo](demo.gif) The cluster will comprise of a single Kubernetes master host with a custom number of worker nodes. ### What's included * [Kubernetes 1.11.0](https://kubernetes.io/) with [Bootkube](https://github.com/kubernetes-incubator/bootkube) * Load Balancer and automatic SSL/TLS renewal using [Traefik](https://github.com/containous/traefik) * Distributed block storage with [Rook](https://github.com/rook/rook) * Pre-configured [Grafana](https://github.com/grafana/grafana) dashboard using [Kube-Prometheus](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus) with Rook and Traefik monitoring * Basic auth protected subdomains (assuming you are using example.com): * https://kube.example.com ([Kubernetes Dashboard](https://github.com/kubernetes/dashboard)) * https://grafana.example.com ([Grafana](https://github.com/grafana/grafana)) * https://alertmanager.example.com ([Alert Manager](https://github.com/prometheus/alertmanager)) * https://prometheus.example.com ([Prometheus Web UI](https://github.com/prometheus/prometheus)) * https://traefik.example.com ([Traefik Web UI](https://github.com/containous/traefik#web-ui)) ### Usage ```sh git clone https://github.com/kahkhang/kube-linode cd kube-linode chmod +x kube-linode.sh ``` Just run `./kube-linode.sh create` into your console, key in your configuration, then sit back and have a :coffee:! Settings are stored in `settings.env`, or you can pass them in as key-value flags as such: ```sh ./kube-linode.sh --no_of_workers=3 --api_key=12345 ``` To increase the number of workers, modify `NO_OF_WORKERS` in `settings.env` as desired and run `./kube-linode.sh` again. Use `kubectl` to control the cluster (e.g. `kubectl get nodes`)
If you want to destroy the cluster created by kube-linode, you can run the following command: ```sh ./kube-linode.sh destroy ``` A prompt will be given listing all the nodes which will be destroyed upon confirmation . ### Dependencies You should have a Linode account, which you can get [here](https://www.linode.com/?r=0affaec6ca42ca06f5f2c2d3d8d1ceb354e222c1). You should also have an API Key with a valid domain that uses [Linode's DNS servers](https://www.linode.com/docs/networking/dns/dns-manager-overview#set-domain-names-to-use-linodes-name-servers). OSX: ``` brew install jq openssl curl kubectl ``` Arch Linux: Follow the instructions [here](https://github.com/kahkhang/kube-linode/issues/4#issuecomment-311601422) ### Acknowledgements This script uses [Bootkube](https://github.com/kubernetes-incubator/bootkube) to bootstrap the initial cluster using [Linode's API](https://www.linode.com/api). ================================================ FILE: display.sh ================================================ #!/bin/bash _SPINNER_POS=0 spinner() { IFS=$'\n' local delay=0.05 local list=( $(echo -e '\xe2\xa0\x8b') $(echo -e '\xe2\xa0\x99') $(echo -e '\xe2\xa0\xb9') $(echo -e '\xe2\xa0\xb8') $(echo -e '\xe2\xa0\xbc') $(echo -e '\xe2\xa0\xb4') $(echo -e '\xe2\xa0\xa6') $(echo -e '\xe2\xa0\xa7') $(echo -e '\xe2\xa0\x87') $(echo -e '\xe2\xa0\x8f')) local i=$_SPINNER_POS local tempfile tempfile=$(mktemp) eval $2 >> $tempfile 2>/dev/null & local pid=$! tput sc printf "%s %s" "${list[i]}" "$1" tput el tput rc i=$(($i+1)) i=$(($i%10)) while [ "$(ps a | awk '{print $1}' | grep $pid)" ]; do printf "%s" "${list[i]}" i=$(($i+1)) i=$(($i%10)) sleep $delay printf "\b\b\b" done _SPINNER_POS=$i if [ -z $3 ]; then :; else eval $3=\'"$(cat $tempfile)"\' fi rm $tempfile } arrow="$(echo -e '\xe2\x9d\xaf')" checked="$(echo -e '\xe2\x97\x89')" unchecked="$(echo -e '\xe2\x97\xaf')" black="$(tput setaf 0)" red="$(tput setaf 1)" green="$(tput setaf 2)" yellow="$(tput setaf 3)" blue="$(tput setaf 4)" magenta="$(tput setaf 5)" cyan="$(tput setaf 6)" white="$(tput setaf 7)" bold="$(tput bold)" normal="$(tput sgr0)" dim=$'\e[2m' print() { echo "$1" tput el } join() { local IFS=$'\n' local _join_list eval _join_list=( '"${'${1}'[@]}"' ) local first=true for item in ${_join_list[@]}; do if [ "$first" = true ]; then printf "%s" "$item" first=false else printf "${2-, }%s" "$item" fi done } function gen_env_from_options() { local IFS=$'\n' local _indices local _env_names local _checkbox_selected eval _indices=( '"${'${1}'[@]}"' ) eval _env_names=( '"${'${2}'[@]}"' ) for i in $(gen_index ${#_env_names[@]}); do _checkbox_selected[$i]=false done for i in ${_indices[@]}; do _checkbox_selected[$i]=true done for i in $(gen_index ${#_env_names[@]}); do printf "%s=%s\n" "${_env_names[$i]}" "${_checkbox_selected[$i]}" done } on_default() { true; } on_keypress() { local OLD_IFS local IFS local key OLD_IFS=$IFS local on_up=${1:-on_default} local on_down=${2:-on_default} local on_space=${3:-on_default} local on_enter=${4:-on_default} local on_left=${5:-on_default} local on_right=${6:-on_default} local on_ascii=${7:-on_default} local on_backspace=${8:-on_default} _break_keypress=false while IFS="" read -rsn1 key; do case "$key" in $'\x1b') read -rsn1 key if [[ "$key" == "[" ]]; then read -rsn1 key case "$key" in 'A') eval $on_up;; 'B') eval $on_down;; 'D') eval $on_left;; 'C') eval $on_right;; esac fi ;; ' ') eval $on_space ' ';; [a-z0-9A-Z\!\#\$\&\+\,\-\.\/\;\=\?\@\[\]\^\_\{\}\~]) eval $on_ascii $key;; $'\x7f') eval $on_backspace $key;; '') eval $on_enter $key;; esac if [ $_break_keypress = true ]; then break fi done IFS=$OLD_IFS } gen_index() { local k=$1 local l=0 if [ $k -gt 0 ]; then for l in $(seq $k) do echo "$l-1" | bc done fi } control_c() { tput cub "$(tput cols)" tput el stty sane tput cnorm stty echo exit $? } select_indices() { local _select_list local _select_indices local _select_selected=() eval _select_list=( '"${'${1}'[@]}"' ) eval _select_indices=( '"${'${2}'[@]}"' ) local _select_var_name=$3 eval $_select_var_name\=\(\) for i in $(gen_index ${#_select_indices[@]}); do eval $_select_var_name\+\=\(\""${_select_list[${_select_indices[$i]}]}"\"\) done } #!/bin/bash set -e on_checkbox_input_up() { remove_checkbox_instructions tput cub "$(tput cols)" if [ "${_checkbox_selected[$_current_index]}" = true ]; then printf " ${green}${checked}${normal} ${_checkbox_list[$_current_index]} ${normal}" else printf " ${unchecked} ${_checkbox_list[$_current_index]} ${normal}" fi tput el if [ $_current_index = 0 ]; then _current_index=$((${#_checkbox_list[@]}-1)) tput cud $((${#_checkbox_list[@]}-1)) tput cub "$(tput cols)" else _current_index=$((_current_index-1)) tput cuu1 tput cub "$(tput cols)" tput el fi if [ "${_checkbox_selected[$_current_index]}" = true ]; then printf "${cyan}${arrow}${green}${checked}${normal} ${_checkbox_list[$_current_index]} ${normal}" else printf "${cyan}${arrow}${normal}${unchecked} ${_checkbox_list[$_current_index]} ${normal}" fi } on_checkbox_input_down() { remove_checkbox_instructions tput cub "$(tput cols)" if [ "${_checkbox_selected[$_current_index]}" = true ]; then printf " ${green}${checked}${normal} ${_checkbox_list[$_current_index]} ${normal}" else printf " ${unchecked} ${_checkbox_list[$_current_index]} ${normal}" fi tput el if [ $_current_index = $((${#_checkbox_list[@]}-1)) ]; then _current_index=0 tput cuu $((${#_checkbox_list[@]}-1)) tput cub "$(tput cols)" else _current_index=$((_current_index+1)) tput cud1 tput cub "$(tput cols)" tput el fi if [ "${_checkbox_selected[$_current_index]}" = true ]; then printf "${cyan}${arrow}${green}${checked}${normal} ${_checkbox_list[$_current_index]} ${normal}" else printf "${cyan}${arrow}${normal}${unchecked} ${_checkbox_list[$_current_index]} ${normal}" fi } on_checkbox_input_enter() { local OLD_IFS OLD_IFS=$IFS _checkbox_selected_indices=() _checkbox_selected_options=() IFS=$'\n' for i in $(gen_index ${#_checkbox_list[@]}); do if [ "${_checkbox_selected[$i]}" = true ]; then _checkbox_selected_indices+=($i) _checkbox_selected_options+=("${_checkbox_list[$i]}") fi done tput cud $((${#_checkbox_list[@]}-${_current_index})) tput cub "$(tput cols)" for i in $(seq $((${#_checkbox_list[@]}+1))); do tput el1 tput el tput cuu1 done tput cub "$(tput cols)" tput cuf $((${#prompt}+3)) printf "${cyan}$(join _checkbox_selected_options)${normal}" tput el tput cud1 tput cub "$(tput cols)" tput el _break_keypress=true IFS=$OLD_IFS } on_checkbox_input_space() { remove_checkbox_instructions tput cub "$(tput cols)" tput el if [ "${_checkbox_selected[$_current_index]}" = true ]; then _checkbox_selected[$_current_index]=false else _checkbox_selected[$_current_index]=true fi if [ "${_checkbox_selected[$_current_index]}" = true ]; then printf "${cyan}${arrow}${green}${checked}${normal} ${_checkbox_list[$_current_index]} ${normal}" else printf "${cyan}${arrow}${normal}${unchecked} ${_checkbox_list[$_current_index]} ${normal}" fi } remove_checkbox_instructions() { if [ $_first_keystroke = true ]; then tput cuu $((${_current_index}+1)) tput cub "$(tput cols)" tput cuf $((${#prompt}+3)) tput el tput cud $((${_current_index}+1)) _first_keystroke=false fi } _checkbox_input() { local i local j prompt=$1 eval _checkbox_list=( '"${'${2}'[@]}"' ) _current_index=0 _first_keystroke=true trap control_c SIGINT EXIT stty -echo tput civis print "${normal}${green}?${normal} ${bold}${prompt}${normal} ${dim}(Press to select, to finalize)${normal}" for i in $(gen_index ${#_checkbox_list[@]}); do _checkbox_selected[$i]=false done if [ -n "$3" ]; then eval _selected_indices=( '"${'${3}'[@]}"' ) for i in ${_selected_indices[@]}; do _checkbox_selected[$i]=true done fi for i in $(gen_index ${#_checkbox_list[@]}); do tput cub "$(tput cols)" if [ $i = 0 ]; then if [ "${_checkbox_selected[$i]}" = true ]; then print "${cyan}${arrow}${green}${checked}${normal} ${_checkbox_list[$i]} ${normal}" else print "${cyan}${arrow}${normal}${unchecked} ${_checkbox_list[$i]} ${normal}" fi else if [ "${_checkbox_selected[$i]}" = true ]; then print " ${green}${checked}${normal} ${_checkbox_list[$i]} ${normal}" else print " ${unchecked} ${_checkbox_list[$i]} ${normal}" fi fi tput el done for j in $(gen_index ${#_checkbox_list[@]}); do tput cuu1 done on_keypress on_checkbox_input_up on_checkbox_input_down on_checkbox_input_space on_checkbox_input_enter } checkbox_input() { _checkbox_input "$1" "$2" _checkbox_input_output_var_name=$3 select_indices _checkbox_list _checkbox_selected_indices $_checkbox_input_output_var_name unset _checkbox_list unset _break_keypress unset _first_keystroke unset _current_index unset _checkbox_input_output_var_name unset _checkbox_selected_indices unset _checkbox_selected_options } checkbox_input_indices() { _checkbox_input "$1" "$2" "$3" _checkbox_input_output_var_name=$3 eval $_checkbox_input_output_var_name\=\(\) for i in $(gen_index ${#_checkbox_selected_indices[@]}); do eval $_checkbox_input_output_var_name\+\=\(${_checkbox_selected_indices[$i]}\) done unset _checkbox_list unset _break_keypress unset _first_keystroke unset _current_index unset _checkbox_input_output_var_name unset _checkbox_selected_indices unset _checkbox_selected_options } #!/bin/bash set -e on_list_input_up() { remove_list_instructions tput cub "$(tput cols)" printf " ${_list_options[$_list_selected_index]}" tput el if [ $_list_selected_index = 0 ]; then _list_selected_index=$((${#_list_options[@]}-1)) tput cud $((${#_list_options[@]}-1)) tput cub "$(tput cols)" else _list_selected_index=$((_list_selected_index-1)) tput cuu1 tput cub "$(tput cols)" tput el fi printf "${cyan}${arrow} %s ${normal}" "${_list_options[$_list_selected_index]}" } on_list_input_down() { remove_list_instructions tput cub "$(tput cols)" printf " ${_list_options[$_list_selected_index]}" tput el if [ $_list_selected_index = $((${#_list_options[@]}-1)) ]; then _list_selected_index=0 tput cuu $((${#_list_options[@]}-1)) tput cub "$(tput cols)" else _list_selected_index=$((_list_selected_index+1)) tput cud1 tput cub "$(tput cols)" tput el fi printf "${cyan}${arrow} %s ${normal}" "${_list_options[$_list_selected_index]}" } on_list_input_enter_space() { local OLD_IFS OLD_IFS=$IFS IFS=$'\n' tput cud $((${#_list_options[@]}-${_list_selected_index})) tput cub "$(tput cols)" for i in $(seq $((${#_list_options[@]}+1))); do tput el1 tput el tput cuu1 done tput cub "$(tput cols)" tput cuf $((${#prompt}+3)) printf "${cyan}${_list_options[$_list_selected_index]}${normal}" tput el tput cud1 tput cub "$(tput cols)" tput el _break_keypress=true IFS=$OLD_IFS } remove_list_instructions() { if [ $_first_keystroke = true ]; then tput cuu $((${_list_selected_index}+1)) tput cub "$(tput cols)" tput cuf $((${#prompt}+3)) tput el tput cud $((${_list_selected_index}+1)) _first_keystroke=false fi } _list_input() { local i local j prompt=$1 eval _list_options=( '"${'${2}'[@]}"' ) _list_selected_index=0 _first_keystroke=true trap control_c SIGINT EXIT stty -echo tput civis print "${normal}${green}?${normal} ${bold}${prompt}${normal} ${dim}(Use arrow keys)${normal}" for i in $(gen_index ${#_list_options[@]}); do tput cub "$(tput cols)" if [ $i = 0 ]; then print "${cyan}${arrow} ${_list_options[$i]} ${normal}" else print " ${_list_options[$i]}" fi tput el done for j in $(gen_index ${#_list_options[@]}); do tput cuu1 done on_keypress on_list_input_up on_list_input_down on_list_input_enter_space on_list_input_enter_space } list_input() { _list_input "$1" "$2" local var_name=$3 eval $var_name=\'"${_list_options[$_list_selected_index]}"\' unset _list_selected_index unset _list_options unset _break_keypress unset _first_keystroke } list_input_index() { _list_input "$1" "$2" local var_name=$3 eval $var_name=\'"$_list_selected_index"\' unset _list_selected_index unset _list_options unset _break_keypress unset _first_keystroke } #!/bin/bash set -e on_text_input_left() { remove_regex_failed if [ $_current_pos -gt 0 ]; then tput cub1 _current_pos=$(($_current_pos-1)) fi } on_text_input_right() { remove_regex_failed if [ $_current_pos -lt ${#_text_input} ]; then tput cuf1 _current_pos=$(($_current_pos+1)) fi } on_text_input_enter() { remove_regex_failed if [[ "$_text_input" =~ $_text_input_regex && "$(eval $_text_input_validator "$_text_input")" = true ]]; then tput cub "$(tput cols)" tput cuf $((${#_read_prompt}-19)) printf "${cyan}${_text_input}${normal}" tput el tput cud1 tput cub "$(tput cols)" tput el eval $var_name=\'"${_text_input}"\' _break_keypress=true else _text_input_regex_failed=true tput civis tput cud1 tput cub "$(tput cols)" tput el printf "${red}>>${normal} $_text_input_regex_failed_msg" tput cuu1 tput cub "$(tput cols)" tput cuf $((${#_read_prompt}-19)) tput el _text_input="" _current_pos=0 tput cnorm fi } on_text_input_ascii() { remove_regex_failed local c=$1 if [ "$c" = '' ]; then c=' ' fi local rest="${_text_input:$_current_pos}" _text_input="${_text_input:0:$_current_pos}$c$rest" _current_pos=$(($_current_pos+1)) tput civis printf "$c$rest" tput el if [ ${#rest} -gt 0 ]; then tput cub ${#rest} fi tput cnorm } on_text_input_backspace() { remove_regex_failed if [ $_current_pos -gt 0 ]; then local start="${_text_input:0:$(($_current_pos-1))}" local rest="${_text_input:$_current_pos}" _current_pos=$(($_current_pos-1)) tput cub 1 tput el tput sc printf "$rest" tput rc _text_input="$start$rest" fi } remove_regex_failed() { if [ $_text_input_regex_failed = true ]; then _text_input_regex_failed=false tput sc tput cud1 tput el1 tput el tput rc fi } text_input_default_validator() { echo true; } text_input() { local prompt=$1 local var_name=$2 local _text_input_regex="${3:-"\.+"}" local _text_input_regex_failed_msg=${4:-"Input validation failed"} local _text_input_validator=${5:-text_input_default_validator} local _read_prompt_start=$'\e[32m?\e[39m\e[1m' local _read_prompt_end=$'\e[22m' local _read_prompt="$( echo "$_read_prompt_start ${prompt} $_read_prompt_end")" local _current_pos=0 local _text_input_regex_failed=false local _text_input="" printf "$_read_prompt" trap control_c SIGINT EXIT stty -echo tput cnorm on_keypress on_default on_default on_text_input_ascii on_text_input_enter on_text_input_left on_text_input_right on_text_input_ascii on_text_input_backspace eval $var_name=\'"${_text_input}"\' } ================================================ FILE: install-coreos.sh ================================================ #!/bin/bash set -euo pipefail [[ -n "$REBOOT_STRATEGY" ]] || die "Need a reboot strategy. Run with eg. '\$REBOOT_STRATEGY=off ./install-coreos.sh'" PUBLIC_IP=$(ip addr show eth0 | grep "inet\b" | grep "/24" | awk '{print $2}' | cut -d/ -f1) PRIVATE_IP=$(ip addr show eth0 | grep "inet\b" | grep "/17" | awk '{print $2}' | cut -d/ -f1) wget --quiet --no-check-certificate https://github.com/coreos/container-linux-config-transpiler/releases/download/v0.5.0/ct-v0.5.0-x86_64-unknown-linux-gnu -O ct chmod +x ct apt-get -y install gawk wget --quiet https://raw.githubusercontent.com/coreos/init/master/bin/coreos-install chmod u+x coreos-install cat container-linux-config.yaml \ | sed "s/#SSH_KEY#/$(cat ~/.ssh/authorized_keys | grep '^ssh-rsa' | sed -n 1p | sed 's/\//\\\//g')/g" \ | sed "s/#COREOS_PUBLIC_IPV4#/$PUBLIC_IP/g" \ | sed "s/#COREOS_PRIVATE_IPV4#/$PRIVATE_IP/g" \ | sed "s/#HOSTNAME#/$(echo $PUBLIC_IP | sed "s/\./-/g")/g" \ | sed "s/#GATEWAY#/${PUBLIC_IP%.*}.1/g" \ | sed "s/#DNS#/$(cat /etc/resolv.conf | awk '/^nameserver /{ print $0 }' | sed 's/nameserver //g' | tr '\n' ' ')/g" \ | sed "s/#REBOOT_STRATEGY#/${REBOOT_STRATEGY}/g" \ | ./ct > container-linux-config.json ./coreos-install -d /dev/sda -i container-linux-config.json ================================================ FILE: kube-linode.sh ================================================ #!/bin/bash set +e base64_args="" $(base64 --wrap=0 <(echo "test") >/dev/null 2>&1) if [ $? -eq 0 ]; then base64_args="--wrap=0" fi set -e source display.sh source linode-utilities.sh check_dep jq check_dep openssl check_dep curl check_dep htpasswd check_dep kubectl check_dep ssh check_dep base64 check_dep bc check_dep ssh-keygen check_dep openssl check_dep awk check_dep sed check_dep cat check_dep tr if [[ "$1" != "create" && "$1" != "destroy" ]]; then echo "${bold}${red}Not a valid action!${normal}" echo "Type ${green}./kube-linode.sh create${normal} to create a cluster" echo "Type ${green}./kube-linode.sh destroy${normal} to destroy created cluster" exit 1 fi unset DATACENTER_ID unset MASTER_PLAN unset WORKER_PLAN unset DOMAIN unset EMAIL unset MASTER_ID unset API_KEY unset USERNAME unset NO_OF_WORKERS unset REBOOT_STRATEGY unset WORKER_IDS stty -echo tput civis if [ -f settings.env ] ; then . settings.env else touch settings.env fi # -- command line argument overrides -- options=$@ for argument in $options do case $argument in --datacenter_id=*) DATACENTER_ID=${argument/*=/""} ;; --master_plan=*) MASTER_PLAN=${argument/*=/""} ;; --worker_plan=*) WORKER_PLAN=${argument/*=/""} ;; --no_of_workers=*) NO_OF_WORKERS=${argument/*=/""} ;; --domain=*) DOMAIN=${argument/*=/""} ;; --email=*) EMAIL=${argument/*=/""} ;; --master_id=*) MASTER_ID=${argument/*=/""} ;; --api_key=*) API_KEY=${argument/*=/""} ;; --username=*) USERNAME=${argument/*=/""} ;; --install_k8s_dashboard=*) INSTALL_K8S_DASHBOARD=${argument/*=/""} ;; --install_traefik=*) INSTALL_TRAEFIK=${argument/*=/""} ;; --install_rook=*) INSTALL_ROOK=${argument/*=/""} ;; --install_prometheus=*) INSTALL_PROMETHEUS=${argument/*=/""} ;; --reboot_strategy=*) REBOOT_STRATEGY=${argument/*=/""} ;; esac done read_api_key read_datacenter read_master_plan read_worker_plan read_domain read_email read_no_of_workers read_username read_install_options read_reboot_strategy if [[ ! ( -f ~/.ssh/id_rsa && -f ~/.ssh/id_rsa.pub ) ]]; then spinner "Generating new SSH key" "ssh-keygen -b 2048 -t rsa -f ~/.ssh/id_rsa -q -N \"\"" else eval `ssh-agent -s` >/dev/null 2>&1 ssh-add -l | grep -q "$(ssh-keygen -lf ~/.ssh/id_rsa | awk '{print $2}')" || ssh-add ~/.ssh/id_rsa >/dev/null 2>&1 fi if [[ -f auth && -f manifests/grafana/grafana-credentials.yaml ]] ; then : ; else read -s -p "${green}?${normal}${bold} Enter your dashboard password: ${normal}" PASSWORD tput cub "$(tput cols)" tput el [ -e auth ] && rm auth htpasswd -b -c auth $USERNAME $PASSWORD >/dev/null 2>&1 [ -e manifests/grafana/grafana-credentials.yaml ] && rm manifests/grafana/grafana-credentials.yaml cat > manifests/grafana/grafana-credentials.yaml <<-EOF apiVersion: v1 kind: Secret metadata: name: grafana-credentials data: user: $( echo -n $USERNAME | base64 $base64_args ) password: $( echo -n $PASSWORD | base64 $base64_args ) EOF fi if [ "$1" == "destroy" ]; then spinner "Retrieving master linode (if any)" get_master_id MASTER_ID if ! [[ $MASTER_ID =~ ^[0-9]+$ ]] 2>/dev/null; then tput el echo "${red}No master node found! Cluster is likely to have been deleted.${normal}" else spinner "Retrieving worker linodes (if any)" list_worker_ids WORKER_IDS tput el echo "${bold}${red}The following nodes will be deleted:${normal}" echo " ${cyan}${arrow}${normal} master_$MASTER_ID [https://manager.linode.com/linodes/dashboard/master_$MASTER_ID]" for WORKER_ID in $WORKER_IDS; do echo " ${cyan}${arrow}${normal} worker_$WORKER_ID [https://manager.linode.com/linodes/dashboard/worker_$WORKER_ID]" done text_input "Are you sure you want to delete the cluster? [y/n] " \ response "^[yn]$" "Please enter either 'y' or 'n'" tput civis if [[ "$response" =~ ^y$ ]]; then for WORKER_ID in $WORKER_IDS; do spinner "${CYAN}[$WORKER_ID]${NORMAL} Deleting worker node" "delete_linode $WORKER_ID" done spinner "${CYAN}[$MASTER_ID]${NORMAL} Deleting master node" "delete_linode $MASTER_ID" fi fi spinner "Retrieving DNS record for $DOMAIN" "get_domains \"$DOMAIN\"" DOMAIN_ID if [[ $DOMAIN_ID =~ ^[0-9]+$ ]] 2>/dev/null; then text_input "Do you want to delete the DNS record for $DOMAIN? [y/n] " \ response "^[yn]$" "Please enter either 'y' or 'n'" tput civis if [[ "$response" =~ ^y$ ]]; then spinner "Deleting DNS record for $DOMAIN" delete_domain fi fi text_input "Do you want to delete the current cluster configuration (including ~/.kube/config)? [y/n] " \ response "^[yn]$" "Please enter either 'y' or 'n'" tput civis if [[ "$response" =~ ^y$ ]]; then [ -e manifests/grafana/grafana-credentials.yaml ] && rm manifests/grafana/grafana-credentials.yaml [ -e cluster ] && rm -rf cluster [ -e ~/.kube/config ] && rm ~/.kube/config [ -e auth ] && rm auth [ -e settings.env ] && rm settings.env touch settings.env echo "API_KEY=$API_KEY" >> settings.env fi elif [ "$1" == "create" ]; then spinner "Retrieving master linode (if any)" get_master_id MASTER_ID if ! [[ $MASTER_ID =~ ^[0-9]+$ ]] 2>/dev/null; then spinner "Retrieving list of workers" list_worker_ids WORKER_IDS for WORKER_ID in $WORKER_IDS; do spinner "${CYAN}[$WORKER_ID]${NORMAL} Deleting worker (since certs are now invalid)"\ "linode_api linode.delete LinodeID=$WORKER_ID skipChecks=true" done spinner "Creating master linode" "create_linode $DATACENTER_ID $MASTER_PLAN" MASTER_ID spinner "Adding private IP" "add_private_ip $MASTER_ID" spinner "${CYAN}[$MASTER_ID]${NORMAL} Initializing labels" \ "linode_api linode.update LinodeID=$MASTER_ID Label=\"master_${MASTER_ID}\" lpm_displayGroup=\"$DOMAIN (Unprovisioned)\"" fi spinner "${CYAN}[$MASTER_ID]${NORMAL} Getting public IP" "get_public_ip $MASTER_ID" MASTER_IP declare "PUBLIC_$MASTER_ID=$MASTER_IP" spinner "${CYAN}[$MASTER_IP]${NORMAL} Getting private IP" "get_private_ip $MASTER_ID" PRIVATE_IP declare "PRIVATE_$MASTER_ID=$PRIVATE_IP" spinner "${CYAN}[$MASTER_IP]${NORMAL} Retrieving provision status" "is_provisioned $MASTER_ID" IS_PROVISIONED if [ $IS_PROVISIONED = false ] ; then update_dns $MASTER_ID install master $MASTER_ID fi tput el echo "${CYAN}[$MASTER_IP]${NORMAL} Master provisioned" spinner "${CYAN}[$MASTER_IP]${NORMAL} Retrieving current number of workers" get_no_of_workers CURRENT_NO_OF_WORKERS NO_OF_NEW_WORKERS=$( echo "$NO_OF_WORKERS - $CURRENT_NO_OF_WORKERS" | bc ) if [[ $NO_OF_NEW_WORKERS -gt 0 ]]; then for WORKER in $( seq $NO_OF_NEW_WORKERS ); do spinner "Creating worker linode" "create_linode $DATACENTER_ID $WORKER_PLAN" WORKER_ID spinner "Adding private IP" "add_private_ip $WORKER_ID" spinner "Initializing labels" "change_to_unprovisioned $WORKER_ID worker" done fi spinner "Retrieving list of workers" list_worker_ids WORKER_IDS for WORKER_ID in $WORKER_IDS; do spinner "${CYAN}[$WORKER_ID]${NORMAL} Getting public IP" "get_public_ip $WORKER_ID" PUBLIC_IP declare "PUBLIC_$WORKER_ID=$PUBLIC_IP" spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Getting private IP" "get_private_ip $WORKER_ID" PRIVATE_IP declare "PRIVATE_$WORKER_ID=$PRIVATE_IP" if [ "$( is_provisioned $WORKER_ID )" = false ] ; then install worker $WORKER_ID fi tput el echo "${CYAN}[$PUBLIC_IP]${NORMAL} Worker provisioned" done fi wait tput cnorm stty echo ================================================ FILE: linode-utilities.sh ================================================ #!/bin/bash if [ -z "${KUBECONFIG}" ]; then export KUBECONFIG=~/.kube/config fi control_c() { tput cub "$(tput cols)" tput el stty sane tput cnorm stty echo exit $? } trap control_c SIGINT CYAN=$(tput setaf 6) NORMAL=$(tput sgr0) BOLD=$(tput bold) check_dep() { command -v $1 >/dev/null 2>&1 || { echo "Please install \`${BOLD}$1${NORMAL}\` before running this script." >&2; exit 1; } } linode_api() { args=(-F "api_action=$1") ; shift for arg in "$@" ; do args+=(-F "$arg") done curl -s -X POST "https://api.linode.com/" -H 'cache-control: no-cache' \ -F "api_key=$API_KEY" "${args[@]}" } wait_jobs() { LINODE_ID=$1 while true ; do if ( linode_api linode.job.list LinodeID=$LINODE_ID pendingOnly=1 | jq -Mje '.DATA == []' >/dev/null ) ; then break fi sleep 3 done } wait_boot() { LINODE_ID=$1 while true ; do if [[ $(linode_api linode.job.list LinodeID=$LINODE_ID | jq ".DATA" | \ jq -c "[ .[] | select(.LABEL == \"Lassie initiated boot: CoreOS\") | select(.HOST_SUCCESS == 1)]" | \ jq ".[] | .JOBID") =~ ^[0-9]+ ]]; then break fi sleep 3 done sleep 10 } get_status() { linode_api linode.list LinodeID=$1 | jq ".DATA" | jq -c ".[] | .STATUS" | sed -n 1p } list_worker_ids() { linode_api linode.list | jq ".DATA" | jq -c "[ .[] | select(.LPM_DISPLAYGROUP | contains (\"$DOMAIN\")) ]" | jq -c ".[] | select(.LABEL | startswith(\"worker_\")) | .LINODEID" } get_master_id() { linode_api linode.list | jq ".DATA" | jq -c "[ .[] | select(.LPM_DISPLAYGROUP | contains (\"$DOMAIN\")) ]" | jq -c ".[] | select(.LABEL | startswith(\"master_\")) | .LINODEID" | sed -n 1p } is_provisioned() { local IS_PROVISIONED=false if [ $( linode_api linode.list LinodeID=$1 | jq ".DATA" | jq -c ".[] | .LPM_DISPLAYGROUP == \"$DOMAIN\"") = true ] ; then IS_PROVISIONED=true fi echo $IS_PROVISIONED } shutdown() { local LINODE_ID=$1 linode_api linode.shutdown LinodeID=$LINODE_ID >/dev/null wait_jobs $LINODE_ID } get_disk_ids() { local LINODE_ID=$1 linode_api linode.disk.list LinodeID=$LINODE_ID | jq ".DATA" | jq -c ".[] | .DISKID" } get_config_ids() { local LINODE_ID=$1 linode_api linode.config.list LinodeID=$LINODE_ID | jq ".DATA" | jq -c ".[] | .ConfigID" } reset_linode() { local LINODE_ID=$1 local DISK_IDS local CONFIG_IDS local STATUS PUBLIC_IP=$(get_public_ip $LINODE_ID) spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Getting status" "get_status $LINODE_ID" STATUS if [ "$STATUS" = "1" ]; then spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Shutting down linode" "shutdown $LINODE_ID" fi spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Retrieving disk list" "get_disk_ids $LINODE_ID" DISK_IDS for DISK_ID in $DISK_IDS; do spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Deleting disk $DISK_ID" "linode_api linode.disk.delete LinodeID=$LINODE_ID DiskID=$DISK_ID" done spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Retrieving config list" "get_config_ids $LINODE_ID" CONFIG_IDS for CONFIG_ID in $CONFIG_IDS; do spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Deleting config $CONFIG_ID" "linode_api linode.config.delete LinodeID=$LINODE_ID ConfigID=$CONFIG_ID" done spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Waiting for all jobs to complete" "wait_jobs $LINODE_ID" } get_public_ip() { local LINODE_ID=$1 local IP eval IP=\$PUBLIC_$LINODE_ID if ! [[ $IP =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]] 2>/dev/null; then IP="$( linode_api linode.ip.list LinodeID=$LINODE_ID | jq -Mje '.DATA[] | select(.ISPUBLIC==1) | .IPADDRESS' | sed -n 1p )" fi echo $IP } get_private_ip() { local LINODE_ID=$1 local IP eval IP=\$PRIVATE_$LINODE_ID if ! [[ $IP =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]] 2>/dev/null; then IP="$( linode_api linode.ip.list LinodeID=$LINODE_ID | jq -Mje '.DATA[] | select(.ISPUBLIC==0) | .IPADDRESS' | sed -n 1p )" fi echo $IP } get_plan_id() { local LINODE_ID=$1 linode_api linode.list LinodeID=$LINODE_ID | jq ".DATA[0].PLANID" } get_max_disk_size() { local PLAN=$1 echo "$( linode_api avail.linodeplans PlanID=$PLAN | jq ".DATA[0].DISK" )" "*1024" | bc } create_raw_disk() { local LINODE_ID=$1 local DISK_SIZE=$2 local LABEL=$3 linode_api linode.disk.create LinodeID=$LINODE_ID Label="$LABEL" Type=raw Size=$DISK_SIZE | jq '.DATA.DiskID' } create_ext4_disk() { local LINODE_ID=$1 local DISK_SIZE=$2 local LABEL=$3 linode_api linode.disk.create LinodeID=$LINODE_ID Label="$LABEL" Type=ext4 Size=$DISK_SIZE | jq '.DATA.DiskID' } create_install_disk() { linode_api linode.disk.createFromDistribution LinodeID=$LINODE_ID \ DistributionID=140 Label=Installer Size=$INSTALL_DISK_SIZE \ rootPass="$ROOT_PASSWORD" rootSSHKey="$( cat ~/.ssh/id_rsa.pub )" | jq ".DATA.DiskID" } create_boot_configuration() { linode_api linode.config.create LinodeID=$LINODE_ID KernelID=138 Label="Installer" \ DiskList=$DISK_ID,$INSTALL_DISK_ID RootDeviceNum=2 helper_network=true | jq ".DATA.ConfigID" } boot_linode() { local LINODE_ID=$1 local CONFIG_ID=$2 linode_api linode.boot LinodeID=$LINODE_ID ConfigID=$CONFIG_ID >/dev/null wait_jobs $LINODE_ID } update_coreos_config() { linode_api linode.config.update LinodeID=$LINODE_ID ConfigID=$CONFIG_ID Label="CoreOS" \ DiskList=$DISK_ID,$STORAGE_DISK_ID KernelID=213 RootDeviceNum=1 helper_network=false } transfer_acme() { IP=$1 ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -tt "core@$IP" \ "sudo truncate -s 0 /etc/traefik/acme/acme.json; echo '$( base64 $base64_args < acme.json )' \ | base64 --decode | sudo tee --append /etc/traefik/acme/acme.json" 2>/dev/null >/dev/null ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -tt "core@$IP" \ "sudo chmod 600 /etc/traefik/acme/acme.json" 2>/dev/null >/dev/null } change_to_provisioned() { local LINODE_ID=$1 local NODE_TYPE=$2 linode_api linode.update LinodeID=$LINODE_ID Label="${NODE_TYPE}_${LINODE_ID}" lpm_displayGroup="$DOMAIN" } change_to_unprovisioned() { local LINODE_ID=$1 local NODE_TYPE=$2 linode_api linode.update LinodeID=$LINODE_ID Label="${NODE_TYPE}_${LINODE_ID}" lpm_displayGroup="$DOMAIN (Unprovisioned)" } install_coreos() { LINODE_ID=$1 NODE_TYPE=$2 PUBLIC_IP=$(get_public_ip $LINODE_ID) set +e while true; do scp -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no \ -r install-coreos.sh root@${PUBLIC_IP}:~/install-coreos.sh && break || sleep 5; done while true; do scp -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no \ -r manifests/container-linux/${NODE_TYPE}-config.yaml root@${PUBLIC_IP}:~/container-linux-config.yaml && break || sleep 5; done while true; do ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no root@${PUBLIC_IP} \ "chmod +x ./install-coreos.sh" && break || sleep 5; done while true; do ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no root@${PUBLIC_IP} \ "REBOOT_STRATEGY=${REBOOT_STRATEGY} ./install-coreos.sh" && break || sleep 5; done set -e } install() { local NODE_TYPE local LINODE_ID local PLAN local ROOT_PASSWORD NODE_TYPE=$1 LINODE_ID=$2 PUBLIC_IP=$(get_public_ip $LINODE_ID) reset_linode $LINODE_ID spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Generating root password" "openssl rand -base64 32" ROOT_PASSWORD spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Retrieving current plan" "get_plan_id $LINODE_ID" PLAN spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Retrieving maximum available disk size" "get_max_disk_size $PLAN" TOTAL_DISK_SIZE INSTALL_DISK_SIZE=2000 COREOS_DISK_SIZE=10240 STORAGE_DISK_SIZE=$((${TOTAL_DISK_SIZE}-${COREOS_DISK_SIZE})) spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Creating ${COREOS_DISK_SIZE}mb CoreOS disk" "create_raw_disk $LINODE_ID $COREOS_DISK_SIZE CoreOS" DISK_ID # Create the install OS disk from script spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Creating ${INSTALL_DISK_SIZE}mb install disk" create_install_disk INSTALL_DISK_ID # Configure the installer to boot spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Creating boot configuration" create_boot_configuration CONFIG_ID spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Booting installer" "boot_linode $LINODE_ID $CONFIG_ID" spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Installing CoreOS (might take a while)" "install_coreos $LINODE_ID $NODE_TYPE" spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Shutting down CoreOS" "linode_api linode.shutdown LinodeID=$LINODE_ID" spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Deleting install disk $INSTALL_DISK_ID" "linode_api linode.disk.delete LinodeID=$LINODE_ID DiskID=$INSTALL_DISK_ID" spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Waiting for existing jobs to complete" "wait_jobs $LINODE_ID" spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Creating ${STORAGE_DISK_SIZE}mb storage disk" "create_raw_disk $LINODE_ID $STORAGE_DISK_SIZE Storage" STORAGE_DISK_ID spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Updating CoreOS config" update_coreos_config spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Waiting for existing jobs to complete" "wait_jobs $LINODE_ID" spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Booting CoreOS" "linode_api linode.boot LinodeID=$LINODE_ID ConfigID=$CONFIG_ID" spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Waiting for CoreOS to be ready" "wait_jobs $LINODE_ID; sleep 20" if [ "$NODE_TYPE" = "master" ] ; then if [ -e acme.json ] ; then spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Transferring acme.json" "transfer_acme $PUBLIC_IP" fi fi spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Provisioning $NODE_TYPE node (might take a while)" "provision_$NODE_TYPE $PUBLIC_IP" PROVISION_LOGS if [ "$( echo "${PROVISION_LOGS}" | tail -n1 )" = "provisioned $NODE_TYPE" ]; then spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Changing status to provisioned" "change_to_provisioned $LINODE_ID $NODE_TYPE" else install $NODE_TYPE $LINODE_ID fi } provision_master() { IP=$1 while true; do ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -tt "core@$IP" \ "sudo systemctl start bootkube" && break || sleep 5; done [ -e cluster ] && rm -rf cluster mkdir cluster ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no core@${IP} "sudo chown -R core:core /opt/bootkube/assets" scp -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -r core@${IP}:/opt/bootkube/assets/* cluster mkdir -p ~/.kube [ -e ~/.kube/config.bak ] && rm ~/.kube/config.bak [ -e ~/.kube/config ] && mv ~/.kube/config ~/.kube/config.bak cp cluster/auth/kubeconfig ~/.kube/config while true; do kubectl --namespace=kube-system create secret generic kubesecret --from-file auth --request-timeout 0 && break || sleep 5; done cat </dev/null >/dev/null ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -tt "core@$IP" "sudo ./bootstrap.sh" 2>/dev/null >/dev/null ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -tt "core@$IP" "rm -rf /home/core/kubeconfig && rm -rf /home/core/bootstrap.sh" 2>/dev/null >/dev/null set +e until kubectl get nodes > /dev/null 2>&1; do sleep 1; done if [ $INSTALL_ROOK = true ]; then if ! kubectl --namespace rook get pods --request-timeout 0 2>/dev/null | grep -q "^rook-api"; then while true; do kubectl apply -f manifests/rook/rook-operator.yaml --request-timeout 0 && break || sleep 5; done while true; do kubectl apply -f manifests/rook/rook-cluster.yaml --request-timeout 0 && break || sleep 5; done while true; do kubectl apply -f manifests/rook/rook-storageclass.yaml --request-timeout 0 && break || sleep 5; done fi fi if [ $INSTALL_PROMETHEUS = true ]; then until kubectl get nodes > /dev/null 2>&1; do sleep 1; done if ! kubectl --namespace monitoring get ingress --request-timeout 0 2>/dev/null | grep -q "^prometheus-ingress"; then while true; do kubectl --namespace monitoring apply -f manifests/prometheus-operator --request-timeout 0 && break || sleep 5; done printf "Waiting for Operator to register third party objects..." until kubectl --namespace monitoring get servicemonitor > /dev/null 2>&1; do sleep 1; printf "."; done until kubectl --namespace monitoring get prometheus > /dev/null 2>&1; do sleep 1; printf "."; done until kubectl --namespace monitoring get alertmanager > /dev/null 2>&1; do sleep 1; printf "."; done while true; do kubectl --namespace monitoring apply -f manifests/node-exporter --request-timeout 0 && break || sleep 5; done while true; do kubectl --namespace monitoring apply -f manifests/kube-state-metrics --request-timeout 0 && break || sleep 5; done while true; do kubectl --namespace monitoring apply -f manifests/grafana/grafana-credentials.yaml --request-timeout 0 && break || sleep 5; done while true; do kubectl --namespace monitoring apply -f manifests/grafana --request-timeout 0 && break || sleep 5; done while true; do find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml ! -name prometheus-k8s-ingress.yaml -exec kubectl --request-timeout 0 --namespace "monitoring" apply -f {} \; && break || sleep 5; done while true; do kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml --request-timeout 0 && break || sleep 5; done while true; do kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml --request-timeout 0 && break || sleep 5; done while true; do kubectl --namespace monitoring apply -f manifests/alertmanager/ --request-timeout 0 && break || sleep 5; done while true; do cat manifests/prometheus/prometheus-k8s-ingress.yaml | sed "s/\${DOMAIN}/${DOMAIN}/g" | kubectl apply --request-timeout 0 --validate=false -f - && break || sleep 5; done fi fi set -e echo "provisioned worker" } read_api_key() { local result=false if ! [[ $API_KEY =~ ^[0-9a-zA-Z]+$ ]] 2>/dev/null; then while ! [[ $API_KEY =~ ^-?[0-9a-zA-Z]+$ ]] 2>/dev/null; do text_input "Enter Linode API Key (https://manager.linode.com/profile/api) : " API_KEY tput civis done while true ; do spinner "Verifying API Key" check_api_key result if [ $result = true ] ; then break fi text_input "Enter Linode API Key (https://manager.linode.com/profile/api) : " API_KEY tput civis done else while true ; do spinner "Verifying API Key" check_api_key result if [ $result = true ] ; then break fi text_input "Enter Linode API Key (https://manager.linode.com/profile/api) : " API_KEY tput civis done fi sed -i.bak '/^API_KEY/d' settings.env echo "API_KEY=$API_KEY" >> settings.env rm settings.env.bak } check_api_key() { if linode_api test.echo | jq -e ".ERRORARRAY == []" >/dev/null; then echo true else echo false fi } get_plans() { linode_api avail.linodeplans | jq ".DATA | sort_by(.PRICE)" } read_install_options() { if [[ -z $INSTALL_K8S_DASHBOARD || -z $INSTALL_TRAEFIK || -z $INSTALL_ROOK || -z $INSTALL_PROMETHEUS ]]; then options=('K8S Dashboard' 'Traefik (Load Balancer)' 'Rook (Distributed Storage)' 'Prometheus (Monitoring)') env_names=('INSTALL_K8S_DASHBOARD' 'INSTALL_TRAEFIK' 'INSTALL_ROOK' 'INSTALL_PROMETHEUS') selected_indices=(0 1 2 3) checkbox_input_indices "What should be included in your cluster?" options selected_indices eval "$(gen_env_from_options selected_indices env_names)" sed -i.bak '/^INSTALL_K8S_DASHBOARD/d' settings.env sed -i.bak '/^INSTALL_TRAEFIK/d' settings.env sed -i.bak '/^INSTALL_ROOK/d' settings.env sed -i.bak '/^INSTALL_PROMETHEUS/d' settings.env echo "$(gen_env_from_options selected_indices env_names)" >> settings.env rm settings.env.bak fi } read_master_plan() { if ! [[ $MASTER_PLAN =~ ^[0-9]+$ ]] 2>/dev/null; then while ! [[ $MASTER_PLAN =~ ^-?[0-9]+$ ]] 2>/dev/null; do IFS=$'\n' spinner "Retrieving plans" get_plans plan_data local plan_ids=($(echo $plan_data | jq -r '.[] | select(.RAM >= 2048) | .PLANID')) local plan_list=($(echo $plan_data | jq -r '.[] | select(.RAM >= 2048) | [.RAM, .PRICE] | @csv' | \ awk -v FS="," '{ram=$1/1024; printf "%3sGB (\$%s/mo)%s",ram,$2,ORS}' 2>/dev/null)) list_input_index "Select a master plan (https://www.linode.com/pricing)" plan_list selected_disk_id MASTER_PLAN=${plan_ids[$selected_disk_id]} done echo "MASTER_PLAN=$MASTER_PLAN" >> settings.env fi } read_worker_plan() { if ! [[ $WORKER_PLAN =~ ^[0-9]+$ ]] 2>/dev/null; then while ! [[ $WORKER_PLAN =~ ^-?[0-9]+$ ]] 2>/dev/null; do IFS=$'\n' spinner "Retrieving plans" get_plans plan_data tput el local plan_ids=($(echo $plan_data | jq -r '.[] | select(.RAM >= 2048) | .PLANID')) local plan_list=($(echo $plan_data | jq -r '.[] | select(.RAM >= 2048) | [.RAM, .PRICE] | @csv' | \ awk -v FS="," '{ram=$1/1024; printf "%3sGB (\$%s/mo)%s",ram,$2,ORS}' 2>/dev/null)) list_input_index "Select a worker plan (https://www.linode.com/pricing)" plan_list selected_disk_id WORKER_PLAN=${plan_ids[$selected_disk_id]} done echo "WORKER_PLAN=$WORKER_PLAN" >> settings.env fi } get_datacenters() { linode_api avail.datacenters | jq ".DATA | sort_by(.LOCATION)" } read_datacenter() { if ! [[ $DATACENTER_ID =~ ^[0-9]+$ ]] 2>/dev/null; then while ! [[ $DATACENTER_ID =~ ^-?[0-9]+$ ]] 2>/dev/null; do IFS=$'\n' spinner "Retrieving datacenters" get_datacenters datacenters_data tput el datacenters_ids=($(echo $datacenters_data | jq -r '.[] | .DATACENTERID')) datacenters_list=($(echo $datacenters_data | jq -r '.[] | .LOCATION')) list_input_index "Select a datacenter" datacenters_list selected_data_center_index DATACENTER_ID=${datacenters_ids[$selected_data_center_index]} done echo "DATACENTER_ID=$DATACENTER_ID" >> settings.env fi } read_domain() { domain_regex="^([a-zA-Z0-9][a-zA-Z0-9-]{1,61}[a-zA-Z0-9]\.)+[a-zA-Z]{2,}$" if ! [[ $DOMAIN =~ $domain_regex ]] 2>/dev/null; then while ! [[ $DOMAIN =~ $domain_regex ]] 2>/dev/null; do text_input "Enter Domain Name: " DOMAIN "$domain_regex" "Please enter a valid domain name" done echo "DOMAIN=$DOMAIN" >> settings.env fi tput civis } read_email() { email_regex="^[a-z0-9!#\$%&'*+/=?^_\`{|}~-]+(\.[a-z0-9!#$%&'*+/=?^_\`{|}~-]+)*@([a-z0-9]([a-z0-9-]*[a-z0-9])?\.)+[a-z0-9]([a-z0-9-]*[a-z0-9])?\$" if ! [[ $EMAIL =~ $email_regex ]] 2>/dev/null; then while ! [[ $EMAIL =~ $email_regex ]] 2>/dev/null; do text_input "Enter Email (for ACME registration): " EMAIL "^[a-z0-9!#\$%&'*+/=?^_\`{|}~-]+(\.[a-z0-9!#$%&'*+/=?^_\`{|}~-]+)*@([a-z0-9]([a-z0-9-]*[a-z0-9])?\.)+[a-z0-9]([a-z0-9-]*[a-z0-9])?\$" "Please enter a valid email" done echo "EMAIL=$EMAIL" >> settings.env fi tput civis } read_username() { if [ -z "$USERNAME" ]; then [ -e auth ] && rm auth [ -e manifests/grafana/grafana-credentials.yaml ] && rm manifests/grafana/grafana-credentials.yaml text_input "Enter dashboard username: " USERNAME echo "USERNAME=$USERNAME" >> settings.env fi tput civis } read_reboot_strategy() { if [ -z "$REBOOT_STRATEGY" ]; then strategies=("off" "etcd-lock" "reboot") list_input_index "Select a update strategy (see https://coreos.com/os/docs/latest/update-strategies.html)" strategies strategy REBOOT_STRATEGY=${strategies[$strategy]} echo "REBOOT_STRATEGY=$REBOOT_STRATEGY" >> settings.env fi } get_domains() { local DOMAIN=$1 linode_api domain.list | jq ".DATA" | jq -c ".[] | select(.DOMAIN == \"$DOMAIN\") | .DOMAINID" } get_resources() { local DOMAIN_ID=$1 linode_api domain.resource.list DomainID=$DOMAIN_ID | jq ".DATA" } create_A_domain() { linode_api domain.resource.create DomainID=$DOMAIN_ID \ TARGET="$IP" TTL_SEC=0 PORT=80 PROTOCOL='' PRIORITY=10 WEIGHT=5 TYPE='A' NAME='' >/dev/null } create_CNAME_domain() { linode_api domain.resource.create DomainID=$DOMAIN_ID \ TARGET="$DOMAIN" TTL_SEC=0 PORT=80 PROTOCOL="" PRIORITY=10 WEIGHT=5 TYPE="CNAME" NAME="*" >/dev/null } get_ip_address_id() { linode_api linode.ip.list | jq ".DATA" | jq -c ".[] | select(.IPADDRESS == \"$IP\") | .IPADDRESSID" | sed -n 1p } update_domain() { linode_api domain.update DomainID=$DOMAIN_ID Domain="$DOMAIN" TTL_sec=300 axfr_ips="none" Expire_sec=604800 \ SOA_Email="$EMAIL" Retry_sec=300 status=1 Refresh_sec=300 Type=master >/dev/null } create_domain() { linode_api domain.create Domain="$DOMAIN" TTL_sec=300 axfr_ips="none" Expire_sec=604800 \ SOA_Email="$EMAIL" Retry_sec=300 status=1 Refresh_sec=300 Type=master >/dev/null } delete_domain() { linode_api domain.delete DomainID="$DOMAIN_ID" Domain="$DOMAIN" >/dev/null } update_dns() { local LINODE_ID=$1 local DOMAIN_ID local IP local RESOURCE_IDS eval IP=\$PUBLIC_$LINODE_ID spinner "${CYAN}[$IP]${NORMAL} Retrieving DNS record for $DOMAIN" "get_domains \"$DOMAIN\"" DOMAIN_ID if ! [[ $DOMAIN_ID =~ ^[0-9]+$ ]] 2>/dev/null; then spinner "${CYAN}[$IP]${NORMAL} Creating DNS record for $DOMAIN" create_domain fi spinner "${CYAN}[$IP]${NORMAL} Retrieving DNS record for $DOMAIN" "get_domains \"$DOMAIN\"" DOMAIN_ID spinner "${CYAN}[$IP]${NORMAL} Updating DNS record for $DOMAIN" update_domain spinner "${CYAN}[$IP]${NORMAL} Retrieving list of resources for $DOMAIN" "get_resources $DOMAIN_ID" RESOURCE_LIST IFS=$'\n' if ! [[ $(echo $RESOURCE_LIST | jq -c ".[] | select(.TYPE == \"A\" and .TARGET == \"$IP\") | .RESOURCEID" | sed -n 1p) =~ ^[0-9]+$ ]] 2>/dev/null; then RESOURCE_IDS=$(echo $RESOURCE_LIST | jq -c ".[] | select(.TYPE == \"A\" and .NAME == \"\") | .RESOURCEID") for RESOURCE_ID in $RESOURCE_IDS; do spinner "${CYAN}[$IP]${NORMAL} Deleting 'A' DNS record $RESOURCE_ID" "linode_api domain.resource.delete DomainID=$DOMAIN_ID ResourceID=$RESOURCE_ID" done spinner "${CYAN}[$IP]${NORMAL} Adding 'A' DNS record to $DOMAIN with target $IP" create_A_domain fi if ! [[ $(echo $RESOURCE_LIST | jq -c ".[] | select(.TYPE == \"CNAME\" and .TARGET == \"$DOMAIN\") | .RESOURCEID") =~ ^[0-9]+$ ]] 2>/dev/null; then spinner "${CYAN}[$IP]${NORMAL} Adding wildcard 'CNAME' record with target $DOMAIN" create_CNAME_domain fi } read_no_of_workers() { if ! [[ $NO_OF_WORKERS =~ ^[0-9]+$ ]] 2>/dev/null; then while ! [[ $NO_OF_WORKERS =~ ^[0-9]+$ ]] 2>/dev/null; do text_input "Enter number of workers: " NO_OF_WORKERS "^[0-9]+$" "Please enter a number" done echo "NO_OF_WORKERS=$NO_OF_WORKERS" >> settings.env fi tput civis } create_linode() { DATACENTER_ID=$1 PLAN_ID=$2 linode_api linode.create DatacenterID=$DATACENTER_ID PlanID=$PLAN_ID | jq ".DATA.LinodeID" } delete_linode() { local LINODE_ID="$1" linode_api linode.delete LinodeID=$LINODE_ID skipChecks=true >/dev/null } add_private_ip() { local LINODE_ID=$1 linode_api linode.ip.addprivate LinodeID=$LINODE_ID } get_no_of_workers() { echo "$( list_worker_ids | wc -l ) + 0" | bc } ================================================ FILE: manifests/alertmanager/alertmanager-config.yaml ================================================ apiVersion: v1 kind: Secret metadata: name: alertmanager-main data: alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg== ================================================ FILE: manifests/alertmanager/alertmanager-service.yaml ================================================ apiVersion: v1 kind: Service metadata: labels: alertmanager: main name: alertmanager-main spec: ports: - name: web port: 9093 protocol: TCP selector: alertmanager: main ================================================ FILE: manifests/alertmanager/alertmanager.yaml ================================================ apiVersion: monitoring.coreos.com/v1 kind: Alertmanager metadata: name: main labels: alertmanager: main spec: replicas: 1 version: v0.14.0 ================================================ FILE: manifests/container-linux/master-config.yaml ================================================ passwd: users: - name: core ssh_authorized_keys: - "#SSH_KEY#" storage: files: - path: /etc/hostname filesystem: root mode: 0420 contents: inline: | #HOSTNAME# - path: /etc/traefik/acme/acme.json filesystem: root contents: inline: | mode: 0600 - path: /etc/environment filesystem: root contents: inline: | COREOS_PUBLIC_IPV4=#COREOS_PUBLIC_IPV4# COREOS_PRIVATE_IPV4=#COREOS_PRIVATE_IPV4# - path: /etc/kubernetes/kubelet.env filesystem: root mode: 0644 contents: inline: | KUBELET_IMAGE_URL=docker://gcr.io/google_containers/hyperkube KUBELET_IMAGE_TAG=v1.11.0 - path: /etc/sysctl.d/max-user-watches.conf filesystem: root contents: inline: | fs.inotify.max_user_watches=16184 - path: /opt/bootkube/bootkube-start filesystem: root mode: 0544 user: id: 500 group: id: 500 contents: inline: | #!/bin/bash # Wrapper for bootkube start set -e # Move experimental manifests [ -n "$(ls /opt/bootkube/assets/manifests-*/* 2>/dev/null)" ] && \ mv /opt/bootkube/assets/manifests-*/* /opt/bootkube/assets/manifests && \ rm -rf /opt/bootkube/assets/manifests-* BOOTKUBE_ACI="${BOOTKUBE_ACI:-quay.io/coreos/bootkube}" BOOTKUBE_VERSION="${BOOTKUBE_VERSION:-v0.13.0}" BOOTKUBE_ASSETS="${BOOTKUBE_ASSETS:-/opt/bootkube/assets}" # ======== START OF RESOURCE RENDERING =========== sudo mkdir -p /opt/bootkube /etc/kubernetes [ -e /opt/bootkube/assets ] && sudo rm -rf /opt/bootkube/assets sudo /usr/bin/rkt run \ --trust-keys-from-https \ --volume assets,kind=host,source=/opt/bootkube \ --mount volume=assets,target=/opt/bootkube \ --volume bootstrap,kind=host,source=/etc/kubernetes \ --mount volume=bootstrap,target=/etc/kubernetes \ ${RKT_OPTS} \ ${BOOTKUBE_ACI}:${BOOTKUBE_VERSION} \ --net=host \ --dns=host \ --exec=/bootkube -- render --asset-dir=/opt/bootkube/assets \ --etcd-servers=https://#COREOS_PRIVATE_IPV4#:2379 \ --network-provider=flannel \ --api-servers=https://#COREOS_PUBLIC_IPV4#:6443,https://#COREOS_PRIVATE_IPV4#:6443 sudo mkdir -p /etc/kubernetes sudo cp ${BOOTKUBE_ASSETS}/auth/kubeconfig /etc/kubernetes/ sudo cp ${BOOTKUBE_ASSETS}/tls/ca.crt /etc/kubernetes/ca.crt sudo mkdir -p /etc/etcd/tls sudo cp ${BOOTKUBE_ASSETS}/tls/etcd-* /etc/etcd/tls sudo mkdir -p /etc/etcd/tls/etcd sudo cp ${BOOTKUBE_ASSETS}/tls/etcd/* /etc/etcd/tls/etcd sudo chown -R etcd:etcd /etc/etcd sudo chmod -R u=rX,g=,o= /etc/etcd sudo systemctl enable kubelet; sudo systemctl start kubelet # ======= END OF RESOURCE RENDERING ======== exec /usr/bin/rkt run \ --trust-keys-from-https \ --volume assets,kind=host,source=${BOOTKUBE_ASSETS} \ --mount volume=assets,target=/assets \ --volume bootstrap,kind=host,source=/etc/kubernetes \ --mount volume=bootstrap,target=/etc/kubernetes \ ${RKT_OPTS} \ ${BOOTKUBE_ACI}:${BOOTKUBE_VERSION} \ --net=host \ --dns=host \ --exec=/bootkube -- start --asset-dir=/assets "$@" networkd: units: - name: 00-eth0.network contents: | [Match] Name=eth0 [Network] DHCP=no DNS=#DNS# Domains=members.linode.com IPv6PrivacyExtensions=false Gateway=#GATEWAY# Address=#COREOS_PUBLIC_IPV4#/24 Address=#COREOS_PRIVATE_IPV4#/17 systemd: units: - name: etcd-member.service enable: true dropins: - name: 40-etcd-cluster.conf contents: | [Service] Environment="ETCD_IMAGE_TAG=v3.2.13" Environment="ETCD_NAME=controller" Environment="ETCD_INITIAL_CLUSTER=controller=https://#COREOS_PRIVATE_IPV4#:2380" Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://#COREOS_PRIVATE_IPV4#:2380" Environment="ETCD_ADVERTISE_CLIENT_URLS=https://#COREOS_PRIVATE_IPV4#:2379" Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379" Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380" Environment="ETCD_SSL_DIR=/etc/etcd/tls" Environment="ETCD_TRUSTED_CA_FILE=/etc/ssl/certs/etcd/server-ca.crt" Environment="ETCD_CERT_FILE=/etc/ssl/certs/etcd/server.crt" Environment="ETCD_KEY_FILE=/etc/ssl/certs/etcd/server.key" Environment="ETCD_CLIENT_CERT_AUTH=true" Environment="ETCD_PEER_TRUSTED_CA_FILE=/etc/ssl/certs/etcd/peer-ca.crt" Environment="ETCD_PEER_CERT_FILE=/etc/ssl/certs/etcd/peer.crt" Environment="ETCD_PEER_KEY_FILE=/etc/ssl/certs/etcd/peer.key" - name: docker.service enable: true - name: kubelet.path enable: true contents: | [Unit] Description=Watch for kubeconfig [Path] PathExists=/etc/kubernetes/kubeconfig [Install] WantedBy=multi-user.target - name: wait-for-dns.service enable: true contents: | [Unit] Description=Wait for DNS entries Wants=systemd-resolved.service Before=kubelet.service [Service] Type=oneshot RemainAfterExit=true ExecStart=/bin/sh -c 'while ! /usr/bin/grep '^[^#[:space:]]' /etc/resolv.conf > /dev/null; do sleep 1; done' [Install] RequiredBy=kubelet.service - name: bootkube.service contents: | [Unit] Description=Bootstrap a Kubernetes cluster ConditionPathExists=!/opt/bootkube/init_bootkube.done [Service] Type=oneshot RemainAfterExit=true EnvironmentFile=/etc/environment WorkingDirectory=/opt/bootkube ExecStart=/opt/bootkube/bootkube-start ExecStartPost=/bin/touch /opt/bootkube/init_bootkube.done [Install] WantedBy=multi-user.target - name: kubelet.service contents: | [Unit] Description=Kubelet via Hyperkube ACI Wants=rpc-statd.service [Service] EnvironmentFile=/etc/kubernetes/kubelet.env EnvironmentFile=/etc/environment Environment="RKT_RUN_ARGS=--uuid-file-save=/var/cache/kubelet-pod.uuid \ --volume=resolv,kind=host,source=/etc/resolv.conf \ --mount volume=resolv,target=/etc/resolv.conf \ --volume var-lib-cni,kind=host,source=/var/lib/cni \ --mount volume=var-lib-cni,target=/var/lib/cni \ --volume opt-cni-bin,kind=host,source=/opt/cni/bin \ --mount volume=opt-cni-bin,target=/opt/cni/bin \ --volume var-log,kind=host,source=/var/log \ --mount volume=var-log,target=/var/log \ --insecure-options=image" ExecStartPre=/bin/mkdir -p /opt/cni/bin ExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests ExecStartPre=/bin/mkdir -p /etc/kubernetes/cni/net.d ExecStartPre=/bin/mkdir -p /etc/kubernetes/checkpoint-secrets ExecStartPre=/bin/mkdir -p /etc/kubernetes/inactive-manifests ExecStartPre=/bin/mkdir -p /var/lib/cni ExecStartPre=/bin/mkdir -p /var/lib/kubelet/volumeplugins ExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid ExecStart=/usr/lib/coreos/kubelet-wrapper \ --allow-privileged \ --anonymous-auth=false \ --client-ca-file=/etc/kubernetes/ca.crt \ --cloud-provider= \ --cluster_dns=10.3.0.10 \ --cluster_domain=cluster.local \ --cni-conf-dir=/etc/kubernetes/cni/net.d \ --exit-on-lock-contention \ --hostname-override=#COREOS_PUBLIC_IPV4# \ --kubeconfig=/etc/kubernetes/kubeconfig \ --lock-file=/var/run/lock/kubelet.lock \ --network-plugin=cni \ --node-labels=node-role.kubernetes.io/master \ --pod-manifest-path=/etc/kubernetes/manifests \ --register-with-taints=node-role.kubernetes.io/master=:NoSchedule \ --volume-plugin-dir=/var/lib/kubelet/volumeplugins ExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid Restart=always RestartSec=10 [Install] WantedBy=multi-user.target locksmith: reboot_strategy: #REBOOT_STRATEGY# ================================================ FILE: manifests/container-linux/worker-config.yaml ================================================ passwd: users: - name: core ssh_authorized_keys: - "#SSH_KEY#" storage: files: - path: /etc/hostname filesystem: root mode: 0420 contents: inline: | #HOSTNAME# - path: /etc/kubernetes/kubelet.env filesystem: root mode: 0644 contents: inline: | KUBELET_IMAGE_URL=docker://gcr.io/google_containers/hyperkube KUBELET_IMAGE_TAG=v1.11.0 - path: /etc/sysctl.d/max-user-watches.conf filesystem: root contents: inline: | fs.inotify.max_user_watches=16184 - path: /etc/kubernetes/delete-node filesystem: root mode: 0744 contents: inline: | #!/bin/bash set -e exec /usr/bin/rkt run \ --trust-keys-from-https \ --volume config,kind=host,source=/etc/kubernetes \ --mount volume=config,target=/etc/kubernetes \ --insecure-options=image \ docker://gcr.io/google_containers/hyperkube:v1.9.3 \ --net=host \ --dns=host \ --exec=/kubectl -- --kubeconfig=/etc/kubernetes/kubeconfig delete node #COREOS_PUBLIC_IPV4# - path: /etc/environment filesystem: root contents: inline: | COREOS_PUBLIC_IPV4=#COREOS_PUBLIC_IPV4# COREOS_PRIVATE_IPV4=#COREOS_PRIVATE_IPV4# mode: 0644 - path: /home/core/bootstrap.sh filesystem: root contents: inline: | #!/usr/bin/env bash set -euo pipefail # Setup kubeconfig mkdir -p /etc/kubernetes cp /home/core/kubeconfig /etc/kubernetes/kubeconfig # Pulled out of the kubeconfig. Other installations should place the root # CA here manually. grep 'certificate-authority-data' /home/core/kubeconfig | awk '{print $2}' | base64 -d > /etc/kubernetes/ca.crt # Start services systemctl daemon-reload systemctl stop update-engine systemctl mask update-engine systemctl enable kubelet sudo systemctl start kubelet mode: 0700 networkd: units: - name: 00-eth0.network contents: | [Match] Name=eth0 [Network] DHCP=no DNS=#DNS# Domains=members.linode.com IPv6PrivacyExtensions=false Gateway=#GATEWAY# Address=#COREOS_PUBLIC_IPV4#/24 Address=#COREOS_PRIVATE_IPV4#/17 systemd: units: - name: docker.service enable: true - name: kubelet.path enable: true contents: | [Unit] Description=Watch for kubeconfig [Path] PathExists=/etc/kubernetes/kubeconfig [Install] WantedBy=multi-user.target - name: wait-for-dns.service enable: true contents: | [Unit] Description=Wait for DNS entries Wants=systemd-resolved.service Before=kubelet.service [Service] Type=oneshot RemainAfterExit=true ExecStart=/bin/sh -c 'while ! /usr/bin/grep '^[^#[:space:]]' /etc/resolv.conf > /dev/null; do sleep 1; done' [Install] RequiredBy=kubelet.service - name: delete-node.service enable: true contents: | [Unit] Description=Waiting to delete Kubernetes node on shutdown [Service] Type=oneshot RemainAfterExit=true ExecStart=/bin/true ExecStop=/etc/kubernetes/delete-node [Install] WantedBy=multi-user.target - name: kubelet.service contents: | [Unit] Description=Kubelet via Hyperkube ACI Wants=rpc-statd.service [Service] EnvironmentFile=/etc/kubernetes/kubelet.env EnvironmentFile=/etc/environment Environment="RKT_RUN_ARGS=--uuid-file-save=/var/cache/kubelet-pod.uuid \ --volume=resolv,kind=host,source=/etc/resolv.conf \ --mount volume=resolv,target=/etc/resolv.conf \ --volume var-lib-cni,kind=host,source=/var/lib/cni \ --mount volume=var-lib-cni,target=/var/lib/cni \ --volume opt-cni-bin,kind=host,source=/opt/cni/bin \ --mount volume=opt-cni-bin,target=/opt/cni/bin \ --volume var-log,kind=host,source=/var/log \ --mount volume=var-log,target=/var/log \ --insecure-options=image" ExecStartPre=/bin/mkdir -p /opt/cni/bin ExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests ExecStartPre=/bin/mkdir -p /etc/kubernetes/cni/net.d ExecStartPre=/bin/mkdir -p /etc/kubernetes/checkpoint-secrets ExecStartPre=/bin/mkdir -p /etc/kubernetes/inactive-manifests ExecStartPre=/bin/mkdir -p /var/lib/cni ExecStartPre=/bin/mkdir -p /var/lib/kubelet/volumeplugins ExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid ExecStart=/usr/lib/coreos/kubelet-wrapper \ --allow-privileged \ --anonymous-auth=false \ --client-ca-file=/etc/kubernetes/ca.crt \ --cloud-provider= \ --cluster_dns=10.3.0.10 \ --cluster_domain=cluster.local \ --cni-conf-dir=/etc/kubernetes/cni/net.d \ --exit-on-lock-contention \ --hostname-override=#COREOS_PUBLIC_IPV4# \ --kubeconfig=/etc/kubernetes/kubeconfig \ --lock-file=/var/run/lock/kubelet.lock \ --network-plugin=cni \ --node-labels=node-role.kubernetes.io/node \ --pod-manifest-path=/etc/kubernetes/manifests \ --volume-plugin-dir=/var/lib/kubelet/volumeplugins ExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid Restart=always RestartSec=10 [Install] WantedBy=multi-user.target locksmith: reboot_strategy: #REBOOT_STRATEGY# ================================================ FILE: manifests/grafana/grafana-dashboards.yaml ================================================ apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboards data: all-nodes-dashboard.json: |+ { "dashboard": { "__inputs": [ { "description": "", "label": "prometheus", "name": "DS_PROMETHEUS", "pluginId": "prometheus", "pluginName": "Prometheus", "type": "datasource" } ], "__requires": [ { "id": "grafana", "name": "Grafana", "type": "grafana", "version": "4.1.1" }, { "id": "graph", "name": "Graph", "type": "panel", "version": "" }, { "id": "prometheus", "name": "Prometheus", "type": "datasource", "version": "1.0.0" }, { "id": "singlestat", "name": "Singlestat", "type": "panel", "version": "" } ], "annotations": { "list": [] }, "description": "Dashboard to get an overview of one server", "editable": true, "gnetId": 22, "graphTooltip": 0, "hideControls": false, "id": null, "links": [], "refresh": false, "rows": [ { "collapse": false, "height": "250px", "panels": [ { "alerting": {}, "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 3, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", "hide": false, "intervalFactor": 10, "legendFormat": "", "refId": "A", "step": 50 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Idle cpu", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percent", "label": "cpu usage", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "alerting": {}, "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 9, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(node_load1)", "intervalFactor": 4, "legendFormat": "load 1m", "refId": "A", "step": 20, "target": "" }, { "expr": "sum(node_load5)", "intervalFactor": 4, "legendFormat": "load 5m", "refId": "B", "step": 20, "target": "" }, { "expr": "sum(node_load15)", "intervalFactor": 4, "legendFormat": "load 15m", "refId": "C", "step": 20, "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "System load", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "New row", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "alerting": {}, "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 4, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", "yaxis": 2 } ], "span": 9, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", "intervalFactor": 2, "legendFormat": "memory usage", "metric": "memo", "refId": "A", "step": 4, "target": "" }, { "expr": "sum(node_memory_Buffers)", "interval": "", "intervalFactor": 2, "legendFormat": "memory buffers", "metric": "memo", "refId": "B", "step": 4, "target": "" }, { "expr": "sum(node_memory_Cached)", "interval": "", "intervalFactor": 2, "legendFormat": "memory cached", "metric": "memo", "refId": "C", "step": 4, "target": "" }, { "expr": "sum(node_memory_MemFree)", "interval": "", "intervalFactor": 2, "legendFormat": "memory free", "metric": "memo", "refId": "D", "step": 4, "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Memory usage", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "id": 5, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [ { "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", "intervalFactor": 2, "metric": "", "refId": "A", "step": 60, "target": "" } ], "thresholds": "80, 90", "title": "Memory usage", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "New row", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "alerting": {}, "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 6, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "read", "yaxis": 1 }, { "alias": "{instance=\"172.17.0.1:9100\"}", "yaxis": 2 }, { "alias": "io time", "yaxis": 2 } ], "span": 9, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(node_disk_bytes_read[5m]))", "hide": false, "intervalFactor": 4, "legendFormat": "read", "refId": "A", "step": 8, "target": "" }, { "expr": "sum(rate(node_disk_bytes_written[5m]))", "intervalFactor": 4, "legendFormat": "written", "refId": "B", "step": 8 }, { "expr": "sum(rate(node_disk_io_time_ms[5m]))", "intervalFactor": 4, "legendFormat": "io time", "refId": "C", "step": 8 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Disk I/O", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "ms", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "percentunit", "gauge": { "maxValue": 1, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "id": 7, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [ { "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", "intervalFactor": 2, "refId": "A", "step": 60, "target": "" } ], "thresholds": "0.75, 0.9", "title": "Disk space usage", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "New row", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "alerting": {}, "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 8, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "transmitted ", "yaxis": 2 } ], "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", "hide": false, "intervalFactor": 2, "legendFormat": "", "refId": "A", "step": 10, "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Network received", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "alerting": {}, "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 10, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "transmitted ", "yaxis": 2 } ], "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", "hide": false, "intervalFactor": 2, "legendFormat": "", "refId": "B", "step": 10, "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Network transmitted", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "New row", "titleSize": "h6" } ], "schemaVersion": 14, "style": "dark", "tags": [ "prometheus" ], "templating": { "list": [] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "All Nodes", "version": 1 } , "inputs": [ { "name": "DS_PROMETHEUS", "pluginId": "prometheus", "type": "datasource", "value": "prometheus" } ], "overwrite": true } deployment-dashboard.json: |+ { "dashboard": { "__inputs": [ { "description": "", "label": "prometheus", "name": "DS_PROMETHEUS", "pluginId": "prometheus", "pluginName": "Prometheus", "type": "datasource" } ], "__requires": [ { "id": "singlestat", "name": "Singlestat", "type": "panel", "version": "" }, { "id": "graph", "name": "Graph", "type": "panel", "version": "" }, { "id": "grafana", "name": "Grafana", "type": "grafana", "version": "3.1.1" }, { "id": "prometheus", "name": "Prometheus", "type": "datasource", "version": "1.0.0" } ], "annotations": { "list": [] }, "editable": true, "gnetId": null, "hideControls": false, "id": null, "links": [], "rows": [ { "collapse": false, "editable": true, "height": "200px", "panels": [ { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 8, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "cores", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 4, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": true }, "targets": [ { "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", "intervalFactor": 2, "refId": "A", "step": 600 } ], "thresholds": "", "title": "CPU", "type": "singlestat", "valueFontSize": "110%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 9, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "GB", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "80%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 4, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": true }, "targets": [ { "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", "intervalFactor": 2, "refId": "A", "step": 600 } ], "thresholds": "", "title": "Memory", "type": "singlestat", "valueFontSize": "110%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "Bps", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": false }, "id": 7, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 4, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": true }, "targets": [ { "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ", "intervalFactor": 2, "refId": "A", "step": 600 } ], "thresholds": "", "title": "Network", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" } ], "showTitle": false, "title": "Row" }, { "collapse": false, "editable": true, "height": "100px", "panels": [ { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", "decimals": null, "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": false }, "id": 5, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [ { "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "metric": "kube_deployment_spec_replicas", "refId": "A", "step": 600 } ], "thresholds": "", "title": "Desired Replicas", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 6, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [ { "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "refId": "A", "step": 600 } ], "thresholds": "", "title": "Available Replicas", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 3, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [ { "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "", "refId": "A", "step": 600 } ], "thresholds": "", "title": "Observed Generation", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 2, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [ { "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "", "refId": "A", "step": 600 } ], "thresholds": "", "title": "Metadata Generation", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" } ], "title": "New row" }, { "collapse": false, "editable": true, "height": "350px", "panels": [ { "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": { "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 1, "isNew": true, "legend": { "avg": false, "current": false, "hideZero": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "current replicas", "refId": "A", "step": 30 }, { "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "available", "refId": "B", "step": 30 }, { "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "unavailable", "refId": "C", "step": 30 }, { "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "updated", "refId": "D", "step": 30 }, { "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", "intervalFactor": 2, "legendFormat": "desired", "refId": "E", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Replicas", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "transparent": false, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] } ], "showTitle": false, "title": "New row" } ], "schemaVersion": 12, "sharedCrosshair": true, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": ".*", "current": {}, "datasource": "${DS_PROMETHEUS}", "hide": 0, "includeAll": false, "label": "Namespace", "multi": false, "name": "deployment_namespace", "options": [], "query": "label_values(kube_deployment_metadata_generation, namespace)", "refresh": 1, "regex": "", "sort": 0, "tagValuesQuery": null, "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": null, "current": {}, "datasource": "${DS_PROMETHEUS}", "hide": 0, "includeAll": false, "label": "Deployment", "multi": false, "name": "deployment_name", "options": [], "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", "refresh": 1, "regex": "", "sort": 0, "tagValuesQuery": "", "tagsQuery": "deployment", "type": "query", "useTags": false } ] }, "time": { "from": "now-6h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Deployment", "version": 2 } , "inputs": [ { "name": "DS_PROMETHEUS", "pluginId": "prometheus", "type": "datasource", "value": "prometheus" } ], "overwrite": true } kubernetes-pods-dashboard.json: |+ { "dashboard": { "__inputs": [ { "description": "", "label": "prometheus", "name": "DS_PROMETHEUS", "pluginId": "prometheus", "pluginName": "Prometheus", "type": "datasource" } ], "__requires": [ { "id": "graph", "name": "Graph", "type": "panel", "version": "" }, { "id": "grafana", "name": "Grafana", "type": "grafana", "version": "3.1.1" }, { "id": "prometheus", "name": "Prometheus", "type": "datasource", "version": "1.0.0" } ], "annotations": { "list": [] }, "editable": true, "gnetId": null, "hideControls": false, "id": null, "links": [], "rows": [ { "collapse": false, "editable": true, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": { "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 1, "isNew": true, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", "interval": "10s", "intervalFactor": 1, "legendFormat": "Current: {{ container_name }}", "metric": "container_memory_usage_bytes", "refId": "A", "step": 10 }, { "expr": "kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}", "interval": "10s", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "metric": "kube_pod_container_resource_requests_memory_bytes", "refId": "B", "step": 20 } ], "timeFrom": null, "timeShift": null, "title": "Memory Usage", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "show": true }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "title": "Row" }, { "collapse": false, "editable": true, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": { "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 2, "isNew": true, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )", "intervalFactor": 2, "legendFormat": "{{ container_name }}", "refId": "A", "step": 30 } ], "timeFrom": null, "timeShift": null, "title": "CPU Usage", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "show": true }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "title": "New row" }, { "collapse": false, "editable": true, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": { "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 3, "isNew": true, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))", "intervalFactor": 2, "legendFormat": "{{ pod_name }}", "refId": "A", "step": 30 } ], "timeFrom": null, "timeShift": null, "title": "Network I/O", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "show": true }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "title": "New row" } ], "schemaVersion": 12, "sharedCrosshair": true, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": ".*", "current": {}, "datasource": "${DS_PROMETHEUS}", "hide": 0, "includeAll": true, "label": "Namespace", "multi": false, "name": "namespace", "options": [], "query": "label_values(kube_pod_info, namespace)", "refresh": 1, "regex": "", "type": "query" }, { "current": {}, "datasource": "${DS_PROMETHEUS}", "hide": 0, "includeAll": false, "label": "Pod", "multi": false, "name": "pod", "options": [], "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", "refresh": 1, "regex": "", "type": "query" }, { "allValue": ".*", "current": {}, "datasource": "${DS_PROMETHEUS}", "hide": 0, "includeAll": true, "label": "Container", "multi": false, "name": "container", "options": [], "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", "refresh": 1, "regex": "", "type": "query" } ] }, "time": { "from": "now-6h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Pods", "version": 26 } , "inputs": [ { "name": "DS_PROMETHEUS", "pluginId": "prometheus", "type": "datasource", "value": "prometheus" } ], "overwrite": true } node-dashboard.json: |+ { "dashboard": { "__inputs": [ { "description": "", "label": "prometheus", "name": "DS_PROMETHEUS", "pluginId": "prometheus", "pluginName": "Prometheus", "type": "datasource" } ], "__requires": [ { "id": "grafana", "name": "Grafana", "type": "grafana", "version": "4.1.1" }, { "id": "graph", "name": "Graph", "type": "panel", "version": "" }, { "id": "prometheus", "name": "Prometheus", "type": "datasource", "version": "1.0.0" }, { "id": "singlestat", "name": "Singlestat", "type": "panel", "version": "" } ], "annotations": { "list": [] }, "description": "Dashboard to get an overview of one server", "editable": true, "gnetId": 22, "graphTooltip": 0, "hideControls": false, "id": null, "links": [], "refresh": false, "rows": [ { "collapse": false, "height": "250px", "panels": [ { "alerting": {}, "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 3, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", "hide": false, "intervalFactor": 10, "legendFormat": "{{cpu}}", "refId": "A", "step": 50 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Idle cpu", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percent", "label": "cpu usage", "logBase": 1, "max": 100, "min": 0, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "alerting": {}, "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 9, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_load1{instance=\"$server\"}", "intervalFactor": 4, "legendFormat": "load 1m", "refId": "A", "step": 20, "target": "" }, { "expr": "node_load5{instance=\"$server\"}", "intervalFactor": 4, "legendFormat": "load 5m", "refId": "B", "step": 20, "target": "" }, { "expr": "node_load15{instance=\"$server\"}", "intervalFactor": 4, "legendFormat": "load 15m", "refId": "C", "step": 20, "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "System load", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "New row", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "alerting": {}, "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 4, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": false, "hideZero": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", "yaxis": 2 } ], "span": 9, "stack": true, "steppedLine": false, "targets": [ { "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "memory used", "metric": "", "refId": "C", "step": 4 }, { "expr": "node_memory_Buffers{instance=\"$server\"}", "interval": "", "intervalFactor": 2, "legendFormat": "memory buffers", "metric": "", "refId": "E", "step": 4 }, { "expr": "node_memory_Cached{instance=\"$server\"}", "intervalFactor": 2, "legendFormat": "memory cached", "metric": "", "refId": "F", "step": 4 }, { "expr": "node_memory_MemFree{instance=\"$server\"}", "intervalFactor": 2, "legendFormat": "memory free", "metric": "", "refId": "D", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Memory usage", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "id": 5, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [ { "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", "intervalFactor": 2, "refId": "A", "step": 60, "target": "" } ], "thresholds": "80, 90", "title": "Memory usage", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "New row", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "alerting": {}, "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 6, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "read", "yaxis": 1 }, { "alias": "{instance=\"172.17.0.1:9100\"}", "yaxis": 2 }, { "alias": "io time", "yaxis": 2 } ], "span": 9, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", "hide": false, "intervalFactor": 4, "legendFormat": "read", "refId": "A", "step": 8, "target": "" }, { "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", "intervalFactor": 4, "legendFormat": "written", "refId": "B", "step": 8 }, { "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", "intervalFactor": 4, "legendFormat": "io time", "refId": "C", "step": 8 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Disk I/O", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "ms", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "percentunit", "gauge": { "maxValue": 1, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "id": 7, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [ { "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", "intervalFactor": 2, "refId": "A", "step": 60, "target": "" } ], "thresholds": "0.75, 0.9", "title": "Disk space usage", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "New row", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "alerting": {}, "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 8, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "transmitted ", "yaxis": 2 } ], "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", "hide": false, "intervalFactor": 2, "legendFormat": "{{device}}", "refId": "A", "step": 10, "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Network received", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "alerting": {}, "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 10, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "transmitted ", "yaxis": 2 } ], "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", "hide": false, "intervalFactor": 2, "legendFormat": "{{device}}", "refId": "B", "step": 10, "target": "" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Network transmitted", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "New row", "titleSize": "h6" } ], "schemaVersion": 14, "style": "dark", "tags": [ "prometheus" ], "templating": { "list": [ { "allValue": null, "current": {}, "datasource": "${DS_PROMETHEUS}", "hide": 0, "includeAll": false, "label": null, "multi": false, "name": "server", "options": [], "query": "label_values(node_boot_time, instance)", "refresh": 1, "regex": "", "sort": 0, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Nodes", "version": 1 } , "inputs": [ { "name": "DS_PROMETHEUS", "pluginId": "prometheus", "type": "datasource", "value": "prometheus" } ], "overwrite": true } traefik-dashboard.json: |+ { "dashboard": { "__inputs": [ { "description": "", "label": "prometheus", "name": "DS_PROMETHEUS", "pluginId": "prometheus", "pluginName": "Prometheus", "type": "datasource" } ], "__requires": [ { "id": "grafana", "name": "Grafana", "type": "grafana", "version": "4.1.1" }, { "id": "graph", "name": "Graph", "type": "panel", "version": "" }, { "id": "prometheus", "name": "Prometheus", "type": "datasource", "version": "1.0.0" }, { "id": "singlestat", "name": "Singlestat", "type": "panel", "version": "" } ], "annotations": { "list": [] }, "description": "Visualize Traefik Health Metrics", "editable": true, "gnetId": 2240, "graphTooltip": 0, "hideControls": false, "id": null, "links": [], "refresh": "30s", "rows": [ { "collapse": false, "height": 288, "panels": [ { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", "decimals": 0, "format": "s", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 3, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 1, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "time() - process_start_time_seconds{job=\"load-balancer\"}", "intervalFactor": 2, "refId": "A", "step": 1800 } ], "thresholds": "", "title": "Uptime", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "fill": 1, "id": 1, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 3, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(traefik_requests_total{service=\"http\"})", "interval": "", "intervalFactor": 2, "legendFormat": "{{http}}", "metric": "", "refId": "A", "step": 240 }, { "expr": "sum(traefik_requests_total{service=\"https\"})", "interval": "", "intervalFactor": 2, "legendFormat": "{{https}}", "refId": "B", "step": 240 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Total requests", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": "Count", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "fill": 1, "id": 2, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 3, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(traefik_request_duration_seconds_sum) / sum(traefik_requests_total) * 1000", "intervalFactor": 2, "legendFormat": "Average response time (ms)", "refId": "A", "step": 240 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Average response time", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ms", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "fill": 1, "id": 6, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 5, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(traefik_requests_total[5m]))", "interval": "", "intervalFactor": 2, "legendFormat": "{{requests}}", "refId": "A", "step": 240 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Requests in last 5 minutes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "General info", "titleSize": "h6" }, { "collapse": false, "height": 412, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": 0, "fill": 1, "id": 5, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(traefik_requests_total{service=~\"http|https\",code=\"200\"}[5m])", "intervalFactor": 2, "legendFormat": "{{service}} {{method}} {{code}}", "refId": "A", "step": 240 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Successful Status Code Count (5min)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": 0, "fill": 1, "id": 4, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(traefik_requests_total{service=~\"http|https\",code!=\"200\"}[5m])", "intervalFactor": 2, "legendFormat": "{{service}} {{method}} {{code}}", "refId": "A", "step": 240 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Bad Status Code Count (5m)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" }, { "collapse": false, "height": 382, "panels": [], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Detailed statuses", "titleSize": "h6" } ], "schemaVersion": 14, "style": "dark", "tags": [], "templating": { "list": [] }, "time": { "from": "now/d", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Traefik", "version": 1 }, "inputs": [ { "name": "DS_PROMETHEUS", "pluginId": "prometheus", "type": "datasource", "value": "prometheus" } ], "overwrite": true } rook-dashboard.json: |+ { "dashboard": { "__inputs": [ { "description": "", "label": "prometheus", "name": "DS_PROMETHEUS", "pluginId": "prometheus", "pluginName": "Prometheus", "type": "datasource" } ], "__requires": [ { "id": "grafana", "name": "Grafana", "type": "grafana", "version": "4.1.1" }, { "id": "graph", "name": "Graph", "type": "panel", "version": "" }, { "id": "prometheus", "name": "Prometheus", "type": "datasource", "version": "1.0.0" }, { "id": "singlestat", "name": "Singlestat", "type": "panel", "version": "" } ], "annotations": { "list": [] }, "description": "Rook cluster monitoring", "editable": true, "gnetId": 917, "graphTooltip": 0, "hideControls": false, "id": null, "links": [], "refresh": "1m", "rows": [ { "collapse": false, "height": "150px", "panels": [ { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 21, "interval": "1m", "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "count(ceph_health_status)", "interval": "$interval", "intervalFactor": 1, "refId": "A", "step": 60 } ], "thresholds": "0,1", "title": "Status", "transparent": false, "type": "singlestat", "valueFontSize": "100%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" }, { "op": "=", "text": "WARNING", "value": "0" }, { "op": "=", "text": "HEALTHY", "value": "1" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 14, "interval": "1m", "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "ceph_monitor_quorum_count", "interval": "$interval", "intervalFactor": 1, "legendFormat": "", "refId": "A", "step": 60 } ], "thresholds": "2,3", "title": "Monitors In Quorum", "transparent": false, "type": "singlestat", "valueFontSize": "100%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 22, "interval": "1m", "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, "lineColor": "rgb(31, 120, 193)", "show": true }, "tableColumn": "", "targets": [ { "expr": "count(ceph_pool_available_bytes)", "interval": "$interval", "intervalFactor": 1, "legendFormat": "", "refId": "A", "step": 60 } ], "thresholds": "", "title": "Pools", "transparent": false, "type": "singlestat", "valueFontSize": "100%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "bytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 33, "interval": "1m", "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, "lineColor": "rgb(31, 120, 193)", "show": true }, "tableColumn": "", "targets": [ { "expr": "ceph_cluster_capacity_bytes", "interval": "$interval", "intervalFactor": 1, "legendFormat": "", "refId": "A", "step": 60 } ], "thresholds": "0.025,0.1", "title": "Cluster Capacity", "transparent": false, "type": "singlestat", "valueFontSize": "100%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "bytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 34, "interval": "1m", "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, "lineColor": "rgb(31, 120, 193)", "show": true }, "tableColumn": "", "targets": [ { "expr": "ceph_cluster_used_bytes", "interval": "$interval", "intervalFactor": 1, "legendFormat": "", "refId": "A", "step": 60 } ], "thresholds": "0.025,0.1", "title": "Used Capacity", "transparent": false, "type": "singlestat", "valueFontSize": "100%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "percentunit", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "id": 23, "interval": "1m", "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "ceph_cluster_available_bytes/ceph_cluster_capacity_bytes", "interval": "$interval", "intervalFactor": 1, "legendFormat": "", "refId": "A", "step": 60 } ], "thresholds": "70,80", "title": "Available Capacity", "transparent": false, "type": "singlestat", "valueFontSize": "100%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "New row", "titleSize": "h6" }, { "collapse": false, "height": "100px", "panels": [ { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 26, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 1, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "ceph_osds_in", "interval": "$interval", "intervalFactor": 1, "legendFormat": "", "refId": "A", "step": 60 } ], "thresholds": "", "title": "OSDs IN", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": true, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 40, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 27, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 1, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "ceph_osds - ceph_osds_in", "interval": "$interval", "intervalFactor": 1, "legendFormat": "", "refId": "A", "step": 60 } ], "thresholds": "1,1", "title": "OSDs OUT", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 28, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 1, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum(ceph_osd_up)", "interval": "$interval", "intervalFactor": 1, "legendFormat": "", "refId": "A", "step": 60 } ], "thresholds": "", "title": "OSDs UP", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": true, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 40, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 29, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 1, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "ceph_osds_down", "interval": "$interval", "intervalFactor": 1, "legendFormat": "", "refId": "A", "step": 60 } ], "thresholds": "1,1", "title": "OSDs DOWN", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 30, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, "lineColor": "rgb(31, 120, 193)", "show": true }, "tableColumn": "", "targets": [ { "expr": "avg(ceph_osd_pgs)", "interval": "$interval", "intervalFactor": 1, "legendFormat": "", "refId": "A", "step": 60 } ], "thresholds": "250,300", "title": "Agerage PGs per OSD", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "s", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 31, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, "lineColor": "rgb(31, 120, 193)", "show": true }, "tableColumn": "", "targets": [ { "expr": "avg(ceph_osd_perf_apply_latency_seconds)", "interval": "$interval", "intervalFactor": 1, "legendFormat": "", "refId": "A", "step": 60 } ], "thresholds": "0.01,0.05", "title": "Agerage OSD Apply Latency", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "s", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 32, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, "lineColor": "rgb(31, 120, 193)", "show": true }, "tableColumn": "", "targets": [ { "expr": "avg(ceph_osd_perf_commit_latency_seconds)", "interval": "$interval", "intervalFactor": 1, "legendFormat": "", "refId": "A", "step": 60 } ], "thresholds": "0.01,0.05", "title": "Agerage OSD Commit Latency", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "s", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 24, "interval": "1m", "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "repeat": null, "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, "lineColor": "rgb(31, 120, 193)", "show": true }, "tableColumn": "", "targets": [ { "expr": "avg(ceph_monitor_latency_seconds)", "interval": "$interval", "intervalFactor": 1, "legendFormat": "", "refId": "A", "step": 60 } ], "thresholds": "70,80", "title": "Average Monitor Latency", "transparent": false, "type": "singlestat", "valueFontSize": "100%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "New row", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "aliasColors": { "Available": "#EAB839", "Total Capacity": "#447EBC", "Used": "#BF1B00", "total_avail": "#6ED0E0", "total_space": "#7EB26D", "total_used": "#890F02" }, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 4, "grid": {}, "height": "300", "id": 1, "interval": "$interval", "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 0, "links": [], "minSpan": null, "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "Total Capacity", "fill": 0, "linewidth": 3, "stack": false } ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { "expr": "ceph_cluster_available_bytes", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Available", "refId": "A", "step": 60 }, { "expr": "ceph_cluster_used_bytes", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Used", "refId": "B", "step": 60 }, { "expr": "ceph_cluster_capacity_bytes", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Total Capacity", "refId": "C", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Capacity", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": { "Total Capacity": "#7EB26D", "Used": "#BF1B00", "total_avail": "#6ED0E0", "total_space": "#7EB26D", "total_used": "#890F02" }, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": 0, "editable": true, "error": false, "fill": 1, "grid": {}, "height": "300", "id": 3, "interval": "$interval", "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "minSpan": null, "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { "expr": "ceph_client_io_write_ops", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Write", "refId": "A", "step": 60 }, { "expr": "ceph_client_io_read_ops", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Read", "refId": "B", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "IOPS", "tooltip": { "msResolution": true, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": 0, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "height": "300", "id": 7, "interval": "$interval", "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { "expr": "ceph_client_io_write_bytes", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Write", "refId": "A", "step": 60 }, { "expr": "ceph_client_io_read_bytes", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Read", "refId": "B", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Throughput", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": 0, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, "title": "CLUSTER", "titleSize": "h6" }, { "collapse": false, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 18, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "/^Total.*$/", "stack": false } ], "spaceLength": 10, "span": 12, "stack": true, "steppedLine": false, "targets": [ { "expr": "ceph_cluster_objects", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Total", "refId": "A", "step": 60 }, { "expr": "ceph_degraded_objects", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Degraded", "refId": "B", "step": 60 }, { "expr": "ceph_misplaced_objects", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Misplaced", "refId": "C", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Objects in the Cluster", "tooltip": { "msResolution": false, "shared": true, "sort": 1, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 19, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "/^Total.*$/", "stack": false } ], "spaceLength": 10, "span": 6, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(ceph_osd_pgs)", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Total", "refId": "A", "step": 60 }, { "expr": "ceph_degraded_pgs", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Degraded", "refId": "B", "step": 60 }, { "expr": "ceph_stale_pgs", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Stale", "refId": "C", "step": 60 }, { "expr": "ceph_unclean_pgs", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Unclean", "refId": "D", "step": 60 }, { "expr": "ceph_undersized_pgs", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Undersized", "refId": "E", "step": 60 }, { "expr": "ceph_stuck_degraded_pgs + ceph_stuck_stale_pgs + ceph_stuck_unclean_pgs + ceph_stuck_undersized_pgs", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Stuck", "refId": "F", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "PGs", "tooltip": { "msResolution": false, "shared": true, "sort": 1, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 20, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "/^Total.*$/", "stack": false } ], "spaceLength": 10, "span": 6, "stack": true, "steppedLine": false, "targets": [ { "expr": "ceph_stuck_degraded_pgs", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Degraded", "refId": "F", "step": 60 }, { "expr": "ceph_stuck_stale_pgs", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Stale", "refId": "A", "step": 60 }, { "expr": "ceph_stuck_unclean_pgs", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Unclean", "refId": "B", "step": 60 }, { "expr": "ceph_stuck_undersized_pgs", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Undersized", "refId": "C", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Stuck PGs", "tooltip": { "msResolution": false, "shared": true, "sort": 1, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "New row", "titleSize": "h6" }, { "collapse": false, "height": "150px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 15, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": false, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { "expr": "ceph_recovery_io_bytes", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Bytes", "refId": "A", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Bytes", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 16, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": false, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "/^.*/", "color": "#E0752D" } ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { "expr": "ceph_recovery_io_keys", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Keys", "refId": "A", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Keys", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": {}, "id": 17, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": false, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "/^.*$/", "color": "#890F02" } ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { "expr": "ceph_recovery_io_objects", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Objects", "refId": "A", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Objects", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, "title": "Recovery", "titleSize": "h6" } ], "schemaVersion": 14, "style": "dark", "tags": [ "ceph", "rook" ], "templating": { "list": [ { "auto": true, "auto_count": 10, "auto_min": "1m", "current": { "text": "1m", "value": "1m" }, "datasource": null, "hide": 0, "includeAll": false, "label": "Interval", "multi": false, "name": "interval", "options": [ { "selected": false, "text": "auto", "value": "$__auto_interval" }, { "selected": true, "text": "1m", "value": "1m" }, { "selected": false, "text": "10m", "value": "10m" }, { "selected": false, "text": "30m", "value": "30m" }, { "selected": false, "text": "1h", "value": "1h" }, { "selected": false, "text": "6h", "value": "6h" }, { "selected": false, "text": "12h", "value": "12h" }, { "selected": false, "text": "1d", "value": "1d" }, { "selected": false, "text": "7d", "value": "7d" }, { "selected": false, "text": "14d", "value": "14d" }, { "selected": false, "text": "30d", "value": "30d" } ], "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", "refresh": 2, "type": "interval" } ] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Rook", "version": 3 }, "inputs": [ { "name": "DS_PROMETHEUS", "pluginId": "prometheus", "type": "datasource", "value": "prometheus" } ], "overwrite": true } resource-requests-dashboard.json: |+ { "dashboard": { "__inputs": [ { "description": "", "label": "prometheus", "name": "DS_PROMETHEUS", "pluginId": "prometheus", "pluginName": "Prometheus", "type": "datasource" } ], "__requires": [ { "id": "grafana", "name": "Grafana", "type": "grafana", "version": "4.1.1" }, { "id": "graph", "name": "Graph", "type": "panel", "version": "" }, { "id": "prometheus", "name": "Prometheus", "type": "datasource", "version": "1.0.0" }, { "id": "singlestat", "name": "Singlestat", "type": "panel", "version": "" } ], "annotations": { "list": [] }, "description": "Dashboard to show the resource requests vs allocatable in the cluster", "editable": true, "gnetId": null, "graphTooltip": 0, "hideControls": false, "id": null, "links": [], "rows": [ { "collapse": false, "height": "300", "panels": [ { "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", "fill": 1, "id": 1, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 9, "stack": false, "steppedLine": false, "targets": [ { "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", "hide": false, "intervalFactor": 2, "legendFormat": "Allocatable CPU Cores", "refId": "A", "step": 10 }, { "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", "intervalFactor": 2, "legendFormat": "Requested CPU Cores", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "CPU Cores", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "CPU Cores", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "decimals": null, "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "id": 2, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": true }, "targets": [ { "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", "intervalFactor": 2, "legendFormat": "", "refId": "A", "step": 240 } ], "thresholds": "80, 90", "title": "CPU Cores", "type": "singlestat", "valueFontSize": "110%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "CPU Cores", "titleSize": "h6" }, { "collapse": false, "height": "300", "panels": [ { "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", "fill": 1, "id": 3, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 9, "stack": false, "steppedLine": false, "targets": [ { "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", "hide": false, "intervalFactor": 2, "legendFormat": "Allocatable Memory", "refId": "A", "step": 10 }, { "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", "intervalFactor": 2, "legendFormat": "Requested Memory", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Memory", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": "Memory", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "decimals": null, "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "id": 4, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": true }, "targets": [ { "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", "intervalFactor": 2, "legendFormat": "", "refId": "A", "step": 240 } ], "thresholds": "80, 90", "title": "Memory", "type": "singlestat", "valueFontSize": "110%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Memory", "titleSize": "h6" } ], "schemaVersion": 14, "style": "dark", "tags": [], "templating": { "list": [] }, "time": { "from": "now-3h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Resource Requests", "version": 1 } , "inputs": [ { "name": "DS_PROMETHEUS", "pluginId": "prometheus", "type": "datasource", "value": "prometheus" } ], "overwrite": true } prometheus-datasource.json: |+ { "access": "proxy", "basicAuth": false, "name": "prometheus", "type": "prometheus", "url": "http://prometheus-k8s.monitoring.svc:9090" } ================================================ FILE: manifests/grafana/grafana-deployment.yaml ================================================ apiVersion: extensions/v1beta1 kind: Deployment metadata: name: grafana spec: replicas: 1 template: metadata: labels: app: grafana spec: containers: - name: grafana image: grafana/grafana:4.4.1 env: - name: GF_AUTH_BASIC_ENABLED value: "true" - name: GF_AUTH_ANONYMOUS_ENABLED value: "true" - name: GF_SECURITY_ADMIN_USER valueFrom: secretKeyRef: name: grafana-credentials key: user - name: GF_SECURITY_ADMIN_PASSWORD valueFrom: secretKeyRef: name: grafana-credentials key: password volumeMounts: - name: grafana-storage mountPath: /var/grafana-storage ports: - name: web containerPort: 3000 resources: requests: memory: 100Mi cpu: 100m limits: memory: 200Mi cpu: 200m - name: grafana-watcher image: quay.io/coreos/grafana-watcher:v0.0.6 args: - '--watch-dir=/var/grafana-dashboards' - '--grafana-url=http://localhost:3000' env: - name: GRAFANA_USER valueFrom: secretKeyRef: name: grafana-credentials key: user - name: GRAFANA_PASSWORD valueFrom: secretKeyRef: name: grafana-credentials key: password resources: requests: memory: "16Mi" cpu: "50m" limits: memory: "32Mi" cpu: "100m" volumeMounts: - name: grafana-dashboards mountPath: /var/grafana-dashboards volumes: - name: grafana-storage emptyDir: {} - name: grafana-dashboards configMap: name: grafana-dashboards ================================================ FILE: manifests/grafana/grafana-service.yaml ================================================ apiVersion: v1 kind: Service metadata: name: grafana labels: app: grafana spec: ports: - name: web port: 3000 protocol: TCP selector: app: grafana ================================================ FILE: manifests/heapster.yaml ================================================ apiVersion: extensions/v1beta1 kind: Deployment metadata: name: heapster namespace: kube-system labels: k8s-app: heapster kubernetes.io/cluster-service: "true" spec: replicas: 1 template: metadata: labels: task: monitoring k8s-app: heapster spec: serviceAccountName: heapster containers: - name: heapster image: gcr.io/google_containers/heapster-amd64:v1.5.1 imagePullPolicy: IfNotPresent command: - /heapster - --source=kubernetes:https://kubernetes.default tolerations: - key: CriticalAddonsOnly operator: Exists - key: node-role.kubernetes.io/master operator: Exists effect: NoSchedule --- apiVersion: v1 kind: Service metadata: labels: task: monitoring kubernetes.io/cluster-service: 'true' kubernetes.io/name: Heapster name: heapster namespace: kube-system spec: ports: - port: 80 targetPort: 8082 selector: k8s-app: heapster --- apiVersion: v1 kind: ServiceAccount metadata: name: heapster namespace: kube-system --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1beta1 metadata: name: heapster roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: system:heapster subjects: - kind: ServiceAccount name: heapster namespace: kube-system ================================================ FILE: manifests/kube-dashboard.yaml ================================================ apiVersion: v1 kind: Secret metadata: labels: k8s-app: kubernetes-dashboard name: kubernetes-dashboard-certs namespace: kube-system type: Opaque --- apiVersion: v1 kind: ServiceAccount metadata: labels: k8s-app: kubernetes-dashboard name: kubernetes-dashboard namespace: kube-system --- kind: Deployment apiVersion: apps/v1beta2 metadata: labels: k8s-app: kubernetes-dashboard name: kubernetes-dashboard namespace: kube-system spec: replicas: 1 revisionHistoryLimit: 10 selector: matchLabels: k8s-app: kubernetes-dashboard template: metadata: labels: k8s-app: kubernetes-dashboard spec: containers: - name: kubernetes-dashboard image: gcr.io/google_containers/kubernetes-dashboard-amd64:v1.8.3 ports: - containerPort: 8443 protocol: TCP args: - --auto-generate-certificates # Uncomment the following line to manually specify Kubernetes API server Host # If not specified, Dashboard will attempt to auto discover the API server and connect # to it. Uncomment only if the default does not work. # - --apiserver-host=http://my-address:port volumeMounts: - name: kubernetes-dashboard-certs mountPath: /certs # Create on-disk volume to store exec logs - mountPath: /tmp name: tmp-volume livenessProbe: httpGet: scheme: HTTPS path: / port: 8443 initialDelaySeconds: 30 timeoutSeconds: 30 volumes: - name: kubernetes-dashboard-certs secret: secretName: kubernetes-dashboard-certs - name: tmp-volume emptyDir: {} serviceAccountName: kubernetes-dashboard # Comment the following tolerations if Dashboard must not be deployed on master tolerations: - key: node-role.kubernetes.io/master effect: NoSchedule --- kind: Service apiVersion: v1 metadata: labels: k8s-app: kubernetes-dashboard name: kubernetes-dashboard namespace: kube-system spec: ports: - port: 443 targetPort: 8443 selector: k8s-app: kubernetes-dashboard --- apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRoleBinding metadata: name: kubernetes-dashboard labels: k8s-app: kubernetes-dashboard roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: cluster-admin subjects: - kind: ServiceAccount name: kubernetes-dashboard namespace: kube-system --- apiVersion: extensions/v1beta1 kind: Ingress metadata: name: dashboard-ingress namespace: kube-system annotations: kubernetes.io/ingress.class: "traefik" ingress.kubernetes.io/auth-type: "basic" ingress.kubernetes.io/auth-secret: "kubesecret" spec: rules: - host: kube.${DOMAIN} http: paths: - backend: serviceName: kubernetes-dashboard servicePort: 443 ================================================ FILE: manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml ================================================ apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRoleBinding metadata: name: kube-state-metrics roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: kube-state-metrics subjects: - kind: ServiceAccount name: kube-state-metrics namespace: monitoring ================================================ FILE: manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml ================================================ apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRole metadata: name: kube-state-metrics rules: - apiGroups: [""] resources: - nodes - pods - services - resourcequotas - replicationcontrollers - limitranges verbs: ["list", "watch"] - apiGroups: ["extensions"] resources: - daemonsets - deployments - replicasets verbs: ["list", "watch"] ================================================ FILE: manifests/kube-state-metrics/kube-state-metrics-deployment.yaml ================================================ apiVersion: extensions/v1beta1 kind: Deployment metadata: name: kube-state-metrics spec: replicas: 1 template: metadata: labels: app: kube-state-metrics spec: serviceAccountName: kube-state-metrics containers: - name: kube-state-metrics image: quay.io/coreos/kube-state-metrics:v0.5.0 ports: - name: metrics containerPort: 8080 resources: requests: memory: 100Mi cpu: 100m limits: memory: 200Mi cpu: 200m ================================================ FILE: manifests/kube-state-metrics/kube-state-metrics-service-account.yaml ================================================ apiVersion: v1 kind: ServiceAccount metadata: name: kube-state-metrics ================================================ FILE: manifests/kube-state-metrics/kube-state-metrics-service.yaml ================================================ apiVersion: v1 kind: Service metadata: labels: app: kube-state-metrics k8s-app: kube-state-metrics name: kube-state-metrics spec: ports: - name: http-metrics port: 8080 targetPort: metrics protocol: TCP selector: app: kube-state-metrics ================================================ FILE: manifests/node-exporter/node-exporter-daemonset.yaml ================================================ apiVersion: extensions/v1beta1 kind: DaemonSet metadata: name: node-exporter spec: template: metadata: labels: app: node-exporter name: node-exporter spec: hostNetwork: true hostPID: true containers: - image: quay.io/prometheus/node-exporter:v0.14.0 args: - "-collector.procfs=/host/proc" - "-collector.sysfs=/host/sys" name: node-exporter ports: - containerPort: 9100 hostPort: 9100 name: scrape resources: requests: memory: 30Mi cpu: 100m limits: memory: 50Mi cpu: 200m volumeMounts: - name: proc readOnly: true mountPath: /host/proc - name: sys readOnly: true mountPath: /host/sys volumes: - name: proc hostPath: path: /proc - name: sys hostPath: path: /sys ================================================ FILE: manifests/node-exporter/node-exporter-service.yaml ================================================ apiVersion: v1 kind: Service metadata: labels: app: node-exporter k8s-app: node-exporter name: node-exporter spec: type: ClusterIP clusterIP: None ports: - name: http-metrics port: 9100 protocol: TCP selector: app: node-exporter ================================================ FILE: manifests/prometheus/prometheus-k8s-ingress.yaml ================================================ apiVersion: extensions/v1beta1 kind: Ingress metadata: name: prometheus-ingress namespace: monitoring annotations: kubernetes.io/ingress.class: "traefik" ingress.kubernetes.io/auth-type: "basic" ingress.kubernetes.io/auth-secret: "kubesecret" spec: rules: - host: alertmanager.${DOMAIN} http: paths: - backend: serviceName: alertmanager-main servicePort: web - host: prometheus.${DOMAIN} http: paths: - backend: serviceName: prometheus-k8s servicePort: web - host: grafana.${DOMAIN} http: paths: - backend: serviceName: grafana servicePort: web ================================================ FILE: manifests/prometheus/prometheus-k8s-role-bindings.yaml ================================================ apiVersion: rbac.authorization.k8s.io/v1beta1 kind: RoleBinding metadata: name: prometheus-k8s namespace: monitoring roleRef: apiGroup: rbac.authorization.k8s.io kind: Role name: prometheus-k8s subjects: - kind: ServiceAccount name: prometheus-k8s namespace: monitoring --- apiVersion: rbac.authorization.k8s.io/v1beta1 kind: RoleBinding metadata: name: prometheus-k8s namespace: kube-system roleRef: apiGroup: rbac.authorization.k8s.io kind: Role name: prometheus-k8s subjects: - kind: ServiceAccount name: prometheus-k8s namespace: monitoring --- apiVersion: rbac.authorization.k8s.io/v1beta1 kind: RoleBinding metadata: name: prometheus-k8s namespace: rook roleRef: apiGroup: rbac.authorization.k8s.io kind: Role name: prometheus-k8s subjects: - kind: ServiceAccount name: prometheus-k8s namespace: monitoring --- apiVersion: rbac.authorization.k8s.io/v1beta1 kind: RoleBinding metadata: name: prometheus-k8s namespace: default roleRef: apiGroup: rbac.authorization.k8s.io kind: Role name: prometheus-k8s subjects: - kind: ServiceAccount name: prometheus-k8s namespace: monitoring --- apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRoleBinding metadata: name: prometheus-k8s roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: prometheus-k8s subjects: - kind: ServiceAccount name: prometheus-k8s namespace: monitoring ================================================ FILE: manifests/prometheus/prometheus-k8s-roles.yaml ================================================ apiVersion: rbac.authorization.k8s.io/v1beta1 kind: Role metadata: name: prometheus-k8s namespace: monitoring rules: - apiGroups: [""] resources: - nodes - services - endpoints - pods verbs: ["get", "list", "watch"] - apiGroups: [""] resources: - configmaps verbs: ["get"] --- apiVersion: rbac.authorization.k8s.io/v1beta1 kind: Role metadata: name: prometheus-k8s namespace: rook rules: - apiGroups: [""] resources: - nodes - services - endpoints - pods verbs: ["get", "list", "watch"] - apiGroups: [""] resources: - configmaps verbs: ["get"] --- apiVersion: rbac.authorization.k8s.io/v1beta1 kind: Role metadata: name: prometheus-k8s namespace: kube-system rules: - apiGroups: [""] resources: - services - endpoints - pods verbs: ["get", "list", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1beta1 kind: Role metadata: name: prometheus-k8s namespace: default rules: - apiGroups: [""] resources: - services - endpoints - pods verbs: ["get", "list", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRole metadata: name: prometheus-k8s rules: - nonResourceURLs: ["/metrics"] verbs: ["get"] ================================================ FILE: manifests/prometheus/prometheus-k8s-rules.yaml ================================================ apiVersion: v1 kind: ConfigMap metadata: name: prometheus-k8s-rules labels: role: prometheus-rulefiles prometheus: k8s data: alertmanager.rules.yaml: |+ groups: - name: ./alertmanager.rules rules: - alert: AlertmanagerConfigInconsistent expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 for: 5m labels: severity: critical annotations: description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. summary: Alertmanager configurations are inconsistent - alert: AlertmanagerDownOrMissing expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 for: 5m labels: severity: warning annotations: description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery. summary: Alertmanager down or not discovered - alert: FailedReload expr: alertmanager_config_last_reload_successful == 0 for: 10m labels: severity: warning annotations: description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. summary: Alertmanager configuration reload has failed etcd3.rules.yaml: |+ groups: - name: ./etcd3.rules rules: - alert: InsufficientMembers expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) for: 3m labels: severity: critical annotations: description: If one more etcd member goes down the cluster will be unavailable summary: etcd cluster insufficient members - alert: NoLeader expr: etcd_server_has_leader{job="etcd"} == 0 for: 1m labels: severity: critical annotations: description: etcd member {{ $labels.instance }} has no leader summary: etcd member has no leader - alert: HighNumberOfLeaderChanges expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour summary: a high number of leader changes within the etcd cluster are happening - alert: HighNumberOfFailedGRPCRequests expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 for: 10m labels: severity: warning annotations: description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of gRPC requests are failing - alert: HighNumberOfFailedGRPCRequests expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 for: 5m labels: severity: critical annotations: description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of gRPC requests are failing - alert: GRPCRequestsSlow expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 for: 10m labels: severity: critical annotations: description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow summary: slow gRPC requests - alert: HighNumberOfFailedHTTPRequests expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01 for: 10m labels: severity: warning annotations: description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of HTTP requests are failing - alert: HighNumberOfFailedHTTPRequests expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05 for: 5m labels: severity: critical annotations: description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of HTTP requests are failing - alert: HTTPRequestsSlow expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 for: 10m labels: severity: warning annotations: description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow summary: slow HTTP requests - alert: EtcdMemberCommunicationSlow expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 for: 10m labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow summary: etcd member communication is slow - alert: HighNumberOfFailedProposals expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour summary: a high number of proposals within the etcd cluster are failing - alert: HighFsyncDurations expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 for: 10m labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} fync durations are high summary: high fsync durations - alert: HighCommitDurations expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 for: 10m labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} commit durations are high summary: high commit durations general.rules.yaml: |+ groups: - name: ./general.rules rules: - alert: TargetDown expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 for: 10m labels: severity: warning annotations: description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' summary: Targets are down - alert: DeadMansSwitch expr: vector(1) labels: severity: none annotations: description: This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional. summary: Alerting DeadMansSwitch - alert: TooManyOpenFileDescriptors expr: 100 * (process_open_fds / process_max_fds) > 95 for: 10m labels: severity: critical annotations: description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.' summary: too many open file descriptors - record: instance:fd_utilization expr: process_open_fds / process_max_fds - alert: FdExhaustionClose expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 for: 10m labels: severity: warning annotations: description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon' summary: file descriptors soon exhausted - alert: FdExhaustionClose expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 for: 10m labels: severity: critical annotations: description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon' summary: file descriptors soon exhausted kube-apiserver.rules.yaml: |+ groups: - name: ./kube-apiserver.rules rules: - alert: K8SApiserverDown expr: absent(up{job="apiserver"} == 1) for: 5m labels: severity: critical annotations: description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery. summary: API server unreachable - alert: K8SApiServerLatency expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}) WITHOUT (instance, resource)) / 1e+06 > 1 for: 10m labels: severity: warning annotations: description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s. summary: Kubernetes apiserver latency is high kube-controller-manager.rules.yaml: |+ groups: - name: ./kube-controller-manager.rules rules: - alert: K8SControllerManagerDown expr: absent(up{job="kube-controller-manager"} == 1) for: 5m labels: severity: critical annotations: description: There is no running K8S controller manager. Deployments and replication controllers are not making progress. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager summary: Controller manager is down kube-scheduler.rules.yaml: |+ groups: - name: ./kube-scheduler.rules rules: - alert: K8SSchedulerDown expr: absent(up{job="kube-scheduler"} == 1) for: 5m labels: severity: critical annotations: description: There is no running K8S scheduler. New pods are not being assigned to nodes. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler summary: Scheduler is down kubelet.rules.yaml: |+ groups: - name: ./kubelet.rules rules: - alert: K8SNodeNotReady expr: kube_node_status_condition{condition="Ready",status="true"} == 0 for: 1h labels: severity: warning annotations: description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour summary: Node status is NotReady - alert: K8SManyNodesNotReady expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} == 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2 for: 1m labels: severity: critical annotations: description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).' summary: Many Kubernetes nodes are Not Ready - alert: K8SKubeletDown expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 for: 1h labels: severity: warning annotations: description: Prometheus failed to scrape {{ $value }}% of kubelets. summary: Many Kubelets cannot be scraped - alert: K8SKubeletDown expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 for: 1h labels: severity: critical annotations: description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery. summary: Many Kubelets cannot be scraped - alert: K8SKubeletTooManyPods expr: kubelet_running_pod_count > 100 labels: severity: warning annotations: description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110 summary: Kubelet is close to pod limit kubernetes.rules.yaml: |+ groups: - name: ./kubernetes.rules rules: - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:spec_cpu_shares expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:cpu_usage:rate expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:memory_usage:bytes expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:memory_working_set:bytes expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:memory_rss:bytes expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:memory_cache:bytes expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:disk_usage:bytes expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type) - record: cluster_namespace_controller_pod_container:memory_oom:rate expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type) - record: cluster:memory_allocation:percent expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster) - record: cluster:memory_used:percent expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster) - record: cluster:cpu_allocation:percent expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} * ON(cluster, instance) machine_cpu_cores) BY (cluster) - record: cluster:node_cpu_use:percent expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) BY (cluster) - record: cluster_resource_verb:apiserver_latency:quantile_seconds expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06 labels: quantile: "0.99" - record: cluster_resource_verb:apiserver_latency:quantile_seconds expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06 labels: quantile: "0.9" - record: cluster_resource_verb:apiserver_latency:quantile_seconds expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06 labels: quantile: "0.5" - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.99" - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.9" - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.5" - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.99" - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.9" - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.5" - record: cluster:scheduler_binding_latency:quantile_seconds expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.99" - record: cluster:scheduler_binding_latency:quantile_seconds expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.9" - record: cluster:scheduler_binding_latency:quantile_seconds expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.5" node.rules.yaml: |+ groups: - name: ./node.rules rules: - alert: NodeExporterDown expr: absent(up{job="node-exporter"} == 1) for: 10m labels: severity: warning annotations: description: Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery. summary: node-exporter cannot be scraped - alert: K8SNodeOutOfDisk expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 labels: service: k8s severity: critical annotations: description: '{{ $labels.node }} has run out of disk space.' summary: Node ran out of disk space. - alert: K8SNodeMemoryPressure expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 labels: service: k8s severity: warning annotations: description: '{{ $labels.node }} is under memory pressure.' summary: Node is under memory pressure. - alert: K8SNodeDiskPressure expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 labels: service: k8s severity: warning annotations: description: '{{ $labels.node }} is under disk pressure.' summary: Node is under disk pressure. prometheus.rules.yaml: |+ groups: - name: ./prometheus.rules rules: - alert: FailedReload expr: prometheus_config_last_reload_successful == 0 for: 10m labels: severity: warning annotations: description: Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. summary: Prometheus configuration reload has failed ================================================ FILE: manifests/prometheus/prometheus-k8s-service-account.yaml ================================================ apiVersion: v1 kind: ServiceAccount metadata: name: prometheus-k8s ================================================ FILE: manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml ================================================ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: alertmanager labels: k8s-app: alertmanager spec: selector: matchLabels: alertmanager: main namespaceSelector: matchNames: - monitoring endpoints: - port: web interval: 30s ================================================ FILE: manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml ================================================ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: kube-apiserver labels: k8s-app: apiserver spec: jobLabel: component selector: matchLabels: component: apiserver provider: kubernetes namespaceSelector: matchNames: - default endpoints: - port: https interval: 30s scheme: https tlsConfig: caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt serverName: kubernetes bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token ================================================ FILE: manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml ================================================ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: kube-controller-manager labels: k8s-app: kube-controller-manager spec: jobLabel: k8s-app endpoints: - port: http-metrics interval: 30s selector: matchLabels: k8s-app: kube-controller-manager namespaceSelector: matchNames: - kube-system ================================================ FILE: manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml ================================================ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: kube-scheduler labels: k8s-app: kube-scheduler spec: jobLabel: k8s-app endpoints: - port: http-metrics interval: 30s selector: matchLabels: k8s-app: kube-scheduler namespaceSelector: matchNames: - kube-system ================================================ FILE: manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml ================================================ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: kube-state-metrics labels: k8s-app: kube-state-metrics spec: jobLabel: k8s-app selector: matchLabels: k8s-app: kube-state-metrics namespaceSelector: matchNames: - monitoring endpoints: - port: http-metrics interval: 30s honorLabels: true ================================================ FILE: manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml ================================================ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: kubelet labels: k8s-app: kubelet spec: jobLabel: k8s-app endpoints: - port: http-metrics interval: 30s - port: cadvisor interval: 30s honorLabels: true selector: matchLabels: k8s-app: kubelet namespaceSelector: matchNames: - kube-system ================================================ FILE: manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml ================================================ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: node-exporter labels: k8s-app: node-exporter spec: jobLabel: k8s-app selector: matchLabels: k8s-app: node-exporter namespaceSelector: matchNames: - monitoring endpoints: - port: http-metrics interval: 30s ================================================ FILE: manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml ================================================ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: prometheus-operator labels: k8s-app: prometheus-operator spec: endpoints: - port: http selector: matchLabels: k8s-app: prometheus-operator ================================================ FILE: manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml ================================================ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: prometheus labels: k8s-app: prometheus spec: selector: matchLabels: prometheus: k8s namespaceSelector: matchNames: - monitoring endpoints: - port: web interval: 30s ================================================ FILE: manifests/prometheus/prometheus-k8s-service-monitor-rook.yaml ================================================ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: rook-api labels: k8s-app: rook spec: selector: matchLabels: app: rook-api rook_cluster: rook namespaceSelector: matchNames: - rook endpoints: - port: rook-api path: /metrics interval: 60s scrapeTimeout: 30s ================================================ FILE: manifests/prometheus/prometheus-k8s-service-monitor-traefik.yaml ================================================ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: traefik labels: k8s-app: traefik spec: endpoints: - port: web path: /metrics interval: 5s selector: matchNames: - traefik-console namespaceSelector: matchNames: - kube-system ================================================ FILE: manifests/prometheus/prometheus-k8s-service.yaml ================================================ apiVersion: v1 kind: Service metadata: labels: prometheus: k8s name: prometheus-k8s spec: ports: - name: web port: 9090 protocol: TCP selector: prometheus: k8s ================================================ FILE: manifests/prometheus/prometheus-k8s.yaml ================================================ apiVersion: monitoring.coreos.com/v1 kind: Prometheus metadata: name: k8s labels: prometheus: k8s spec: replicas: 1 version: v2.0.0 serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpressions: - {key: k8s-app, operator: Exists} ruleSelector: matchLabels: role: prometheus-rulefiles prometheus: k8s resources: requests: # 2Gi is default, but won't schedule if you don't have a node with >2Gi # memory. Modify based on your target and time-series count for # production use. This value is mainly meant for demonstration/testing # purposes. memory: 400Mi alerting: alertmanagers: - namespace: monitoring name: alertmanager-main port: web ================================================ FILE: manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml ================================================ apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRoleBinding metadata: name: prometheus-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: prometheus-operator subjects: - kind: ServiceAccount name: prometheus-operator namespace: monitoring ================================================ FILE: manifests/prometheus-operator/prometheus-operator-cluster-role.yaml ================================================ apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRole metadata: name: prometheus-operator rules: - apiGroups: - extensions resources: - thirdpartyresources verbs: - "*" - apiGroups: - apiextensions.k8s.io resources: - customresourcedefinitions verbs: - "*" - apiGroups: - monitoring.coreos.com resources: - alertmanagers - prometheuses - servicemonitors verbs: - "*" - apiGroups: - apps resources: - statefulsets verbs: ["*"] - apiGroups: [""] resources: - configmaps - secrets verbs: ["*"] - apiGroups: [""] resources: - pods verbs: ["list", "delete"] - apiGroups: [""] resources: - services - endpoints verbs: ["get", "create", "update"] - apiGroups: [""] resources: - nodes verbs: ["list", "watch"] - apiGroups: [""] resources: - namespaces verbs: ["list"] ================================================ FILE: manifests/prometheus-operator/prometheus-operator-service-account.yaml ================================================ apiVersion: v1 kind: ServiceAccount metadata: name: prometheus-operator ================================================ FILE: manifests/prometheus-operator/prometheus-operator-service.yaml ================================================ apiVersion: v1 kind: Service metadata: name: prometheus-operator labels: k8s-app: prometheus-operator spec: type: ClusterIP ports: - name: http port: 8080 targetPort: http protocol: TCP selector: k8s-app: prometheus-operator ================================================ FILE: manifests/prometheus-operator/prometheus-operator.yaml ================================================ apiVersion: extensions/v1beta1 kind: Deployment metadata: labels: k8s-app: prometheus-operator name: prometheus-operator spec: replicas: 1 template: metadata: labels: k8s-app: prometheus-operator spec: containers: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 image: quay.io/coreos/prometheus-operator:v0.17.0 name: prometheus-operator ports: - containerPort: 8080 name: http resources: limits: cpu: 200m memory: 100Mi requests: cpu: 100m memory: 50Mi serviceAccountName: prometheus-operator ================================================ FILE: manifests/rook/rook-cluster.yaml ================================================ apiVersion: v1 kind: Namespace metadata: name: rook --- apiVersion: rook.io/v1alpha1 kind: Cluster metadata: name: rook namespace: rook spec: versionTag: master # The path on the host where configuration files will be persisted. If not specified, a kubernetes emptyDir will be created (not recommended). # Important: if you reinstall the cluster, make sure you delete this directory from each host or else the mons will fail to start on the new cluster. dataDirHostPath: /var/lib/rook # toggle to use hostNetwork hostNetwork: false # set the amount of mons to be started monCount: 1 # To control where various services will be scheduled by kubernetes, use the placement configuration sections below. # The example under 'all' would have all services scheduled on kubernetes nodes labeled with 'role=storage' and # tolerate taints with a key of 'storage-node'. # placement: # all: # nodeAffinity: # requiredDuringSchedulingIgnoredDuringExecution: # nodeSelectorTerms: # - matchExpressions: # - key: role # operator: In # values: # - storage-node # podAffinity: # podAntiAffinity: # tolerations: # - key: storage-node # operator: Exists # api: # nodeAffinity: # podAffinity: # podAntiAffinity: # tolerations: # mgr: # nodeAffinity: # podAffinity: # podAntiAffinity: # tolerations: # mon: # nodeAffinity: # podAffinity: # podAntiAffinity: # tolerations: # osd: # nodeAffinity: # podAffinity: # podAntiAffinity: # tolerations: storage: useAllNodes: true useAllDevices: false deviceFilter: ^sd[^a] metadataDevice: location: storeConfig: storeType: bluestore databaseSizeMB: 1024 # this value can be removed for environments with normal sized disks (100 GB or larger) journalSizeMB: 1024 # this value can be removed for environments with normal sized disks (20 GB or larger) # Individual nodes and their config can be specified as well, but 'useAllNodes' above must be set to false. Then, only the named # nodes below will be used as storage resources. Each node's 'name' field should match their 'kubernetes.io/hostname' label. # nodes: # - name: "172.17.4.101" # directories: # specific directores to use for storage can be specified for each node # - path: "/rook/storage-dir" # - name: "172.17.4.201" # devices: # specific devices to use for storage can be specified for each node # - name: "sdb" # - name: "sdc" # storeConfig: # configuration can be specified at the node level which overrides the cluster level config # storeType: bluestore # - name: "172.17.4.301" # deviceFilter: "^sd." ================================================ FILE: manifests/rook/rook-operator.yaml ================================================ --- apiVersion: apiextensions.k8s.io/v1beta1 kind: CustomResourceDefinition metadata: name: clusters.rook.io spec: group: rook.io names: kind: Cluster listKind: ClusterList plural: clusters singular: cluster scope: Namespaced version: v1alpha1 --- apiVersion: apiextensions.k8s.io/v1beta1 kind: CustomResourceDefinition metadata: name: filesystems.rook.io spec: group: rook.io names: kind: Filesystem listKind: FilesystemList plural: filesystems singular: filesystem scope: Namespaced version: v1alpha1 --- apiVersion: apiextensions.k8s.io/v1beta1 kind: CustomResourceDefinition metadata: name: objectstores.rook.io spec: group: rook.io names: kind: ObjectStore listKind: ObjectStoreList plural: objectstores singular: objectstore scope: Namespaced version: v1alpha1 --- apiVersion: apiextensions.k8s.io/v1beta1 kind: CustomResourceDefinition metadata: name: pools.rook.io spec: group: rook.io names: kind: Pool listKind: PoolList plural: pools singular: pool scope: Namespaced version: v1alpha1 --- apiVersion: apiextensions.k8s.io/v1beta1 kind: CustomResourceDefinition metadata: name: volumeattachments.rook.io spec: group: rook.io names: kind: VolumeAttachment listKind: VolumeAttachmentList plural: volumeattachments singular: volumeattachment scope: Namespaced version: v1alpha1 --- apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRole metadata: name: rook-operator rules: - apiGroups: - "" resources: - namespaces - serviceaccounts - secrets - pods - services - nodes - nodes/proxy - configmaps - events - persistentvolumes - persistentvolumeclaims verbs: - get - list - watch - patch - create - update - delete - apiGroups: - extensions resources: - deployments - daemonsets - replicasets verbs: - get - list - watch - create - update - delete - apiGroups: - rbac.authorization.k8s.io resources: - clusterroles - clusterrolebindings - roles - rolebindings verbs: - get - list - watch - create - update - delete - apiGroups: - storage.k8s.io resources: - storageclasses verbs: - get - list - watch - delete - apiGroups: - rook.io resources: - "*" verbs: - "*" --- apiVersion: v1 kind: Namespace metadata: name: rook-system --- apiVersion: v1 kind: ServiceAccount metadata: name: rook-operator namespace: rook-system --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1beta1 metadata: name: rook-operator namespace: rook-system roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: rook-operator subjects: - kind: ServiceAccount name: rook-operator namespace: rook-system --- apiVersion: apps/v1beta1 kind: Deployment metadata: name: rook-operator namespace: rook-system spec: replicas: 1 template: metadata: labels: app: rook-operator spec: serviceAccountName: rook-operator containers: - name: rook-operator image: rook/rook:v0.7.1 args: ["operator"] env: # To disable RBAC, uncomment the following: # - name: RBAC_ENABLED # value: "false" # Rook Agent toleration. Will tolerate all taints with all keys. # Choose between NoSchedule, PreferNoSchedule and NoExecute: # - name: AGENT_TOLERATION # value: "NoSchedule" # (Optional) Rook Agent toleration key. Set this to the key of the taint you want to tolerate # - name: AGENT_TOLERATION_KEY # value: "" # Set the path where the Rook agent can find the flex volumes # - name: FLEXVOLUME_DIR_PATH # value: "" # The interval to check if every mon is in the quorum. - name: FLEXVOLUME_DIR_PATH value: "/var/lib/kubelet/volumeplugins" - name: ROOK_MON_HEALTHCHECK_INTERVAL value: "45s" # The duration to wait before trying to failover or remove/replace the # current mon with a new mon (useful for compensating flapping network). - name: ROOK_MON_OUT_TIMEOUT value: "300s" - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - name: POD_NAME valueFrom: fieldRef: fieldPath: metadata.name - name: POD_NAMESPACE valueFrom: fieldRef: fieldPath: metadata.namespace ================================================ FILE: manifests/rook/rook-storageclass.yaml ================================================ apiVersion: rook.io/v1alpha1 kind: Pool metadata: name: replicapool namespace: rook spec: replicated: size: 1 # For an erasure-coded pool, comment out the replication size above and uncomment the following settings. # Make sure you have enough OSDs to support the replica size or erasure code chunks. #erasureCoded: # dataChunks: 2 # codingChunks: 1 --- apiVersion: storage.k8s.io/v1 kind: StorageClass metadata: name: rook-block annotations: storageclass.kubernetes.io/is-default-class: "true" provisioner: rook.io/block parameters: pool: replicapool # Specify the Rook cluster from which to create volumes. # If not specified, it will use `rook` as the name of the cluster. # This is also the namespace where the cluster will be clusterName: rook # Specify the filesystem type of the volume. If not specified, it will use `ext4`. # fstype: ext4 ================================================ FILE: manifests/traefik.yaml ================================================ --- kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1beta1 metadata: name: traefik-ingress-controller rules: - apiGroups: - "" resources: - services - endpoints - secrets verbs: - get - list - watch - apiGroups: - extensions resources: - ingresses verbs: - get - list - watch --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1beta1 metadata: name: traefik-ingress-controller roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: traefik-ingress-controller subjects: - kind: ServiceAccount name: traefik-ingress-controller namespace: kube-system --- apiVersion: v1 kind: ServiceAccount metadata: name: traefik-ingress-controller namespace: kube-system --- apiVersion: v1 kind: Service metadata: name: traefik namespace: kube-system labels: k8s-app: traefik-ingress-lb spec: selector: k8s-app: traefik-ingress-lb ports: - port: 80 name: http - port: 443 name: https --- apiVersion: v1 kind: Service metadata: name: traefik-console namespace: kube-system labels: k8s-app: traefik-ingress-lb spec: selector: k8s-app: traefik-ingress-lb ports: - port: 8080 name: web --- apiVersion: v1 kind: ConfigMap metadata: name: traefik-conf namespace: kube-system data: traefik.toml: | # traefik.toml defaultEntryPoints = ["http","https"] InsecureSkipVerify = true [entryPoints] [entryPoints.http] address = ":80" [entryPoints.http.redirect] entryPoint = "https" [entryPoints.https] address = ":443" [entryPoints.https.tls] [acme] email = "$EMAIL" storage = "/acme/acme.json" entryPoint = "https" onDemand = true onHostRule = true caServer = "https://acme-v01.api.letsencrypt.org/directory" [acme.httpChallenge] entryPoint = "http" [[acme.domains]] main = "${DOMAIN}" [web] address = ":8080" [web.metrics.prometheus] Buckets=[0.1,0.3,1.2,5.0] --- apiVersion: extensions/v1beta1 kind: DaemonSet metadata: name: traefik-ingress-controller namespace: kube-system labels: k8s-app: traefik-ingress-lb spec: revisionHistoryLimit: 0 template: metadata: labels: k8s-app: traefik-ingress-lb name: traefik-ingress-lb spec: terminationGracePeriodSeconds: 60 volumes: - name: config configMap: name: traefik-conf - name: acme hostPath: path: /etc/traefik/acme containers: - image: traefik:v1.6.5 name: traefik-ingress-lb imagePullPolicy: Always volumeMounts: - mountPath: "/config" name: "config" - mountPath: "/acme" name: "acme" ports: - containerPort: 80 hostPort: 80 - containerPort: 443 hostPort: 443 - containerPort: 8080 args: - --configfile=/config/traefik.toml - --web - --web.metrics.prometheus - --web.metrics.prometheus.buckets=0.1,0.3,1.2,5.0 - --kubernetes - --logLevel=DEBUG affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: node-role.kubernetes.io/master operator: In values: - tolerations: - key: CriticalAddonsOnly operator: Exists - key: node-role.kubernetes.io/master operator: Exists effect: NoSchedule --- apiVersion: extensions/v1beta1 kind: Ingress metadata: name: traefik-ingress namespace: kube-system annotations: kubernetes.io/ingress.class: "traefik" ingress.kubernetes.io/auth-type: "basic" ingress.kubernetes.io/auth-secret: "kubesecret" spec: rules: - host: traefik.${DOMAIN} http: paths: - backend: serviceName: traefik-console servicePort: web