Repository: kahkhang/kube-linode
Branch: master
Commit: 812ac407badd
Files: 51
Total size: 340.7 KB
Directory structure:
gitextract_psepesfk/
├── .gitignore
├── LICENSE
├── README.md
├── display.sh
├── install-coreos.sh
├── kube-linode.sh
├── linode-utilities.sh
└── manifests/
├── alertmanager/
│ ├── alertmanager-config.yaml
│ ├── alertmanager-service.yaml
│ └── alertmanager.yaml
├── container-linux/
│ ├── master-config.yaml
│ └── worker-config.yaml
├── grafana/
│ ├── grafana-dashboards.yaml
│ ├── grafana-deployment.yaml
│ └── grafana-service.yaml
├── heapster.yaml
├── kube-dashboard.yaml
├── kube-state-metrics/
│ ├── kube-state-metrics-cluster-role-binding.yaml
│ ├── kube-state-metrics-cluster-role.yaml
│ ├── kube-state-metrics-deployment.yaml
│ ├── kube-state-metrics-service-account.yaml
│ └── kube-state-metrics-service.yaml
├── node-exporter/
│ ├── node-exporter-daemonset.yaml
│ └── node-exporter-service.yaml
├── prometheus/
│ ├── prometheus-k8s-ingress.yaml
│ ├── prometheus-k8s-role-bindings.yaml
│ ├── prometheus-k8s-roles.yaml
│ ├── prometheus-k8s-rules.yaml
│ ├── prometheus-k8s-service-account.yaml
│ ├── prometheus-k8s-service-monitor-alertmanager.yaml
│ ├── prometheus-k8s-service-monitor-apiserver.yaml
│ ├── prometheus-k8s-service-monitor-kube-controller-manager.yaml
│ ├── prometheus-k8s-service-monitor-kube-scheduler.yaml
│ ├── prometheus-k8s-service-monitor-kube-state-metrics.yaml
│ ├── prometheus-k8s-service-monitor-kubelet.yaml
│ ├── prometheus-k8s-service-monitor-node-exporter.yaml
│ ├── prometheus-k8s-service-monitor-prometheus-operator.yaml
│ ├── prometheus-k8s-service-monitor-prometheus.yaml
│ ├── prometheus-k8s-service-monitor-rook.yaml
│ ├── prometheus-k8s-service-monitor-traefik.yaml
│ ├── prometheus-k8s-service.yaml
│ └── prometheus-k8s.yaml
├── prometheus-operator/
│ ├── prometheus-operator-cluster-role-binding.yaml
│ ├── prometheus-operator-cluster-role.yaml
│ ├── prometheus-operator-service-account.yaml
│ ├── prometheus-operator-service.yaml
│ └── prometheus-operator.yaml
├── rook/
│ ├── rook-cluster.yaml
│ ├── rook-operator.yaml
│ └── rook-storageclass.yaml
└── traefik.yaml
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
certs/
cluster/
acme.json
auth
settings.env
testing.sh
package.sh
*.zip
demo.mov
resolv.conf
bootkube
manifests/grafana/grafana-credentials.yaml
install.exp
manifests/container-linux/master-config.yaml.bak
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2017 Andrew Low
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
## :whale: Provision a Kubernetes / CoreOS Cluster on Linode
[](https://github.com/kahkhang/kube-linode)
[](https://raw.githubusercontent.com/kahkhang/kube-linode/master/LICENSE)
[](https://gitter.im/kube-linode/support)
Automatically provision a scalable CoreOS/Kubernetes cluster on Linode with zero configuration.

The cluster will comprise of a single Kubernetes master host with a custom number of worker nodes.
### What's included
* [Kubernetes 1.11.0](https://kubernetes.io/) with [Bootkube](https://github.com/kubernetes-incubator/bootkube)
* Load Balancer and automatic SSL/TLS renewal using [Traefik](https://github.com/containous/traefik)
* Distributed block storage with [Rook](https://github.com/rook/rook)
* Pre-configured [Grafana](https://github.com/grafana/grafana) dashboard using [Kube-Prometheus](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus) with Rook and Traefik monitoring
* Basic auth protected subdomains (assuming you are using example.com):
* https://kube.example.com ([Kubernetes Dashboard](https://github.com/kubernetes/dashboard))
* https://grafana.example.com ([Grafana](https://github.com/grafana/grafana))
* https://alertmanager.example.com ([Alert Manager](https://github.com/prometheus/alertmanager))
* https://prometheus.example.com ([Prometheus Web UI](https://github.com/prometheus/prometheus))
* https://traefik.example.com ([Traefik Web UI](https://github.com/containous/traefik#web-ui))
### Usage
```sh
git clone https://github.com/kahkhang/kube-linode
cd kube-linode
chmod +x kube-linode.sh
```
Just run `./kube-linode.sh create` into your console, key in your configuration, then sit back and have a :coffee:!
Settings are stored in `settings.env`, or you can pass them in as key-value flags as such:
```sh
./kube-linode.sh --no_of_workers=3 --api_key=12345
```
To increase the number of workers, modify `NO_OF_WORKERS` in `settings.env` as desired and run `./kube-linode.sh` again.
Use `kubectl` to control the cluster (e.g. `kubectl get nodes`)
If you want to destroy the cluster created by kube-linode, you can run the following command:
```sh
./kube-linode.sh destroy
```
A prompt will be given listing all the nodes which will be destroyed upon confirmation .
### Dependencies
You should have a Linode account, which you can get [here](https://www.linode.com/?r=0affaec6ca42ca06f5f2c2d3d8d1ceb354e222c1).
You should also have an API Key with a valid domain that uses [Linode's DNS servers](https://www.linode.com/docs/networking/dns/dns-manager-overview#set-domain-names-to-use-linodes-name-servers).
OSX: ``` brew install jq openssl curl kubectl ```
Arch Linux: Follow the instructions [here](https://github.com/kahkhang/kube-linode/issues/4#issuecomment-311601422)
### Acknowledgements
This script uses [Bootkube](https://github.com/kubernetes-incubator/bootkube) to bootstrap the initial cluster using [Linode's API](https://www.linode.com/api).
================================================
FILE: display.sh
================================================
#!/bin/bash
_SPINNER_POS=0
spinner() {
IFS=$'\n'
local delay=0.05
local list=( $(echo -e '\xe2\xa0\x8b')
$(echo -e '\xe2\xa0\x99')
$(echo -e '\xe2\xa0\xb9')
$(echo -e '\xe2\xa0\xb8')
$(echo -e '\xe2\xa0\xbc')
$(echo -e '\xe2\xa0\xb4')
$(echo -e '\xe2\xa0\xa6')
$(echo -e '\xe2\xa0\xa7')
$(echo -e '\xe2\xa0\x87')
$(echo -e '\xe2\xa0\x8f'))
local i=$_SPINNER_POS
local tempfile
tempfile=$(mktemp)
eval $2 >> $tempfile 2>/dev/null &
local pid=$!
tput sc
printf "%s %s" "${list[i]}" "$1"
tput el
tput rc
i=$(($i+1))
i=$(($i%10))
while [ "$(ps a | awk '{print $1}' | grep $pid)" ]; do
printf "%s" "${list[i]}"
i=$(($i+1))
i=$(($i%10))
sleep $delay
printf "\b\b\b"
done
_SPINNER_POS=$i
if [ -z $3 ]; then :; else
eval $3=\'"$(cat $tempfile)"\'
fi
rm $tempfile
}
arrow="$(echo -e '\xe2\x9d\xaf')"
checked="$(echo -e '\xe2\x97\x89')"
unchecked="$(echo -e '\xe2\x97\xaf')"
black="$(tput setaf 0)"
red="$(tput setaf 1)"
green="$(tput setaf 2)"
yellow="$(tput setaf 3)"
blue="$(tput setaf 4)"
magenta="$(tput setaf 5)"
cyan="$(tput setaf 6)"
white="$(tput setaf 7)"
bold="$(tput bold)"
normal="$(tput sgr0)"
dim=$'\e[2m'
print() {
echo "$1"
tput el
}
join() {
local IFS=$'\n'
local _join_list
eval _join_list=( '"${'${1}'[@]}"' )
local first=true
for item in ${_join_list[@]}; do
if [ "$first" = true ]; then
printf "%s" "$item"
first=false
else
printf "${2-, }%s" "$item"
fi
done
}
function gen_env_from_options() {
local IFS=$'\n'
local _indices
local _env_names
local _checkbox_selected
eval _indices=( '"${'${1}'[@]}"' )
eval _env_names=( '"${'${2}'[@]}"' )
for i in $(gen_index ${#_env_names[@]}); do
_checkbox_selected[$i]=false
done
for i in ${_indices[@]}; do
_checkbox_selected[$i]=true
done
for i in $(gen_index ${#_env_names[@]}); do
printf "%s=%s\n" "${_env_names[$i]}" "${_checkbox_selected[$i]}"
done
}
on_default() {
true;
}
on_keypress() {
local OLD_IFS
local IFS
local key
OLD_IFS=$IFS
local on_up=${1:-on_default}
local on_down=${2:-on_default}
local on_space=${3:-on_default}
local on_enter=${4:-on_default}
local on_left=${5:-on_default}
local on_right=${6:-on_default}
local on_ascii=${7:-on_default}
local on_backspace=${8:-on_default}
_break_keypress=false
while IFS="" read -rsn1 key; do
case "$key" in
$'\x1b')
read -rsn1 key
if [[ "$key" == "[" ]]; then
read -rsn1 key
case "$key" in
'A') eval $on_up;;
'B') eval $on_down;;
'D') eval $on_left;;
'C') eval $on_right;;
esac
fi
;;
' ') eval $on_space ' ';;
[a-z0-9A-Z\!\#\$\&\+\,\-\.\/\;\=\?\@\[\]\^\_\{\}\~]) eval $on_ascii $key;;
$'\x7f') eval $on_backspace $key;;
'') eval $on_enter $key;;
esac
if [ $_break_keypress = true ]; then
break
fi
done
IFS=$OLD_IFS
}
gen_index() {
local k=$1
local l=0
if [ $k -gt 0 ]; then
for l in $(seq $k)
do
echo "$l-1" | bc
done
fi
}
control_c() {
tput cub "$(tput cols)"
tput el
stty sane
tput cnorm
stty echo
exit $?
}
select_indices() {
local _select_list
local _select_indices
local _select_selected=()
eval _select_list=( '"${'${1}'[@]}"' )
eval _select_indices=( '"${'${2}'[@]}"' )
local _select_var_name=$3
eval $_select_var_name\=\(\)
for i in $(gen_index ${#_select_indices[@]}); do
eval $_select_var_name\+\=\(\""${_select_list[${_select_indices[$i]}]}"\"\)
done
}
#!/bin/bash
set -e
on_checkbox_input_up() {
remove_checkbox_instructions
tput cub "$(tput cols)"
if [ "${_checkbox_selected[$_current_index]}" = true ]; then
printf " ${green}${checked}${normal} ${_checkbox_list[$_current_index]} ${normal}"
else
printf " ${unchecked} ${_checkbox_list[$_current_index]} ${normal}"
fi
tput el
if [ $_current_index = 0 ]; then
_current_index=$((${#_checkbox_list[@]}-1))
tput cud $((${#_checkbox_list[@]}-1))
tput cub "$(tput cols)"
else
_current_index=$((_current_index-1))
tput cuu1
tput cub "$(tput cols)"
tput el
fi
if [ "${_checkbox_selected[$_current_index]}" = true ]; then
printf "${cyan}${arrow}${green}${checked}${normal} ${_checkbox_list[$_current_index]} ${normal}"
else
printf "${cyan}${arrow}${normal}${unchecked} ${_checkbox_list[$_current_index]} ${normal}"
fi
}
on_checkbox_input_down() {
remove_checkbox_instructions
tput cub "$(tput cols)"
if [ "${_checkbox_selected[$_current_index]}" = true ]; then
printf " ${green}${checked}${normal} ${_checkbox_list[$_current_index]} ${normal}"
else
printf " ${unchecked} ${_checkbox_list[$_current_index]} ${normal}"
fi
tput el
if [ $_current_index = $((${#_checkbox_list[@]}-1)) ]; then
_current_index=0
tput cuu $((${#_checkbox_list[@]}-1))
tput cub "$(tput cols)"
else
_current_index=$((_current_index+1))
tput cud1
tput cub "$(tput cols)"
tput el
fi
if [ "${_checkbox_selected[$_current_index]}" = true ]; then
printf "${cyan}${arrow}${green}${checked}${normal} ${_checkbox_list[$_current_index]} ${normal}"
else
printf "${cyan}${arrow}${normal}${unchecked} ${_checkbox_list[$_current_index]} ${normal}"
fi
}
on_checkbox_input_enter() {
local OLD_IFS
OLD_IFS=$IFS
_checkbox_selected_indices=()
_checkbox_selected_options=()
IFS=$'\n'
for i in $(gen_index ${#_checkbox_list[@]}); do
if [ "${_checkbox_selected[$i]}" = true ]; then
_checkbox_selected_indices+=($i)
_checkbox_selected_options+=("${_checkbox_list[$i]}")
fi
done
tput cud $((${#_checkbox_list[@]}-${_current_index}))
tput cub "$(tput cols)"
for i in $(seq $((${#_checkbox_list[@]}+1))); do
tput el1
tput el
tput cuu1
done
tput cub "$(tput cols)"
tput cuf $((${#prompt}+3))
printf "${cyan}$(join _checkbox_selected_options)${normal}"
tput el
tput cud1
tput cub "$(tput cols)"
tput el
_break_keypress=true
IFS=$OLD_IFS
}
on_checkbox_input_space() {
remove_checkbox_instructions
tput cub "$(tput cols)"
tput el
if [ "${_checkbox_selected[$_current_index]}" = true ]; then
_checkbox_selected[$_current_index]=false
else
_checkbox_selected[$_current_index]=true
fi
if [ "${_checkbox_selected[$_current_index]}" = true ]; then
printf "${cyan}${arrow}${green}${checked}${normal} ${_checkbox_list[$_current_index]} ${normal}"
else
printf "${cyan}${arrow}${normal}${unchecked} ${_checkbox_list[$_current_index]} ${normal}"
fi
}
remove_checkbox_instructions() {
if [ $_first_keystroke = true ]; then
tput cuu $((${_current_index}+1))
tput cub "$(tput cols)"
tput cuf $((${#prompt}+3))
tput el
tput cud $((${_current_index}+1))
_first_keystroke=false
fi
}
_checkbox_input() {
local i
local j
prompt=$1
eval _checkbox_list=( '"${'${2}'[@]}"' )
_current_index=0
_first_keystroke=true
trap control_c SIGINT EXIT
stty -echo
tput civis
print "${normal}${green}?${normal} ${bold}${prompt}${normal} ${dim}(Press to select, to finalize)${normal}"
for i in $(gen_index ${#_checkbox_list[@]}); do
_checkbox_selected[$i]=false
done
if [ -n "$3" ]; then
eval _selected_indices=( '"${'${3}'[@]}"' )
for i in ${_selected_indices[@]}; do
_checkbox_selected[$i]=true
done
fi
for i in $(gen_index ${#_checkbox_list[@]}); do
tput cub "$(tput cols)"
if [ $i = 0 ]; then
if [ "${_checkbox_selected[$i]}" = true ]; then
print "${cyan}${arrow}${green}${checked}${normal} ${_checkbox_list[$i]} ${normal}"
else
print "${cyan}${arrow}${normal}${unchecked} ${_checkbox_list[$i]} ${normal}"
fi
else
if [ "${_checkbox_selected[$i]}" = true ]; then
print " ${green}${checked}${normal} ${_checkbox_list[$i]} ${normal}"
else
print " ${unchecked} ${_checkbox_list[$i]} ${normal}"
fi
fi
tput el
done
for j in $(gen_index ${#_checkbox_list[@]}); do
tput cuu1
done
on_keypress on_checkbox_input_up on_checkbox_input_down on_checkbox_input_space on_checkbox_input_enter
}
checkbox_input() {
_checkbox_input "$1" "$2"
_checkbox_input_output_var_name=$3
select_indices _checkbox_list _checkbox_selected_indices $_checkbox_input_output_var_name
unset _checkbox_list
unset _break_keypress
unset _first_keystroke
unset _current_index
unset _checkbox_input_output_var_name
unset _checkbox_selected_indices
unset _checkbox_selected_options
}
checkbox_input_indices() {
_checkbox_input "$1" "$2" "$3"
_checkbox_input_output_var_name=$3
eval $_checkbox_input_output_var_name\=\(\)
for i in $(gen_index ${#_checkbox_selected_indices[@]}); do
eval $_checkbox_input_output_var_name\+\=\(${_checkbox_selected_indices[$i]}\)
done
unset _checkbox_list
unset _break_keypress
unset _first_keystroke
unset _current_index
unset _checkbox_input_output_var_name
unset _checkbox_selected_indices
unset _checkbox_selected_options
}
#!/bin/bash
set -e
on_list_input_up() {
remove_list_instructions
tput cub "$(tput cols)"
printf " ${_list_options[$_list_selected_index]}"
tput el
if [ $_list_selected_index = 0 ]; then
_list_selected_index=$((${#_list_options[@]}-1))
tput cud $((${#_list_options[@]}-1))
tput cub "$(tput cols)"
else
_list_selected_index=$((_list_selected_index-1))
tput cuu1
tput cub "$(tput cols)"
tput el
fi
printf "${cyan}${arrow} %s ${normal}" "${_list_options[$_list_selected_index]}"
}
on_list_input_down() {
remove_list_instructions
tput cub "$(tput cols)"
printf " ${_list_options[$_list_selected_index]}"
tput el
if [ $_list_selected_index = $((${#_list_options[@]}-1)) ]; then
_list_selected_index=0
tput cuu $((${#_list_options[@]}-1))
tput cub "$(tput cols)"
else
_list_selected_index=$((_list_selected_index+1))
tput cud1
tput cub "$(tput cols)"
tput el
fi
printf "${cyan}${arrow} %s ${normal}" "${_list_options[$_list_selected_index]}"
}
on_list_input_enter_space() {
local OLD_IFS
OLD_IFS=$IFS
IFS=$'\n'
tput cud $((${#_list_options[@]}-${_list_selected_index}))
tput cub "$(tput cols)"
for i in $(seq $((${#_list_options[@]}+1))); do
tput el1
tput el
tput cuu1
done
tput cub "$(tput cols)"
tput cuf $((${#prompt}+3))
printf "${cyan}${_list_options[$_list_selected_index]}${normal}"
tput el
tput cud1
tput cub "$(tput cols)"
tput el
_break_keypress=true
IFS=$OLD_IFS
}
remove_list_instructions() {
if [ $_first_keystroke = true ]; then
tput cuu $((${_list_selected_index}+1))
tput cub "$(tput cols)"
tput cuf $((${#prompt}+3))
tput el
tput cud $((${_list_selected_index}+1))
_first_keystroke=false
fi
}
_list_input() {
local i
local j
prompt=$1
eval _list_options=( '"${'${2}'[@]}"' )
_list_selected_index=0
_first_keystroke=true
trap control_c SIGINT EXIT
stty -echo
tput civis
print "${normal}${green}?${normal} ${bold}${prompt}${normal} ${dim}(Use arrow keys)${normal}"
for i in $(gen_index ${#_list_options[@]}); do
tput cub "$(tput cols)"
if [ $i = 0 ]; then
print "${cyan}${arrow} ${_list_options[$i]} ${normal}"
else
print " ${_list_options[$i]}"
fi
tput el
done
for j in $(gen_index ${#_list_options[@]}); do
tput cuu1
done
on_keypress on_list_input_up on_list_input_down on_list_input_enter_space on_list_input_enter_space
}
list_input() {
_list_input "$1" "$2"
local var_name=$3
eval $var_name=\'"${_list_options[$_list_selected_index]}"\'
unset _list_selected_index
unset _list_options
unset _break_keypress
unset _first_keystroke
}
list_input_index() {
_list_input "$1" "$2"
local var_name=$3
eval $var_name=\'"$_list_selected_index"\'
unset _list_selected_index
unset _list_options
unset _break_keypress
unset _first_keystroke
}
#!/bin/bash
set -e
on_text_input_left() {
remove_regex_failed
if [ $_current_pos -gt 0 ]; then
tput cub1
_current_pos=$(($_current_pos-1))
fi
}
on_text_input_right() {
remove_regex_failed
if [ $_current_pos -lt ${#_text_input} ]; then
tput cuf1
_current_pos=$(($_current_pos+1))
fi
}
on_text_input_enter() {
remove_regex_failed
if [[ "$_text_input" =~ $_text_input_regex && "$(eval $_text_input_validator "$_text_input")" = true ]]; then
tput cub "$(tput cols)"
tput cuf $((${#_read_prompt}-19))
printf "${cyan}${_text_input}${normal}"
tput el
tput cud1
tput cub "$(tput cols)"
tput el
eval $var_name=\'"${_text_input}"\'
_break_keypress=true
else
_text_input_regex_failed=true
tput civis
tput cud1
tput cub "$(tput cols)"
tput el
printf "${red}>>${normal} $_text_input_regex_failed_msg"
tput cuu1
tput cub "$(tput cols)"
tput cuf $((${#_read_prompt}-19))
tput el
_text_input=""
_current_pos=0
tput cnorm
fi
}
on_text_input_ascii() {
remove_regex_failed
local c=$1
if [ "$c" = '' ]; then
c=' '
fi
local rest="${_text_input:$_current_pos}"
_text_input="${_text_input:0:$_current_pos}$c$rest"
_current_pos=$(($_current_pos+1))
tput civis
printf "$c$rest"
tput el
if [ ${#rest} -gt 0 ]; then
tput cub ${#rest}
fi
tput cnorm
}
on_text_input_backspace() {
remove_regex_failed
if [ $_current_pos -gt 0 ]; then
local start="${_text_input:0:$(($_current_pos-1))}"
local rest="${_text_input:$_current_pos}"
_current_pos=$(($_current_pos-1))
tput cub 1
tput el
tput sc
printf "$rest"
tput rc
_text_input="$start$rest"
fi
}
remove_regex_failed() {
if [ $_text_input_regex_failed = true ]; then
_text_input_regex_failed=false
tput sc
tput cud1
tput el1
tput el
tput rc
fi
}
text_input_default_validator() {
echo true;
}
text_input() {
local prompt=$1
local var_name=$2
local _text_input_regex="${3:-"\.+"}"
local _text_input_regex_failed_msg=${4:-"Input validation failed"}
local _text_input_validator=${5:-text_input_default_validator}
local _read_prompt_start=$'\e[32m?\e[39m\e[1m'
local _read_prompt_end=$'\e[22m'
local _read_prompt="$( echo "$_read_prompt_start ${prompt} $_read_prompt_end")"
local _current_pos=0
local _text_input_regex_failed=false
local _text_input=""
printf "$_read_prompt"
trap control_c SIGINT EXIT
stty -echo
tput cnorm
on_keypress on_default on_default on_text_input_ascii on_text_input_enter on_text_input_left on_text_input_right on_text_input_ascii on_text_input_backspace
eval $var_name=\'"${_text_input}"\'
}
================================================
FILE: install-coreos.sh
================================================
#!/bin/bash
set -euo pipefail
[[ -n "$REBOOT_STRATEGY" ]] || die "Need a reboot strategy. Run with eg. '\$REBOOT_STRATEGY=off ./install-coreos.sh'"
PUBLIC_IP=$(ip addr show eth0 | grep "inet\b" | grep "/24" | awk '{print $2}' | cut -d/ -f1)
PRIVATE_IP=$(ip addr show eth0 | grep "inet\b" | grep "/17" | awk '{print $2}' | cut -d/ -f1)
wget --quiet --no-check-certificate https://github.com/coreos/container-linux-config-transpiler/releases/download/v0.5.0/ct-v0.5.0-x86_64-unknown-linux-gnu -O ct
chmod +x ct
apt-get -y install gawk
wget --quiet https://raw.githubusercontent.com/coreos/init/master/bin/coreos-install
chmod u+x coreos-install
cat container-linux-config.yaml \
| sed "s/#SSH_KEY#/$(cat ~/.ssh/authorized_keys | grep '^ssh-rsa' | sed -n 1p | sed 's/\//\\\//g')/g" \
| sed "s/#COREOS_PUBLIC_IPV4#/$PUBLIC_IP/g" \
| sed "s/#COREOS_PRIVATE_IPV4#/$PRIVATE_IP/g" \
| sed "s/#HOSTNAME#/$(echo $PUBLIC_IP | sed "s/\./-/g")/g" \
| sed "s/#GATEWAY#/${PUBLIC_IP%.*}.1/g" \
| sed "s/#DNS#/$(cat /etc/resolv.conf | awk '/^nameserver /{ print $0 }' | sed 's/nameserver //g' | tr '\n' ' ')/g" \
| sed "s/#REBOOT_STRATEGY#/${REBOOT_STRATEGY}/g" \
| ./ct > container-linux-config.json
./coreos-install -d /dev/sda -i container-linux-config.json
================================================
FILE: kube-linode.sh
================================================
#!/bin/bash
set +e
base64_args=""
$(base64 --wrap=0 <(echo "test") >/dev/null 2>&1)
if [ $? -eq 0 ]; then
base64_args="--wrap=0"
fi
set -e
source display.sh
source linode-utilities.sh
check_dep jq
check_dep openssl
check_dep curl
check_dep htpasswd
check_dep kubectl
check_dep ssh
check_dep base64
check_dep bc
check_dep ssh-keygen
check_dep openssl
check_dep awk
check_dep sed
check_dep cat
check_dep tr
if [[ "$1" != "create" && "$1" != "destroy" ]]; then
echo "${bold}${red}Not a valid action!${normal}"
echo "Type ${green}./kube-linode.sh create${normal} to create a cluster"
echo "Type ${green}./kube-linode.sh destroy${normal} to destroy created cluster"
exit 1
fi
unset DATACENTER_ID
unset MASTER_PLAN
unset WORKER_PLAN
unset DOMAIN
unset EMAIL
unset MASTER_ID
unset API_KEY
unset USERNAME
unset NO_OF_WORKERS
unset REBOOT_STRATEGY
unset WORKER_IDS
stty -echo
tput civis
if [ -f settings.env ] ; then
. settings.env
else
touch settings.env
fi
# -- command line argument overrides --
options=$@
for argument in $options
do
case $argument in
--datacenter_id=*) DATACENTER_ID=${argument/*=/""} ;;
--master_plan=*) MASTER_PLAN=${argument/*=/""} ;;
--worker_plan=*) WORKER_PLAN=${argument/*=/""} ;;
--no_of_workers=*) NO_OF_WORKERS=${argument/*=/""} ;;
--domain=*) DOMAIN=${argument/*=/""} ;;
--email=*) EMAIL=${argument/*=/""} ;;
--master_id=*) MASTER_ID=${argument/*=/""} ;;
--api_key=*) API_KEY=${argument/*=/""} ;;
--username=*) USERNAME=${argument/*=/""} ;;
--install_k8s_dashboard=*) INSTALL_K8S_DASHBOARD=${argument/*=/""} ;;
--install_traefik=*) INSTALL_TRAEFIK=${argument/*=/""} ;;
--install_rook=*) INSTALL_ROOK=${argument/*=/""} ;;
--install_prometheus=*) INSTALL_PROMETHEUS=${argument/*=/""} ;;
--reboot_strategy=*) REBOOT_STRATEGY=${argument/*=/""} ;;
esac
done
read_api_key
read_datacenter
read_master_plan
read_worker_plan
read_domain
read_email
read_no_of_workers
read_username
read_install_options
read_reboot_strategy
if [[ ! ( -f ~/.ssh/id_rsa && -f ~/.ssh/id_rsa.pub ) ]]; then
spinner "Generating new SSH key" "ssh-keygen -b 2048 -t rsa -f ~/.ssh/id_rsa -q -N \"\""
else
eval `ssh-agent -s` >/dev/null 2>&1
ssh-add -l | grep -q "$(ssh-keygen -lf ~/.ssh/id_rsa | awk '{print $2}')" || ssh-add ~/.ssh/id_rsa >/dev/null 2>&1
fi
if [[ -f auth && -f manifests/grafana/grafana-credentials.yaml ]] ; then : ; else
read -s -p "${green}?${normal}${bold} Enter your dashboard password: ${normal}" PASSWORD
tput cub "$(tput cols)"
tput el
[ -e auth ] && rm auth
htpasswd -b -c auth $USERNAME $PASSWORD >/dev/null 2>&1
[ -e manifests/grafana/grafana-credentials.yaml ] && rm manifests/grafana/grafana-credentials.yaml
cat > manifests/grafana/grafana-credentials.yaml <<-EOF
apiVersion: v1
kind: Secret
metadata:
name: grafana-credentials
data:
user: $( echo -n $USERNAME | base64 $base64_args )
password: $( echo -n $PASSWORD | base64 $base64_args )
EOF
fi
if [ "$1" == "destroy" ]; then
spinner "Retrieving master linode (if any)" get_master_id MASTER_ID
if ! [[ $MASTER_ID =~ ^[0-9]+$ ]] 2>/dev/null; then
tput el
echo "${red}No master node found! Cluster is likely to have been deleted.${normal}"
else
spinner "Retrieving worker linodes (if any)" list_worker_ids WORKER_IDS
tput el
echo "${bold}${red}The following nodes will be deleted:${normal}"
echo " ${cyan}${arrow}${normal} master_$MASTER_ID [https://manager.linode.com/linodes/dashboard/master_$MASTER_ID]"
for WORKER_ID in $WORKER_IDS; do
echo " ${cyan}${arrow}${normal} worker_$WORKER_ID [https://manager.linode.com/linodes/dashboard/worker_$WORKER_ID]"
done
text_input "Are you sure you want to delete the cluster? [y/n] " \
response "^[yn]$" "Please enter either 'y' or 'n'"
tput civis
if [[ "$response" =~ ^y$ ]]; then
for WORKER_ID in $WORKER_IDS; do
spinner "${CYAN}[$WORKER_ID]${NORMAL} Deleting worker node" "delete_linode $WORKER_ID"
done
spinner "${CYAN}[$MASTER_ID]${NORMAL} Deleting master node" "delete_linode $MASTER_ID"
fi
fi
spinner "Retrieving DNS record for $DOMAIN" "get_domains \"$DOMAIN\"" DOMAIN_ID
if [[ $DOMAIN_ID =~ ^[0-9]+$ ]] 2>/dev/null; then
text_input "Do you want to delete the DNS record for $DOMAIN? [y/n] " \
response "^[yn]$" "Please enter either 'y' or 'n'"
tput civis
if [[ "$response" =~ ^y$ ]]; then
spinner "Deleting DNS record for $DOMAIN" delete_domain
fi
fi
text_input "Do you want to delete the current cluster configuration (including ~/.kube/config)? [y/n] " \
response "^[yn]$" "Please enter either 'y' or 'n'"
tput civis
if [[ "$response" =~ ^y$ ]]; then
[ -e manifests/grafana/grafana-credentials.yaml ] && rm manifests/grafana/grafana-credentials.yaml
[ -e cluster ] && rm -rf cluster
[ -e ~/.kube/config ] && rm ~/.kube/config
[ -e auth ] && rm auth
[ -e settings.env ] && rm settings.env
touch settings.env
echo "API_KEY=$API_KEY" >> settings.env
fi
elif [ "$1" == "create" ]; then
spinner "Retrieving master linode (if any)" get_master_id MASTER_ID
if ! [[ $MASTER_ID =~ ^[0-9]+$ ]] 2>/dev/null; then
spinner "Retrieving list of workers" list_worker_ids WORKER_IDS
for WORKER_ID in $WORKER_IDS; do
spinner "${CYAN}[$WORKER_ID]${NORMAL} Deleting worker (since certs are now invalid)"\
"linode_api linode.delete LinodeID=$WORKER_ID skipChecks=true"
done
spinner "Creating master linode" "create_linode $DATACENTER_ID $MASTER_PLAN" MASTER_ID
spinner "Adding private IP" "add_private_ip $MASTER_ID"
spinner "${CYAN}[$MASTER_ID]${NORMAL} Initializing labels" \
"linode_api linode.update LinodeID=$MASTER_ID Label=\"master_${MASTER_ID}\" lpm_displayGroup=\"$DOMAIN (Unprovisioned)\""
fi
spinner "${CYAN}[$MASTER_ID]${NORMAL} Getting public IP" "get_public_ip $MASTER_ID" MASTER_IP
declare "PUBLIC_$MASTER_ID=$MASTER_IP"
spinner "${CYAN}[$MASTER_IP]${NORMAL} Getting private IP" "get_private_ip $MASTER_ID" PRIVATE_IP
declare "PRIVATE_$MASTER_ID=$PRIVATE_IP"
spinner "${CYAN}[$MASTER_IP]${NORMAL} Retrieving provision status" "is_provisioned $MASTER_ID" IS_PROVISIONED
if [ $IS_PROVISIONED = false ] ; then
update_dns $MASTER_ID
install master $MASTER_ID
fi
tput el
echo "${CYAN}[$MASTER_IP]${NORMAL} Master provisioned"
spinner "${CYAN}[$MASTER_IP]${NORMAL} Retrieving current number of workers" get_no_of_workers CURRENT_NO_OF_WORKERS
NO_OF_NEW_WORKERS=$( echo "$NO_OF_WORKERS - $CURRENT_NO_OF_WORKERS" | bc )
if [[ $NO_OF_NEW_WORKERS -gt 0 ]]; then
for WORKER in $( seq $NO_OF_NEW_WORKERS ); do
spinner "Creating worker linode" "create_linode $DATACENTER_ID $WORKER_PLAN" WORKER_ID
spinner "Adding private IP" "add_private_ip $WORKER_ID"
spinner "Initializing labels" "change_to_unprovisioned $WORKER_ID worker"
done
fi
spinner "Retrieving list of workers" list_worker_ids WORKER_IDS
for WORKER_ID in $WORKER_IDS; do
spinner "${CYAN}[$WORKER_ID]${NORMAL} Getting public IP" "get_public_ip $WORKER_ID" PUBLIC_IP
declare "PUBLIC_$WORKER_ID=$PUBLIC_IP"
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Getting private IP" "get_private_ip $WORKER_ID" PRIVATE_IP
declare "PRIVATE_$WORKER_ID=$PRIVATE_IP"
if [ "$( is_provisioned $WORKER_ID )" = false ] ; then
install worker $WORKER_ID
fi
tput el
echo "${CYAN}[$PUBLIC_IP]${NORMAL} Worker provisioned"
done
fi
wait
tput cnorm
stty echo
================================================
FILE: linode-utilities.sh
================================================
#!/bin/bash
if [ -z "${KUBECONFIG}" ]; then
export KUBECONFIG=~/.kube/config
fi
control_c() {
tput cub "$(tput cols)"
tput el
stty sane
tput cnorm
stty echo
exit $?
}
trap control_c SIGINT
CYAN=$(tput setaf 6)
NORMAL=$(tput sgr0)
BOLD=$(tput bold)
check_dep() {
command -v $1 >/dev/null 2>&1 || { echo "Please install \`${BOLD}$1${NORMAL}\` before running this script." >&2; exit 1; }
}
linode_api() {
args=(-F "api_action=$1") ; shift
for arg in "$@" ; do
args+=(-F "$arg")
done
curl -s -X POST "https://api.linode.com/" -H 'cache-control: no-cache' \
-F "api_key=$API_KEY" "${args[@]}"
}
wait_jobs() {
LINODE_ID=$1
while true ; do
if ( linode_api linode.job.list LinodeID=$LINODE_ID pendingOnly=1 | jq -Mje '.DATA == []' >/dev/null ) ; then
break
fi
sleep 3
done
}
wait_boot() {
LINODE_ID=$1
while true ; do
if [[ $(linode_api linode.job.list LinodeID=$LINODE_ID | jq ".DATA" | \
jq -c "[ .[] | select(.LABEL == \"Lassie initiated boot: CoreOS\") | select(.HOST_SUCCESS == 1)]" | \
jq ".[] | .JOBID") =~ ^[0-9]+ ]]; then
break
fi
sleep 3
done
sleep 10
}
get_status() {
linode_api linode.list LinodeID=$1 | jq ".DATA" | jq -c ".[] | .STATUS" | sed -n 1p
}
list_worker_ids() {
linode_api linode.list | jq ".DATA" | jq -c "[ .[] | select(.LPM_DISPLAYGROUP | contains (\"$DOMAIN\")) ]" | jq -c ".[] | select(.LABEL | startswith(\"worker_\")) | .LINODEID"
}
get_master_id() {
linode_api linode.list | jq ".DATA" | jq -c "[ .[] | select(.LPM_DISPLAYGROUP | contains (\"$DOMAIN\")) ]" | jq -c ".[] | select(.LABEL | startswith(\"master_\")) | .LINODEID" | sed -n 1p
}
is_provisioned() {
local IS_PROVISIONED=false
if [ $( linode_api linode.list LinodeID=$1 | jq ".DATA" | jq -c ".[] | .LPM_DISPLAYGROUP == \"$DOMAIN\"") = true ] ; then
IS_PROVISIONED=true
fi
echo $IS_PROVISIONED
}
shutdown() {
local LINODE_ID=$1
linode_api linode.shutdown LinodeID=$LINODE_ID >/dev/null
wait_jobs $LINODE_ID
}
get_disk_ids() {
local LINODE_ID=$1
linode_api linode.disk.list LinodeID=$LINODE_ID | jq ".DATA" | jq -c ".[] | .DISKID"
}
get_config_ids() {
local LINODE_ID=$1
linode_api linode.config.list LinodeID=$LINODE_ID | jq ".DATA" | jq -c ".[] | .ConfigID"
}
reset_linode() {
local LINODE_ID=$1
local DISK_IDS
local CONFIG_IDS
local STATUS
PUBLIC_IP=$(get_public_ip $LINODE_ID)
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Getting status" "get_status $LINODE_ID" STATUS
if [ "$STATUS" = "1" ]; then
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Shutting down linode" "shutdown $LINODE_ID"
fi
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Retrieving disk list" "get_disk_ids $LINODE_ID" DISK_IDS
for DISK_ID in $DISK_IDS; do
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Deleting disk $DISK_ID" "linode_api linode.disk.delete LinodeID=$LINODE_ID DiskID=$DISK_ID"
done
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Retrieving config list" "get_config_ids $LINODE_ID" CONFIG_IDS
for CONFIG_ID in $CONFIG_IDS; do
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Deleting config $CONFIG_ID" "linode_api linode.config.delete LinodeID=$LINODE_ID ConfigID=$CONFIG_ID"
done
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Waiting for all jobs to complete" "wait_jobs $LINODE_ID"
}
get_public_ip() {
local LINODE_ID=$1
local IP
eval IP=\$PUBLIC_$LINODE_ID
if ! [[ $IP =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]] 2>/dev/null; then
IP="$( linode_api linode.ip.list LinodeID=$LINODE_ID | jq -Mje '.DATA[] | select(.ISPUBLIC==1) | .IPADDRESS' | sed -n 1p )"
fi
echo $IP
}
get_private_ip() {
local LINODE_ID=$1
local IP
eval IP=\$PRIVATE_$LINODE_ID
if ! [[ $IP =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]] 2>/dev/null; then
IP="$( linode_api linode.ip.list LinodeID=$LINODE_ID | jq -Mje '.DATA[] | select(.ISPUBLIC==0) | .IPADDRESS' | sed -n 1p )"
fi
echo $IP
}
get_plan_id() {
local LINODE_ID=$1
linode_api linode.list LinodeID=$LINODE_ID | jq ".DATA[0].PLANID"
}
get_max_disk_size() {
local PLAN=$1
echo "$( linode_api avail.linodeplans PlanID=$PLAN | jq ".DATA[0].DISK" )" "*1024" | bc
}
create_raw_disk() {
local LINODE_ID=$1
local DISK_SIZE=$2
local LABEL=$3
linode_api linode.disk.create LinodeID=$LINODE_ID Label="$LABEL" Type=raw Size=$DISK_SIZE | jq '.DATA.DiskID'
}
create_ext4_disk() {
local LINODE_ID=$1
local DISK_SIZE=$2
local LABEL=$3
linode_api linode.disk.create LinodeID=$LINODE_ID Label="$LABEL" Type=ext4 Size=$DISK_SIZE | jq '.DATA.DiskID'
}
create_install_disk() {
linode_api linode.disk.createFromDistribution LinodeID=$LINODE_ID \
DistributionID=140 Label=Installer Size=$INSTALL_DISK_SIZE \
rootPass="$ROOT_PASSWORD" rootSSHKey="$( cat ~/.ssh/id_rsa.pub )" | jq ".DATA.DiskID"
}
create_boot_configuration() {
linode_api linode.config.create LinodeID=$LINODE_ID KernelID=138 Label="Installer" \
DiskList=$DISK_ID,$INSTALL_DISK_ID RootDeviceNum=2 helper_network=true | jq ".DATA.ConfigID"
}
boot_linode() {
local LINODE_ID=$1
local CONFIG_ID=$2
linode_api linode.boot LinodeID=$LINODE_ID ConfigID=$CONFIG_ID >/dev/null
wait_jobs $LINODE_ID
}
update_coreos_config() {
linode_api linode.config.update LinodeID=$LINODE_ID ConfigID=$CONFIG_ID Label="CoreOS" \
DiskList=$DISK_ID,$STORAGE_DISK_ID KernelID=213 RootDeviceNum=1 helper_network=false
}
transfer_acme() {
IP=$1
ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -tt "core@$IP" \
"sudo truncate -s 0 /etc/traefik/acme/acme.json; echo '$( base64 $base64_args < acme.json )' \
| base64 --decode | sudo tee --append /etc/traefik/acme/acme.json" 2>/dev/null >/dev/null
ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -tt "core@$IP" \
"sudo chmod 600 /etc/traefik/acme/acme.json" 2>/dev/null >/dev/null
}
change_to_provisioned() {
local LINODE_ID=$1
local NODE_TYPE=$2
linode_api linode.update LinodeID=$LINODE_ID Label="${NODE_TYPE}_${LINODE_ID}" lpm_displayGroup="$DOMAIN"
}
change_to_unprovisioned() {
local LINODE_ID=$1
local NODE_TYPE=$2
linode_api linode.update LinodeID=$LINODE_ID Label="${NODE_TYPE}_${LINODE_ID}" lpm_displayGroup="$DOMAIN (Unprovisioned)"
}
install_coreos() {
LINODE_ID=$1
NODE_TYPE=$2
PUBLIC_IP=$(get_public_ip $LINODE_ID)
set +e
while true; do scp -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no \
-r install-coreos.sh root@${PUBLIC_IP}:~/install-coreos.sh && break || sleep 5; done
while true; do scp -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no \
-r manifests/container-linux/${NODE_TYPE}-config.yaml root@${PUBLIC_IP}:~/container-linux-config.yaml && break || sleep 5; done
while true; do ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no root@${PUBLIC_IP} \
"chmod +x ./install-coreos.sh" && break || sleep 5; done
while true; do ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no root@${PUBLIC_IP} \
"REBOOT_STRATEGY=${REBOOT_STRATEGY} ./install-coreos.sh" && break || sleep 5; done
set -e
}
install() {
local NODE_TYPE
local LINODE_ID
local PLAN
local ROOT_PASSWORD
NODE_TYPE=$1
LINODE_ID=$2
PUBLIC_IP=$(get_public_ip $LINODE_ID)
reset_linode $LINODE_ID
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Generating root password" "openssl rand -base64 32" ROOT_PASSWORD
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Retrieving current plan" "get_plan_id $LINODE_ID" PLAN
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Retrieving maximum available disk size" "get_max_disk_size $PLAN" TOTAL_DISK_SIZE
INSTALL_DISK_SIZE=2000
COREOS_DISK_SIZE=10240
STORAGE_DISK_SIZE=$((${TOTAL_DISK_SIZE}-${COREOS_DISK_SIZE}))
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Creating ${COREOS_DISK_SIZE}mb CoreOS disk" "create_raw_disk $LINODE_ID $COREOS_DISK_SIZE CoreOS" DISK_ID
# Create the install OS disk from script
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Creating ${INSTALL_DISK_SIZE}mb install disk" create_install_disk INSTALL_DISK_ID
# Configure the installer to boot
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Creating boot configuration" create_boot_configuration CONFIG_ID
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Booting installer" "boot_linode $LINODE_ID $CONFIG_ID"
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Installing CoreOS (might take a while)" "install_coreos $LINODE_ID $NODE_TYPE"
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Shutting down CoreOS" "linode_api linode.shutdown LinodeID=$LINODE_ID"
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Deleting install disk $INSTALL_DISK_ID" "linode_api linode.disk.delete LinodeID=$LINODE_ID DiskID=$INSTALL_DISK_ID"
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Waiting for existing jobs to complete" "wait_jobs $LINODE_ID"
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Creating ${STORAGE_DISK_SIZE}mb storage disk" "create_raw_disk $LINODE_ID $STORAGE_DISK_SIZE Storage" STORAGE_DISK_ID
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Updating CoreOS config" update_coreos_config
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Waiting for existing jobs to complete" "wait_jobs $LINODE_ID"
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Booting CoreOS" "linode_api linode.boot LinodeID=$LINODE_ID ConfigID=$CONFIG_ID"
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Waiting for CoreOS to be ready" "wait_jobs $LINODE_ID; sleep 20"
if [ "$NODE_TYPE" = "master" ] ; then
if [ -e acme.json ] ; then
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Transferring acme.json" "transfer_acme $PUBLIC_IP"
fi
fi
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Provisioning $NODE_TYPE node (might take a while)" "provision_$NODE_TYPE $PUBLIC_IP" PROVISION_LOGS
if [ "$( echo "${PROVISION_LOGS}" | tail -n1 )" = "provisioned $NODE_TYPE" ]; then
spinner "${CYAN}[$PUBLIC_IP]${NORMAL} Changing status to provisioned" "change_to_provisioned $LINODE_ID $NODE_TYPE"
else
install $NODE_TYPE $LINODE_ID
fi
}
provision_master() {
IP=$1
while true; do ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -tt "core@$IP" \
"sudo systemctl start bootkube" && break || sleep 5; done
[ -e cluster ] && rm -rf cluster
mkdir cluster
ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no core@${IP} "sudo chown -R core:core /opt/bootkube/assets"
scp -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -r core@${IP}:/opt/bootkube/assets/* cluster
mkdir -p ~/.kube
[ -e ~/.kube/config.bak ] && rm ~/.kube/config.bak
[ -e ~/.kube/config ] && mv ~/.kube/config ~/.kube/config.bak
cp cluster/auth/kubeconfig ~/.kube/config
while true; do kubectl --namespace=kube-system create secret generic kubesecret --from-file auth --request-timeout 0 && break || sleep 5; done
cat </dev/null >/dev/null
ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -tt "core@$IP" "sudo ./bootstrap.sh" 2>/dev/null >/dev/null
ssh -i ~/.ssh/id_rsa -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -tt "core@$IP" "rm -rf /home/core/kubeconfig && rm -rf /home/core/bootstrap.sh" 2>/dev/null >/dev/null
set +e
until kubectl get nodes > /dev/null 2>&1; do sleep 1; done
if [ $INSTALL_ROOK = true ]; then
if ! kubectl --namespace rook get pods --request-timeout 0 2>/dev/null | grep -q "^rook-api"; then
while true; do kubectl apply -f manifests/rook/rook-operator.yaml --request-timeout 0 && break || sleep 5; done
while true; do kubectl apply -f manifests/rook/rook-cluster.yaml --request-timeout 0 && break || sleep 5; done
while true; do kubectl apply -f manifests/rook/rook-storageclass.yaml --request-timeout 0 && break || sleep 5; done
fi
fi
if [ $INSTALL_PROMETHEUS = true ]; then
until kubectl get nodes > /dev/null 2>&1; do sleep 1; done
if ! kubectl --namespace monitoring get ingress --request-timeout 0 2>/dev/null | grep -q "^prometheus-ingress"; then
while true; do kubectl --namespace monitoring apply -f manifests/prometheus-operator --request-timeout 0 && break || sleep 5; done
printf "Waiting for Operator to register third party objects..."
until kubectl --namespace monitoring get servicemonitor > /dev/null 2>&1; do sleep 1; printf "."; done
until kubectl --namespace monitoring get prometheus > /dev/null 2>&1; do sleep 1; printf "."; done
until kubectl --namespace monitoring get alertmanager > /dev/null 2>&1; do sleep 1; printf "."; done
while true; do kubectl --namespace monitoring apply -f manifests/node-exporter --request-timeout 0 && break || sleep 5; done
while true; do kubectl --namespace monitoring apply -f manifests/kube-state-metrics --request-timeout 0 && break || sleep 5; done
while true; do kubectl --namespace monitoring apply -f manifests/grafana/grafana-credentials.yaml --request-timeout 0 && break || sleep 5; done
while true; do kubectl --namespace monitoring apply -f manifests/grafana --request-timeout 0 && break || sleep 5; done
while true; do find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml ! -name prometheus-k8s-ingress.yaml -exec kubectl --request-timeout 0 --namespace "monitoring" apply -f {} \; && break || sleep 5; done
while true; do kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml --request-timeout 0 && break || sleep 5; done
while true; do kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml --request-timeout 0 && break || sleep 5; done
while true; do kubectl --namespace monitoring apply -f manifests/alertmanager/ --request-timeout 0 && break || sleep 5; done
while true; do cat manifests/prometheus/prometheus-k8s-ingress.yaml | sed "s/\${DOMAIN}/${DOMAIN}/g" | kubectl apply --request-timeout 0 --validate=false -f - && break || sleep 5; done
fi
fi
set -e
echo "provisioned worker"
}
read_api_key() {
local result=false
if ! [[ $API_KEY =~ ^[0-9a-zA-Z]+$ ]] 2>/dev/null; then
while ! [[ $API_KEY =~ ^-?[0-9a-zA-Z]+$ ]] 2>/dev/null; do
text_input "Enter Linode API Key (https://manager.linode.com/profile/api) : " API_KEY
tput civis
done
while true ; do
spinner "Verifying API Key" check_api_key result
if [ $result = true ] ; then
break
fi
text_input "Enter Linode API Key (https://manager.linode.com/profile/api) : " API_KEY
tput civis
done
else
while true ; do
spinner "Verifying API Key" check_api_key result
if [ $result = true ] ; then
break
fi
text_input "Enter Linode API Key (https://manager.linode.com/profile/api) : " API_KEY
tput civis
done
fi
sed -i.bak '/^API_KEY/d' settings.env
echo "API_KEY=$API_KEY" >> settings.env
rm settings.env.bak
}
check_api_key() {
if linode_api test.echo | jq -e ".ERRORARRAY == []" >/dev/null; then
echo true
else
echo false
fi
}
get_plans() {
linode_api avail.linodeplans | jq ".DATA | sort_by(.PRICE)"
}
read_install_options() {
if [[ -z $INSTALL_K8S_DASHBOARD || -z $INSTALL_TRAEFIK || -z $INSTALL_ROOK || -z $INSTALL_PROMETHEUS ]]; then
options=('K8S Dashboard' 'Traefik (Load Balancer)' 'Rook (Distributed Storage)' 'Prometheus (Monitoring)')
env_names=('INSTALL_K8S_DASHBOARD' 'INSTALL_TRAEFIK' 'INSTALL_ROOK' 'INSTALL_PROMETHEUS')
selected_indices=(0 1 2 3)
checkbox_input_indices "What should be included in your cluster?" options selected_indices
eval "$(gen_env_from_options selected_indices env_names)"
sed -i.bak '/^INSTALL_K8S_DASHBOARD/d' settings.env
sed -i.bak '/^INSTALL_TRAEFIK/d' settings.env
sed -i.bak '/^INSTALL_ROOK/d' settings.env
sed -i.bak '/^INSTALL_PROMETHEUS/d' settings.env
echo "$(gen_env_from_options selected_indices env_names)" >> settings.env
rm settings.env.bak
fi
}
read_master_plan() {
if ! [[ $MASTER_PLAN =~ ^[0-9]+$ ]] 2>/dev/null; then
while ! [[ $MASTER_PLAN =~ ^-?[0-9]+$ ]] 2>/dev/null; do
IFS=$'\n'
spinner "Retrieving plans" get_plans plan_data
local plan_ids=($(echo $plan_data | jq -r '.[] | select(.RAM >= 2048) | .PLANID'))
local plan_list=($(echo $plan_data | jq -r '.[] | select(.RAM >= 2048) | [.RAM, .PRICE] | @csv' | \
awk -v FS="," '{ram=$1/1024; printf "%3sGB (\$%s/mo)%s",ram,$2,ORS}' 2>/dev/null))
list_input_index "Select a master plan (https://www.linode.com/pricing)" plan_list selected_disk_id
MASTER_PLAN=${plan_ids[$selected_disk_id]}
done
echo "MASTER_PLAN=$MASTER_PLAN" >> settings.env
fi
}
read_worker_plan() {
if ! [[ $WORKER_PLAN =~ ^[0-9]+$ ]] 2>/dev/null; then
while ! [[ $WORKER_PLAN =~ ^-?[0-9]+$ ]] 2>/dev/null; do
IFS=$'\n'
spinner "Retrieving plans" get_plans plan_data
tput el
local plan_ids=($(echo $plan_data | jq -r '.[] | select(.RAM >= 2048) | .PLANID'))
local plan_list=($(echo $plan_data | jq -r '.[] | select(.RAM >= 2048) | [.RAM, .PRICE] | @csv' | \
awk -v FS="," '{ram=$1/1024; printf "%3sGB (\$%s/mo)%s",ram,$2,ORS}' 2>/dev/null))
list_input_index "Select a worker plan (https://www.linode.com/pricing)" plan_list selected_disk_id
WORKER_PLAN=${plan_ids[$selected_disk_id]}
done
echo "WORKER_PLAN=$WORKER_PLAN" >> settings.env
fi
}
get_datacenters() {
linode_api avail.datacenters | jq ".DATA | sort_by(.LOCATION)"
}
read_datacenter() {
if ! [[ $DATACENTER_ID =~ ^[0-9]+$ ]] 2>/dev/null; then
while ! [[ $DATACENTER_ID =~ ^-?[0-9]+$ ]] 2>/dev/null; do
IFS=$'\n'
spinner "Retrieving datacenters" get_datacenters datacenters_data
tput el
datacenters_ids=($(echo $datacenters_data | jq -r '.[] | .DATACENTERID'))
datacenters_list=($(echo $datacenters_data | jq -r '.[] | .LOCATION'))
list_input_index "Select a datacenter" datacenters_list selected_data_center_index
DATACENTER_ID=${datacenters_ids[$selected_data_center_index]}
done
echo "DATACENTER_ID=$DATACENTER_ID" >> settings.env
fi
}
read_domain() {
domain_regex="^([a-zA-Z0-9][a-zA-Z0-9-]{1,61}[a-zA-Z0-9]\.)+[a-zA-Z]{2,}$"
if ! [[ $DOMAIN =~ $domain_regex ]] 2>/dev/null; then
while ! [[ $DOMAIN =~ $domain_regex ]] 2>/dev/null; do
text_input "Enter Domain Name: " DOMAIN "$domain_regex" "Please enter a valid domain name"
done
echo "DOMAIN=$DOMAIN" >> settings.env
fi
tput civis
}
read_email() {
email_regex="^[a-z0-9!#\$%&'*+/=?^_\`{|}~-]+(\.[a-z0-9!#$%&'*+/=?^_\`{|}~-]+)*@([a-z0-9]([a-z0-9-]*[a-z0-9])?\.)+[a-z0-9]([a-z0-9-]*[a-z0-9])?\$"
if ! [[ $EMAIL =~ $email_regex ]] 2>/dev/null; then
while ! [[ $EMAIL =~ $email_regex ]] 2>/dev/null; do
text_input "Enter Email (for ACME registration): " EMAIL "^[a-z0-9!#\$%&'*+/=?^_\`{|}~-]+(\.[a-z0-9!#$%&'*+/=?^_\`{|}~-]+)*@([a-z0-9]([a-z0-9-]*[a-z0-9])?\.)+[a-z0-9]([a-z0-9-]*[a-z0-9])?\$" "Please enter a valid email"
done
echo "EMAIL=$EMAIL" >> settings.env
fi
tput civis
}
read_username() {
if [ -z "$USERNAME" ]; then
[ -e auth ] && rm auth
[ -e manifests/grafana/grafana-credentials.yaml ] && rm manifests/grafana/grafana-credentials.yaml
text_input "Enter dashboard username: " USERNAME
echo "USERNAME=$USERNAME" >> settings.env
fi
tput civis
}
read_reboot_strategy() {
if [ -z "$REBOOT_STRATEGY" ]; then
strategies=("off" "etcd-lock" "reboot")
list_input_index "Select a update strategy (see https://coreos.com/os/docs/latest/update-strategies.html)" strategies strategy
REBOOT_STRATEGY=${strategies[$strategy]}
echo "REBOOT_STRATEGY=$REBOOT_STRATEGY" >> settings.env
fi
}
get_domains() {
local DOMAIN=$1
linode_api domain.list | jq ".DATA" | jq -c ".[] | select(.DOMAIN == \"$DOMAIN\") | .DOMAINID"
}
get_resources() {
local DOMAIN_ID=$1
linode_api domain.resource.list DomainID=$DOMAIN_ID | jq ".DATA"
}
create_A_domain() {
linode_api domain.resource.create DomainID=$DOMAIN_ID \
TARGET="$IP" TTL_SEC=0 PORT=80 PROTOCOL='' PRIORITY=10 WEIGHT=5 TYPE='A' NAME='' >/dev/null
}
create_CNAME_domain() {
linode_api domain.resource.create DomainID=$DOMAIN_ID \
TARGET="$DOMAIN" TTL_SEC=0 PORT=80 PROTOCOL="" PRIORITY=10 WEIGHT=5 TYPE="CNAME" NAME="*" >/dev/null
}
get_ip_address_id() {
linode_api linode.ip.list | jq ".DATA" | jq -c ".[] | select(.IPADDRESS == \"$IP\") | .IPADDRESSID" | sed -n 1p
}
update_domain() {
linode_api domain.update DomainID=$DOMAIN_ID Domain="$DOMAIN" TTL_sec=300 axfr_ips="none" Expire_sec=604800 \
SOA_Email="$EMAIL" Retry_sec=300 status=1 Refresh_sec=300 Type=master >/dev/null
}
create_domain() {
linode_api domain.create Domain="$DOMAIN" TTL_sec=300 axfr_ips="none" Expire_sec=604800 \
SOA_Email="$EMAIL" Retry_sec=300 status=1 Refresh_sec=300 Type=master >/dev/null
}
delete_domain() {
linode_api domain.delete DomainID="$DOMAIN_ID" Domain="$DOMAIN" >/dev/null
}
update_dns() {
local LINODE_ID=$1
local DOMAIN_ID
local IP
local RESOURCE_IDS
eval IP=\$PUBLIC_$LINODE_ID
spinner "${CYAN}[$IP]${NORMAL} Retrieving DNS record for $DOMAIN" "get_domains \"$DOMAIN\"" DOMAIN_ID
if ! [[ $DOMAIN_ID =~ ^[0-9]+$ ]] 2>/dev/null; then
spinner "${CYAN}[$IP]${NORMAL} Creating DNS record for $DOMAIN" create_domain
fi
spinner "${CYAN}[$IP]${NORMAL} Retrieving DNS record for $DOMAIN" "get_domains \"$DOMAIN\"" DOMAIN_ID
spinner "${CYAN}[$IP]${NORMAL} Updating DNS record for $DOMAIN" update_domain
spinner "${CYAN}[$IP]${NORMAL} Retrieving list of resources for $DOMAIN" "get_resources $DOMAIN_ID" RESOURCE_LIST
IFS=$'\n'
if ! [[ $(echo $RESOURCE_LIST | jq -c ".[] | select(.TYPE == \"A\" and .TARGET == \"$IP\") | .RESOURCEID" | sed -n 1p) =~ ^[0-9]+$ ]] 2>/dev/null; then
RESOURCE_IDS=$(echo $RESOURCE_LIST | jq -c ".[] | select(.TYPE == \"A\" and .NAME == \"\") | .RESOURCEID")
for RESOURCE_ID in $RESOURCE_IDS; do
spinner "${CYAN}[$IP]${NORMAL} Deleting 'A' DNS record $RESOURCE_ID" "linode_api domain.resource.delete DomainID=$DOMAIN_ID ResourceID=$RESOURCE_ID"
done
spinner "${CYAN}[$IP]${NORMAL} Adding 'A' DNS record to $DOMAIN with target $IP" create_A_domain
fi
if ! [[ $(echo $RESOURCE_LIST | jq -c ".[] | select(.TYPE == \"CNAME\" and .TARGET == \"$DOMAIN\") | .RESOURCEID") =~ ^[0-9]+$ ]] 2>/dev/null; then
spinner "${CYAN}[$IP]${NORMAL} Adding wildcard 'CNAME' record with target $DOMAIN" create_CNAME_domain
fi
}
read_no_of_workers() {
if ! [[ $NO_OF_WORKERS =~ ^[0-9]+$ ]] 2>/dev/null; then
while ! [[ $NO_OF_WORKERS =~ ^[0-9]+$ ]] 2>/dev/null; do
text_input "Enter number of workers: " NO_OF_WORKERS "^[0-9]+$" "Please enter a number"
done
echo "NO_OF_WORKERS=$NO_OF_WORKERS" >> settings.env
fi
tput civis
}
create_linode() {
DATACENTER_ID=$1
PLAN_ID=$2
linode_api linode.create DatacenterID=$DATACENTER_ID PlanID=$PLAN_ID | jq ".DATA.LinodeID"
}
delete_linode() {
local LINODE_ID="$1"
linode_api linode.delete LinodeID=$LINODE_ID skipChecks=true >/dev/null
}
add_private_ip() {
local LINODE_ID=$1
linode_api linode.ip.addprivate LinodeID=$LINODE_ID
}
get_no_of_workers() {
echo "$( list_worker_ids | wc -l ) + 0" | bc
}
================================================
FILE: manifests/alertmanager/alertmanager-config.yaml
================================================
apiVersion: v1
kind: Secret
metadata:
name: alertmanager-main
data:
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg==
================================================
FILE: manifests/alertmanager/alertmanager-service.yaml
================================================
apiVersion: v1
kind: Service
metadata:
labels:
alertmanager: main
name: alertmanager-main
spec:
ports:
- name: web
port: 9093
protocol: TCP
selector:
alertmanager: main
================================================
FILE: manifests/alertmanager/alertmanager.yaml
================================================
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
name: main
labels:
alertmanager: main
spec:
replicas: 1
version: v0.14.0
================================================
FILE: manifests/container-linux/master-config.yaml
================================================
passwd:
users:
- name: core
ssh_authorized_keys:
- "#SSH_KEY#"
storage:
files:
- path: /etc/hostname
filesystem: root
mode: 0420
contents:
inline: |
#HOSTNAME#
- path: /etc/traefik/acme/acme.json
filesystem: root
contents:
inline: |
mode: 0600
- path: /etc/environment
filesystem: root
contents:
inline: |
COREOS_PUBLIC_IPV4=#COREOS_PUBLIC_IPV4#
COREOS_PRIVATE_IPV4=#COREOS_PRIVATE_IPV4#
- path: /etc/kubernetes/kubelet.env
filesystem: root
mode: 0644
contents:
inline: |
KUBELET_IMAGE_URL=docker://gcr.io/google_containers/hyperkube
KUBELET_IMAGE_TAG=v1.11.0
- path: /etc/sysctl.d/max-user-watches.conf
filesystem: root
contents:
inline: |
fs.inotify.max_user_watches=16184
- path: /opt/bootkube/bootkube-start
filesystem: root
mode: 0544
user:
id: 500
group:
id: 500
contents:
inline: |
#!/bin/bash
# Wrapper for bootkube start
set -e
# Move experimental manifests
[ -n "$(ls /opt/bootkube/assets/manifests-*/* 2>/dev/null)" ] && \
mv /opt/bootkube/assets/manifests-*/* /opt/bootkube/assets/manifests && \
rm -rf /opt/bootkube/assets/manifests-*
BOOTKUBE_ACI="${BOOTKUBE_ACI:-quay.io/coreos/bootkube}"
BOOTKUBE_VERSION="${BOOTKUBE_VERSION:-v0.13.0}"
BOOTKUBE_ASSETS="${BOOTKUBE_ASSETS:-/opt/bootkube/assets}"
# ======== START OF RESOURCE RENDERING ===========
sudo mkdir -p /opt/bootkube /etc/kubernetes
[ -e /opt/bootkube/assets ] && sudo rm -rf /opt/bootkube/assets
sudo /usr/bin/rkt run \
--trust-keys-from-https \
--volume assets,kind=host,source=/opt/bootkube \
--mount volume=assets,target=/opt/bootkube \
--volume bootstrap,kind=host,source=/etc/kubernetes \
--mount volume=bootstrap,target=/etc/kubernetes \
${RKT_OPTS} \
${BOOTKUBE_ACI}:${BOOTKUBE_VERSION} \
--net=host \
--dns=host \
--exec=/bootkube -- render --asset-dir=/opt/bootkube/assets \
--etcd-servers=https://#COREOS_PRIVATE_IPV4#:2379 \
--network-provider=flannel \
--api-servers=https://#COREOS_PUBLIC_IPV4#:6443,https://#COREOS_PRIVATE_IPV4#:6443
sudo mkdir -p /etc/kubernetes
sudo cp ${BOOTKUBE_ASSETS}/auth/kubeconfig /etc/kubernetes/
sudo cp ${BOOTKUBE_ASSETS}/tls/ca.crt /etc/kubernetes/ca.crt
sudo mkdir -p /etc/etcd/tls
sudo cp ${BOOTKUBE_ASSETS}/tls/etcd-* /etc/etcd/tls
sudo mkdir -p /etc/etcd/tls/etcd
sudo cp ${BOOTKUBE_ASSETS}/tls/etcd/* /etc/etcd/tls/etcd
sudo chown -R etcd:etcd /etc/etcd
sudo chmod -R u=rX,g=,o= /etc/etcd
sudo systemctl enable kubelet; sudo systemctl start kubelet
# ======= END OF RESOURCE RENDERING ========
exec /usr/bin/rkt run \
--trust-keys-from-https \
--volume assets,kind=host,source=${BOOTKUBE_ASSETS} \
--mount volume=assets,target=/assets \
--volume bootstrap,kind=host,source=/etc/kubernetes \
--mount volume=bootstrap,target=/etc/kubernetes \
${RKT_OPTS} \
${BOOTKUBE_ACI}:${BOOTKUBE_VERSION} \
--net=host \
--dns=host \
--exec=/bootkube -- start --asset-dir=/assets "$@"
networkd:
units:
- name: 00-eth0.network
contents: |
[Match]
Name=eth0
[Network]
DHCP=no
DNS=#DNS#
Domains=members.linode.com
IPv6PrivacyExtensions=false
Gateway=#GATEWAY#
Address=#COREOS_PUBLIC_IPV4#/24
Address=#COREOS_PRIVATE_IPV4#/17
systemd:
units:
- name: etcd-member.service
enable: true
dropins:
- name: 40-etcd-cluster.conf
contents: |
[Service]
Environment="ETCD_IMAGE_TAG=v3.2.13"
Environment="ETCD_NAME=controller"
Environment="ETCD_INITIAL_CLUSTER=controller=https://#COREOS_PRIVATE_IPV4#:2380"
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://#COREOS_PRIVATE_IPV4#:2380"
Environment="ETCD_ADVERTISE_CLIENT_URLS=https://#COREOS_PRIVATE_IPV4#:2379"
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
Environment="ETCD_SSL_DIR=/etc/etcd/tls"
Environment="ETCD_TRUSTED_CA_FILE=/etc/ssl/certs/etcd/server-ca.crt"
Environment="ETCD_CERT_FILE=/etc/ssl/certs/etcd/server.crt"
Environment="ETCD_KEY_FILE=/etc/ssl/certs/etcd/server.key"
Environment="ETCD_CLIENT_CERT_AUTH=true"
Environment="ETCD_PEER_TRUSTED_CA_FILE=/etc/ssl/certs/etcd/peer-ca.crt"
Environment="ETCD_PEER_CERT_FILE=/etc/ssl/certs/etcd/peer.crt"
Environment="ETCD_PEER_KEY_FILE=/etc/ssl/certs/etcd/peer.key"
- name: docker.service
enable: true
- name: kubelet.path
enable: true
contents: |
[Unit]
Description=Watch for kubeconfig
[Path]
PathExists=/etc/kubernetes/kubeconfig
[Install]
WantedBy=multi-user.target
- name: wait-for-dns.service
enable: true
contents: |
[Unit]
Description=Wait for DNS entries
Wants=systemd-resolved.service
Before=kubelet.service
[Service]
Type=oneshot
RemainAfterExit=true
ExecStart=/bin/sh -c 'while ! /usr/bin/grep '^[^#[:space:]]' /etc/resolv.conf > /dev/null; do sleep 1; done'
[Install]
RequiredBy=kubelet.service
- name: bootkube.service
contents: |
[Unit]
Description=Bootstrap a Kubernetes cluster
ConditionPathExists=!/opt/bootkube/init_bootkube.done
[Service]
Type=oneshot
RemainAfterExit=true
EnvironmentFile=/etc/environment
WorkingDirectory=/opt/bootkube
ExecStart=/opt/bootkube/bootkube-start
ExecStartPost=/bin/touch /opt/bootkube/init_bootkube.done
[Install]
WantedBy=multi-user.target
- name: kubelet.service
contents: |
[Unit]
Description=Kubelet via Hyperkube ACI
Wants=rpc-statd.service
[Service]
EnvironmentFile=/etc/kubernetes/kubelet.env
EnvironmentFile=/etc/environment
Environment="RKT_RUN_ARGS=--uuid-file-save=/var/cache/kubelet-pod.uuid \
--volume=resolv,kind=host,source=/etc/resolv.conf \
--mount volume=resolv,target=/etc/resolv.conf \
--volume var-lib-cni,kind=host,source=/var/lib/cni \
--mount volume=var-lib-cni,target=/var/lib/cni \
--volume opt-cni-bin,kind=host,source=/opt/cni/bin \
--mount volume=opt-cni-bin,target=/opt/cni/bin \
--volume var-log,kind=host,source=/var/log \
--mount volume=var-log,target=/var/log \
--insecure-options=image"
ExecStartPre=/bin/mkdir -p /opt/cni/bin
ExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests
ExecStartPre=/bin/mkdir -p /etc/kubernetes/cni/net.d
ExecStartPre=/bin/mkdir -p /etc/kubernetes/checkpoint-secrets
ExecStartPre=/bin/mkdir -p /etc/kubernetes/inactive-manifests
ExecStartPre=/bin/mkdir -p /var/lib/cni
ExecStartPre=/bin/mkdir -p /var/lib/kubelet/volumeplugins
ExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid
ExecStart=/usr/lib/coreos/kubelet-wrapper \
--allow-privileged \
--anonymous-auth=false \
--client-ca-file=/etc/kubernetes/ca.crt \
--cloud-provider= \
--cluster_dns=10.3.0.10 \
--cluster_domain=cluster.local \
--cni-conf-dir=/etc/kubernetes/cni/net.d \
--exit-on-lock-contention \
--hostname-override=#COREOS_PUBLIC_IPV4# \
--kubeconfig=/etc/kubernetes/kubeconfig \
--lock-file=/var/run/lock/kubelet.lock \
--network-plugin=cni \
--node-labels=node-role.kubernetes.io/master \
--pod-manifest-path=/etc/kubernetes/manifests \
--register-with-taints=node-role.kubernetes.io/master=:NoSchedule \
--volume-plugin-dir=/var/lib/kubelet/volumeplugins
ExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
locksmith:
reboot_strategy: #REBOOT_STRATEGY#
================================================
FILE: manifests/container-linux/worker-config.yaml
================================================
passwd:
users:
- name: core
ssh_authorized_keys:
- "#SSH_KEY#"
storage:
files:
- path: /etc/hostname
filesystem: root
mode: 0420
contents:
inline: |
#HOSTNAME#
- path: /etc/kubernetes/kubelet.env
filesystem: root
mode: 0644
contents:
inline: |
KUBELET_IMAGE_URL=docker://gcr.io/google_containers/hyperkube
KUBELET_IMAGE_TAG=v1.11.0
- path: /etc/sysctl.d/max-user-watches.conf
filesystem: root
contents:
inline: |
fs.inotify.max_user_watches=16184
- path: /etc/kubernetes/delete-node
filesystem: root
mode: 0744
contents:
inline: |
#!/bin/bash
set -e
exec /usr/bin/rkt run \
--trust-keys-from-https \
--volume config,kind=host,source=/etc/kubernetes \
--mount volume=config,target=/etc/kubernetes \
--insecure-options=image \
docker://gcr.io/google_containers/hyperkube:v1.9.3 \
--net=host \
--dns=host \
--exec=/kubectl -- --kubeconfig=/etc/kubernetes/kubeconfig delete node #COREOS_PUBLIC_IPV4#
- path: /etc/environment
filesystem: root
contents:
inline: |
COREOS_PUBLIC_IPV4=#COREOS_PUBLIC_IPV4#
COREOS_PRIVATE_IPV4=#COREOS_PRIVATE_IPV4#
mode: 0644
- path: /home/core/bootstrap.sh
filesystem: root
contents:
inline: |
#!/usr/bin/env bash
set -euo pipefail
# Setup kubeconfig
mkdir -p /etc/kubernetes
cp /home/core/kubeconfig /etc/kubernetes/kubeconfig
# Pulled out of the kubeconfig. Other installations should place the root
# CA here manually.
grep 'certificate-authority-data' /home/core/kubeconfig | awk '{print $2}' | base64 -d > /etc/kubernetes/ca.crt
# Start services
systemctl daemon-reload
systemctl stop update-engine
systemctl mask update-engine
systemctl enable kubelet
sudo systemctl start kubelet
mode: 0700
networkd:
units:
- name: 00-eth0.network
contents: |
[Match]
Name=eth0
[Network]
DHCP=no
DNS=#DNS#
Domains=members.linode.com
IPv6PrivacyExtensions=false
Gateway=#GATEWAY#
Address=#COREOS_PUBLIC_IPV4#/24
Address=#COREOS_PRIVATE_IPV4#/17
systemd:
units:
- name: docker.service
enable: true
- name: kubelet.path
enable: true
contents: |
[Unit]
Description=Watch for kubeconfig
[Path]
PathExists=/etc/kubernetes/kubeconfig
[Install]
WantedBy=multi-user.target
- name: wait-for-dns.service
enable: true
contents: |
[Unit]
Description=Wait for DNS entries
Wants=systemd-resolved.service
Before=kubelet.service
[Service]
Type=oneshot
RemainAfterExit=true
ExecStart=/bin/sh -c 'while ! /usr/bin/grep '^[^#[:space:]]' /etc/resolv.conf > /dev/null; do sleep 1; done'
[Install]
RequiredBy=kubelet.service
- name: delete-node.service
enable: true
contents: |
[Unit]
Description=Waiting to delete Kubernetes node on shutdown
[Service]
Type=oneshot
RemainAfterExit=true
ExecStart=/bin/true
ExecStop=/etc/kubernetes/delete-node
[Install]
WantedBy=multi-user.target
- name: kubelet.service
contents: |
[Unit]
Description=Kubelet via Hyperkube ACI
Wants=rpc-statd.service
[Service]
EnvironmentFile=/etc/kubernetes/kubelet.env
EnvironmentFile=/etc/environment
Environment="RKT_RUN_ARGS=--uuid-file-save=/var/cache/kubelet-pod.uuid \
--volume=resolv,kind=host,source=/etc/resolv.conf \
--mount volume=resolv,target=/etc/resolv.conf \
--volume var-lib-cni,kind=host,source=/var/lib/cni \
--mount volume=var-lib-cni,target=/var/lib/cni \
--volume opt-cni-bin,kind=host,source=/opt/cni/bin \
--mount volume=opt-cni-bin,target=/opt/cni/bin \
--volume var-log,kind=host,source=/var/log \
--mount volume=var-log,target=/var/log \
--insecure-options=image"
ExecStartPre=/bin/mkdir -p /opt/cni/bin
ExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests
ExecStartPre=/bin/mkdir -p /etc/kubernetes/cni/net.d
ExecStartPre=/bin/mkdir -p /etc/kubernetes/checkpoint-secrets
ExecStartPre=/bin/mkdir -p /etc/kubernetes/inactive-manifests
ExecStartPre=/bin/mkdir -p /var/lib/cni
ExecStartPre=/bin/mkdir -p /var/lib/kubelet/volumeplugins
ExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid
ExecStart=/usr/lib/coreos/kubelet-wrapper \
--allow-privileged \
--anonymous-auth=false \
--client-ca-file=/etc/kubernetes/ca.crt \
--cloud-provider= \
--cluster_dns=10.3.0.10 \
--cluster_domain=cluster.local \
--cni-conf-dir=/etc/kubernetes/cni/net.d \
--exit-on-lock-contention \
--hostname-override=#COREOS_PUBLIC_IPV4# \
--kubeconfig=/etc/kubernetes/kubeconfig \
--lock-file=/var/run/lock/kubelet.lock \
--network-plugin=cni \
--node-labels=node-role.kubernetes.io/node \
--pod-manifest-path=/etc/kubernetes/manifests \
--volume-plugin-dir=/var/lib/kubelet/volumeplugins
ExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
locksmith:
reboot_strategy: #REBOOT_STRATEGY#
================================================
FILE: manifests/grafana/grafana-dashboards.yaml
================================================
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards
data:
all-nodes-dashboard.json: |+
{
"dashboard":
{
"__inputs": [
{
"description": "",
"label": "prometheus",
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"pluginName": "Prometheus",
"type": "datasource"
}
],
"__requires": [
{
"id": "grafana",
"name": "Grafana",
"type": "grafana",
"version": "4.1.1"
},
{
"id": "graph",
"name": "Graph",
"type": "panel",
"version": ""
},
{
"id": "prometheus",
"name": "Prometheus",
"type": "datasource",
"version": "1.0.0"
},
{
"id": "singlestat",
"name": "Singlestat",
"type": "panel",
"version": ""
}
],
"annotations": {
"list": []
},
"description": "Dashboard to get an overview of one server",
"editable": true,
"gnetId": 22,
"graphTooltip": 0,
"hideControls": false,
"id": null,
"links": [],
"refresh": false,
"rows": [
{
"collapse": false,
"height": "250px",
"panels": [
{
"alerting": {},
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 3,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100",
"hide": false,
"intervalFactor": 10,
"legendFormat": "",
"refId": "A",
"step": 50
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Idle cpu",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percent",
"label": "cpu usage",
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"alerting": {},
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 9,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(node_load1)",
"intervalFactor": 4,
"legendFormat": "load 1m",
"refId": "A",
"step": 20,
"target": ""
},
{
"expr": "sum(node_load5)",
"intervalFactor": 4,
"legendFormat": "load 5m",
"refId": "B",
"step": 20,
"target": ""
},
{
"expr": "sum(node_load15)",
"intervalFactor": 4,
"legendFormat": "load 15m",
"refId": "C",
"step": 20,
"target": ""
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "System load",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percentunit",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "New row",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"alerting": {},
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 4,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}",
"yaxis": 2
}
],
"span": 9,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)",
"intervalFactor": 2,
"legendFormat": "memory usage",
"metric": "memo",
"refId": "A",
"step": 4,
"target": ""
},
{
"expr": "sum(node_memory_Buffers)",
"interval": "",
"intervalFactor": 2,
"legendFormat": "memory buffers",
"metric": "memo",
"refId": "B",
"step": 4,
"target": ""
},
{
"expr": "sum(node_memory_Cached)",
"interval": "",
"intervalFactor": 2,
"legendFormat": "memory cached",
"metric": "memo",
"refId": "C",
"step": 4,
"target": ""
},
{
"expr": "sum(node_memory_MemFree)",
"interval": "",
"intervalFactor": 2,
"legendFormat": "memory free",
"metric": "memo",
"refId": "D",
"step": 4,
"target": ""
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Memory usage",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 5,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [
{
"expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100",
"intervalFactor": 2,
"metric": "",
"refId": "A",
"step": 60,
"target": ""
}
],
"thresholds": "80, 90",
"title": "Memory usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "New row",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"alerting": {},
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 6,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "read",
"yaxis": 1
},
{
"alias": "{instance=\"172.17.0.1:9100\"}",
"yaxis": 2
},
{
"alias": "io time",
"yaxis": 2
}
],
"span": 9,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(node_disk_bytes_read[5m]))",
"hide": false,
"intervalFactor": 4,
"legendFormat": "read",
"refId": "A",
"step": 8,
"target": ""
},
{
"expr": "sum(rate(node_disk_bytes_written[5m]))",
"intervalFactor": 4,
"legendFormat": "written",
"refId": "B",
"step": 8
},
{
"expr": "sum(rate(node_disk_io_time_ms[5m]))",
"intervalFactor": 4,
"legendFormat": "io time",
"refId": "C",
"step": 8
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Disk I/O",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "ms",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "percentunit",
"gauge": {
"maxValue": 1,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 7,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [
{
"expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})",
"intervalFactor": 2,
"refId": "A",
"step": 60,
"target": ""
}
],
"thresholds": "0.75, 0.9",
"title": "Disk space usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "New row",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"alerting": {},
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 8,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "transmitted ",
"yaxis": 2
}
],
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 10,
"target": ""
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Network received",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"alerting": {},
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 10,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "transmitted ",
"yaxis": 2
}
],
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "",
"refId": "B",
"step": 10,
"target": ""
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Network transmitted",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "New row",
"titleSize": "h6"
}
],
"schemaVersion": 14,
"style": "dark",
"tags": [
"prometheus"
],
"templating": {
"list": []
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "All Nodes",
"version": 1
}
,
"inputs": [
{
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"type": "datasource",
"value": "prometheus"
}
],
"overwrite": true
}
deployment-dashboard.json: |+
{
"dashboard":
{
"__inputs": [
{
"description": "",
"label": "prometheus",
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"pluginName": "Prometheus",
"type": "datasource"
}
],
"__requires": [
{
"id": "singlestat",
"name": "Singlestat",
"type": "panel",
"version": ""
},
{
"id": "graph",
"name": "Graph",
"type": "panel",
"version": ""
},
{
"id": "grafana",
"name": "Grafana",
"type": "grafana",
"version": "3.1.1"
},
{
"id": "prometheus",
"name": "Prometheus",
"type": "datasource",
"version": "1.0.0"
}
],
"annotations": {
"list": []
},
"editable": true,
"gnetId": null,
"hideControls": false,
"id": null,
"links": [],
"rows": [
{
"collapse": false,
"editable": true,
"height": "200px",
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 8,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "cores",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 4,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ",
"intervalFactor": 2,
"refId": "A",
"step": 600
}
],
"thresholds": "",
"title": "CPU",
"type": "singlestat",
"valueFontSize": "110%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 9,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "GB",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "80%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 4,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"targets": [
{
"expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3",
"intervalFactor": 2,
"refId": "A",
"step": 600
}
],
"thresholds": "",
"title": "Memory",
"type": "singlestat",
"valueFontSize": "110%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "Bps",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": false
},
"id": 7,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 4,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ",
"intervalFactor": 2,
"refId": "A",
"step": 600
}
],
"thresholds": "",
"title": "Network",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
}
],
"showTitle": false,
"title": "Row"
},
{
"collapse": false,
"editable": true,
"height": "100px",
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"decimals": null,
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": false
},
"id": 5,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [
{
"expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)",
"intervalFactor": 2,
"metric": "kube_deployment_spec_replicas",
"refId": "A",
"step": 600
}
],
"thresholds": "",
"title": "Desired Replicas",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 6,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [
{
"expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)",
"intervalFactor": 2,
"refId": "A",
"step": 600
}
],
"thresholds": "",
"title": "Available Replicas",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 3,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [
{
"expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 600
}
],
"thresholds": "",
"title": "Observed Generation",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 2,
"interval": null,
"isNew": true,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [
{
"expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 600
}
],
"thresholds": "",
"title": "Metadata Generation",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
}
],
"title": "New row"
},
{
"collapse": false,
"editable": true,
"height": "350px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 1,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"hideZero": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)",
"intervalFactor": 2,
"legendFormat": "current replicas",
"refId": "A",
"step": 30
},
{
"expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)",
"intervalFactor": 2,
"legendFormat": "available",
"refId": "B",
"step": 30
},
{
"expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)",
"intervalFactor": 2,
"legendFormat": "unavailable",
"refId": "C",
"step": 30
},
{
"expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)",
"intervalFactor": 2,
"legendFormat": "updated",
"refId": "D",
"step": 30
},
{
"expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)",
"intervalFactor": 2,
"legendFormat": "desired",
"refId": "E",
"step": 30
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Replicas",
"tooltip": {
"msResolution": true,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"transparent": false,
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "none",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"showTitle": false,
"title": "New row"
}
],
"schemaVersion": 12,
"sharedCrosshair": true,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allValue": ".*",
"current": {},
"datasource": "${DS_PROMETHEUS}",
"hide": 0,
"includeAll": false,
"label": "Namespace",
"multi": false,
"name": "deployment_namespace",
"options": [],
"query": "label_values(kube_deployment_metadata_generation, namespace)",
"refresh": 1,
"regex": "",
"sort": 0,
"tagValuesQuery": null,
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"hide": 0,
"includeAll": false,
"label": "Deployment",
"multi": false,
"name": "deployment_name",
"options": [],
"query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)",
"refresh": 1,
"regex": "",
"sort": 0,
"tagValuesQuery": "",
"tagsQuery": "deployment",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "Deployment",
"version": 2
}
,
"inputs": [
{
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"type": "datasource",
"value": "prometheus"
}
],
"overwrite": true
}
kubernetes-pods-dashboard.json: |+
{
"dashboard":
{
"__inputs": [
{
"description": "",
"label": "prometheus",
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"pluginName": "Prometheus",
"type": "datasource"
}
],
"__requires": [
{
"id": "graph",
"name": "Graph",
"type": "panel",
"version": ""
},
{
"id": "grafana",
"name": "Grafana",
"type": "grafana",
"version": "3.1.1"
},
{
"id": "prometheus",
"name": "Prometheus",
"type": "datasource",
"version": "1.0.0"
}
],
"annotations": {
"list": []
},
"editable": true,
"gnetId": null,
"hideControls": false,
"id": null,
"links": [],
"rows": [
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 1,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "Current: {{ container_name }}",
"metric": "container_memory_usage_bytes",
"refId": "A",
"step": 10
},
{
"expr": "kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}",
"interval": "10s",
"intervalFactor": 2,
"legendFormat": "Requested: {{ container }}",
"metric": "kube_pod_container_resource_requests_memory_bytes",
"refId": "B",
"step": 20
}
],
"timeFrom": null,
"timeShift": null,
"title": "Memory Usage",
"tooltip": {
"msResolution": true,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "Row"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 2,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )",
"intervalFactor": 2,
"legendFormat": "{{ container_name }}",
"refId": "A",
"step": 30
}
],
"timeFrom": null,
"timeShift": null,
"title": "CPU Usage",
"tooltip": {
"msResolution": true,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "New row"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 3,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))",
"intervalFactor": 2,
"legendFormat": "{{ pod_name }}",
"refId": "A",
"step": 30
}
],
"timeFrom": null,
"timeShift": null,
"title": "Network I/O",
"tooltip": {
"msResolution": true,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "New row"
}
],
"schemaVersion": 12,
"sharedCrosshair": true,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allValue": ".*",
"current": {},
"datasource": "${DS_PROMETHEUS}",
"hide": 0,
"includeAll": true,
"label": "Namespace",
"multi": false,
"name": "namespace",
"options": [],
"query": "label_values(kube_pod_info, namespace)",
"refresh": 1,
"regex": "",
"type": "query"
},
{
"current": {},
"datasource": "${DS_PROMETHEUS}",
"hide": 0,
"includeAll": false,
"label": "Pod",
"multi": false,
"name": "pod",
"options": [],
"query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)",
"refresh": 1,
"regex": "",
"type": "query"
},
{
"allValue": ".*",
"current": {},
"datasource": "${DS_PROMETHEUS}",
"hide": 0,
"includeAll": true,
"label": "Container",
"multi": false,
"name": "container",
"options": [],
"query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)",
"refresh": 1,
"regex": "",
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "Pods",
"version": 26
}
,
"inputs": [
{
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"type": "datasource",
"value": "prometheus"
}
],
"overwrite": true
}
node-dashboard.json: |+
{
"dashboard":
{
"__inputs": [
{
"description": "",
"label": "prometheus",
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"pluginName": "Prometheus",
"type": "datasource"
}
],
"__requires": [
{
"id": "grafana",
"name": "Grafana",
"type": "grafana",
"version": "4.1.1"
},
{
"id": "graph",
"name": "Graph",
"type": "panel",
"version": ""
},
{
"id": "prometheus",
"name": "Prometheus",
"type": "datasource",
"version": "1.0.0"
},
{
"id": "singlestat",
"name": "Singlestat",
"type": "panel",
"version": ""
}
],
"annotations": {
"list": []
},
"description": "Dashboard to get an overview of one server",
"editable": true,
"gnetId": 22,
"graphTooltip": 0,
"hideControls": false,
"id": null,
"links": [],
"refresh": false,
"rows": [
{
"collapse": false,
"height": "250px",
"panels": [
{
"alerting": {},
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 3,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)",
"hide": false,
"intervalFactor": 10,
"legendFormat": "{{cpu}}",
"refId": "A",
"step": 50
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Idle cpu",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percent",
"label": "cpu usage",
"logBase": 1,
"max": 100,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"alerting": {},
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 9,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "node_load1{instance=\"$server\"}",
"intervalFactor": 4,
"legendFormat": "load 1m",
"refId": "A",
"step": 20,
"target": ""
},
{
"expr": "node_load5{instance=\"$server\"}",
"intervalFactor": 4,
"legendFormat": "load 5m",
"refId": "B",
"step": 20,
"target": ""
},
{
"expr": "node_load15{instance=\"$server\"}",
"intervalFactor": 4,
"legendFormat": "load 15m",
"refId": "C",
"step": 20,
"target": ""
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "System load",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percentunit",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "New row",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"alerting": {},
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 4,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"hideEmpty": false,
"hideZero": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}",
"yaxis": 2
}
],
"span": 9,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}",
"hide": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "memory used",
"metric": "",
"refId": "C",
"step": 4
},
{
"expr": "node_memory_Buffers{instance=\"$server\"}",
"interval": "",
"intervalFactor": 2,
"legendFormat": "memory buffers",
"metric": "",
"refId": "E",
"step": 4
},
{
"expr": "node_memory_Cached{instance=\"$server\"}",
"intervalFactor": 2,
"legendFormat": "memory cached",
"metric": "",
"refId": "F",
"step": 4
},
{
"expr": "node_memory_MemFree{instance=\"$server\"}",
"intervalFactor": 2,
"legendFormat": "memory free",
"metric": "",
"refId": "D",
"step": 4
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Memory usage",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 5,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [
{
"expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100",
"intervalFactor": 2,
"refId": "A",
"step": 60,
"target": ""
}
],
"thresholds": "80, 90",
"title": "Memory usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "New row",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"alerting": {},
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 6,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "read",
"yaxis": 1
},
{
"alias": "{instance=\"172.17.0.1:9100\"}",
"yaxis": 2
},
{
"alias": "io time",
"yaxis": 2
}
],
"span": 9,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))",
"hide": false,
"intervalFactor": 4,
"legendFormat": "read",
"refId": "A",
"step": 8,
"target": ""
},
{
"expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))",
"intervalFactor": 4,
"legendFormat": "written",
"refId": "B",
"step": 8
},
{
"expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))",
"intervalFactor": 4,
"legendFormat": "io time",
"refId": "C",
"step": 8
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Disk I/O",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "ms",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "percentunit",
"gauge": {
"maxValue": 1,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 7,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [
{
"expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})",
"intervalFactor": 2,
"refId": "A",
"step": 60,
"target": ""
}
],
"thresholds": "0.75, 0.9",
"title": "Disk space usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "New row",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"alerting": {},
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 8,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "transmitted ",
"yaxis": 2
}
],
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{device}}",
"refId": "A",
"step": 10,
"target": ""
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Network received",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"alerting": {},
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 10,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "transmitted ",
"yaxis": 2
}
],
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{device}}",
"refId": "B",
"step": 10,
"target": ""
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Network transmitted",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "New row",
"titleSize": "h6"
}
],
"schemaVersion": 14,
"style": "dark",
"tags": [
"prometheus"
],
"templating": {
"list": [
{
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "server",
"options": [],
"query": "label_values(node_boot_time, instance)",
"refresh": 1,
"regex": "",
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "Nodes",
"version": 1
}
,
"inputs": [
{
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"type": "datasource",
"value": "prometheus"
}
],
"overwrite": true
}
traefik-dashboard.json: |+
{
"dashboard": {
"__inputs": [
{
"description": "",
"label": "prometheus",
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"pluginName": "Prometheus",
"type": "datasource"
}
],
"__requires": [
{
"id": "grafana",
"name": "Grafana",
"type": "grafana",
"version": "4.1.1"
},
{
"id": "graph",
"name": "Graph",
"type": "panel",
"version": ""
},
{
"id": "prometheus",
"name": "Prometheus",
"type": "datasource",
"version": "1.0.0"
},
{
"id": "singlestat",
"name": "Singlestat",
"type": "panel",
"version": ""
}
],
"annotations": {
"list": []
},
"description": "Visualize Traefik Health Metrics",
"editable": true,
"gnetId": 2240,
"graphTooltip": 0,
"hideControls": false,
"id": null,
"links": [],
"refresh": "30s",
"rows": [
{
"collapse": false,
"height": 288,
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"decimals": 0,
"format": "s",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 3,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 1,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "time() - process_start_time_seconds{job=\"load-balancer\"}",
"intervalFactor": 2,
"refId": "A",
"step": 1800
}
],
"thresholds": "",
"title": "Uptime",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"fill": 1,
"id": 1,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"span": 3,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(traefik_requests_total{service=\"http\"})",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{http}}",
"metric": "",
"refId": "A",
"step": 240
},
{
"expr": "sum(traefik_requests_total{service=\"https\"})",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{https}}",
"refId": "B",
"step": 240
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Total requests",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "none",
"label": "Count",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"fill": 1,
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"span": 3,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(traefik_request_duration_seconds_sum) / sum(traefik_requests_total) * 1000",
"intervalFactor": 2,
"legendFormat": "Average response time (ms)",
"refId": "A",
"step": 240
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Average response time",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "ms",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"fill": 1,
"id": 6,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"span": 5,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(traefik_requests_total[5m]))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{requests}}",
"refId": "A",
"step": 240
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Requests in last 5 minutes",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "General info",
"titleSize": "h6"
},
{
"collapse": false,
"height": 412,
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"decimals": 0,
"fill": 1,
"id": 5,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "rate(traefik_requests_total{service=~\"http|https\",code=\"200\"}[5m])",
"intervalFactor": 2,
"legendFormat": "{{service}} {{method}} {{code}}",
"refId": "A",
"step": 240
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Successful Status Code Count (5min)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"decimals": 0,
"fill": 1,
"id": 4,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "rate(traefik_requests_total{service=~\"http|https\",code!=\"200\"}[5m])",
"intervalFactor": 2,
"legendFormat": "{{service}} {{method}} {{code}}",
"refId": "A",
"step": 240
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Bad Status Code Count (5m)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Dashboard Row",
"titleSize": "h6"
},
{
"collapse": false,
"height": 382,
"panels": [],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Detailed statuses",
"titleSize": "h6"
}
],
"schemaVersion": 14,
"style": "dark",
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now/d",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "Traefik",
"version": 1
},
"inputs": [
{
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"type": "datasource",
"value": "prometheus"
}
],
"overwrite": true
}
rook-dashboard.json: |+
{
"dashboard": {
"__inputs": [
{
"description": "",
"label": "prometheus",
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"pluginName": "Prometheus",
"type": "datasource"
}
],
"__requires": [
{
"id": "grafana",
"name": "Grafana",
"type": "grafana",
"version": "4.1.1"
},
{
"id": "graph",
"name": "Graph",
"type": "panel",
"version": ""
},
{
"id": "prometheus",
"name": "Prometheus",
"type": "datasource",
"version": "1.0.0"
},
{
"id": "singlestat",
"name": "Singlestat",
"type": "panel",
"version": ""
}
],
"annotations": {
"list": []
},
"description": "Rook cluster monitoring",
"editable": true,
"gnetId": 917,
"graphTooltip": 0,
"hideControls": false,
"id": null,
"links": [],
"refresh": "1m",
"rows": [
{
"collapse": false,
"height": "150px",
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 21,
"interval": "1m",
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "count(ceph_health_status)",
"interval": "$interval",
"intervalFactor": 1,
"refId": "A",
"step": 60
}
],
"thresholds": "0,1",
"title": "Status",
"transparent": false,
"type": "singlestat",
"valueFontSize": "100%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
},
{
"op": "=",
"text": "WARNING",
"value": "0"
},
{
"op": "=",
"text": "HEALTHY",
"value": "1"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 14,
"interval": "1m",
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "ceph_monitor_quorum_count",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 60
}
],
"thresholds": "2,3",
"title": "Monitors In Quorum",
"transparent": false,
"type": "singlestat",
"valueFontSize": "100%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 22,
"interval": "1m",
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": true,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"tableColumn": "",
"targets": [
{
"expr": "count(ceph_pool_available_bytes)",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 60
}
],
"thresholds": "",
"title": "Pools",
"transparent": false,
"type": "singlestat",
"valueFontSize": "100%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "bytes",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 33,
"interval": "1m",
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": true,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"tableColumn": "",
"targets": [
{
"expr": "ceph_cluster_capacity_bytes",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 60
}
],
"thresholds": "0.025,0.1",
"title": "Cluster Capacity",
"transparent": false,
"type": "singlestat",
"valueFontSize": "100%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "bytes",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 34,
"interval": "1m",
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": true,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"tableColumn": "",
"targets": [
{
"expr": "ceph_cluster_used_bytes",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 60
}
],
"thresholds": "0.025,0.1",
"title": "Used Capacity",
"transparent": false,
"type": "singlestat",
"valueFontSize": "100%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "percentunit",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 23,
"interval": "1m",
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": true,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "ceph_cluster_available_bytes/ceph_cluster_capacity_bytes",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 60
}
],
"thresholds": "70,80",
"title": "Available Capacity",
"transparent": false,
"type": "singlestat",
"valueFontSize": "100%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "New row",
"titleSize": "h6"
},
{
"collapse": false,
"height": "100px",
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 26,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 1,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "ceph_osds_in",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 60
}
],
"thresholds": "",
"title": "OSDs IN",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": true,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 40, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 27,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 1,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "ceph_osds - ceph_osds_in",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 60
}
],
"thresholds": "1,1",
"title": "OSDs OUT",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 28,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 1,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "sum(ceph_osd_up)",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 60
}
],
"thresholds": "",
"title": "OSDs UP",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": true,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 40, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 29,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 1,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "ceph_osds_down",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 60
}
],
"thresholds": "1,1",
"title": "OSDs DOWN",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 30,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": true,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"tableColumn": "",
"targets": [
{
"expr": "avg(ceph_osd_pgs)",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 60
}
],
"thresholds": "250,300",
"title": "Agerage PGs per OSD",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "s",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 31,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": true,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"tableColumn": "",
"targets": [
{
"expr": "avg(ceph_osd_perf_apply_latency_seconds)",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 60
}
],
"thresholds": "0.01,0.05",
"title": "Agerage OSD Apply Latency",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "s",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 32,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": true,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"tableColumn": "",
"targets": [
{
"expr": "avg(ceph_osd_perf_commit_latency_seconds)",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 60
}
],
"thresholds": "0.01,0.05",
"title": "Agerage OSD Commit Latency",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"format": "s",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 24,
"interval": "1m",
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"repeat": null,
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": true,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"tableColumn": "",
"targets": [
{
"expr": "avg(ceph_monitor_latency_seconds)",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 60
}
],
"thresholds": "70,80",
"title": "Average Monitor Latency",
"transparent": false,
"type": "singlestat",
"valueFontSize": "100%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "New row",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": {
"Available": "#EAB839",
"Total Capacity": "#447EBC",
"Used": "#BF1B00",
"total_avail": "#6ED0E0",
"total_space": "#7EB26D",
"total_used": "#890F02"
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 4,
"grid": {},
"height": "300",
"id": 1,
"interval": "$interval",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 0,
"links": [],
"minSpan": null,
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "Total Capacity",
"fill": 0,
"linewidth": 3,
"stack": false
}
],
"spaceLength": 10,
"span": 4,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "ceph_cluster_available_bytes",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Available",
"refId": "A",
"step": 60
},
{
"expr": "ceph_cluster_used_bytes",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Used",
"refId": "B",
"step": 60
},
{
"expr": "ceph_cluster_capacity_bytes",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Total Capacity",
"refId": "C",
"step": 60
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Capacity",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {
"Total Capacity": "#7EB26D",
"Used": "#BF1B00",
"total_avail": "#6ED0E0",
"total_space": "#7EB26D",
"total_used": "#890F02"
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"decimals": 0,
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"height": "300",
"id": 3,
"interval": "$interval",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"minSpan": null,
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"span": 4,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "ceph_client_io_write_ops",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Write",
"refId": "A",
"step": 60
},
{
"expr": "ceph_client_io_read_ops",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Read",
"refId": "B",
"step": 60
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "IOPS",
"tooltip": {
"msResolution": true,
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "none",
"label": "",
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"height": "300",
"id": 7,
"interval": "$interval",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"span": 4,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "ceph_client_io_write_bytes",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Write",
"refId": "A",
"step": 60
},
{
"expr": "ceph_client_io_read_bytes",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Read",
"refId": "B",
"step": 60
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Throughput",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "Bps",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "CLUSTER",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 18,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "/^Total.*$/",
"stack": false
}
],
"spaceLength": 10,
"span": 12,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "ceph_cluster_objects",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Total",
"refId": "A",
"step": 60
},
{
"expr": "ceph_degraded_objects",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Degraded",
"refId": "B",
"step": 60
},
{
"expr": "ceph_misplaced_objects",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Misplaced",
"refId": "C",
"step": 60
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Objects in the Cluster",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 1,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 19,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "/^Total.*$/",
"stack": false
}
],
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum(ceph_osd_pgs)",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Total",
"refId": "A",
"step": 60
},
{
"expr": "ceph_degraded_pgs",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Degraded",
"refId": "B",
"step": 60
},
{
"expr": "ceph_stale_pgs",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Stale",
"refId": "C",
"step": 60
},
{
"expr": "ceph_unclean_pgs",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Unclean",
"refId": "D",
"step": 60
},
{
"expr": "ceph_undersized_pgs",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Undersized",
"refId": "E",
"step": 60
},
{
"expr": "ceph_stuck_degraded_pgs + ceph_stuck_stale_pgs + ceph_stuck_unclean_pgs + ceph_stuck_undersized_pgs",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Stuck",
"refId": "F",
"step": 60
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "PGs",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 1,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 20,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "/^Total.*$/",
"stack": false
}
],
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "ceph_stuck_degraded_pgs",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Degraded",
"refId": "F",
"step": 60
},
{
"expr": "ceph_stuck_stale_pgs",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Stale",
"refId": "A",
"step": 60
},
{
"expr": "ceph_stuck_unclean_pgs",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Unclean",
"refId": "B",
"step": 60
},
{
"expr": "ceph_stuck_undersized_pgs",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Undersized",
"refId": "C",
"step": 60
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Stuck PGs",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 1,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "New row",
"titleSize": "h6"
},
{
"collapse": false,
"height": "150px",
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 15,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "ceph_recovery_io_bytes",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Bytes",
"refId": "A",
"step": 60
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Bytes",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 16,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "/^.*/",
"color": "#E0752D"
}
],
"spaceLength": 10,
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "ceph_recovery_io_keys",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Keys",
"refId": "A",
"step": 60
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Keys",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"editable": true,
"error": false,
"fill": 1,
"grid": {},
"id": 17,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "/^.*$/",
"color": "#890F02"
}
],
"spaceLength": 10,
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "ceph_recovery_io_objects",
"interval": "$interval",
"intervalFactor": 1,
"legendFormat": "Objects",
"refId": "A",
"step": 60
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Objects",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Recovery",
"titleSize": "h6"
}
],
"schemaVersion": 14,
"style": "dark",
"tags": [
"ceph",
"rook"
],
"templating": {
"list": [
{
"auto": true,
"auto_count": 10,
"auto_min": "1m",
"current": {
"text": "1m",
"value": "1m"
},
"datasource": null,
"hide": 0,
"includeAll": false,
"label": "Interval",
"multi": false,
"name": "interval",
"options": [
{
"selected": false,
"text": "auto",
"value": "$__auto_interval"
},
{
"selected": true,
"text": "1m",
"value": "1m"
},
{
"selected": false,
"text": "10m",
"value": "10m"
},
{
"selected": false,
"text": "30m",
"value": "30m"
},
{
"selected": false,
"text": "1h",
"value": "1h"
},
{
"selected": false,
"text": "6h",
"value": "6h"
},
{
"selected": false,
"text": "12h",
"value": "12h"
},
{
"selected": false,
"text": "1d",
"value": "1d"
},
{
"selected": false,
"text": "7d",
"value": "7d"
},
{
"selected": false,
"text": "14d",
"value": "14d"
},
{
"selected": false,
"text": "30d",
"value": "30d"
}
],
"query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
"refresh": 2,
"type": "interval"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "Rook",
"version": 3
},
"inputs": [
{
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"type": "datasource",
"value": "prometheus"
}
],
"overwrite": true
}
resource-requests-dashboard.json: |+
{
"dashboard":
{
"__inputs": [
{
"description": "",
"label": "prometheus",
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"pluginName": "Prometheus",
"type": "datasource"
}
],
"__requires": [
{
"id": "grafana",
"name": "Grafana",
"type": "grafana",
"version": "4.1.1"
},
{
"id": "graph",
"name": "Graph",
"type": "panel",
"version": ""
},
{
"id": "prometheus",
"name": "Prometheus",
"type": "datasource",
"version": "1.0.0"
},
{
"id": "singlestat",
"name": "Singlestat",
"type": "panel",
"version": ""
}
],
"annotations": {
"list": []
},
"description": "Dashboard to show the resource requests vs allocatable in the cluster",
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"hideControls": false,
"id": null,
"links": [],
"rows": [
{
"collapse": false,
"height": "300",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.",
"fill": 1,
"id": 1,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 9,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "Allocatable CPU Cores",
"refId": "A",
"step": 10
},
{
"expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))",
"intervalFactor": 2,
"legendFormat": "Requested CPU Cores",
"refId": "B",
"step": 10
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "CPU Cores",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": "CPU Cores",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"decimals": null,
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 2,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"targets": [
{
"expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 240
}
],
"thresholds": "80, 90",
"title": "CPU Cores",
"type": "singlestat",
"valueFontSize": "110%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "CPU Cores",
"titleSize": "h6"
},
{
"collapse": false,
"height": "300",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.",
"fill": 1,
"id": 3,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 9,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "Allocatable Memory",
"refId": "A",
"step": 10
},
{
"expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))",
"intervalFactor": 2,
"legendFormat": "Requested Memory",
"refId": "B",
"step": 10
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Memory",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": "Memory",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(50, 172, 45, 0.97)",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"datasource": "${DS_PROMETHEUS}",
"decimals": null,
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"id": 4,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"targets": [
{
"expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 240
}
],
"thresholds": "80, 90",
"title": "Memory",
"type": "singlestat",
"valueFontSize": "110%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Memory",
"titleSize": "h6"
}
],
"schemaVersion": 14,
"style": "dark",
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-3h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "Resource Requests",
"version": 1
}
,
"inputs": [
{
"name": "DS_PROMETHEUS",
"pluginId": "prometheus",
"type": "datasource",
"value": "prometheus"
}
],
"overwrite": true
}
prometheus-datasource.json: |+
{
"access": "proxy",
"basicAuth": false,
"name": "prometheus",
"type": "prometheus",
"url": "http://prometheus-k8s.monitoring.svc:9090"
}
================================================
FILE: manifests/grafana/grafana-deployment.yaml
================================================
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: grafana
spec:
replicas: 1
template:
metadata:
labels:
app: grafana
spec:
containers:
- name: grafana
image: grafana/grafana:4.4.1
env:
- name: GF_AUTH_BASIC_ENABLED
value: "true"
- name: GF_AUTH_ANONYMOUS_ENABLED
value: "true"
- name: GF_SECURITY_ADMIN_USER
valueFrom:
secretKeyRef:
name: grafana-credentials
key: user
- name: GF_SECURITY_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-credentials
key: password
volumeMounts:
- name: grafana-storage
mountPath: /var/grafana-storage
ports:
- name: web
containerPort: 3000
resources:
requests:
memory: 100Mi
cpu: 100m
limits:
memory: 200Mi
cpu: 200m
- name: grafana-watcher
image: quay.io/coreos/grafana-watcher:v0.0.6
args:
- '--watch-dir=/var/grafana-dashboards'
- '--grafana-url=http://localhost:3000'
env:
- name: GRAFANA_USER
valueFrom:
secretKeyRef:
name: grafana-credentials
key: user
- name: GRAFANA_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-credentials
key: password
resources:
requests:
memory: "16Mi"
cpu: "50m"
limits:
memory: "32Mi"
cpu: "100m"
volumeMounts:
- name: grafana-dashboards
mountPath: /var/grafana-dashboards
volumes:
- name: grafana-storage
emptyDir: {}
- name: grafana-dashboards
configMap:
name: grafana-dashboards
================================================
FILE: manifests/grafana/grafana-service.yaml
================================================
apiVersion: v1
kind: Service
metadata:
name: grafana
labels:
app: grafana
spec:
ports:
- name: web
port: 3000
protocol: TCP
selector:
app: grafana
================================================
FILE: manifests/heapster.yaml
================================================
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: heapster
namespace: kube-system
labels:
k8s-app: heapster
kubernetes.io/cluster-service: "true"
spec:
replicas: 1
template:
metadata:
labels:
task: monitoring
k8s-app: heapster
spec:
serviceAccountName: heapster
containers:
- name: heapster
image: gcr.io/google_containers/heapster-amd64:v1.5.1
imagePullPolicy: IfNotPresent
command:
- /heapster
- --source=kubernetes:https://kubernetes.default
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
---
apiVersion: v1
kind: Service
metadata:
labels:
task: monitoring
kubernetes.io/cluster-service: 'true'
kubernetes.io/name: Heapster
name: heapster
namespace: kube-system
spec:
ports:
- port: 80
targetPort: 8082
selector:
k8s-app: heapster
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: heapster
namespace: kube-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
name: heapster
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:heapster
subjects:
- kind: ServiceAccount
name: heapster
namespace: kube-system
================================================
FILE: manifests/kube-dashboard.yaml
================================================
apiVersion: v1
kind: Secret
metadata:
labels:
k8s-app: kubernetes-dashboard
name: kubernetes-dashboard-certs
namespace: kube-system
type: Opaque
---
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
k8s-app: kubernetes-dashboard
name: kubernetes-dashboard
namespace: kube-system
---
kind: Deployment
apiVersion: apps/v1beta2
metadata:
labels:
k8s-app: kubernetes-dashboard
name: kubernetes-dashboard
namespace: kube-system
spec:
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
k8s-app: kubernetes-dashboard
template:
metadata:
labels:
k8s-app: kubernetes-dashboard
spec:
containers:
- name: kubernetes-dashboard
image: gcr.io/google_containers/kubernetes-dashboard-amd64:v1.8.3
ports:
- containerPort: 8443
protocol: TCP
args:
- --auto-generate-certificates
# Uncomment the following line to manually specify Kubernetes API server Host
# If not specified, Dashboard will attempt to auto discover the API server and connect
# to it. Uncomment only if the default does not work.
# - --apiserver-host=http://my-address:port
volumeMounts:
- name: kubernetes-dashboard-certs
mountPath: /certs
# Create on-disk volume to store exec logs
- mountPath: /tmp
name: tmp-volume
livenessProbe:
httpGet:
scheme: HTTPS
path: /
port: 8443
initialDelaySeconds: 30
timeoutSeconds: 30
volumes:
- name: kubernetes-dashboard-certs
secret:
secretName: kubernetes-dashboard-certs
- name: tmp-volume
emptyDir: {}
serviceAccountName: kubernetes-dashboard
# Comment the following tolerations if Dashboard must not be deployed on master
tolerations:
- key: node-role.kubernetes.io/master
effect: NoSchedule
---
kind: Service
apiVersion: v1
metadata:
labels:
k8s-app: kubernetes-dashboard
name: kubernetes-dashboard
namespace: kube-system
spec:
ports:
- port: 443
targetPort: 8443
selector:
k8s-app: kubernetes-dashboard
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: kubernetes-dashboard
labels:
k8s-app: kubernetes-dashboard
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: ServiceAccount
name: kubernetes-dashboard
namespace: kube-system
---
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: dashboard-ingress
namespace: kube-system
annotations:
kubernetes.io/ingress.class: "traefik"
ingress.kubernetes.io/auth-type: "basic"
ingress.kubernetes.io/auth-secret: "kubesecret"
spec:
rules:
- host: kube.${DOMAIN}
http:
paths:
- backend:
serviceName: kubernetes-dashboard
servicePort: 443
================================================
FILE: manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml
================================================
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: monitoring
================================================
FILE: manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml
================================================
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: kube-state-metrics
rules:
- apiGroups: [""]
resources:
- nodes
- pods
- services
- resourcequotas
- replicationcontrollers
- limitranges
verbs: ["list", "watch"]
- apiGroups: ["extensions"]
resources:
- daemonsets
- deployments
- replicasets
verbs: ["list", "watch"]
================================================
FILE: manifests/kube-state-metrics/kube-state-metrics-deployment.yaml
================================================
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: kube-state-metrics
spec:
replicas: 1
template:
metadata:
labels:
app: kube-state-metrics
spec:
serviceAccountName: kube-state-metrics
containers:
- name: kube-state-metrics
image: quay.io/coreos/kube-state-metrics:v0.5.0
ports:
- name: metrics
containerPort: 8080
resources:
requests:
memory: 100Mi
cpu: 100m
limits:
memory: 200Mi
cpu: 200m
================================================
FILE: manifests/kube-state-metrics/kube-state-metrics-service-account.yaml
================================================
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-state-metrics
================================================
FILE: manifests/kube-state-metrics/kube-state-metrics-service.yaml
================================================
apiVersion: v1
kind: Service
metadata:
labels:
app: kube-state-metrics
k8s-app: kube-state-metrics
name: kube-state-metrics
spec:
ports:
- name: http-metrics
port: 8080
targetPort: metrics
protocol: TCP
selector:
app: kube-state-metrics
================================================
FILE: manifests/node-exporter/node-exporter-daemonset.yaml
================================================
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: node-exporter
spec:
template:
metadata:
labels:
app: node-exporter
name: node-exporter
spec:
hostNetwork: true
hostPID: true
containers:
- image: quay.io/prometheus/node-exporter:v0.14.0
args:
- "-collector.procfs=/host/proc"
- "-collector.sysfs=/host/sys"
name: node-exporter
ports:
- containerPort: 9100
hostPort: 9100
name: scrape
resources:
requests:
memory: 30Mi
cpu: 100m
limits:
memory: 50Mi
cpu: 200m
volumeMounts:
- name: proc
readOnly: true
mountPath: /host/proc
- name: sys
readOnly: true
mountPath: /host/sys
volumes:
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
================================================
FILE: manifests/node-exporter/node-exporter-service.yaml
================================================
apiVersion: v1
kind: Service
metadata:
labels:
app: node-exporter
k8s-app: node-exporter
name: node-exporter
spec:
type: ClusterIP
clusterIP: None
ports:
- name: http-metrics
port: 9100
protocol: TCP
selector:
app: node-exporter
================================================
FILE: manifests/prometheus/prometheus-k8s-ingress.yaml
================================================
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: prometheus-ingress
namespace: monitoring
annotations:
kubernetes.io/ingress.class: "traefik"
ingress.kubernetes.io/auth-type: "basic"
ingress.kubernetes.io/auth-secret: "kubesecret"
spec:
rules:
- host: alertmanager.${DOMAIN}
http:
paths:
- backend:
serviceName: alertmanager-main
servicePort: web
- host: prometheus.${DOMAIN}
http:
paths:
- backend:
serviceName: prometheus-k8s
servicePort: web
- host: grafana.${DOMAIN}
http:
paths:
- backend:
serviceName: grafana
servicePort: web
================================================
FILE: manifests/prometheus/prometheus-k8s-role-bindings.yaml
================================================
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: monitoring
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: rook
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: default
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus-k8s
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
================================================
FILE: manifests/prometheus/prometheus-k8s-roles.yaml
================================================
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: monitoring
rules:
- apiGroups: [""]
resources:
- nodes
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: rook
rules:
- apiGroups: [""]
resources:
- nodes
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: kube-system
rules:
- apiGroups: [""]
resources:
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: default
rules:
- apiGroups: [""]
resources:
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus-k8s
rules:
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
================================================
FILE: manifests/prometheus/prometheus-k8s-rules.yaml
================================================
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-k8s-rules
labels:
role: prometheus-rulefiles
prometheus: k8s
data:
alertmanager.rules.yaml: |+
groups:
- name: ./alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
"alertmanager-$1", "alertmanager", "(.*)") != 1
for: 5m
labels:
severity: critical
annotations:
description: The configuration of the instances of the Alertmanager cluster
`{{$labels.service}}` are out of sync.
summary: Alertmanager configurations are inconsistent
- alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
for: 5m
labels:
severity: warning
annotations:
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
disappeared from discovery.
summary: Alertmanager down or not discovered
- alert: FailedReload
expr: alertmanager_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}.
summary: Alertmanager configuration reload has failed
etcd3.rules.yaml: |+
groups:
- name: ./etcd3.rules
rules:
- alert: InsufficientMembers
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
for: 3m
labels:
severity: critical
annotations:
description: If one more etcd member goes down the cluster will be unavailable
summary: etcd cluster insufficient members
- alert: NoLeader
expr: etcd_server_has_leader{job="etcd"} == 0
for: 1m
labels:
severity: critical
annotations:
description: etcd member {{ $labels.instance }} has no leader
summary: etcd member has no leader
- alert: HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: critical
annotations:
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
}} are slow
summary: slow gRPC requests
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
BY (method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
BY (method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
}} are slow
summary: slow HTTP requests
- alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} member communication with
{{ $labels.To }} is slow
summary: etcd member communication is slow
- alert: HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
failures within the last hour
summary: a high number of proposals within the etcd cluster are failing
- alert: HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
> 0.5
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} fync durations are high
summary: high fsync durations
- alert: HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
> 0.25
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} commit durations are high
summary: high commit durations
general.rules.yaml: |+
groups:
- name: ./general.rules
rules:
- alert: TargetDown
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% or more of {{ $labels.job }} targets are down.'
summary: Targets are down
- alert: DeadMansSwitch
expr: vector(1)
labels:
severity: none
annotations:
description: This is a DeadMansSwitch meant to ensure that the entire Alerting
pipeline is functional.
summary: Alerting DeadMansSwitch
- alert: TooManyOpenFileDescriptors
expr: 100 * (process_open_fds / process_max_fds) > 95
for: 10m
labels:
severity: critical
annotations:
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
$labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.'
summary: too many open file descriptors
- record: instance:fd_utilization
expr: process_open_fds / process_max_fds
- alert: FdExhaustionClose
expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
for: 10m
labels:
severity: warning
annotations:
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
$labels.instance }}) instance will exhaust in file/socket descriptors soon'
summary: file descriptors soon exhausted
- alert: FdExhaustionClose
expr: predict_linear(instance:fd_utilization[10m], 3600) > 1
for: 10m
labels:
severity: critical
annotations:
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
$labels.instance }}) instance will exhaust in file/socket descriptors soon'
summary: file descriptors soon exhausted
kube-apiserver.rules.yaml: |+
groups:
- name: ./kube-apiserver.rules
rules:
- alert: K8SApiserverDown
expr: absent(up{job="apiserver"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: Prometheus failed to scrape API server(s), or all API servers have
disappeared from service discovery.
summary: API server unreachable
- alert: K8SApiServerLatency
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"})
WITHOUT (instance, resource)) / 1e+06 > 1
for: 10m
labels:
severity: warning
annotations:
description: 99th percentile Latency for {{ $labels.verb }} requests to the
kube-apiserver is higher than 1s.
summary: Kubernetes apiserver latency is high
kube-controller-manager.rules.yaml: |+
groups:
- name: ./kube-controller-manager.rules
rules:
- alert: K8SControllerManagerDown
expr: absent(up{job="kube-controller-manager"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S controller manager. Deployments and replication
controllers are not making progress.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
summary: Controller manager is down
kube-scheduler.rules.yaml: |+
groups:
- name: ./kube-scheduler.rules
rules:
- alert: K8SSchedulerDown
expr: absent(up{job="kube-scheduler"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S scheduler. New pods are not being assigned
to nodes.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
summary: Scheduler is down
kubelet.rules.yaml: |+
groups:
- name: ./kubelet.rules
rules:
- alert: K8SNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
annotations:
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
or has set itself to NotReady, for more than an hour
summary: Node status is NotReady
- alert: K8SManyNodesNotReady
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
for: 1m
labels:
severity: critical
annotations:
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady
state).'
summary: Many Kubernetes nodes are Not Ready
- alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
for: 1h
labels:
severity: warning
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletDown
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})
> 0.1
for: 1h
labels:
severity: critical
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
have disappeared from service discovery.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100
labels:
severity: warning
annotations:
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
to the limit of 110
summary: Kubelet is close to pod limit
kubernetes.rules.yaml: |+
groups:
- name: ./kubernetes.rules
rules:
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:spec_cpu_shares
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller",
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
container_name)
- record: cluster_namespace_controller_pod_container:cpu_usage:rate
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]),
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_usage:bytes
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller",
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
container_name)
- record: cluster_namespace_controller_pod_container:memory_working_set:bytes
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""},
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_rss:bytes
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller",
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
container_name)
- record: cluster_namespace_controller_pod_container:memory_cache:bytes
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller",
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
container_name)
- record: cluster_namespace_controller_pod_container:disk_usage:bytes
expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller",
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
container_name)
- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]),
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
controller, pod_name, container_name, scope, type)
- record: cluster_namespace_controller_pod_container:memory_oom:rate
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]),
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
controller, pod_name, container_name, scope, type)
- record: cluster:memory_allocation:percent
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster)
/ sum(machine_memory_bytes) BY (cluster)
- record: cluster:memory_used:percent
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes)
BY (cluster)
- record: cluster:cpu_allocation:percent
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"}
* ON(cluster, instance) machine_cpu_cores) BY (cluster)
- record: cluster:node_cpu_use:percent
expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores)
BY (cluster)
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le,
cluster, job, resource, verb)) / 1e+06
labels:
quantile: "0.99"
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le,
cluster, job, resource, verb)) / 1e+06
labels:
quantile: "0.9"
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le,
cluster, job, resource, verb)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
node.rules.yaml: |+
groups:
- name: ./node.rules
rules:
- alert: NodeExporterDown
expr: absent(up{job="node-exporter"} == 1)
for: 10m
labels:
severity: warning
annotations:
description: Prometheus could not scrape a node-exporter for more than 10m,
or node-exporters have disappeared from discovery.
summary: node-exporter cannot be scraped
- alert: K8SNodeOutOfDisk
expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
labels:
service: k8s
severity: critical
annotations:
description: '{{ $labels.node }} has run out of disk space.'
summary: Node ran out of disk space.
- alert: K8SNodeMemoryPressure
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
1
labels:
service: k8s
severity: warning
annotations:
description: '{{ $labels.node }} is under memory pressure.'
summary: Node is under memory pressure.
- alert: K8SNodeDiskPressure
expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
labels:
service: k8s
severity: warning
annotations:
description: '{{ $labels.node }} is under disk pressure.'
summary: Node is under disk pressure.
prometheus.rules.yaml: |+
groups:
- name: ./prometheus.rules
rules:
- alert: FailedReload
expr: prometheus_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Prometheus' configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}.
summary: Prometheus configuration reload has failed
================================================
FILE: manifests/prometheus/prometheus-k8s-service-account.yaml
================================================
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus-k8s
================================================
FILE: manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml
================================================
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: alertmanager
labels:
k8s-app: alertmanager
spec:
selector:
matchLabels:
alertmanager: main
namespaceSelector:
matchNames:
- monitoring
endpoints:
- port: web
interval: 30s
================================================
FILE: manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml
================================================
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kube-apiserver
labels:
k8s-app: apiserver
spec:
jobLabel: component
selector:
matchLabels:
component: apiserver
provider: kubernetes
namespaceSelector:
matchNames:
- default
endpoints:
- port: https
interval: 30s
scheme: https
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
================================================
FILE: manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml
================================================
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kube-controller-manager
labels:
k8s-app: kube-controller-manager
spec:
jobLabel: k8s-app
endpoints:
- port: http-metrics
interval: 30s
selector:
matchLabels:
k8s-app: kube-controller-manager
namespaceSelector:
matchNames:
- kube-system
================================================
FILE: manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml
================================================
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kube-scheduler
labels:
k8s-app: kube-scheduler
spec:
jobLabel: k8s-app
endpoints:
- port: http-metrics
interval: 30s
selector:
matchLabels:
k8s-app: kube-scheduler
namespaceSelector:
matchNames:
- kube-system
================================================
FILE: manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml
================================================
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kube-state-metrics
labels:
k8s-app: kube-state-metrics
spec:
jobLabel: k8s-app
selector:
matchLabels:
k8s-app: kube-state-metrics
namespaceSelector:
matchNames:
- monitoring
endpoints:
- port: http-metrics
interval: 30s
honorLabels: true
================================================
FILE: manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml
================================================
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kubelet
labels:
k8s-app: kubelet
spec:
jobLabel: k8s-app
endpoints:
- port: http-metrics
interval: 30s
- port: cadvisor
interval: 30s
honorLabels: true
selector:
matchLabels:
k8s-app: kubelet
namespaceSelector:
matchNames:
- kube-system
================================================
FILE: manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml
================================================
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: node-exporter
labels:
k8s-app: node-exporter
spec:
jobLabel: k8s-app
selector:
matchLabels:
k8s-app: node-exporter
namespaceSelector:
matchNames:
- monitoring
endpoints:
- port: http-metrics
interval: 30s
================================================
FILE: manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml
================================================
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: prometheus-operator
labels:
k8s-app: prometheus-operator
spec:
endpoints:
- port: http
selector:
matchLabels:
k8s-app: prometheus-operator
================================================
FILE: manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml
================================================
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: prometheus
labels:
k8s-app: prometheus
spec:
selector:
matchLabels:
prometheus: k8s
namespaceSelector:
matchNames:
- monitoring
endpoints:
- port: web
interval: 30s
================================================
FILE: manifests/prometheus/prometheus-k8s-service-monitor-rook.yaml
================================================
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: rook-api
labels:
k8s-app: rook
spec:
selector:
matchLabels:
app: rook-api
rook_cluster: rook
namespaceSelector:
matchNames:
- rook
endpoints:
- port: rook-api
path: /metrics
interval: 60s
scrapeTimeout: 30s
================================================
FILE: manifests/prometheus/prometheus-k8s-service-monitor-traefik.yaml
================================================
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: traefik
labels:
k8s-app: traefik
spec:
endpoints:
- port: web
path: /metrics
interval: 5s
selector:
matchNames:
- traefik-console
namespaceSelector:
matchNames:
- kube-system
================================================
FILE: manifests/prometheus/prometheus-k8s-service.yaml
================================================
apiVersion: v1
kind: Service
metadata:
labels:
prometheus: k8s
name: prometheus-k8s
spec:
ports:
- name: web
port: 9090
protocol: TCP
selector:
prometheus: k8s
================================================
FILE: manifests/prometheus/prometheus-k8s.yaml
================================================
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
name: k8s
labels:
prometheus: k8s
spec:
replicas: 1
version: v2.0.0
serviceAccountName: prometheus-k8s
serviceMonitorSelector:
matchExpressions:
- {key: k8s-app, operator: Exists}
ruleSelector:
matchLabels:
role: prometheus-rulefiles
prometheus: k8s
resources:
requests:
# 2Gi is default, but won't schedule if you don't have a node with >2Gi
# memory. Modify based on your target and time-series count for
# production use. This value is mainly meant for demonstration/testing
# purposes.
memory: 400Mi
alerting:
alertmanagers:
- namespace: monitoring
name: alertmanager-main
port: web
================================================
FILE: manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml
================================================
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus-operator
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-operator
subjects:
- kind: ServiceAccount
name: prometheus-operator
namespace: monitoring
================================================
FILE: manifests/prometheus-operator/prometheus-operator-cluster-role.yaml
================================================
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus-operator
rules:
- apiGroups:
- extensions
resources:
- thirdpartyresources
verbs:
- "*"
- apiGroups:
- apiextensions.k8s.io
resources:
- customresourcedefinitions
verbs:
- "*"
- apiGroups:
- monitoring.coreos.com
resources:
- alertmanagers
- prometheuses
- servicemonitors
verbs:
- "*"
- apiGroups:
- apps
resources:
- statefulsets
verbs: ["*"]
- apiGroups: [""]
resources:
- configmaps
- secrets
verbs: ["*"]
- apiGroups: [""]
resources:
- pods
verbs: ["list", "delete"]
- apiGroups: [""]
resources:
- services
- endpoints
verbs: ["get", "create", "update"]
- apiGroups: [""]
resources:
- nodes
verbs: ["list", "watch"]
- apiGroups: [""]
resources:
- namespaces
verbs: ["list"]
================================================
FILE: manifests/prometheus-operator/prometheus-operator-service-account.yaml
================================================
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus-operator
================================================
FILE: manifests/prometheus-operator/prometheus-operator-service.yaml
================================================
apiVersion: v1
kind: Service
metadata:
name: prometheus-operator
labels:
k8s-app: prometheus-operator
spec:
type: ClusterIP
ports:
- name: http
port: 8080
targetPort: http
protocol: TCP
selector:
k8s-app: prometheus-operator
================================================
FILE: manifests/prometheus-operator/prometheus-operator.yaml
================================================
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
labels:
k8s-app: prometheus-operator
name: prometheus-operator
spec:
replicas: 1
template:
metadata:
labels:
k8s-app: prometheus-operator
spec:
containers:
- args:
- --kubelet-service=kube-system/kubelet
- --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1
image: quay.io/coreos/prometheus-operator:v0.17.0
name: prometheus-operator
ports:
- containerPort: 8080
name: http
resources:
limits:
cpu: 200m
memory: 100Mi
requests:
cpu: 100m
memory: 50Mi
serviceAccountName: prometheus-operator
================================================
FILE: manifests/rook/rook-cluster.yaml
================================================
apiVersion: v1
kind: Namespace
metadata:
name: rook
---
apiVersion: rook.io/v1alpha1
kind: Cluster
metadata:
name: rook
namespace: rook
spec:
versionTag: master
# The path on the host where configuration files will be persisted. If not specified, a kubernetes emptyDir will be created (not recommended).
# Important: if you reinstall the cluster, make sure you delete this directory from each host or else the mons will fail to start on the new cluster.
dataDirHostPath: /var/lib/rook
# toggle to use hostNetwork
hostNetwork: false
# set the amount of mons to be started
monCount: 1
# To control where various services will be scheduled by kubernetes, use the placement configuration sections below.
# The example under 'all' would have all services scheduled on kubernetes nodes labeled with 'role=storage' and
# tolerate taints with a key of 'storage-node'.
# placement:
# all:
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: role
# operator: In
# values:
# - storage-node
# podAffinity:
# podAntiAffinity:
# tolerations:
# - key: storage-node
# operator: Exists
# api:
# nodeAffinity:
# podAffinity:
# podAntiAffinity:
# tolerations:
# mgr:
# nodeAffinity:
# podAffinity:
# podAntiAffinity:
# tolerations:
# mon:
# nodeAffinity:
# podAffinity:
# podAntiAffinity:
# tolerations:
# osd:
# nodeAffinity:
# podAffinity:
# podAntiAffinity:
# tolerations:
storage:
useAllNodes: true
useAllDevices: false
deviceFilter: ^sd[^a]
metadataDevice:
location:
storeConfig:
storeType: bluestore
databaseSizeMB: 1024 # this value can be removed for environments with normal sized disks (100 GB or larger)
journalSizeMB: 1024 # this value can be removed for environments with normal sized disks (20 GB or larger)
# Individual nodes and their config can be specified as well, but 'useAllNodes' above must be set to false. Then, only the named
# nodes below will be used as storage resources. Each node's 'name' field should match their 'kubernetes.io/hostname' label.
# nodes:
# - name: "172.17.4.101"
# directories: # specific directores to use for storage can be specified for each node
# - path: "/rook/storage-dir"
# - name: "172.17.4.201"
# devices: # specific devices to use for storage can be specified for each node
# - name: "sdb"
# - name: "sdc"
# storeConfig: # configuration can be specified at the node level which overrides the cluster level config
# storeType: bluestore
# - name: "172.17.4.301"
# deviceFilter: "^sd."
================================================
FILE: manifests/rook/rook-operator.yaml
================================================
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
name: clusters.rook.io
spec:
group: rook.io
names:
kind: Cluster
listKind: ClusterList
plural: clusters
singular: cluster
scope: Namespaced
version: v1alpha1
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
name: filesystems.rook.io
spec:
group: rook.io
names:
kind: Filesystem
listKind: FilesystemList
plural: filesystems
singular: filesystem
scope: Namespaced
version: v1alpha1
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
name: objectstores.rook.io
spec:
group: rook.io
names:
kind: ObjectStore
listKind: ObjectStoreList
plural: objectstores
singular: objectstore
scope: Namespaced
version: v1alpha1
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
name: pools.rook.io
spec:
group: rook.io
names:
kind: Pool
listKind: PoolList
plural: pools
singular: pool
scope: Namespaced
version: v1alpha1
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
name: volumeattachments.rook.io
spec:
group: rook.io
names:
kind: VolumeAttachment
listKind: VolumeAttachmentList
plural: volumeattachments
singular: volumeattachment
scope: Namespaced
version: v1alpha1
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: rook-operator
rules:
- apiGroups:
- ""
resources:
- namespaces
- serviceaccounts
- secrets
- pods
- services
- nodes
- nodes/proxy
- configmaps
- events
- persistentvolumes
- persistentvolumeclaims
verbs:
- get
- list
- watch
- patch
- create
- update
- delete
- apiGroups:
- extensions
resources:
- deployments
- daemonsets
- replicasets
verbs:
- get
- list
- watch
- create
- update
- delete
- apiGroups:
- rbac.authorization.k8s.io
resources:
- clusterroles
- clusterrolebindings
- roles
- rolebindings
verbs:
- get
- list
- watch
- create
- update
- delete
- apiGroups:
- storage.k8s.io
resources:
- storageclasses
verbs:
- get
- list
- watch
- delete
- apiGroups:
- rook.io
resources:
- "*"
verbs:
- "*"
---
apiVersion: v1
kind: Namespace
metadata:
name: rook-system
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: rook-operator
namespace: rook-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
name: rook-operator
namespace: rook-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: rook-operator
subjects:
- kind: ServiceAccount
name: rook-operator
namespace: rook-system
---
apiVersion: apps/v1beta1
kind: Deployment
metadata:
name: rook-operator
namespace: rook-system
spec:
replicas: 1
template:
metadata:
labels:
app: rook-operator
spec:
serviceAccountName: rook-operator
containers:
- name: rook-operator
image: rook/rook:v0.7.1
args: ["operator"]
env:
# To disable RBAC, uncomment the following:
# - name: RBAC_ENABLED
# value: "false"
# Rook Agent toleration. Will tolerate all taints with all keys.
# Choose between NoSchedule, PreferNoSchedule and NoExecute:
# - name: AGENT_TOLERATION
# value: "NoSchedule"
# (Optional) Rook Agent toleration key. Set this to the key of the taint you want to tolerate
# - name: AGENT_TOLERATION_KEY
# value: ""
# Set the path where the Rook agent can find the flex volumes
# - name: FLEXVOLUME_DIR_PATH
# value: ""
# The interval to check if every mon is in the quorum.
- name: FLEXVOLUME_DIR_PATH
value: "/var/lib/kubelet/volumeplugins"
- name: ROOK_MON_HEALTHCHECK_INTERVAL
value: "45s"
# The duration to wait before trying to failover or remove/replace the
# current mon with a new mon (useful for compensating flapping network).
- name: ROOK_MON_OUT_TIMEOUT
value: "300s"
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
================================================
FILE: manifests/rook/rook-storageclass.yaml
================================================
apiVersion: rook.io/v1alpha1
kind: Pool
metadata:
name: replicapool
namespace: rook
spec:
replicated:
size: 1
# For an erasure-coded pool, comment out the replication size above and uncomment the following settings.
# Make sure you have enough OSDs to support the replica size or erasure code chunks.
#erasureCoded:
# dataChunks: 2
# codingChunks: 1
---
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: rook-block
annotations:
storageclass.kubernetes.io/is-default-class: "true"
provisioner: rook.io/block
parameters:
pool: replicapool
# Specify the Rook cluster from which to create volumes.
# If not specified, it will use `rook` as the name of the cluster.
# This is also the namespace where the cluster will be
clusterName: rook
# Specify the filesystem type of the volume. If not specified, it will use `ext4`.
# fstype: ext4
================================================
FILE: manifests/traefik.yaml
================================================
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
name: traefik-ingress-controller
rules:
- apiGroups:
- ""
resources:
- services
- endpoints
- secrets
verbs:
- get
- list
- watch
- apiGroups:
- extensions
resources:
- ingresses
verbs:
- get
- list
- watch
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
name: traefik-ingress-controller
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: traefik-ingress-controller
subjects:
- kind: ServiceAccount
name: traefik-ingress-controller
namespace: kube-system
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: traefik-ingress-controller
namespace: kube-system
---
apiVersion: v1
kind: Service
metadata:
name: traefik
namespace: kube-system
labels:
k8s-app: traefik-ingress-lb
spec:
selector:
k8s-app: traefik-ingress-lb
ports:
- port: 80
name: http
- port: 443
name: https
---
apiVersion: v1
kind: Service
metadata:
name: traefik-console
namespace: kube-system
labels:
k8s-app: traefik-ingress-lb
spec:
selector:
k8s-app: traefik-ingress-lb
ports:
- port: 8080
name: web
---
apiVersion: v1
kind: ConfigMap
metadata:
name: traefik-conf
namespace: kube-system
data:
traefik.toml: |
# traefik.toml
defaultEntryPoints = ["http","https"]
InsecureSkipVerify = true
[entryPoints]
[entryPoints.http]
address = ":80"
[entryPoints.http.redirect]
entryPoint = "https"
[entryPoints.https]
address = ":443"
[entryPoints.https.tls]
[acme]
email = "$EMAIL"
storage = "/acme/acme.json"
entryPoint = "https"
onDemand = true
onHostRule = true
caServer = "https://acme-v01.api.letsencrypt.org/directory"
[acme.httpChallenge]
entryPoint = "http"
[[acme.domains]]
main = "${DOMAIN}"
[web]
address = ":8080"
[web.metrics.prometheus]
Buckets=[0.1,0.3,1.2,5.0]
---
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: traefik-ingress-controller
namespace: kube-system
labels:
k8s-app: traefik-ingress-lb
spec:
revisionHistoryLimit: 0
template:
metadata:
labels:
k8s-app: traefik-ingress-lb
name: traefik-ingress-lb
spec:
terminationGracePeriodSeconds: 60
volumes:
- name: config
configMap:
name: traefik-conf
- name: acme
hostPath:
path: /etc/traefik/acme
containers:
- image: traefik:v1.6.5
name: traefik-ingress-lb
imagePullPolicy: Always
volumeMounts:
- mountPath: "/config"
name: "config"
- mountPath: "/acme"
name: "acme"
ports:
- containerPort: 80
hostPort: 80
- containerPort: 443
hostPort: 443
- containerPort: 8080
args:
- --configfile=/config/traefik.toml
- --web
- --web.metrics.prometheus
- --web.metrics.prometheus.buckets=0.1,0.3,1.2,5.0
- --kubernetes
- --logLevel=DEBUG
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/master
operator: In
values:
-
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
---
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: traefik-ingress
namespace: kube-system
annotations:
kubernetes.io/ingress.class: "traefik"
ingress.kubernetes.io/auth-type: "basic"
ingress.kubernetes.io/auth-secret: "kubesecret"
spec:
rules:
- host: traefik.${DOMAIN}
http:
paths:
- backend:
serviceName: traefik-console
servicePort: web